summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/acl.c16
-rw-r--r--fs/9p/fid.c26
-rw-r--r--fs/9p/fid.h13
-rw-r--r--fs/9p/vfs_addr.c22
-rw-r--r--fs/9p/vfs_dir.c4
-rw-r--r--fs/9p/vfs_file.c10
-rw-r--r--fs/9p/vfs_inode.c16
-rw-r--r--fs/9p/vfs_inode_dotl.c20
-rw-r--r--fs/9p/vfs_super.c2
-rw-r--r--fs/9p/xattr.c13
-rw-r--r--fs/Kconfig10
-rw-r--r--fs/Kconfig.binfmt11
-rw-r--r--fs/Makefile1
-rw-r--r--fs/adfs/dir.c8
-rw-r--r--fs/affs/amigaffs.c4
-rw-r--r--fs/affs/dir.c2
-rw-r--r--fs/affs/file.c31
-rw-r--r--fs/affs/namei.c20
-rw-r--r--fs/affs/super.c5
-rw-r--r--fs/afs/dir.c18
-rw-r--r--fs/afs/file.c4
-rw-r--r--fs/afs/mntpt.c6
-rw-r--r--fs/afs/rxrpc.c64
-rw-r--r--fs/afs/super.c4
-rw-r--r--fs/afs/write.c30
-rw-r--r--fs/aio.c9
-rw-r--r--fs/attr.c19
-rw-r--r--fs/autofs4/autofs_i.h8
-rw-r--r--fs/autofs4/expire.c27
-rw-r--r--fs/autofs4/root.c14
-rw-r--r--fs/autofs4/waitq.c13
-rw-r--r--fs/bad_inode.c8
-rw-r--r--fs/befs/befs.h4
-rw-r--r--fs/befs/btree.c16
-rw-r--r--fs/befs/btree.h4
-rw-r--r--fs/befs/datastream.c34
-rw-r--r--fs/befs/datastream.h11
-rw-r--r--fs/befs/io.c4
-rw-r--r--fs/befs/linuxvfs.c14
-rw-r--r--fs/bfs/dir.c2
-rw-r--r--fs/binfmt_aout.c26
-rw-r--r--fs/binfmt_elf.c54
-rw-r--r--fs/binfmt_elf_fdpic.c42
-rw-r--r--fs/binfmt_em86.c3
-rw-r--r--fs/binfmt_flat.c531
-rw-r--r--fs/binfmt_misc.c12
-rw-r--r--fs/block_dev.c206
-rw-r--r--fs/btrfs/acl.c6
-rw-r--r--fs/btrfs/async-thread.c31
-rw-r--r--fs/btrfs/async-thread.h6
-rw-r--r--fs/btrfs/backref.c8
-rw-r--r--fs/btrfs/btrfs_inode.h12
-rw-r--r--fs/btrfs/check-integrity.c129
-rw-r--r--fs/btrfs/check-integrity.h6
-rw-r--r--fs/btrfs/compression.c196
-rw-r--r--fs/btrfs/ctree.c142
-rw-r--r--fs/btrfs/ctree.h1231
-rw-r--r--fs/btrfs/dedupe.h24
-rw-r--r--fs/btrfs/delayed-inode.c101
-rw-r--r--fs/btrfs/delayed-inode.h10
-rw-r--r--fs/btrfs/delayed-ref.c17
-rw-r--r--fs/btrfs/delayed-ref.h2
-rw-r--r--fs/btrfs/dev-replace.c109
-rw-r--r--fs/btrfs/dev-replace.h4
-rw-r--r--fs/btrfs/disk-io.c361
-rw-r--r--fs/btrfs/disk-io.h5
-rw-r--r--fs/btrfs/extent-tree.c1128
-rw-r--r--fs/btrfs/extent_io.c606
-rw-r--r--fs/btrfs/extent_io.h53
-rw-r--r--fs/btrfs/extent_map.c4
-rw-r--r--fs/btrfs/file-item.c10
-rw-r--r--fs/btrfs/file.c160
-rw-r--r--fs/btrfs/free-space-cache.c54
-rw-r--r--fs/btrfs/free-space-cache.h2
-rw-r--r--fs/btrfs/free-space-tree.c16
-rw-r--r--fs/btrfs/hash.c5
-rw-r--r--fs/btrfs/hash.h1
-rw-r--r--fs/btrfs/inode-item.c2
-rw-r--r--fs/btrfs/inode-map.c26
-rw-r--r--fs/btrfs/inode.c939
-rw-r--r--fs/btrfs/ioctl.c340
-rw-r--r--fs/btrfs/lzo.c32
-rw-r--r--fs/btrfs/ordered-data.c35
-rw-r--r--fs/btrfs/ordered-data.h8
-rw-r--r--fs/btrfs/props.c6
-rw-r--r--fs/btrfs/qgroup.c98
-rw-r--r--fs/btrfs/qgroup.h9
-rw-r--r--fs/btrfs/raid56.c51
-rw-r--r--fs/btrfs/reada.c32
-rw-r--r--fs/btrfs/relocation.c112
-rw-r--r--fs/btrfs/root-tree.c18
-rw-r--r--fs/btrfs/scrub.c137
-rw-r--r--fs/btrfs/send.c84
-rw-r--r--fs/btrfs/struct-funcs.c6
-rw-r--r--fs/btrfs/super.c279
-rw-r--r--fs/btrfs/sysfs.c16
-rw-r--r--fs/btrfs/tests/btrfs-tests.c75
-rw-r--r--fs/btrfs/tests/btrfs-tests.h47
-rw-r--r--fs/btrfs/tests/extent-buffer-tests.c34
-rw-r--r--fs/btrfs/tests/extent-io-tests.c124
-rw-r--r--fs/btrfs/tests/free-space-tests.c95
-rw-r--r--fs/btrfs/tests/free-space-tree-tests.c46
-rw-r--r--fs/btrfs/tests/inode-tests.c386
-rw-r--r--fs/btrfs/tests/qgroup-tests.c130
-rw-r--r--fs/btrfs/transaction.c175
-rw-r--r--fs/btrfs/transaction.h5
-rw-r--r--fs/btrfs/tree-log.c253
-rw-r--r--fs/btrfs/ulist.c2
-rw-r--r--fs/btrfs/volumes.c848
-rw-r--r--fs/btrfs/volumes.h63
-rw-r--r--fs/btrfs/xattr.c40
-rw-r--r--fs/btrfs/xattr.h3
-rw-r--r--fs/btrfs/zlib.c38
-rw-r--r--fs/buffer.c266
-rw-r--r--fs/cachefiles/interface.c2
-rw-r--r--fs/cachefiles/namei.c5
-rw-r--r--fs/cachefiles/proc.c1
-rw-r--r--fs/cachefiles/rdwr.c38
-rw-r--r--fs/ceph/acl.c16
-rw-r--r--fs/ceph/addr.c412
-rw-r--r--fs/ceph/cache.c145
-rw-r--r--fs/ceph/cache.h44
-rw-r--r--fs/ceph/caps.c949
-rw-r--r--fs/ceph/debugfs.c2
-rw-r--r--fs/ceph/dir.c456
-rw-r--r--fs/ceph/export.c10
-rw-r--r--fs/ceph/file.c236
-rw-r--r--fs/ceph/inode.c240
-rw-r--r--fs/ceph/ioctl.c44
-rw-r--r--fs/ceph/mds_client.c508
-rw-r--r--fs/ceph/mds_client.h38
-rw-r--r--fs/ceph/mdsmap.c43
-rw-r--r--fs/ceph/snap.c10
-rw-r--r--fs/ceph/super.c82
-rw-r--r--fs/ceph/super.h70
-rw-r--r--fs/ceph/xattr.c319
-rw-r--r--fs/char_dev.c4
-rw-r--r--fs/cifs/Makefile3
-rw-r--r--fs/cifs/cifs_debug.c7
-rw-r--r--fs/cifs/cifs_dfs_ref.c8
-rw-r--r--fs/cifs/cifs_fs_sb.h4
-rw-r--r--fs/cifs/cifs_spnego.c67
-rw-r--r--fs/cifs/cifs_unicode.c33
-rw-r--r--fs/cifs/cifs_unicode.h2
-rw-r--r--fs/cifs/cifsacl.c2
-rw-r--r--fs/cifs/cifsencrypt.c113
-rw-r--r--fs/cifs/cifsfs.c58
-rw-r--r--fs/cifs/cifsfs.h14
-rw-r--r--fs/cifs/cifsglob.h7
-rw-r--r--fs/cifs/cifsproto.h14
-rw-r--r--fs/cifs/cifssmb.c31
-rw-r--r--fs/cifs/connect.c198
-rw-r--r--fs/cifs/dir.c52
-rw-r--r--fs/cifs/file.c174
-rw-r--r--fs/cifs/inode.c35
-rw-r--r--fs/cifs/ntlmssp.h2
-rw-r--r--fs/cifs/readdir.c61
-rw-r--r--fs/cifs/sess.c207
-rw-r--r--fs/cifs/smb2glob.h1
-rw-r--r--fs/cifs/smb2inode.c8
-rw-r--r--fs/cifs/smb2ops.c32
-rw-r--r--fs/cifs/smb2pdu.c53
-rw-r--r--fs/cifs/smb2proto.h2
-rw-r--r--fs/cifs/smb2transport.c107
-rw-r--r--fs/cifs/transport.c141
-rw-r--r--fs/cifs/xattr.c387
-rw-r--r--fs/coda/dir.c18
-rw-r--r--fs/coda/pioctl.c1
-rw-r--r--fs/compat.c16
-rw-r--r--fs/compat_ioctl.c12
-rw-r--r--fs/configfs/dir.c37
-rw-r--r--fs/configfs/file.c8
-rw-r--r--fs/configfs/inode.c2
-rw-r--r--fs/configfs/mount.c4
-rw-r--r--fs/coredump.c11
-rw-r--r--fs/cramfs/README26
-rw-r--r--fs/cramfs/inode.c34
-rw-r--r--fs/crypto/crypto.c62
-rw-r--r--fs/crypto/keyinfo.c120
-rw-r--r--fs/dax.c945
-rw-r--r--fs/dcache.c577
-rw-r--r--fs/debugfs/file.c437
-rw-r--r--fs/debugfs/inode.c117
-rw-r--r--fs/debugfs/internal.h26
-rw-r--r--fs/devpts/inode.c260
-rw-r--r--fs/direct-io.c107
-rw-r--r--fs/dlm/config.c10
-rw-r--r--fs/dlm/config.h1
-rw-r--r--fs/dlm/dlm_internal.h10
-rw-r--r--fs/dlm/lowcomms.c11
-rw-r--r--fs/ecryptfs/crypto.c76
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h11
-rw-r--r--fs/ecryptfs/file.c90
-rw-r--r--fs/ecryptfs/inode.c130
-rw-r--r--fs/ecryptfs/keystore.c2
-rw-r--r--fs/ecryptfs/main.c11
-rw-r--r--fs/ecryptfs/mmap.c50
-rw-r--r--fs/ecryptfs/read_write.c14
-rw-r--r--fs/ecryptfs/super.c5
-rw-r--r--fs/efivarfs/file.c2
-rw-r--r--fs/efivarfs/inode.c40
-rw-r--r--fs/efivarfs/super.c14
-rw-r--r--fs/efs/dir.c3
-rw-r--r--fs/efs/namei.c2
-rw-r--r--fs/efs/super.c4
-rw-r--r--fs/eventpoll.c12
-rw-r--r--fs/exec.c97
-rw-r--r--fs/exofs/dir.c46
-rw-r--r--fs/exofs/inode.c37
-rw-r--r--fs/exofs/namei.c4
-rw-r--r--fs/exofs/ore.c2
-rw-r--r--fs/exofs/super.c4
-rw-r--r--fs/exportfs/expfs.c12
-rw-r--r--fs/ext2/acl.c3
-rw-r--r--fs/ext2/balloc.c21
-rw-r--r--fs/ext2/dir.c58
-rw-r--r--fs/ext2/ext2.h7
-rw-r--r--fs/ext2/file.c4
-rw-r--r--fs/ext2/inode.c30
-rw-r--r--fs/ext2/namei.c8
-rw-r--r--fs/ext2/super.c11
-rw-r--r--fs/ext2/xattr.c9
-rw-r--r--fs/ext2/xattr_security.c13
-rw-r--r--fs/ext2/xattr_trusted.c13
-rw-r--r--fs/ext2/xattr_user.c17
-rw-r--r--fs/ext4/Kconfig12
-rw-r--r--fs/ext4/Makefile2
-rw-r--r--fs/ext4/acl.c3
-rw-r--r--fs/ext4/balloc.c10
-rw-r--r--fs/ext4/crypto.c523
-rw-r--r--fs/ext4/crypto_fname.c468
-rw-r--r--fs/ext4/crypto_key.c274
-rw-r--r--fs/ext4/crypto_policy.c229
-rw-r--r--fs/ext4/dir.c39
-rw-r--r--fs/ext4/ext4.h255
-rw-r--r--fs/ext4/ext4_crypto.h159
-rw-r--r--fs/ext4/ext4_jbd2.h25
-rw-r--r--fs/ext4/extents.c32
-rw-r--r--fs/ext4/extents_status.c2
-rw-r--r--fs/ext4/file.c41
-rw-r--r--fs/ext4/fsync.c5
-rw-r--r--fs/ext4/ialloc.c68
-rw-r--r--fs/ext4/indirect.c127
-rw-r--r--fs/ext4/inline.c34
-rw-r--r--fs/ext4/inode.c596
-rw-r--r--fs/ext4/ioctl.c40
-rw-r--r--fs/ext4/mballoc.c82
-rw-r--r--fs/ext4/mmp.c8
-rw-r--r--fs/ext4/move_extent.c29
-rw-r--r--fs/ext4/namei.c160
-rw-r--r--fs/ext4/page-io.c38
-rw-r--r--fs/ext4/readpage.c71
-rw-r--r--fs/ext4/resize.c2
-rw-r--r--fs/ext4/super.c244
-rw-r--r--fs/ext4/symlink.c39
-rw-r--r--fs/ext4/sysfs.c1
-rw-r--r--fs/ext4/xattr.c45
-rw-r--r--fs/ext4/xattr_security.c13
-rw-r--r--fs/ext4/xattr_trusted.c13
-rw-r--r--fs/ext4/xattr_user.c17
-rw-r--r--fs/f2fs/Kconfig8
-rw-r--r--fs/f2fs/acl.c16
-rw-r--r--fs/f2fs/acl.h2
-rw-r--r--fs/f2fs/checkpoint.c141
-rw-r--r--fs/f2fs/data.c575
-rw-r--r--fs/f2fs/debug.c34
-rw-r--r--fs/f2fs/dir.c253
-rw-r--r--fs/f2fs/extent_cache.c53
-rw-r--r--fs/f2fs/f2fs.h464
-rw-r--r--fs/f2fs/file.c879
-rw-r--r--fs/f2fs/gc.c91
-rw-r--r--fs/f2fs/inline.c198
-rw-r--r--fs/f2fs/inode.c117
-rw-r--r--fs/f2fs/namei.c165
-rw-r--r--fs/f2fs/node.c476
-rw-r--r--fs/f2fs/node.h14
-rw-r--r--fs/f2fs/recovery.c172
-rw-r--r--fs/f2fs/segment.c95
-rw-r--r--fs/f2fs/segment.h31
-rw-r--r--fs/f2fs/shrinker.c5
-rw-r--r--fs/f2fs/super.c525
-rw-r--r--fs/f2fs/trace.c7
-rw-r--r--fs/f2fs/xattr.c49
-rw-r--r--fs/fat/dir.c6
-rw-r--r--fs/fat/inode.c10
-rw-r--r--fs/fat/misc.c2
-rw-r--r--fs/fat/namei_msdos.c6
-rw-r--r--fs/fat/namei_vfat.c16
-rw-r--r--fs/file.c5
-rw-r--r--fs/freevxfs/Kconfig13
-rw-r--r--fs/freevxfs/vxfs.h185
-rw-r--r--fs/freevxfs/vxfs_bmap.c70
-rw-r--r--fs/freevxfs/vxfs_dir.h17
-rw-r--r--fs/freevxfs/vxfs_extern.h10
-rw-r--r--fs/freevxfs/vxfs_fshead.c37
-rw-r--r--fs/freevxfs/vxfs_fshead.h29
-rw-r--r--fs/freevxfs/vxfs_immed.c4
-rw-r--r--fs/freevxfs/vxfs_inode.c265
-rw-r--r--fs/freevxfs/vxfs_inode.h146
-rw-r--r--fs/freevxfs/vxfs_lookup.c232
-rw-r--r--fs/freevxfs/vxfs_olt.c15
-rw-r--r--fs/freevxfs/vxfs_olt.h70
-rw-r--r--fs/freevxfs/vxfs_subr.c2
-rw-r--r--fs/freevxfs/vxfs_super.c162
-rw-r--r--fs/fs-writeback.c123
-rw-r--r--fs/fscache/histogram.c1
-rw-r--r--fs/fscache/object-list.c1
-rw-r--r--fs/fscache/page.c12
-rw-r--r--fs/fscache/stats.c1
-rw-r--r--fs/fuse/dev.c58
-rw-r--r--fs/fuse/dir.c118
-rw-r--r--fs/fuse/file.c122
-rw-r--r--fs/fuse/fuse_i.h12
-rw-r--r--fs/fuse/inode.c45
-rw-r--r--fs/gfs2/acl.c58
-rw-r--r--fs/gfs2/acl.h1
-rw-r--r--fs/gfs2/aops.c104
-rw-r--r--fs/gfs2/bmap.c17
-rw-r--r--fs/gfs2/dentry.c2
-rw-r--r--fs/gfs2/dir.c20
-rw-r--r--fs/gfs2/export.c11
-rw-r--r--fs/gfs2/file.c58
-rw-r--r--fs/gfs2/glock.c26
-rw-r--r--fs/gfs2/glock.h10
-rw-r--r--fs/gfs2/glops.c7
-rw-r--r--fs/gfs2/inode.c219
-rw-r--r--fs/gfs2/inode.h4
-rw-r--r--fs/gfs2/log.c8
-rw-r--r--fs/gfs2/lops.c23
-rw-r--r--fs/gfs2/lops.h2
-rw-r--r--fs/gfs2/main.c1
-rw-r--r--fs/gfs2/meta_io.c29
-rw-r--r--fs/gfs2/meta_io.h8
-rw-r--r--fs/gfs2/ops_fstype.c10
-rw-r--r--fs/gfs2/quota.c18
-rw-r--r--fs/gfs2/recovery.c6
-rw-r--r--fs/gfs2/recovery.h4
-rw-r--r--fs/gfs2/rgrp.c38
-rw-r--r--fs/gfs2/super.c26
-rw-r--r--fs/gfs2/util.c1
-rw-r--r--fs/gfs2/xattr.c52
-rw-r--r--fs/hfs/attr.c11
-rw-r--r--fs/hfs/bnode.c12
-rw-r--r--fs/hfs/btree.c20
-rw-r--r--fs/hfs/catalog.c15
-rw-r--r--fs/hfs/dir.c12
-rw-r--r--fs/hfs/hfs_fs.h23
-rw-r--r--fs/hfs/inode.c21
-rw-r--r--fs/hfs/string.c4
-rw-r--r--fs/hfs/trans.c2
-rw-r--r--fs/hfsplus/bitmap.c2
-rw-r--r--fs/hfsplus/bnode.c90
-rw-r--r--fs/hfsplus/btree.c22
-rw-r--r--fs/hfsplus/catalog.c15
-rw-r--r--fs/hfsplus/dir.c12
-rw-r--r--fs/hfsplus/hfsplus_fs.h16
-rw-r--r--fs/hfsplus/inode.c18
-rw-r--r--fs/hfsplus/part_tbl.c5
-rw-r--r--fs/hfsplus/posix_acl.c3
-rw-r--r--fs/hfsplus/super.c9
-rw-r--r--fs/hfsplus/unicode.c6
-rw-r--r--fs/hfsplus/wrapper.c15
-rw-r--r--fs/hfsplus/xattr.c28
-rw-r--r--fs/hfsplus/xattr.h4
-rw-r--r--fs/hfsplus/xattr_security.c13
-rw-r--r--fs/hfsplus/xattr_trusted.c13
-rw-r--r--fs/hfsplus/xattr_user.c13
-rw-r--r--fs/hostfs/hostfs_kern.c27
-rw-r--r--fs/hpfs/dentry.c6
-rw-r--r--fs/hpfs/dir.c12
-rw-r--r--fs/hpfs/dnode.c8
-rw-r--r--fs/hpfs/hpfs_fn.h2
-rw-r--r--fs/hpfs/super.c42
-rw-r--r--fs/hugetlbfs/inode.c10
-rw-r--r--fs/inode.c35
-rw-r--r--fs/internal.h5
-rw-r--r--fs/ioctl.c1
-rw-r--r--fs/iomap.c497
-rw-r--r--fs/isofs/compress.c39
-rw-r--r--fs/isofs/dir.c4
-rw-r--r--fs/isofs/inode.c31
-rw-r--r--fs/isofs/namei.c2
-rw-r--r--fs/isofs/rock.c13
-rw-r--r--fs/jbd2/commit.c16
-rw-r--r--fs/jbd2/journal.c52
-rw-r--r--fs/jbd2/recovery.c6
-rw-r--r--fs/jbd2/transaction.c49
-rw-r--r--fs/jffs2/acl.c2
-rw-r--r--fs/jffs2/debug.c8
-rw-r--r--fs/jffs2/dir.c12
-rw-r--r--fs/jffs2/file.c23
-rw-r--r--fs/jffs2/fs.c8
-rw-r--r--fs/jffs2/gc.c8
-rw-r--r--fs/jffs2/nodelist.c8
-rw-r--r--fs/jffs2/readinode.c2
-rw-r--r--fs/jffs2/scan.c2
-rw-r--r--fs/jffs2/security.c13
-rw-r--r--fs/jffs2/summary.c2
-rw-r--r--fs/jffs2/super.c2
-rw-r--r--fs/jffs2/write.c11
-rw-r--r--fs/jffs2/xattr_trusted.c13
-rw-r--r--fs/jffs2/xattr_user.c13
-rw-r--r--fs/jfs/acl.c6
-rw-r--r--fs/jfs/file.c6
-rw-r--r--fs/jfs/inode.c11
-rw-r--r--fs/jfs/jfs_debug.c1
-rw-r--r--fs/jfs/jfs_discard.c6
-rw-r--r--fs/jfs/jfs_dtree.c10
-rw-r--r--fs/jfs/jfs_imap.c3
-rw-r--r--fs/jfs/jfs_inode.c2
-rw-r--r--fs/jfs/jfs_logmgr.c21
-rw-r--r--fs/jfs/jfs_metapage.c53
-rw-r--r--fs/jfs/jfs_metapage.h4
-rw-r--r--fs/jfs/jfs_txnmgr.c23
-rw-r--r--fs/jfs/jfs_xattr.h6
-rw-r--r--fs/jfs/jfs_xtree.c1
-rw-r--r--fs/jfs/namei.c16
-rw-r--r--fs/jfs/super.c6
-rw-r--r--fs/jfs/symlink.c12
-rw-r--r--fs/jfs/xattr.c222
-rw-r--r--fs/kernfs/dir.c35
-rw-r--r--fs/kernfs/file.c51
-rw-r--r--fs/kernfs/inode.c32
-rw-r--r--fs/kernfs/kernfs-internal.h7
-rw-r--r--fs/kernfs/mount.c29
-rw-r--r--fs/libfs.c153
-rw-r--r--fs/lockd/procfs.c1
-rw-r--r--fs/lockd/svc.c13
-rw-r--r--fs/locks.c2
-rw-r--r--fs/logfs/dev_bdev.c19
-rw-r--r--fs/logfs/dev_mtd.c10
-rw-r--r--fs/logfs/dir.c26
-rw-r--r--fs/logfs/file.c26
-rw-r--r--fs/logfs/readwrite.c20
-rw-r--r--fs/logfs/segment.c28
-rw-r--r--fs/logfs/super.c16
-rw-r--r--fs/minix/dir.c20
-rw-r--r--fs/minix/namei.c4
-rw-r--r--fs/mpage.c69
-rw-r--r--fs/namei.c763
-rw-r--r--fs/namespace.c104
-rw-r--r--fs/ncpfs/dir.c18
-rw-r--r--fs/ncpfs/ncplib_kernel.h2
-rw-r--r--fs/nfs/Makefile2
-rw-r--r--fs/nfs/blocklayout/blocklayout.c46
-rw-r--r--fs/nfs/blocklayout/blocklayout.h4
-rw-r--r--fs/nfs/blocklayout/dev.c110
-rw-r--r--fs/nfs/blocklayout/extent_tree.c27
-rw-r--r--fs/nfs/callback_proc.c73
-rw-r--r--fs/nfs/callback_xdr.c23
-rw-r--r--fs/nfs/client.c32
-rw-r--r--fs/nfs/delegation.c9
-rw-r--r--fs/nfs/delegation.h2
-rw-r--r--fs/nfs/dir.c196
-rw-r--r--fs/nfs/direct.c149
-rw-r--r--fs/nfs/file.c122
-rw-r--r--fs/nfs/filelayout/filelayout.c24
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c223
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h17
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c119
-rw-r--r--fs/nfs/inode.c145
-rw-r--r--fs/nfs/internal.h71
-rw-r--r--fs/nfs/io.c147
-rw-r--r--fs/nfs/nfs3acl.c43
-rw-r--r--fs/nfs/nfs3client.c14
-rw-r--r--fs/nfs/nfs3proc.c8
-rw-r--r--fs/nfs/nfs42.h1
-rw-r--r--fs/nfs/nfs42proc.c129
-rw-r--r--fs/nfs/nfs42xdr.c154
-rw-r--r--fs/nfs/nfs4_fs.h18
-rw-r--r--fs/nfs/nfs4client.c26
-rw-r--r--fs/nfs/nfs4file.c17
-rw-r--r--fs/nfs/nfs4idmap.c2
-rw-r--r--fs/nfs/nfs4namespace.c4
-rw-r--r--fs/nfs/nfs4proc.c365
-rw-r--r--fs/nfs/nfs4state.c20
-rw-r--r--fs/nfs/nfs4trace.h14
-rw-r--r--fs/nfs/nfs4xdr.c56
-rw-r--r--fs/nfs/nfstrace.h7
-rw-r--r--fs/nfs/objlayout/objio_osd.c2
-rw-r--r--fs/nfs/pagelist.c10
-rw-r--r--fs/nfs/pnfs.c506
-rw-r--r--fs/nfs/pnfs.h51
-rw-r--r--fs/nfs/pnfs_nfs.c81
-rw-r--r--fs/nfs/proc.c8
-rw-r--r--fs/nfs/read.c20
-rw-r--r--fs/nfs/super.c23
-rw-r--r--fs/nfs/unlink.c192
-rw-r--r--fs/nfs/write.c114
-rw-r--r--fs/nfsd/Kconfig19
-rw-r--r--fs/nfsd/Makefile1
-rw-r--r--fs/nfsd/blocklayout.c5
-rw-r--r--fs/nfsd/blocklayoutxdr.c5
-rw-r--r--fs/nfsd/export.c14
-rw-r--r--fs/nfsd/export.h2
-rw-r--r--fs/nfsd/flexfilelayout.c133
-rw-r--r--fs/nfsd/flexfilelayoutxdr.c115
-rw-r--r--fs/nfsd/flexfilelayoutxdr.h49
-rw-r--r--fs/nfsd/nfs2acl.c20
-rw-r--r--fs/nfsd/nfs3acl.c16
-rw-r--r--fs/nfsd/nfs3proc.c4
-rw-r--r--fs/nfsd/nfs3xdr.c4
-rw-r--r--fs/nfsd/nfs4acl.c16
-rw-r--r--fs/nfsd/nfs4callback.c18
-rw-r--r--fs/nfsd/nfs4layouts.c18
-rw-r--r--fs/nfsd/nfs4proc.c48
-rw-r--r--fs/nfsd/nfs4state.c149
-rw-r--r--fs/nfsd/nfs4xdr.c81
-rw-r--r--fs/nfsd/nfsctl.c16
-rw-r--r--fs/nfsd/nfsd.h5
-rw-r--r--fs/nfsd/nfsfh.c20
-rw-r--r--fs/nfsd/nfsproc.c7
-rw-r--r--fs/nfsd/nfsxdr.c2
-rw-r--r--fs/nfsd/pnfs.h4
-rw-r--r--fs/nfsd/state.h8
-rw-r--r--fs/nfsd/stats.c1
-rw-r--r--fs/nfsd/vfs.c160
-rw-r--r--fs/nfsd/vfs.h3
-rw-r--r--fs/nfsd/xdr4.h5
-rw-r--r--fs/nilfs2/alloc.c74
-rw-r--r--fs/nilfs2/alloc.h11
-rw-r--r--fs/nilfs2/bmap.c14
-rw-r--r--fs/nilfs2/bmap.h24
-rw-r--r--fs/nilfs2/btnode.c29
-rw-r--r--fs/nilfs2/btnode.h10
-rw-r--r--fs/nilfs2/btree.c88
-rw-r--r--fs/nilfs2/btree.h8
-rw-r--r--fs/nilfs2/cpfile.c45
-rw-r--r--fs/nilfs2/cpfile.h13
-rw-r--r--fs/nilfs2/dat.c27
-rw-r--r--fs/nilfs2/dat.h9
-rw-r--r--fs/nilfs2/dir.c161
-rw-r--r--fs/nilfs2/direct.c27
-rw-r--r--fs/nilfs2/direct.h16
-rw-r--r--fs/nilfs2/export.h2
-rw-r--r--fs/nilfs2/file.c7
-rw-r--r--fs/nilfs2/gcinode.c25
-rw-r--r--fs/nilfs2/ifile.c21
-rw-r--r--fs/nilfs2/ifile.h10
-rw-r--r--fs/nilfs2/inode.c139
-rw-r--r--fs/nilfs2/ioctl.c65
-rw-r--r--fs/nilfs2/mdt.c94
-rw-r--r--fs/nilfs2/mdt.h21
-rw-r--r--fs/nilfs2/namei.c22
-rw-r--r--fs/nilfs2/nilfs.h81
-rw-r--r--fs/nilfs2/page.c78
-rw-r--r--fs/nilfs2/page.h10
-rw-r--r--fs/nilfs2/recovery.c101
-rw-r--r--fs/nilfs2/segbuf.c34
-rw-r--r--fs/nilfs2/segbuf.h11
-rw-r--r--fs/nilfs2/segment.c176
-rw-r--r--fs/nilfs2/segment.h45
-rw-r--r--fs/nilfs2/sufile.c58
-rw-r--r--fs/nilfs2/sufile.h11
-rw-r--r--fs/nilfs2/super.c227
-rw-r--r--fs/nilfs2/sysfs.c84
-rw-r--r--fs/nilfs2/sysfs.h6
-rw-r--r--fs/nilfs2/the_nilfs.c152
-rw-r--r--fs/nilfs2/the_nilfs.h38
-rw-r--r--fs/notify/fsnotify.h7
-rw-r--r--fs/notify/group.c17
-rw-r--r--fs/notify/mark.c78
-rw-r--r--fs/ntfs/aops.c56
-rw-r--r--fs/ntfs/aops.h4
-rw-r--r--fs/ntfs/attrib.c28
-rw-r--r--fs/ntfs/bitmap.c10
-rw-r--r--fs/ntfs/compress.c79
-rw-r--r--fs/ntfs/dir.c56
-rw-r--r--fs/ntfs/file.c65
-rw-r--r--fs/ntfs/index.c14
-rw-r--r--fs/ntfs/inode.c14
-rw-r--r--fs/ntfs/lcnalloc.c6
-rw-r--r--fs/ntfs/logfile.c18
-rw-r--r--fs/ntfs/mft.c38
-rw-r--r--fs/ntfs/namei.c2
-rw-r--r--fs/ntfs/ntfs.h2
-rw-r--r--fs/ntfs/super.c72
-rw-r--r--fs/ocfs2/Makefile2
-rw-r--r--fs/ocfs2/acl.c87
-rw-r--r--fs/ocfs2/acl.h5
-rw-r--r--fs/ocfs2/alloc.c68
-rw-r--r--fs/ocfs2/alloc.h2
-rw-r--r--fs/ocfs2/aops.c104
-rw-r--r--fs/ocfs2/buffer_head_io.c13
-rw-r--r--fs/ocfs2/cluster/heartbeat.c209
-rw-r--r--fs/ocfs2/cluster/tcp.c25
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h5
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h4
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c26
-rw-r--r--fs/ocfs2/dlm/dlmdebug.h1
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c4
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c53
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c29
-rw-r--r--fs/ocfs2/dlm/dlmthread.c57
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c8
-rw-r--r--fs/ocfs2/dlmfs/userdlm.c2
-rw-r--r--fs/ocfs2/dlmfs/userdlm.h2
-rw-r--r--fs/ocfs2/dlmglue.c16
-rw-r--r--fs/ocfs2/file.c20
-rw-r--r--fs/ocfs2/inode.c9
-rw-r--r--fs/ocfs2/inode.h7
-rw-r--r--fs/ocfs2/journal.c41
-rw-r--r--fs/ocfs2/journal.h2
-rw-r--r--fs/ocfs2/mmap.c6
-rw-r--r--fs/ocfs2/namei.c23
-rw-r--r--fs/ocfs2/ocfs2.h20
-rw-r--r--fs/ocfs2/ocfs2_fs.h2
-rw-r--r--fs/ocfs2/quota_global.c13
-rw-r--r--fs/ocfs2/refcounttree.c41
-rw-r--r--fs/ocfs2/slot_map.c6
-rw-r--r--fs/ocfs2/stack_user.c11
-rw-r--r--fs/ocfs2/stackglue.c2
-rw-r--r--fs/ocfs2/suballoc.c20
-rw-r--r--fs/ocfs2/super.c7
-rw-r--r--fs/ocfs2/xattr.c57
-rw-r--r--fs/ocfs2/xattr.h4
-rw-r--r--fs/omfs/dir.c2
-rw-r--r--fs/open.c22
-rw-r--r--fs/openpromfs/inode.c2
-rw-r--r--fs/orangefs/acl.c17
-rw-r--r--fs/orangefs/dcache.c4
-rw-r--r--fs/orangefs/devorangefs-req.c7
-rw-r--r--fs/orangefs/dir.c12
-rw-r--r--fs/orangefs/file.c6
-rw-r--r--fs/orangefs/inode.c61
-rw-r--r--fs/orangefs/namei.c22
-rw-r--r--fs/orangefs/orangefs-bufmap.c4
-rw-r--r--fs/orangefs/orangefs-cache.c4
-rw-r--r--fs/orangefs/orangefs-debugfs.c3
-rw-r--r--fs/orangefs/orangefs-kernel.h33
-rw-r--r--fs/orangefs/orangefs-mod.c2
-rw-r--r--fs/orangefs/orangefs-sysfs.c43
-rw-r--r--fs/orangefs/orangefs-utils.c50
-rw-r--r--fs/orangefs/protocol.h43
-rw-r--r--fs/orangefs/symlink.c2
-rw-r--r--fs/orangefs/xattr.c154
-rw-r--r--fs/overlayfs/copy_up.c27
-rw-r--r--fs/overlayfs/dir.c326
-rw-r--r--fs/overlayfs/inode.c329
-rw-r--r--fs/overlayfs/overlayfs.h43
-rw-r--r--fs/overlayfs/readdir.c16
-rw-r--r--fs/overlayfs/super.c264
-rw-r--r--fs/pipe.c38
-rw-r--r--fs/pnode.c25
-rw-r--r--fs/posix_acl.c164
-rw-r--r--fs/proc/Makefile1
-rw-r--r--fs/proc/array.c20
-rw-r--r--fs/proc/base.c274
-rw-r--r--fs/proc/fd.c8
-rw-r--r--fs/proc/generic.c2
-rw-r--r--fs/proc/inode.c15
-rw-r--r--fs/proc/internal.h3
-rw-r--r--fs/proc/meminfo.c23
-rw-r--r--fs/proc/namespaces.c3
-rw-r--r--fs/proc/page.c2
-rw-r--r--fs/proc/proc_net.c2
-rw-r--r--fs/proc/proc_sysctl.c23
-rw-r--r--fs/proc/root.c58
-rw-r--r--fs/proc/stat.c10
-rw-r--r--fs/proc/task_mmu.c56
-rw-r--r--fs/proc/vmcore.c6
-rw-r--r--fs/pstore/Kconfig31
-rw-r--r--fs/pstore/inode.c5
-rw-r--r--fs/pstore/platform.c269
-rw-r--r--fs/pstore/ram.c102
-rw-r--r--fs/qnx4/dir.c2
-rw-r--r--fs/qnx6/dir.c18
-rw-r--r--fs/qnx6/inode.c4
-rw-r--r--fs/qnx6/qnx6.h2
-rw-r--r--fs/quota/dquot.c37
-rw-r--r--fs/quota/netlink.c12
-rw-r--r--fs/quota/quota.c14
-rw-r--r--fs/ramfs/file-nommu.c8
-rw-r--r--fs/ramfs/inode.c4
-rw-r--r--fs/read_write.c69
-rw-r--r--fs/readdir.c41
-rw-r--r--fs/reiserfs/dir.c2
-rw-r--r--fs/reiserfs/file.c10
-rw-r--r--fs/reiserfs/ibalance.c3
-rw-r--r--fs/reiserfs/inode.c55
-rw-r--r--fs/reiserfs/ioctl.c10
-rw-r--r--fs/reiserfs/journal.c20
-rw-r--r--fs/reiserfs/namei.c18
-rw-r--r--fs/reiserfs/objectid.c2
-rw-r--r--fs/reiserfs/stree.c8
-rw-r--r--fs/reiserfs/super.c11
-rw-r--r--fs/reiserfs/tail_conversion.c4
-rw-r--r--fs/reiserfs/xattr.c72
-rw-r--r--fs/reiserfs/xattr.h9
-rw-r--r--fs/reiserfs/xattr_acl.c8
-rw-r--r--fs/reiserfs/xattr_security.c26
-rw-r--r--fs/reiserfs/xattr_trusted.c26
-rw-r--r--fs/reiserfs/xattr_user.c26
-rw-r--r--fs/romfs/super.c4
-rw-r--r--fs/select.c67
-rw-r--r--fs/seq_file.c7
-rw-r--r--fs/splice.c35
-rw-r--r--fs/squashfs/block.c8
-rw-r--r--fs/squashfs/cache.c18
-rw-r--r--fs/squashfs/decompressor.c2
-rw-r--r--fs/squashfs/dir.c4
-rw-r--r--fs/squashfs/file.c24
-rw-r--r--fs/squashfs/file_direct.c22
-rw-r--r--fs/squashfs/lz4_wrapper.c8
-rw-r--r--fs/squashfs/lzo_wrapper.c8
-rw-r--r--fs/squashfs/page_actor.c4
-rw-r--r--fs/squashfs/page_actor.h2
-rw-r--r--fs/squashfs/super.c2
-rw-r--r--fs/squashfs/symlink.c6
-rw-r--r--fs/squashfs/xattr.c6
-rw-r--r--fs/squashfs/xz_wrapper.c4
-rw-r--r--fs/squashfs/zlib_wrapper.c4
-rw-r--r--fs/super.c73
-rw-r--r--fs/sync.c4
-rw-r--r--fs/sysfs/mount.c5
-rw-r--r--fs/sysv/dir.c20
-rw-r--r--fs/sysv/namei.c6
-rw-r--r--fs/timerfd.c10
-rw-r--r--fs/tracefs/inode.c7
-rw-r--r--fs/ubifs/debug.c2
-rw-r--r--fs/ubifs/dir.c8
-rw-r--r--fs/ubifs/file.c90
-rw-r--r--fs/ubifs/gc.c4
-rw-r--r--fs/ubifs/sb.c2
-rw-r--r--fs/ubifs/super.c26
-rw-r--r--fs/ubifs/ubifs.h15
-rw-r--r--fs/ubifs/xattr.c144
-rw-r--r--fs/udf/dir.c4
-rw-r--r--fs/udf/directory.c2
-rw-r--r--fs/udf/file.c13
-rw-r--r--fs/udf/inode.c13
-rw-r--r--fs/udf/namei.c2
-rw-r--r--fs/udf/partition.c13
-rw-r--r--fs/udf/super.c93
-rw-r--r--fs/udf/udf_sb.h9
-rw-r--r--fs/udf/udfdecl.h2
-rw-r--r--fs/udf/unicode.c16
-rw-r--r--fs/ufs/balloc.c8
-rw-r--r--fs/ufs/dir.c65
-rw-r--r--fs/ufs/inode.c4
-rw-r--r--fs/ufs/namei.c6
-rw-r--r--fs/ufs/super.c2
-rw-r--r--fs/ufs/util.c6
-rw-r--r--fs/ufs/util.h2
-rw-r--r--fs/userfaultfd.c63
-rw-r--r--fs/utimes.c3
-rw-r--r--fs/xattr.c32
-rw-r--r--fs/xfs/Kconfig1
-rw-r--r--fs/xfs/Makefile8
-rw-r--r--fs/xfs/kmem.c26
-rw-r--r--fs/xfs/kmem.h2
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c246
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h59
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c12
-rw-r--r--fs/xfs/libxfs/xfs_attr.c129
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c4
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.h3
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c19
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c282
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h54
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c32
-rw-r--r--fs/xfs/libxfs/xfs_btree.c941
-rw-r--r--fs/xfs/libxfs/xfs_btree.h90
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c65
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.h4
-rw-r--r--fs/xfs/libxfs/xfs_da_format.c31
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h44
-rw-r--r--fs/xfs/libxfs/xfs_defer.c463
-rw-r--r--fs/xfs/libxfs/xfs_defer.h97
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c15
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h8
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c47
-rw-r--r--fs/xfs/libxfs/xfs_format.h197
-rw-r--r--fs/xfs/libxfs/xfs_fs.h9
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c43
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h2
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c18
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c1
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c99
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h1
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h68
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c1399
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h209
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c511
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.h61
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c2
-rw-r--r--fs/xfs/libxfs/xfs_sb.c17
-rw-r--r--fs/xfs/libxfs/xfs_shared.h104
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c62
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.h10
-rw-r--r--fs/xfs/libxfs/xfs_types.h4
-rw-r--r--fs/xfs/xfs_acl.c20
-rw-r--r--fs/xfs/xfs_aops.c693
-rw-r--r--fs/xfs/xfs_aops.h18
-rw-r--r--fs/xfs/xfs_attr.h4
-rw-r--r--fs/xfs/xfs_attr_inactive.c18
-rw-r--r--fs/xfs/xfs_attr_list.c87
-rw-r--r--fs/xfs/xfs_bmap_util.c516
-rw-r--r--fs/xfs/xfs_bmap_util.h7
-rw-r--r--fs/xfs/xfs_buf.c280
-rw-r--r--fs/xfs/xfs_buf.h27
-rw-r--r--fs/xfs/xfs_buf_item.c140
-rw-r--r--fs/xfs/xfs_dir2_readdir.c23
-rw-r--r--fs/xfs/xfs_discard.c2
-rw-r--r--fs/xfs/xfs_dquot.c23
-rw-r--r--fs/xfs/xfs_dquot_item.c2
-rw-r--r--fs/xfs/xfs_error.c5
-rw-r--r--fs/xfs/xfs_error.h8
-rw-r--r--fs/xfs/xfs_export.c2
-rw-r--r--fs/xfs/xfs_extfree_item.c71
-rw-r--r--fs/xfs/xfs_extfree_item.h3
-rw-r--r--fs/xfs/xfs_file.c465
-rw-r--r--fs/xfs/xfs_filestream.c3
-rw-r--r--fs/xfs/xfs_fsops.c225
-rw-r--r--fs/xfs/xfs_icache.c292
-rw-r--r--fs/xfs/xfs_icache.h1
-rw-r--r--fs/xfs/xfs_inode.c282
-rw-r--r--fs/xfs/xfs_inode.h29
-rw-r--r--fs/xfs/xfs_inode_item.c7
-rw-r--r--fs/xfs/xfs_ioctl.c72
-rw-r--r--fs/xfs/xfs_ioctl.h3
-rw-r--r--fs/xfs/xfs_ioctl32.c6
-rw-r--r--fs/xfs/xfs_iomap.c255
-rw-r--r--fs/xfs/xfs_iomap.h7
-rw-r--r--fs/xfs/xfs_iops.c230
-rw-r--r--fs/xfs/xfs_linux.h9
-rw-r--r--fs/xfs/xfs_log.c75
-rw-r--r--fs/xfs/xfs_log.h8
-rw-r--r--fs/xfs/xfs_log_cil.c259
-rw-r--r--fs/xfs/xfs_log_priv.h1
-rw-r--r--fs/xfs/xfs_log_recover.c344
-rw-r--r--fs/xfs/xfs_mount.c42
-rw-r--r--fs/xfs/xfs_mount.h44
-rw-r--r--fs/xfs/xfs_ondisk.h34
-rw-r--r--fs/xfs/xfs_pnfs.c38
-rw-r--r--fs/xfs/xfs_pnfs.h4
-rw-r--r--fs/xfs/xfs_qm.c9
-rw-r--r--fs/xfs/xfs_qm_syscalls.c26
-rw-r--r--fs/xfs/xfs_rmap_item.c536
-rw-r--r--fs/xfs/xfs_rmap_item.h95
-rw-r--r--fs/xfs/xfs_rtalloc.c32
-rw-r--r--fs/xfs/xfs_rtalloc.h2
-rw-r--r--fs/xfs/xfs_stats.c2
-rw-r--r--fs/xfs/xfs_stats.h18
-rw-r--r--fs/xfs/xfs_super.c128
-rw-r--r--fs/xfs/xfs_super.h2
-rw-r--r--fs/xfs/xfs_symlink.c62
-rw-r--r--fs/xfs/xfs_sysfs.c294
-rw-r--r--fs/xfs/xfs_sysfs.h3
-rw-r--r--fs/xfs/xfs_trace.c2
-rw-r--r--fs/xfs/xfs_trace.h415
-rw-r--r--fs/xfs/xfs_trans.c88
-rw-r--r--fs/xfs/xfs_trans.h35
-rw-r--r--fs/xfs/xfs_trans_extfree.c215
-rw-r--r--fs/xfs/xfs_trans_rmap.c271
-rw-r--r--fs/xfs/xfs_xattr.c32
856 files changed, 34547 insertions, 23908 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 9da967f383872..5b6a1743ea17b 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -93,7 +93,7 @@ static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type)
* instantiating the inode (v9fs_inode_from_fid)
*/
acl = get_cached_acl(inode, type);
- BUG_ON(acl == ACL_NOT_CACHED);
+ BUG_ON(is_uncached_acl(acl));
return acl;
}
@@ -213,8 +213,8 @@ int v9fs_acl_mode(struct inode *dir, umode_t *modep,
}
static int v9fs_xattr_get_acl(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- void *buffer, size_t size)
+ struct dentry *dentry, struct inode *inode,
+ const char *name, void *buffer, size_t size)
{
struct v9fs_session_info *v9ses;
struct posix_acl *acl;
@@ -227,7 +227,7 @@ static int v9fs_xattr_get_acl(const struct xattr_handler *handler,
if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
return v9fs_xattr_get(dentry, handler->name, buffer, size);
- acl = v9fs_get_cached_acl(d_inode(dentry), handler->flags);
+ acl = v9fs_get_cached_acl(inode, handler->flags);
if (IS_ERR(acl))
return PTR_ERR(acl);
if (acl == NULL)
@@ -239,13 +239,13 @@ static int v9fs_xattr_get_acl(const struct xattr_handler *handler,
}
static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+ struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
int retval;
struct posix_acl *acl;
struct v9fs_session_info *v9ses;
- struct inode *inode = d_inode(dentry);
v9ses = v9fs_dentry2v9ses(dentry);
/*
@@ -266,7 +266,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
if (IS_ERR(acl))
return PTR_ERR(acl);
else if (acl) {
- retval = posix_acl_valid(acl);
+ retval = posix_acl_valid(inode->i_sb->s_user_ns, acl);
if (retval)
goto err_out;
}
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 47db55aee7f2d..60fb47469c86b 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -257,36 +257,12 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
return v9fs_fid_lookup_with_uid(dentry, uid, any);
}
-struct p9_fid *v9fs_fid_clone(struct dentry *dentry)
-{
- struct p9_fid *fid, *ret;
-
- fid = v9fs_fid_lookup(dentry);
- if (IS_ERR(fid))
- return fid;
-
- ret = p9_client_walk(fid, 0, NULL, 1);
- return ret;
-}
-
-static struct p9_fid *v9fs_fid_clone_with_uid(struct dentry *dentry, kuid_t uid)
-{
- struct p9_fid *fid, *ret;
-
- fid = v9fs_fid_lookup_with_uid(dentry, uid, 0);
- if (IS_ERR(fid))
- return fid;
-
- ret = p9_client_walk(fid, 0, NULL, 1);
- return ret;
-}
-
struct p9_fid *v9fs_writeback_fid(struct dentry *dentry)
{
int err;
struct p9_fid *fid;
- fid = v9fs_fid_clone_with_uid(dentry, GLOBAL_ROOT_UID);
+ fid = clone_fid(v9fs_fid_lookup_with_uid(dentry, GLOBAL_ROOT_UID, 0));
if (IS_ERR(fid))
goto error_out;
/*
diff --git a/fs/9p/fid.h b/fs/9p/fid.h
index 2b6787fcb6261..4491bcaf42b80 100644
--- a/fs/9p/fid.h
+++ b/fs/9p/fid.h
@@ -24,7 +24,18 @@
#include <linux/list.h>
struct p9_fid *v9fs_fid_lookup(struct dentry *dentry);
-struct p9_fid *v9fs_fid_clone(struct dentry *dentry);
+static inline struct p9_fid *v9fs_parent_fid(struct dentry *dentry)
+{
+ return v9fs_fid_lookup(dentry->d_parent);
+}
void v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid);
struct p9_fid *v9fs_writeback_fid(struct dentry *dentry);
+static inline struct p9_fid *clone_fid(struct p9_fid *fid)
+{
+ return IS_ERR(fid) ? fid : p9_client_walk(fid, 0, NULL, 1);
+}
+static inline struct p9_fid *v9fs_fid_clone(struct dentry *dentry)
+{
+ return clone_fid(v9fs_fid_lookup(dentry));
+}
#endif
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index e9e04376c52ce..6181ad79e1a54 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -153,7 +153,7 @@ static void v9fs_invalidate_page(struct page *page, unsigned int offset,
* If called with zero offset, we should release
* the private state assocated with the page
*/
- if (offset == 0 && length == PAGE_CACHE_SIZE)
+ if (offset == 0 && length == PAGE_SIZE)
v9fs_fscache_invalidate_page(page);
}
@@ -166,10 +166,10 @@ static int v9fs_vfs_writepage_locked(struct page *page)
struct bio_vec bvec;
int err, len;
- if (page->index == size >> PAGE_CACHE_SHIFT)
- len = size & ~PAGE_CACHE_MASK;
+ if (page->index == size >> PAGE_SHIFT)
+ len = size & ~PAGE_MASK;
else
- len = PAGE_CACHE_SIZE;
+ len = PAGE_SIZE;
bvec.bv_page = page;
bvec.bv_offset = 0;
@@ -231,7 +231,6 @@ static int v9fs_launder_page(struct page *page)
/**
* v9fs_direct_IO - 9P address space operation for direct I/O
* @iocb: target I/O control block
- * @pos: offset in file to begin the operation
*
* The presence of v9fs_direct_IO() in the address space ops vector
* allowes open() O_DIRECT flags which would have failed otherwise.
@@ -245,9 +244,10 @@ static int v9fs_launder_page(struct page *page)
*
*/
static ssize_t
-v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
+v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
+ loff_t pos = iocb->ki_pos;
ssize_t n;
int err = 0;
if (iov_iter_rw(iter) == WRITE) {
@@ -271,7 +271,7 @@ static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
int retval = 0;
struct page *page;
struct v9fs_inode *v9inode;
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t index = pos >> PAGE_SHIFT;
struct inode *inode = mapping->host;
@@ -288,11 +288,11 @@ start:
if (PageUptodate(page))
goto out;
- if (len == PAGE_CACHE_SIZE)
+ if (len == PAGE_SIZE)
goto out;
retval = v9fs_fid_readpage(v9inode->writeback_fid, page);
- page_cache_release(page);
+ put_page(page);
if (!retval)
goto start;
out:
@@ -313,7 +313,7 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping,
/*
* zero out the rest of the area
*/
- unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned from = pos & (PAGE_SIZE - 1);
zero_user(page, from + copied, len - copied);
flush_dcache_page(page);
@@ -331,7 +331,7 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping,
}
set_page_dirty(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return copied;
}
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 5cc00e56206e3..b0405d6aac854 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -246,7 +246,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
const struct file_operations v9fs_dir_operations = {
.read = generic_read_dir,
.llseek = generic_file_llseek,
- .iterate = v9fs_dir_readdir,
+ .iterate_shared = v9fs_dir_readdir,
.open = v9fs_file_open,
.release = v9fs_dir_release,
};
@@ -254,7 +254,7 @@ const struct file_operations v9fs_dir_operations = {
const struct file_operations v9fs_dir_operations_dotl = {
.read = generic_read_dir,
.llseek = generic_file_llseek,
- .iterate = v9fs_dir_readdir_dotl,
+ .iterate_shared = v9fs_dir_readdir_dotl,
.open = v9fs_file_open,
.release = v9fs_dir_release,
.fsync = v9fs_file_fsync_dotl,
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index eadc894faea2e..d7b78d531e63f 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -74,7 +74,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
v9fs_proto_dotu(v9ses));
fid = file->private_data;
if (!fid) {
- fid = v9fs_fid_clone(file->f_path.dentry);
+ fid = v9fs_fid_clone(file_dentry(file));
if (IS_ERR(fid))
return PTR_ERR(fid);
@@ -100,7 +100,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
* because we want write after unlink usecase
* to work.
*/
- fid = v9fs_writeback_fid(file->f_path.dentry);
+ fid = v9fs_writeback_fid(file_dentry(file));
if (IS_ERR(fid)) {
err = PTR_ERR(fid);
mutex_unlock(&v9inode->v_mutex);
@@ -421,8 +421,8 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file_inode(file);
loff_t i_size;
unsigned long pg_start, pg_end;
- pg_start = origin >> PAGE_CACHE_SHIFT;
- pg_end = (origin + retval - 1) >> PAGE_CACHE_SHIFT;
+ pg_start = origin >> PAGE_SHIFT;
+ pg_end = (origin + retval - 1) >> PAGE_SHIFT;
if (inode->i_mapping && inode->i_mapping->nrpages)
invalidate_inode_pages2_range(inode->i_mapping,
pg_start, pg_end);
@@ -516,7 +516,7 @@ v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma)
* because we want write after unlink usecase
* to work.
*/
- fid = v9fs_writeback_fid(filp->f_path.dentry);
+ fid = v9fs_writeback_fid(file_dentry(filp));
if (IS_ERR(fid)) {
retval = PTR_ERR(fid);
mutex_unlock(&v9inode->v_mutex);
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 3a08b3e6ff1d7..8b1999b528e9b 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -595,7 +595,7 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags)
v9ses = v9fs_inode2v9ses(dir);
inode = d_inode(dentry);
- dfid = v9fs_fid_lookup(dentry->d_parent);
+ dfid = v9fs_parent_fid(dentry);
if (IS_ERR(dfid)) {
retval = PTR_ERR(dfid);
p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", retval);
@@ -653,7 +653,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
ofid = NULL;
fid = NULL;
name = (char *) dentry->d_name.name;
- dfid = v9fs_fid_lookup(dentry->d_parent);
+ dfid = v9fs_parent_fid(dentry);
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
@@ -661,7 +661,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
}
/* clone a fid to use for creation */
- ofid = p9_client_walk(dfid, 0, NULL, 1);
+ ofid = clone_fid(dfid);
if (IS_ERR(ofid)) {
err = PTR_ERR(ofid);
p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
@@ -798,7 +798,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
v9ses = v9fs_inode2v9ses(dir);
/* We can walk d_parent because we hold the dir->i_mutex */
- dfid = v9fs_fid_lookup(dentry->d_parent);
+ dfid = v9fs_parent_fid(dentry);
if (IS_ERR(dfid))
return ERR_CAST(dfid);
@@ -853,7 +853,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
struct p9_fid *fid, *inode_fid;
struct dentry *res = NULL;
- if (d_unhashed(dentry)) {
+ if (d_in_lookup(dentry)) {
res = v9fs_vfs_lookup(dir, dentry, 0);
if (IS_ERR(res))
return PTR_ERR(res);
@@ -975,13 +975,13 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (IS_ERR(oldfid))
return PTR_ERR(oldfid);
- olddirfid = v9fs_fid_clone(old_dentry->d_parent);
+ olddirfid = clone_fid(v9fs_parent_fid(old_dentry));
if (IS_ERR(olddirfid)) {
retval = PTR_ERR(olddirfid);
goto done;
}
- newdirfid = v9fs_fid_clone(new_dentry->d_parent);
+ newdirfid = clone_fid(v9fs_parent_fid(new_dentry));
if (IS_ERR(newdirfid)) {
retval = PTR_ERR(newdirfid);
goto clunk_olddir;
@@ -1071,7 +1071,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
if (IS_ERR(st))
return PTR_ERR(st);
- v9fs_stat2inode(st, d_inode(dentry), d_inode(dentry)->i_sb);
+ v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb);
generic_fillattr(d_inode(dentry), stat);
p9stat_free(st);
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index a34702c998f59..eeabcb0bad125 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -254,7 +254,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
struct posix_acl *pacl = NULL, *dacl = NULL;
struct dentry *res = NULL;
- if (d_unhashed(dentry)) {
+ if (d_in_lookup(dentry)) {
res = v9fs_vfs_lookup(dir, dentry, 0);
if (IS_ERR(res))
return PTR_ERR(res);
@@ -273,7 +273,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
p9_debug(P9_DEBUG_VFS, "name:%s flags:0x%x mode:0x%hx\n",
name, flags, omode);
- dfid = v9fs_fid_lookup(dentry->d_parent);
+ dfid = v9fs_parent_fid(dentry);
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
@@ -281,7 +281,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
}
/* clone a fid to use for creation */
- ofid = p9_client_walk(dfid, 0, NULL, 1);
+ ofid = clone_fid(dfid);
if (IS_ERR(ofid)) {
err = PTR_ERR(ofid);
p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
@@ -389,7 +389,6 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
umode_t mode;
struct inode *inode;
struct p9_qid qid;
- struct dentry *dir_dentry;
struct posix_acl *dacl = NULL, *pacl = NULL;
p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry);
@@ -400,8 +399,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
if (dir->i_mode & S_ISGID)
omode |= S_ISGID;
- dir_dentry = dentry->d_parent;
- dfid = v9fs_fid_lookup(dir_dentry);
+ dfid = v9fs_parent_fid(dentry);
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
@@ -691,7 +689,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
p9_debug(P9_DEBUG_VFS, "%lu,%s,%s\n", dir->i_ino, name, symname);
v9ses = v9fs_inode2v9ses(dir);
- dfid = v9fs_fid_lookup(dentry->d_parent);
+ dfid = v9fs_parent_fid(dentry);
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
@@ -762,7 +760,6 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
int err;
- struct dentry *dir_dentry;
struct p9_fid *dfid, *oldfid;
struct v9fs_session_info *v9ses;
@@ -770,8 +767,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
dir->i_ino, old_dentry, dentry);
v9ses = v9fs_inode2v9ses(dir);
- dir_dentry = dentry->d_parent;
- dfid = v9fs_fid_lookup(dir_dentry);
+ dfid = v9fs_parent_fid(dentry);
if (IS_ERR(dfid))
return PTR_ERR(dfid);
@@ -822,7 +818,6 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
struct p9_fid *fid = NULL, *dfid = NULL;
struct inode *inode;
struct p9_qid qid;
- struct dentry *dir_dentry;
struct posix_acl *dacl = NULL, *pacl = NULL;
p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %hx MAJOR: %u MINOR: %u\n",
@@ -830,8 +825,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
MAJOR(rdev), MINOR(rdev));
v9ses = v9fs_inode2v9ses(dir);
- dir_dentry = dentry->d_parent;
- dfid = v9fs_fid_lookup(dir_dentry);
+ dfid = v9fs_parent_fid(dentry);
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index bf495cedec26a..de3ed86291969 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -87,7 +87,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
sb->s_op = &v9fs_super_ops;
sb->s_bdi = &v9ses->bdi;
if (v9ses->cache)
- sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_CACHE_SIZE;
+ sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE;
sb->s_flags |= MS_ACTIVE | MS_DIRSYNC | MS_NOATIME;
if (!v9ses->cache)
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 9dd9b47a6c1a1..f329eee6dc93e 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -97,8 +97,6 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name,
const void *value, size_t value_len, int flags)
{
struct p9_fid *fid = v9fs_fid_lookup(dentry);
- if (IS_ERR(fid))
- return PTR_ERR(fid);
return v9fs_fid_xattr_set(fid, name, value, value_len, flags);
}
@@ -115,7 +113,7 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name,
name, value_len, flags);
/* Clone it */
- fid = p9_client_walk(fid, 0, NULL, 1);
+ fid = clone_fid(fid);
if (IS_ERR(fid))
return PTR_ERR(fid);
@@ -138,8 +136,8 @@ ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
}
static int v9fs_xattr_handler_get(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- void *buffer, size_t size)
+ struct dentry *dentry, struct inode *inode,
+ const char *name, void *buffer, size_t size)
{
const char *full_name = xattr_full_name(handler, name);
@@ -147,8 +145,9 @@ static int v9fs_xattr_handler_get(const struct xattr_handler *handler,
}
static int v9fs_xattr_handler_set(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+ struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
const char *full_name = xattr_full_name(handler, name);
diff --git a/fs/Kconfig b/fs/Kconfig
index 6725f59c18e6b..2bc7ad7758428 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -10,6 +10,9 @@ config DCACHE_WORD_ACCESS
if BLOCK
+config FS_IOMAP
+ bool
+
source "fs/ext2/Kconfig"
source "fs/ext4/Kconfig"
source "fs/jbd2/Kconfig"
@@ -52,6 +55,7 @@ config FS_DAX_PMD
depends on FS_DAX
depends on ZONE_DEVICE
depends on TRANSPARENT_HUGEPAGE
+ depends on BROKEN
endif # BLOCK
@@ -66,6 +70,12 @@ config FS_POSIX_ACL
config EXPORTFS
tristate
+config EXPORTFS_BLOCK_OPS
+ bool "Enable filesystem export operations for block IO"
+ help
+ This option enables the export operations for a filesystem to support
+ external block IO.
+
config FILE_LOCKING
bool "Enable POSIX file locking API" if EXPERT
default y
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 2d0cbbd14cfc8..c7efddf6e0380 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -1,6 +1,7 @@
config BINFMT_ELF
bool "Kernel support for ELF binaries"
depends on MMU && (BROKEN || !FRV)
+ select ELFCORE
default y
---help---
ELF (Executable and Linkable Format) is a format for libraries and
@@ -26,6 +27,7 @@ config BINFMT_ELF
config COMPAT_BINFMT_ELF
bool
depends on COMPAT && BINFMT_ELF
+ select ELFCORE
config ARCH_BINFMT_ELF_STATE
bool
@@ -34,6 +36,7 @@ config BINFMT_ELF_FDPIC
bool "Kernel support for FDPIC ELF binaries"
default y
depends on (FRV || BLACKFIN || (SUPERH32 && !MMU) || C6X)
+ select ELFCORE
help
ELF FDPIC binaries are based on ELF, but allow the individual load
segments of a binary to be located in memory independently of each
@@ -43,6 +46,11 @@ config BINFMT_ELF_FDPIC
It is also possible to run FDPIC ELF binaries on MMU linux also.
+config ELFCORE
+ bool
+ help
+ This option enables kernel/elfcore.o.
+
config CORE_DUMP_DEFAULT_ELF_HEADERS
bool "Write ELF core dumps with partial segments"
default y
@@ -81,7 +89,8 @@ config BINFMT_SCRIPT
config BINFMT_FLAT
bool "Kernel support for flat binaries"
- depends on !MMU && (!FRV || BROKEN)
+ depends on !MMU || M68K
+ depends on !FRV || BROKEN
help
Support uClinux FLAT format binaries.
diff --git a/fs/Makefile b/fs/Makefile
index 85b6e13b62d36..ed2b63257ba99 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_COREDUMP) += coredump.o
obj-$(CONFIG_SYSCTL) += drop_caches.o
obj-$(CONFIG_FHANDLE) += fhandle.o
+obj-$(CONFIG_FS_IOMAP) += iomap.o
obj-y += quota/
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index fd4cf2c48e48e..29444c83da48c 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -101,7 +101,7 @@ out:
}
static int
-adfs_match(struct qstr *name, struct object_info *obj)
+adfs_match(const struct qstr *name, struct object_info *obj)
{
int i;
@@ -126,7 +126,7 @@ adfs_match(struct qstr *name, struct object_info *obj)
}
static int
-adfs_dir_lookup_byname(struct inode *inode, struct qstr *name, struct object_info *obj)
+adfs_dir_lookup_byname(struct inode *inode, const struct qstr *name, struct object_info *obj)
{
struct super_block *sb = inode->i_sb;
const struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
@@ -207,7 +207,7 @@ adfs_hash(const struct dentry *parent, struct qstr *qstr)
*/
qstr->len = i = name_len;
name = qstr->name;
- hash = init_name_hash();
+ hash = init_name_hash(parent);
while (i--) {
char c;
@@ -227,7 +227,7 @@ adfs_hash(const struct dentry *parent, struct qstr *qstr)
* requirements of the underlying filesystem.
*/
static int
-adfs_compare(const struct dentry *parent, const struct dentry *dentry,
+adfs_compare(const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
int i;
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index d6c7a51c93e4c..d8f217c711d37 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -472,9 +472,7 @@ affs_warning(struct super_block *sb, const char *function, const char *fmt, ...)
bool
affs_nofilenametruncate(const struct dentry *dentry)
{
- struct inode *inode = d_inode(dentry);
-
- return affs_test_opt(AFFS_SB(inode->i_sb)->s_flags, SF_NO_TRUNCATE);
+ return affs_test_opt(AFFS_SB(dentry->d_sb)->s_flags, SF_NO_TRUNCATE);
}
/* Check if the name is valid for a affs object. */
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index ac4f318aafba8..f1e7294381c5a 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -20,7 +20,7 @@ static int affs_readdir(struct file *, struct dir_context *);
const struct file_operations affs_dir_operations = {
.read = generic_read_dir,
.llseek = generic_file_llseek,
- .iterate = affs_readdir,
+ .iterate_shared = affs_readdir,
.fsync = affs_file_fsync,
};
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 22fc7c802d698..0deec9cc2362c 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -389,12 +389,13 @@ static void affs_write_failed(struct address_space *mapping, loff_t to)
}
static ssize_t
-affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
+affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
size_t count = iov_iter_count(iter);
+ loff_t offset = iocb->ki_pos;
ssize_t ret;
if (iov_iter_rw(iter) == WRITE) {
@@ -404,7 +405,7 @@ affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
return 0;
}
- ret = blockdev_direct_IO(iocb, inode, iter, offset, affs_get_block);
+ ret = blockdev_direct_IO(iocb, inode, iter, affs_get_block);
if (ret < 0 && iov_iter_rw(iter) == WRITE)
affs_write_failed(mapping, offset + count);
return ret;
@@ -510,9 +511,9 @@ affs_do_readpage_ofs(struct page *page, unsigned to)
pr_debug("%s(%lu, %ld, 0, %d)\n", __func__, inode->i_ino,
page->index, to);
- BUG_ON(to > PAGE_CACHE_SIZE);
+ BUG_ON(to > PAGE_SIZE);
bsize = AFFS_SB(sb)->s_data_blksize;
- tmp = page->index << PAGE_CACHE_SHIFT;
+ tmp = page->index << PAGE_SHIFT;
bidx = tmp / bsize;
boff = tmp % bsize;
@@ -613,10 +614,10 @@ affs_readpage_ofs(struct file *file, struct page *page)
int err;
pr_debug("%s(%lu, %ld)\n", __func__, inode->i_ino, page->index);
- to = PAGE_CACHE_SIZE;
- if (((page->index + 1) << PAGE_CACHE_SHIFT) > inode->i_size) {
- to = inode->i_size & ~PAGE_CACHE_MASK;
- memset(page_address(page) + to, 0, PAGE_CACHE_SIZE - to);
+ to = PAGE_SIZE;
+ if (((page->index + 1) << PAGE_SHIFT) > inode->i_size) {
+ to = inode->i_size & ~PAGE_MASK;
+ memset(page_address(page) + to, 0, PAGE_SIZE - to);
}
err = affs_do_readpage_ofs(page, to);
@@ -646,7 +647,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
return err;
}
- index = pos >> PAGE_CACHE_SHIFT;
+ index = pos >> PAGE_SHIFT;
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
@@ -656,10 +657,10 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
return 0;
/* XXX: inefficient but safe in the face of short writes */
- err = affs_do_readpage_ofs(page, PAGE_CACHE_SIZE);
+ err = affs_do_readpage_ofs(page, PAGE_SIZE);
if (err) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
return err;
}
@@ -677,7 +678,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
u32 tmp;
int written;
- from = pos & (PAGE_CACHE_SIZE - 1);
+ from = pos & (PAGE_SIZE - 1);
to = pos + len;
/*
* XXX: not sure if this can handle short copies (len < copied), but
@@ -692,7 +693,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
bh = NULL;
written = 0;
- tmp = (page->index << PAGE_CACHE_SHIFT) + from;
+ tmp = (page->index << PAGE_SHIFT) + from;
bidx = tmp / bsize;
boff = tmp % bsize;
if (boff) {
@@ -788,13 +789,13 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
done:
affs_brelse(bh);
- tmp = (page->index << PAGE_CACHE_SHIFT) + from;
+ tmp = (page->index << PAGE_SHIFT) + from;
if (tmp > inode->i_size)
inode->i_size = AFFS_I(inode)->mmu_private = tmp;
err_first_bh:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return written;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 00d3002a6780b..a2d68f828d530 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -14,11 +14,11 @@ typedef int (*toupper_t)(int);
static int affs_toupper(int ch);
static int affs_hash_dentry(const struct dentry *, struct qstr *);
-static int affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
+static int affs_compare_dentry(const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name);
static int affs_intl_toupper(int ch);
static int affs_intl_hash_dentry(const struct dentry *, struct qstr *);
-static int affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
+static int affs_intl_compare_dentry(const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name);
const struct dentry_operations affs_dentry_operations = {
@@ -61,7 +61,7 @@ affs_get_toupper(struct super_block *sb)
* Note: the dentry argument is the parent dentry.
*/
static inline int
-__affs_hash_dentry(struct qstr *qstr, toupper_t toupper, bool notruncate)
+__affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr, toupper_t toupper, bool notruncate)
{
const u8 *name = qstr->name;
unsigned long hash;
@@ -72,7 +72,7 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper, bool notruncate)
if (retval)
return retval;
- hash = init_name_hash();
+ hash = init_name_hash(dentry);
len = min(qstr->len, AFFSNAMEMAX);
for (; len > 0; name++, len--)
hash = partial_name_hash(toupper(*name), hash);
@@ -84,7 +84,7 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper, bool notruncate)
static int
affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
{
- return __affs_hash_dentry(qstr, affs_toupper,
+ return __affs_hash_dentry(dentry, qstr, affs_toupper,
affs_nofilenametruncate(dentry));
}
@@ -92,7 +92,7 @@ affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
static int
affs_intl_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
{
- return __affs_hash_dentry(qstr, affs_intl_toupper,
+ return __affs_hash_dentry(dentry, qstr, affs_intl_toupper,
affs_nofilenametruncate(dentry));
}
@@ -131,20 +131,20 @@ static inline int __affs_compare_dentry(unsigned int len,
}
static int
-affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
+affs_compare_dentry(const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
return __affs_compare_dentry(len, str, name, affs_toupper,
- affs_nofilenametruncate(parent));
+ affs_nofilenametruncate(dentry));
}
static int
-affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
+affs_intl_compare_dentry(const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
return __affs_compare_dentry(len, str, name, affs_intl_toupper,
- affs_nofilenametruncate(parent));
+ affs_nofilenametruncate(dentry));
}
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 2a6713b6b9f46..d6384863192ca 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -528,7 +528,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
char *prefix = NULL;
new_opts = kstrdup(data, GFP_KERNEL);
- if (!new_opts)
+ if (data && !new_opts)
return -ENOMEM;
pr_debug("%s(flags=0x%x,opts=\"%s\")\n", __func__, *flags, data);
@@ -546,7 +546,8 @@ affs_remount(struct super_block *sb, int *flags, char *data)
}
flush_delayed_work(&sbi->sb_work);
- replace_mount_options(sb, new_opts);
+ if (new_opts)
+ replace_mount_options(sb, new_opts);
sbi->s_flags = mount_flags;
sbi->s_mode = mode;
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index e10e17788f060..eba541004d90f 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -43,7 +43,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
const struct file_operations afs_dir_file_operations = {
.open = afs_dir_open,
.release = afs_release,
- .iterate = afs_readdir,
+ .iterate_shared = afs_readdir,
.lock = afs_lock,
.llseek = generic_file_llseek,
};
@@ -128,7 +128,7 @@ struct afs_lookup_cookie {
/*
* check that a directory page is valid
*/
-static inline void afs_dir_check_page(struct inode *dir, struct page *page)
+static inline bool afs_dir_check_page(struct inode *dir, struct page *page)
{
struct afs_dir_page *dbuf;
loff_t latter;
@@ -168,11 +168,11 @@ static inline void afs_dir_check_page(struct inode *dir, struct page *page)
}
SetPageChecked(page);
- return;
+ return true;
error:
- SetPageChecked(page);
SetPageError(page);
+ return false;
}
/*
@@ -181,7 +181,7 @@ error:
static inline void afs_dir_put_page(struct page *page)
{
kunmap(page);
- page_cache_release(page);
+ put_page(page);
}
/*
@@ -196,10 +196,10 @@ static struct page *afs_dir_get_page(struct inode *dir, unsigned long index,
page = read_cache_page(dir->i_mapping, index, afs_page_filler, key);
if (!IS_ERR(page)) {
kmap(page);
- if (!PageChecked(page))
- afs_dir_check_page(dir, page);
- if (PageError(page))
- goto fail;
+ if (unlikely(!PageChecked(page))) {
+ if (PageError(page) || !afs_dir_check_page(dir, page))
+ goto fail;
+ }
}
return page;
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 999bc3caec927..6344aee4ac4bf 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -164,7 +164,7 @@ int afs_page_filler(void *data, struct page *page)
_debug("cache said ENOBUFS");
default:
go_on:
- offset = page->index << PAGE_CACHE_SHIFT;
+ offset = page->index << PAGE_SHIFT;
len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE);
/* read the contents of the file from the server into the
@@ -319,7 +319,7 @@ static void afs_invalidatepage(struct page *page, unsigned int offset,
BUG_ON(!PageLocked(page));
/* we clean up only if the entire page is being invalidated */
- if (offset == 0 && length == PAGE_CACHE_SIZE) {
+ if (offset == 0 && length == PAGE_SIZE) {
#ifdef CONFIG_AFS_FSCACHE
if (PageFsCache(page)) {
struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index ccd0b212e82a7..81dd075356b96 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -93,7 +93,7 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
kunmap(page);
out_free:
- page_cache_release(page);
+ put_page(page);
out:
_leave(" = %d", ret);
return ret;
@@ -189,7 +189,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
buf = kmap_atomic(page);
memcpy(devname, buf, size);
kunmap_atomic(buf);
- page_cache_release(page);
+ put_page(page);
page = NULL;
}
@@ -211,7 +211,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
return mnt;
error:
- page_cache_release(page);
+ put_page(page);
error_no_page:
free_page((unsigned long) options);
error_no_options:
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index b50642870a43b..4832de84d52cb 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -65,6 +65,12 @@ static void afs_async_workfn(struct work_struct *work)
call->async_workfn(call);
}
+static int afs_wait_atomic_t(atomic_t *p)
+{
+ schedule();
+ return 0;
+}
+
/*
* open an RxRPC socket and bind it to be a server for callback notifications
* - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT
@@ -79,18 +85,14 @@ int afs_open_socket(void)
skb_queue_head_init(&afs_incoming_calls);
+ ret = -ENOMEM;
afs_async_calls = create_singlethread_workqueue("kafsd");
- if (!afs_async_calls) {
- _leave(" = -ENOMEM [wq]");
- return -ENOMEM;
- }
+ if (!afs_async_calls)
+ goto error_0;
ret = sock_create_kern(&init_net, AF_RXRPC, SOCK_DGRAM, PF_INET, &socket);
- if (ret < 0) {
- destroy_workqueue(afs_async_calls);
- _leave(" = %d [socket]", ret);
- return ret;
- }
+ if (ret < 0)
+ goto error_1;
socket->sk->sk_allocation = GFP_NOFS;
@@ -105,18 +107,26 @@ int afs_open_socket(void)
sizeof(srx.transport.sin.sin_addr));
ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
- if (ret < 0) {
- sock_release(socket);
- destroy_workqueue(afs_async_calls);
- _leave(" = %d [bind]", ret);
- return ret;
- }
+ if (ret < 0)
+ goto error_2;
+
+ ret = kernel_listen(socket, INT_MAX);
+ if (ret < 0)
+ goto error_2;
rxrpc_kernel_intercept_rx_messages(socket, afs_rx_interceptor);
afs_socket = socket;
_leave(" = 0");
return 0;
+
+error_2:
+ sock_release(socket);
+error_1:
+ destroy_workqueue(afs_async_calls);
+error_0:
+ _leave(" = %d", ret);
+ return ret;
}
/*
@@ -126,13 +136,16 @@ void afs_close_socket(void)
{
_enter("");
+ wait_on_atomic_t(&afs_outstanding_calls, afs_wait_atomic_t,
+ TASK_UNINTERRUPTIBLE);
+ _debug("no outstanding calls");
+
sock_release(afs_socket);
_debug("dework");
destroy_workqueue(afs_async_calls);
ASSERTCMP(atomic_read(&afs_outstanding_skbs), ==, 0);
- ASSERTCMP(atomic_read(&afs_outstanding_calls), ==, 0);
_leave("");
}
@@ -178,8 +191,6 @@ static void afs_free_call(struct afs_call *call)
{
_debug("DONE %p{%s} [%d]",
call, call->type->name, atomic_read(&afs_outstanding_calls));
- if (atomic_dec_return(&afs_outstanding_calls) == -1)
- BUG();
ASSERTCMP(call->rxcall, ==, NULL);
ASSERT(!work_pending(&call->async_work));
@@ -188,6 +199,9 @@ static void afs_free_call(struct afs_call *call)
kfree(call->request);
kfree(call);
+
+ if (atomic_dec_and_test(&afs_outstanding_calls))
+ wake_up_atomic_t(&afs_outstanding_calls);
}
/*
@@ -420,9 +434,11 @@ error_kill_call:
}
/*
- * handles intercepted messages that were arriving in the socket's Rx queue
- * - called with the socket receive queue lock held to ensure message ordering
- * - called with softirqs disabled
+ * Handles intercepted messages that were arriving in the socket's Rx queue.
+ *
+ * Called from the AF_RXRPC call processor in waitqueue process context. For
+ * each call, it is guaranteed this will be called in order of packet to be
+ * delivered.
*/
static void afs_rx_interceptor(struct sock *sk, unsigned long user_call_ID,
struct sk_buff *skb)
@@ -513,6 +529,12 @@ static void afs_deliver_to_call(struct afs_call *call)
call->state = AFS_CALL_ABORTED;
_debug("Rcv ABORT %u -> %d", abort_code, call->error);
break;
+ case RXRPC_SKB_MARK_LOCAL_ABORT:
+ abort_code = rxrpc_kernel_get_abort_code(skb);
+ call->error = call->type->abort_to_error(abort_code);
+ call->state = AFS_CALL_ABORTED;
+ _debug("Loc ABORT %u -> %d", abort_code, call->error);
+ break;
case RXRPC_SKB_MARK_NET_ERROR:
call->error = -rxrpc_kernel_get_error_number(skb);
call->state = AFS_CALL_ERROR;
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 81afefe7d8a6e..fbdb022b75a27 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -315,8 +315,8 @@ static int afs_fill_super(struct super_block *sb,
_enter("");
/* fill in the superblock */
- sb->s_blocksize = PAGE_CACHE_SIZE;
- sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ sb->s_blocksize = PAGE_SIZE;
+ sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = AFS_FS_MAGIC;
sb->s_op = &afs_super_ops;
sb->s_bdi = &as->volume->bdi;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index dfef94f70667c..14d506efd1aaa 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -93,10 +93,10 @@ static int afs_fill_page(struct afs_vnode *vnode, struct key *key,
_enter(",,%llu", (unsigned long long)pos);
i_size = i_size_read(&vnode->vfs_inode);
- if (pos + PAGE_CACHE_SIZE > i_size)
+ if (pos + PAGE_SIZE > i_size)
len = i_size - pos;
else
- len = PAGE_CACHE_SIZE;
+ len = PAGE_SIZE;
ret = afs_vnode_fetch_data(vnode, key, pos, len, page);
if (ret < 0) {
@@ -123,9 +123,9 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
struct page *page;
struct key *key = file->private_data;
- unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned from = pos & (PAGE_SIZE - 1);
unsigned to = from + len;
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t index = pos >> PAGE_SHIFT;
int ret;
_enter("{%x:%u},{%lx},%u,%u",
@@ -151,8 +151,8 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
*pagep = page;
/* page won't leak in error case: it eventually gets cleaned off LRU */
- if (!PageUptodate(page) && len != PAGE_CACHE_SIZE) {
- ret = afs_fill_page(vnode, key, index << PAGE_CACHE_SHIFT, page);
+ if (!PageUptodate(page) && len != PAGE_SIZE) {
+ ret = afs_fill_page(vnode, key, index << PAGE_SHIFT, page);
if (ret < 0) {
kfree(candidate);
_leave(" = %d [prep]", ret);
@@ -266,7 +266,7 @@ int afs_write_end(struct file *file, struct address_space *mapping,
if (PageDirty(page))
_debug("dirtied");
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return copied;
}
@@ -480,7 +480,7 @@ static int afs_writepages_region(struct address_space *mapping,
if (page->index > end) {
*_next = index;
- page_cache_release(page);
+ put_page(page);
_leave(" = 0 [%lx]", *_next);
return 0;
}
@@ -494,7 +494,7 @@ static int afs_writepages_region(struct address_space *mapping,
if (page->mapping != mapping) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
continue;
}
@@ -515,7 +515,7 @@ static int afs_writepages_region(struct address_space *mapping,
ret = afs_write_back_from_locked_page(wb, page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if (ret < 0) {
_leave(" = %d", ret);
return ret;
@@ -551,13 +551,13 @@ int afs_writepages(struct address_space *mapping,
&next);
mapping->writeback_index = next;
} else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
- end = (pgoff_t)(LLONG_MAX >> PAGE_CACHE_SHIFT);
+ end = (pgoff_t)(LLONG_MAX >> PAGE_SHIFT);
ret = afs_writepages_region(mapping, wbc, 0, end, &next);
if (wbc->nr_to_write > 0)
mapping->writeback_index = next;
} else {
- start = wbc->range_start >> PAGE_CACHE_SHIFT;
- end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ start = wbc->range_start >> PAGE_SHIFT;
+ end = wbc->range_end >> PAGE_SHIFT;
ret = afs_writepages_region(mapping, wbc, start, end, &next);
}
@@ -643,10 +643,6 @@ ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from)
return 0;
result = generic_file_write_iter(iocb, from);
- if (IS_ERR_VALUE(result)) {
- _leave(" = %zd", result);
- return result;
- }
_leave(" = %zd", result);
return result;
diff --git a/fs/aio.c b/fs/aio.c
index 155f84253f331..fb8e45b88cd4e 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -496,7 +496,12 @@ static int aio_setup_ring(struct kioctx *ctx)
ctx->mmap_size = nr_pages * PAGE_SIZE;
pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size);
- down_write(&mm->mmap_sem);
+ if (down_write_killable(&mm->mmap_sem)) {
+ ctx->mmap_size = 0;
+ aio_free_ring(ctx);
+ return -EINTR;
+ }
+
ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size,
PROT_READ | PROT_WRITE,
MAP_SHARED, 0, &unused);
@@ -1447,8 +1452,6 @@ rw_common:
return ret;
}
- len = ret;
-
if (rw == WRITE)
file_start_write(file);
diff --git a/fs/attr.c b/fs/attr.c
index 25b24d0f6c881..42bb42bb3c72c 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -255,6 +255,25 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
if (!(attr->ia_valid & ~(ATTR_KILL_SUID | ATTR_KILL_SGID)))
return 0;
+ /*
+ * Verify that uid/gid changes are valid in the target
+ * namespace of the superblock.
+ */
+ if (ia_valid & ATTR_UID &&
+ !kuid_has_mapping(inode->i_sb->s_user_ns, attr->ia_uid))
+ return -EOVERFLOW;
+ if (ia_valid & ATTR_GID &&
+ !kgid_has_mapping(inode->i_sb->s_user_ns, attr->ia_gid))
+ return -EOVERFLOW;
+
+ /* Don't allow modifications of files with invalid uids or
+ * gids unless those uids & gids are being made valid.
+ */
+ if (!(ia_valid & ATTR_UID) && !uid_valid(inode->i_uid))
+ return -EOVERFLOW;
+ if (!(ia_valid & ATTR_GID) && !gid_valid(inode->i_gid))
+ return -EOVERFLOW;
+
error = security_inode_setattr(dentry, attr);
if (error)
return error;
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index f0d268b97d196..a439548de785d 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -70,9 +70,13 @@ struct autofs_info {
};
#define AUTOFS_INF_EXPIRING (1<<0) /* dentry in the process of expiring */
-#define AUTOFS_INF_NO_RCU (1<<1) /* the dentry is being considered
+#define AUTOFS_INF_WANT_EXPIRE (1<<1) /* the dentry is being considered
* for expiry, so RCU_walk is
- * not permitted
+ * not permitted. If it progresses to
+ * actual expiry attempt, the flag is
+ * not cleared when EXPIRING is set -
+ * in that case it gets cleared only
+ * when it comes to clearing EXPIRING.
*/
#define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 9510d8d2e9cd2..b493909e74920 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -316,19 +316,17 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
if (ino->flags & AUTOFS_INF_PENDING)
goto out;
if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
- ino->flags |= AUTOFS_INF_NO_RCU;
+ ino->flags |= AUTOFS_INF_WANT_EXPIRE;
spin_unlock(&sbi->fs_lock);
synchronize_rcu();
spin_lock(&sbi->fs_lock);
if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
ino->flags |= AUTOFS_INF_EXPIRING;
- smp_mb();
- ino->flags &= ~AUTOFS_INF_NO_RCU;
init_completion(&ino->expire_complete);
spin_unlock(&sbi->fs_lock);
return root;
}
- ino->flags &= ~AUTOFS_INF_NO_RCU;
+ ino->flags &= ~AUTOFS_INF_WANT_EXPIRE;
}
out:
spin_unlock(&sbi->fs_lock);
@@ -446,7 +444,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
while ((dentry = get_next_positive_subdir(dentry, root))) {
spin_lock(&sbi->fs_lock);
ino = autofs4_dentry_ino(dentry);
- if (ino->flags & AUTOFS_INF_NO_RCU)
+ if (ino->flags & AUTOFS_INF_WANT_EXPIRE)
expired = NULL;
else
expired = should_expire(dentry, mnt, timeout, how);
@@ -455,7 +453,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
continue;
}
ino = autofs4_dentry_ino(expired);
- ino->flags |= AUTOFS_INF_NO_RCU;
+ ino->flags |= AUTOFS_INF_WANT_EXPIRE;
spin_unlock(&sbi->fs_lock);
synchronize_rcu();
spin_lock(&sbi->fs_lock);
@@ -465,7 +463,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
goto found;
}
- ino->flags &= ~AUTOFS_INF_NO_RCU;
+ ino->flags &= ~AUTOFS_INF_WANT_EXPIRE;
if (expired != dentry)
dput(expired);
spin_unlock(&sbi->fs_lock);
@@ -475,17 +473,8 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
found:
pr_debug("returning %p %pd\n", expired, expired);
ino->flags |= AUTOFS_INF_EXPIRING;
- smp_mb();
- ino->flags &= ~AUTOFS_INF_NO_RCU;
init_completion(&ino->expire_complete);
spin_unlock(&sbi->fs_lock);
- spin_lock(&sbi->lookup_lock);
- spin_lock(&expired->d_parent->d_lock);
- spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED);
- list_move(&expired->d_parent->d_subdirs, &expired->d_child);
- spin_unlock(&expired->d_lock);
- spin_unlock(&expired->d_parent->d_lock);
- spin_unlock(&sbi->lookup_lock);
return expired;
}
@@ -496,7 +485,7 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
int status;
/* Block on any pending expire */
- if (!(ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU)))
+ if (!(ino->flags & AUTOFS_INF_WANT_EXPIRE))
return 0;
if (rcu_walk)
return -ECHILD;
@@ -554,7 +543,7 @@ int autofs4_expire_run(struct super_block *sb,
ino = autofs4_dentry_ino(dentry);
/* avoid rapid-fire expire attempts if expiry fails */
ino->last_used = now;
- ino->flags &= ~AUTOFS_INF_EXPIRING;
+ ino->flags &= ~(AUTOFS_INF_EXPIRING|AUTOFS_INF_WANT_EXPIRE);
complete_all(&ino->expire_complete);
spin_unlock(&sbi->fs_lock);
@@ -583,7 +572,7 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
spin_lock(&sbi->fs_lock);
/* avoid rapid-fire expire attempts if expiry fails */
ino->last_used = now;
- ino->flags &= ~AUTOFS_INF_EXPIRING;
+ ino->flags &= ~(AUTOFS_INF_EXPIRING|AUTOFS_INF_WANT_EXPIRE);
complete_all(&ino->expire_complete);
spin_unlock(&sbi->fs_lock);
dput(dentry);
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 7ab923940d18c..fa84bb8832e0b 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -39,7 +39,7 @@ const struct file_operations autofs4_root_operations = {
.open = dcache_dir_open,
.release = dcache_dir_close,
.read = generic_read_dir,
- .iterate = dcache_readdir,
+ .iterate_shared = dcache_readdir,
.llseek = dcache_dir_lseek,
.unlocked_ioctl = autofs4_root_ioctl,
#ifdef CONFIG_COMPAT
@@ -51,7 +51,7 @@ const struct file_operations autofs4_dir_operations = {
.open = autofs4_dir_open,
.release = dcache_dir_close,
.read = generic_read_dir,
- .iterate = dcache_readdir,
+ .iterate_shared = dcache_readdir,
.llseek = dcache_dir_lseek,
};
@@ -159,7 +159,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct dentry *parent = dentry->d_parent;
- struct qstr *name = &dentry->d_name;
+ const struct qstr *name = &dentry->d_name;
unsigned int len = name->len;
unsigned int hash = name->hash;
const unsigned char *str = name->name;
@@ -172,7 +172,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
list_for_each(p, head) {
struct autofs_info *ino;
struct dentry *active;
- struct qstr *qstr;
+ const struct qstr *qstr;
ino = list_entry(p, struct autofs_info, active);
active = ino->dentry;
@@ -214,7 +214,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry,
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct dentry *parent = dentry->d_parent;
- struct qstr *name = &dentry->d_name;
+ const struct qstr *name = &dentry->d_name;
unsigned int len = name->len;
unsigned int hash = name->hash;
const unsigned char *str = name->name;
@@ -227,7 +227,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry,
list_for_each(p, head) {
struct autofs_info *ino;
struct dentry *expiring;
- struct qstr *qstr;
+ const struct qstr *qstr;
if (rcu_walk) {
spin_unlock(&sbi->lookup_lock);
@@ -458,7 +458,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
*/
struct inode *inode;
- if (ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU))
+ if (ino->flags & AUTOFS_INF_WANT_EXPIRE)
return 0;
if (d_mountpoint(dentry))
return 0;
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 0146d911f468c..431fd7ee34886 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -66,11 +66,12 @@ static int autofs4_write(struct autofs_sb_info *sbi,
set_fs(KERNEL_DS);
mutex_lock(&sbi->pipe_mutex);
- wr = __vfs_write(file, data, bytes, &file->f_pos);
- while (bytes && wr) {
+ while (bytes) {
+ wr = __vfs_write(file, data, bytes, &file->f_pos);
+ if (wr <= 0)
+ break;
data += wr;
bytes -= wr;
- wr = __vfs_write(file, data, bytes, &file->f_pos);
}
mutex_unlock(&sbi->pipe_mutex);
@@ -224,7 +225,7 @@ rename_retry:
}
static struct autofs_wait_queue *
-autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr)
+autofs4_find_wait(struct autofs_sb_info *sbi, const struct qstr *qstr)
{
struct autofs_wait_queue *wq;
@@ -248,7 +249,7 @@ autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr)
*/
static int validate_request(struct autofs_wait_queue **wait,
struct autofs_sb_info *sbi,
- struct qstr *qstr,
+ const struct qstr *qstr,
struct dentry *dentry, enum autofs_notify notify)
{
struct autofs_wait_queue *wq;
@@ -397,7 +398,7 @@ int autofs4_wait(struct autofs_sb_info *sbi,
}
}
qstr.name = name;
- qstr.hash = full_name_hash(name, qstr.len);
+ qstr.hash = full_name_hash(dentry, name, qstr.len);
if (mutex_lock_interruptible(&sbi->wq_mutex)) {
kfree(qstr.name);
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 103f5d7c30838..3ba385eaa26ee 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -100,14 +100,14 @@ static int bad_inode_setattr(struct dentry *direntry, struct iattr *attrs)
return -EIO;
}
-static int bad_inode_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+static int bad_inode_setxattr(struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value, size_t size, int flags)
{
return -EIO;
}
-static ssize_t bad_inode_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size)
+static ssize_t bad_inode_getxattr(struct dentry *dentry, struct inode *inode,
+ const char *name, void *buffer, size_t size)
{
return -EIO;
}
diff --git a/fs/befs/befs.h b/fs/befs/befs.h
index 35d19e8731e35..e0f59263a96d5 100644
--- a/fs/befs/befs.h
+++ b/fs/befs/befs.h
@@ -116,7 +116,7 @@ BEFS_I(const struct inode *inode)
}
static inline befs_blocknr_t
-iaddr2blockno(struct super_block *sb, befs_inode_addr * iaddr)
+iaddr2blockno(struct super_block *sb, const befs_inode_addr *iaddr)
{
return ((iaddr->allocation_group << BEFS_SB(sb)->ag_shift) +
iaddr->start);
@@ -141,7 +141,7 @@ befs_iaddrs_per_block(struct super_block *sb)
}
static inline int
-befs_iaddr_is_empty(befs_inode_addr * iaddr)
+befs_iaddr_is_empty(const befs_inode_addr *iaddr)
{
return (!iaddr->allocation_group) && (!iaddr->start) && (!iaddr->len);
}
diff --git a/fs/befs/btree.c b/fs/befs/btree.c
index 22c1662808830..307645f9e284c 100644
--- a/fs/befs/btree.c
+++ b/fs/befs/btree.c
@@ -88,15 +88,15 @@ struct befs_btree_node {
static const befs_off_t befs_bt_inval = 0xffffffffffffffffULL;
/* local functions */
-static int befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
+static int befs_btree_seekleaf(struct super_block *sb, const befs_data_stream *ds,
befs_btree_super * bt_super,
struct befs_btree_node *this_node,
befs_off_t * node_off);
-static int befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
+static int befs_bt_read_super(struct super_block *sb, const befs_data_stream *ds,
befs_btree_super * sup);
-static int befs_bt_read_node(struct super_block *sb, befs_data_stream * ds,
+static int befs_bt_read_node(struct super_block *sb, const befs_data_stream *ds,
struct befs_btree_node *node,
befs_off_t node_off);
@@ -134,7 +134,7 @@ static int befs_compare_strings(const void *key1, int keylen1,
* On failure, BEFS_ERR is returned.
*/
static int
-befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
+befs_bt_read_super(struct super_block *sb, const befs_data_stream *ds,
befs_btree_super * sup)
{
struct buffer_head *bh;
@@ -193,7 +193,7 @@ befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
*/
static int
-befs_bt_read_node(struct super_block *sb, befs_data_stream * ds,
+befs_bt_read_node(struct super_block *sb, const befs_data_stream *ds,
struct befs_btree_node *node, befs_off_t node_off)
{
uint off = 0;
@@ -247,7 +247,7 @@ befs_bt_read_node(struct super_block *sb, befs_data_stream * ds,
* actuall value stored with the key.
*/
int
-befs_btree_find(struct super_block *sb, befs_data_stream * ds,
+befs_btree_find(struct super_block *sb, const befs_data_stream *ds,
const char *key, befs_off_t * value)
{
struct befs_btree_node *this_node;
@@ -416,7 +416,7 @@ befs_find_key(struct super_block *sb, struct befs_btree_node *node,
* until the (key_no)th key is found or the tree is out of keys.
*/
int
-befs_btree_read(struct super_block *sb, befs_data_stream * ds,
+befs_btree_read(struct super_block *sb, const befs_data_stream *ds,
loff_t key_no, size_t bufsize, char *keybuf, size_t * keysize,
befs_off_t * value)
{
@@ -548,7 +548,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
* Also checks for an empty tree. If there are no keys, returns BEFS_BT_EMPTY.
*/
static int
-befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
+befs_btree_seekleaf(struct super_block *sb, const befs_data_stream *ds,
befs_btree_super *bt_super,
struct befs_btree_node *this_node,
befs_off_t * node_off)
diff --git a/fs/befs/btree.h b/fs/befs/btree.h
index 92e781e5f30ef..f2a8f637e9e07 100644
--- a/fs/befs/btree.h
+++ b/fs/befs/btree.h
@@ -4,10 +4,10 @@
*/
-int befs_btree_find(struct super_block *sb, befs_data_stream * ds,
+int befs_btree_find(struct super_block *sb, const befs_data_stream *ds,
const char *key, befs_off_t * value);
-int befs_btree_read(struct super_block *sb, befs_data_stream * ds,
+int befs_btree_read(struct super_block *sb, const befs_data_stream *ds,
loff_t key_no, size_t bufsize, char *keybuf,
size_t * keysize, befs_off_t * value);
diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c
index ebd50718659f2..af1bc19b7c85b 100644
--- a/fs/befs/datastream.c
+++ b/fs/befs/datastream.c
@@ -21,16 +21,16 @@
const befs_inode_addr BAD_IADDR = { 0, 0, 0 };
static int befs_find_brun_direct(struct super_block *sb,
- befs_data_stream * data,
+ const befs_data_stream *data,
befs_blocknr_t blockno, befs_block_run * run);
static int befs_find_brun_indirect(struct super_block *sb,
- befs_data_stream * data,
+ const befs_data_stream *data,
befs_blocknr_t blockno,
befs_block_run * run);
static int befs_find_brun_dblindirect(struct super_block *sb,
- befs_data_stream * data,
+ const befs_data_stream *data,
befs_blocknr_t blockno,
befs_block_run * run);
@@ -45,10 +45,10 @@ static int befs_find_brun_dblindirect(struct super_block *sb,
* if you don't need to know offset just set @off = NULL.
*/
struct buffer_head *
-befs_read_datastream(struct super_block *sb, befs_data_stream * ds,
+befs_read_datastream(struct super_block *sb, const befs_data_stream *ds,
befs_off_t pos, uint * off)
{
- struct buffer_head *bh = NULL;
+ struct buffer_head *bh;
befs_block_run run;
befs_blocknr_t block; /* block coresponding to pos */
@@ -87,7 +87,7 @@ befs_read_datastream(struct super_block *sb, befs_data_stream * ds,
* 2001-11-15 Will Dyson
*/
int
-befs_fblock2brun(struct super_block *sb, befs_data_stream * data,
+befs_fblock2brun(struct super_block *sb, const befs_data_stream *data,
befs_blocknr_t fblock, befs_block_run * run)
{
int err;
@@ -122,12 +122,12 @@ befs_fblock2brun(struct super_block *sb, befs_data_stream * data,
* Returns the number of bytes read
*/
size_t
-befs_read_lsymlink(struct super_block * sb, befs_data_stream * ds, void *buff,
- befs_off_t len)
+befs_read_lsymlink(struct super_block *sb, const befs_data_stream *ds,
+ void *buff, befs_off_t len)
{
befs_off_t bytes_read = 0; /* bytes readed */
u16 plen;
- struct buffer_head *bh = NULL;
+ struct buffer_head *bh;
befs_debug(sb, "---> %s length: %llu", __func__, len);
while (bytes_read < len) {
@@ -163,7 +163,7 @@ befs_read_lsymlink(struct super_block * sb, befs_data_stream * ds, void *buff,
*/
befs_blocknr_t
-befs_count_blocks(struct super_block * sb, befs_data_stream * ds)
+befs_count_blocks(struct super_block *sb, const befs_data_stream *ds)
{
befs_blocknr_t blocks;
befs_blocknr_t datablocks; /* File data blocks */
@@ -243,11 +243,11 @@ befs_count_blocks(struct super_block * sb, befs_data_stream * ds)
2001-11-15 Will Dyson
*/
static int
-befs_find_brun_direct(struct super_block *sb, befs_data_stream * data,
+befs_find_brun_direct(struct super_block *sb, const befs_data_stream *data,
befs_blocknr_t blockno, befs_block_run * run)
{
int i;
- befs_block_run *array = data->direct;
+ const befs_block_run *array = data->direct;
befs_blocknr_t sum;
befs_blocknr_t max_block =
data->max_direct_range >> BEFS_SB(sb)->block_shift;
@@ -304,7 +304,8 @@ befs_find_brun_direct(struct super_block *sb, befs_data_stream * data,
*/
static int
befs_find_brun_indirect(struct super_block *sb,
- befs_data_stream * data, befs_blocknr_t blockno,
+ const befs_data_stream *data,
+ befs_blocknr_t blockno,
befs_block_run * run)
{
int i, j;
@@ -412,7 +413,8 @@ befs_find_brun_indirect(struct super_block *sb,
*/
static int
befs_find_brun_dblindirect(struct super_block *sb,
- befs_data_stream * data, befs_blocknr_t blockno,
+ const befs_data_stream *data,
+ befs_blocknr_t blockno,
befs_block_run * run)
{
int dblindir_indx;
@@ -427,7 +429,7 @@ befs_find_brun_dblindirect(struct super_block *sb,
struct buffer_head *dbl_indir_block;
struct buffer_head *indir_block;
befs_block_run indir_run;
- befs_disk_inode_addr *iaddr_array = NULL;
+ befs_disk_inode_addr *iaddr_array;
struct befs_sb_info *befs_sb = BEFS_SB(sb);
befs_blocknr_t indir_start_blk =
@@ -486,7 +488,6 @@ befs_find_brun_dblindirect(struct super_block *sb,
iaddr_array = (befs_disk_inode_addr *) dbl_indir_block->b_data;
indir_run = fsrun_to_cpu(sb, iaddr_array[dbl_block_indx]);
brelse(dbl_indir_block);
- iaddr_array = NULL;
/* Read indirect block */
which_block = indir_indx / befs_iaddrs_per_block(sb);
@@ -511,7 +512,6 @@ befs_find_brun_dblindirect(struct super_block *sb,
iaddr_array = (befs_disk_inode_addr *) indir_block->b_data;
*run = fsrun_to_cpu(sb, iaddr_array[block_indx]);
brelse(indir_block);
- iaddr_array = NULL;
blockno_at_run_start = indir_start_blk;
blockno_at_run_start += diblklen * dblindir_indx;
diff --git a/fs/befs/datastream.h b/fs/befs/datastream.h
index 45e8a3c98249c..91ba8203d83f2 100644
--- a/fs/befs/datastream.h
+++ b/fs/befs/datastream.h
@@ -4,16 +4,17 @@
*/
struct buffer_head *befs_read_datastream(struct super_block *sb,
- befs_data_stream * ds, befs_off_t pos,
- uint * off);
+ const befs_data_stream *ds,
+ befs_off_t pos, uint * off);
-int befs_fblock2brun(struct super_block *sb, befs_data_stream * data,
+int befs_fblock2brun(struct super_block *sb, const befs_data_stream *data,
befs_blocknr_t fblock, befs_block_run * run);
-size_t befs_read_lsymlink(struct super_block *sb, befs_data_stream * data,
+size_t befs_read_lsymlink(struct super_block *sb, const befs_data_stream *data,
void *buff, befs_off_t len);
-befs_blocknr_t befs_count_blocks(struct super_block *sb, befs_data_stream * ds);
+befs_blocknr_t befs_count_blocks(struct super_block *sb,
+ const befs_data_stream *ds);
extern const befs_inode_addr BAD_IADDR;
diff --git a/fs/befs/io.c b/fs/befs/io.c
index 7a5b4ec21c569..523c8af2d770b 100644
--- a/fs/befs/io.c
+++ b/fs/befs/io.c
@@ -26,7 +26,7 @@
struct buffer_head *
befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr)
{
- struct buffer_head *bh = NULL;
+ struct buffer_head *bh;
befs_blocknr_t block = 0;
struct befs_sb_info *befs_sb = BEFS_SB(sb);
@@ -63,7 +63,7 @@ befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr)
struct buffer_head *
befs_bread(struct super_block *sb, befs_blocknr_t block)
{
- struct buffer_head *bh = NULL;
+ struct buffer_head *bh;
befs_debug(sb, "---> Enter %s %lu", __func__, (unsigned long)block);
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index cc0e08252913a..7da05b159ade2 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -66,7 +66,7 @@ static struct kmem_cache *befs_inode_cachep;
static const struct file_operations befs_dir_operations = {
.read = generic_read_dir,
- .iterate = befs_readdir,
+ .iterate_shared = befs_readdir,
.llseek = generic_file_llseek,
};
@@ -155,9 +155,9 @@ befs_get_block(struct inode *inode, sector_t block,
static struct dentry *
befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
- struct inode *inode = NULL;
+ struct inode *inode;
struct super_block *sb = dir->i_sb;
- befs_data_stream *ds = &BEFS_I(dir)->i_data.ds;
+ const befs_data_stream *ds = &BEFS_I(dir)->i_data.ds;
befs_off_t offset;
int ret;
int utfnamelen;
@@ -207,7 +207,7 @@ befs_readdir(struct file *file, struct dir_context *ctx)
{
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
- befs_data_stream *ds = &BEFS_I(inode)->i_data.ds;
+ const befs_data_stream *ds = &BEFS_I(inode)->i_data.ds;
befs_off_t value;
int result;
size_t keysize;
@@ -294,10 +294,10 @@ static void init_once(void *foo)
static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
{
- struct buffer_head *bh = NULL;
- befs_inode *raw_inode = NULL;
+ struct buffer_head *bh;
+ befs_inode *raw_inode;
struct befs_sb_info *befs_sb = BEFS_SB(sb);
- struct befs_inode_info *befs_ino = NULL;
+ struct befs_inode_info *befs_ino;
struct inode *inode;
long ret = -EIO;
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 3ec6113146c0b..34a5bc2f12902 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -70,7 +70,7 @@ static int bfs_readdir(struct file *f, struct dir_context *ctx)
const struct file_operations bfs_dir_operations = {
.read = generic_read_dir,
- .iterate = bfs_readdir,
+ .iterate_shared = bfs_readdir,
.fsync = generic_file_fsync,
.llseek = generic_file_llseek,
};
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 4c556680fa749..ae1b5404fced4 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -127,12 +127,8 @@ static int set_brk(unsigned long start, unsigned long end)
{
start = PAGE_ALIGN(start);
end = PAGE_ALIGN(end);
- if (end > start) {
- unsigned long addr;
- addr = vm_brk(start, end - start);
- if (BAD_ADDR(addr))
- return addr;
- }
+ if (end > start)
+ return vm_brk(start, end - start);
return 0;
}
@@ -275,7 +271,7 @@ static int load_aout_binary(struct linux_binprm * bprm)
map_size = ex.a_text+ex.a_data;
#endif
error = vm_brk(text_addr & PAGE_MASK, map_size);
- if (error != (text_addr & PAGE_MASK))
+ if (error)
return error;
error = read_code(bprm->file, text_addr, pos,
@@ -297,7 +293,10 @@ static int load_aout_binary(struct linux_binprm * bprm)
}
if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) {
- vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
+ error = vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
+ if (error)
+ return error;
+
read_code(bprm->file, N_TXTADDR(ex), fd_offset,
ex.a_text + ex.a_data);
goto beyond_if;
@@ -378,8 +377,10 @@ static int load_aout_library(struct file *file)
"N_TXTOFF is not page aligned. Please convert library: %pD\n",
file);
}
- vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
-
+ retval = vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
+ if (retval)
+ goto out;
+
read_code(file, start_addr, N_TXTOFF(ex),
ex.a_text + ex.a_data);
retval = 0;
@@ -397,9 +398,8 @@ static int load_aout_library(struct file *file)
len = PAGE_ALIGN(ex.a_text + ex.a_data);
bss = ex.a_text + ex.a_data + ex.a_bss;
if (bss > len) {
- error = vm_brk(start_addr + len, bss - len);
- retval = error;
- if (error != start_addr + len)
+ retval = vm_brk(start_addr + len, bss - len);
+ if (retval)
goto out;
}
retval = 0;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 7d914c67a9d07..7f6aff3f72eba 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -96,10 +96,9 @@ static int set_brk(unsigned long start, unsigned long end)
start = ELF_PAGEALIGN(start);
end = ELF_PAGEALIGN(end);
if (end > start) {
- unsigned long addr;
- addr = vm_brk(start, end - start);
- if (BAD_ADDR(addr))
- return addr;
+ int error = vm_brk(start, end - start);
+ if (error)
+ return error;
}
current->mm->start_brk = current->mm->brk = end;
return 0;
@@ -606,30 +605,32 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
* Do the same thing for the memory mapping - between
* elf_bss and last_bss is the bss section.
*/
- k = load_addr + eppnt->p_memsz + eppnt->p_vaddr;
+ k = load_addr + eppnt->p_vaddr + eppnt->p_memsz;
if (k > last_bss)
last_bss = k;
}
}
+ /*
+ * Now fill out the bss section: first pad the last page from
+ * the file up to the page boundary, and zero it from elf_bss
+ * up to the end of the page.
+ */
+ if (padzero(elf_bss)) {
+ error = -EFAULT;
+ goto out;
+ }
+ /*
+ * Next, align both the file and mem bss up to the page size,
+ * since this is where elf_bss was just zeroed up to, and where
+ * last_bss will end after the vm_brk() below.
+ */
+ elf_bss = ELF_PAGEALIGN(elf_bss);
+ last_bss = ELF_PAGEALIGN(last_bss);
+ /* Finally, if there is still more bss to allocate, do it. */
if (last_bss > elf_bss) {
- /*
- * Now fill out the bss section. First pad the last page up
- * to the page boundary, and then perform a mmap to make sure
- * that there are zero-mapped pages up to and including the
- * last bss page.
- */
- if (padzero(elf_bss)) {
- error = -EFAULT;
- goto out;
- }
-
- /* What we have mapped so far */
- elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);
-
- /* Map the last of the bss segment */
error = vm_brk(elf_bss, last_bss - elf_bss);
- if (BAD_ADDR(error))
+ if (error)
goto out;
}
@@ -1176,8 +1177,11 @@ static int load_elf_library(struct file *file)
len = ELF_PAGESTART(eppnt->p_filesz + eppnt->p_vaddr +
ELF_MIN_ALIGN - 1);
bss = eppnt->p_memsz + eppnt->p_vaddr;
- if (bss > len)
- vm_brk(len, bss - len);
+ if (bss > len) {
+ error = vm_brk(len, bss - len);
+ if (error)
+ goto out_free_ph;
+ }
error = 0;
out_free_ph:
@@ -2273,7 +2277,7 @@ static int elf_core_dump(struct coredump_params *cprm)
goto end_coredump;
/* Align to page */
- if (!dump_skip(cprm, dataoff - cprm->written))
+ if (!dump_skip(cprm, dataoff - cprm->pos))
goto end_coredump;
for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
@@ -2292,7 +2296,7 @@ static int elf_core_dump(struct coredump_params *cprm)
void *kaddr = kmap(page);
stop = !dump_emit(cprm, kaddr, PAGE_SIZE);
kunmap(page);
- page_cache_release(page);
+ put_page(page);
} else
stop = !dump_skip(cprm, PAGE_SIZE);
if (stop)
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index b1adb92e69de7..464a972e88c13 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -67,8 +67,6 @@ static int create_elf_fdpic_tables(struct linux_binprm *, struct mm_struct *,
struct elf_fdpic_params *);
#ifndef CONFIG_MMU
-static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *,
- unsigned long *);
static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *,
struct file *,
struct mm_struct *);
@@ -515,8 +513,9 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
sp = mm->start_stack;
/* stack the program arguments and environment */
- if (elf_fdpic_transfer_args_to_stack(bprm, &sp) < 0)
+ if (transfer_args_to_stack(bprm, &sp) < 0)
return -EFAULT;
+ sp &= ~15;
#endif
/*
@@ -711,39 +710,6 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
/*****************************************************************************/
/*
- * transfer the program arguments and environment from the holding pages onto
- * the stack
- */
-#ifndef CONFIG_MMU
-static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *bprm,
- unsigned long *_sp)
-{
- unsigned long index, stop, sp;
- char *src;
- int ret = 0;
-
- stop = bprm->p >> PAGE_SHIFT;
- sp = *_sp;
-
- for (index = MAX_ARG_PAGES - 1; index >= stop; index--) {
- src = kmap(bprm->page[index]);
- sp -= PAGE_SIZE;
- if (copy_to_user((void *) sp, src, PAGE_SIZE) != 0)
- ret = -EFAULT;
- kunmap(bprm->page[index]);
- if (ret < 0)
- goto out;
- }
-
- *_sp = (*_sp - (MAX_ARG_PAGES * PAGE_SIZE - bprm->p)) & ~15;
-
-out:
- return ret;
-}
-#endif
-
-/*****************************************************************************/
-/*
* load the appropriate binary image (executable or interpreter) into memory
* - we assume no MMU is available
* - if no other PIC bits are set in params->hdr->e_flags
@@ -1533,7 +1499,7 @@ static bool elf_fdpic_dump_segments(struct coredump_params *cprm)
void *kaddr = kmap(page);
res = dump_emit(cprm, kaddr, PAGE_SIZE);
kunmap(page);
- page_cache_release(page);
+ put_page(page);
} else {
res = dump_skip(cprm, PAGE_SIZE);
}
@@ -1787,7 +1753,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
goto end_coredump;
}
- if (!dump_skip(cprm, dataoff - cprm->written))
+ if (!dump_skip(cprm, dataoff - cprm->pos))
goto end_coredump;
if (!elf_fdpic_dump_segments(cprm))
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index 490538536cb44..dd2d3f0cd55d8 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -24,7 +24,8 @@
static int load_em86(struct linux_binprm *bprm)
{
- char *interp, *i_name, *i_arg;
+ const char *i_name, *i_arg;
+ char *interp;
struct file * file;
int retval;
struct elfhdr elf_ex;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index f723cd3a455cb..9b2917a302940 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -15,7 +15,8 @@
* JAN/99 -- coded full program relocation (gerg@snapgear.com)
*/
-#include <linux/export.h>
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/mm.h>
@@ -25,8 +26,6 @@
#include <linux/string.h>
#include <linux/fs.h>
#include <linux/file.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
#include <linux/ptrace.h>
#include <linux/user.h>
#include <linux/slab.h>
@@ -34,26 +33,16 @@
#include <linux/personality.h>
#include <linux/init.h>
#include <linux/flat.h>
-#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
#include <asm/byteorder.h>
-#include <asm/uaccess.h>
#include <asm/unaligned.h>
#include <asm/cacheflush.h>
#include <asm/page.h>
/****************************************************************************/
-#if 0
-#define DEBUG 1
-#endif
-
-#ifdef DEBUG
-#define DBG_FLT(a...) printk(a)
-#else
-#define DBG_FLT(a...)
-#endif
-
/*
* User data (data section and bss) needs to be aligned.
* We pick 0x20 here because it is the max value elf2flt has always
@@ -80,7 +69,7 @@ struct lib_info {
unsigned long text_len; /* Length of text segment */
unsigned long entry; /* Start address for this module */
unsigned long build_date; /* When this one was compiled */
- short loaded; /* Has this library been loaded? */
+ bool loaded; /* Has this library been loaded? */
} lib_list[MAX_SHARED_LIBS];
};
@@ -106,59 +95,67 @@ static struct linux_binfmt flat_format = {
static int flat_core_dump(struct coredump_params *cprm)
{
- printk("Process %s:%d received signr %d and should have core dumped\n",
- current->comm, current->pid, (int) cprm->siginfo->si_signo);
- return(1);
+ pr_warn("Process %s:%d received signr %d and should have core dumped\n",
+ current->comm, current->pid, cprm->siginfo->si_signo);
+ return 1;
}
/****************************************************************************/
/*
* create_flat_tables() parses the env- and arg-strings in new user
* memory and creates the pointer tables from them, and puts their
- * addresses on the "stack", returning the new stack pointer value.
+ * addresses on the "stack", recording the new stack pointer value.
*/
-static unsigned long create_flat_tables(
- unsigned long pp,
- struct linux_binprm * bprm)
+static int create_flat_tables(struct linux_binprm *bprm, unsigned long arg_start)
{
- unsigned long *argv,*envp;
- unsigned long * sp;
- char * p = (char*)pp;
- int argc = bprm->argc;
- int envc = bprm->envc;
- char uninitialized_var(dummy);
-
- sp = (unsigned long *)p;
- sp -= (envc + argc + 2) + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
- sp = (unsigned long *) ((unsigned long)sp & -FLAT_STACK_ALIGN);
- argv = sp + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
- envp = argv + (argc + 1);
+ char __user *p;
+ unsigned long __user *sp;
+ long i, len;
+
+ p = (char __user *)arg_start;
+ sp = (unsigned long __user *)current->mm->start_stack;
+
+ sp -= bprm->envc + 1;
+ sp -= bprm->argc + 1;
+ sp -= flat_argvp_envp_on_stack() ? 2 : 0;
+ sp -= 1; /* &argc */
+ current->mm->start_stack = (unsigned long)sp & -FLAT_STACK_ALIGN;
+ sp = (unsigned long __user *)current->mm->start_stack;
+
+ __put_user(bprm->argc, sp++);
if (flat_argvp_envp_on_stack()) {
- put_user((unsigned long) envp, sp + 2);
- put_user((unsigned long) argv, sp + 1);
- }
-
- put_user(argc, sp);
- current->mm->arg_start = (unsigned long) p;
- while (argc-->0) {
- put_user((unsigned long) p, argv++);
- do {
- get_user(dummy, p); p++;
- } while (dummy);
- }
- put_user((unsigned long) NULL, argv);
- current->mm->arg_end = current->mm->env_start = (unsigned long) p;
- while (envc-->0) {
- put_user((unsigned long)p, envp); envp++;
- do {
- get_user(dummy, p); p++;
- } while (dummy);
- }
- put_user((unsigned long) NULL, envp);
- current->mm->env_end = (unsigned long) p;
- return (unsigned long)sp;
+ unsigned long argv, envp;
+ argv = (unsigned long)(sp + 2);
+ envp = (unsigned long)(sp + 2 + bprm->argc + 1);
+ __put_user(argv, sp++);
+ __put_user(envp, sp++);
+ }
+
+ current->mm->arg_start = (unsigned long)p;
+ for (i = bprm->argc; i > 0; i--) {
+ __put_user((unsigned long)p, sp++);
+ len = strnlen_user(p, MAX_ARG_STRLEN);
+ if (!len || len > MAX_ARG_STRLEN)
+ return -EINVAL;
+ p += len;
+ }
+ __put_user(0, sp++);
+ current->mm->arg_end = (unsigned long)p;
+
+ current->mm->env_start = (unsigned long) p;
+ for (i = bprm->envc; i > 0; i--) {
+ __put_user((unsigned long)p, sp++);
+ len = strnlen_user(p, MAX_ARG_STRLEN);
+ if (!len || len > MAX_ARG_STRLEN)
+ return -EINVAL;
+ p += len;
+ }
+ __put_user(0, sp++);
+ current->mm->env_end = (unsigned long)p;
+
+ return 0;
}
/****************************************************************************/
@@ -190,17 +187,17 @@ static int decompress_exec(
loff_t fpos;
int ret, retval;
- DBG_FLT("decompress_exec(offset=%x,buf=%x,len=%x)\n",(int)offset, (int)dst, (int)len);
+ pr_debug("decompress_exec(offset=%lx,buf=%p,len=%lx)\n", offset, dst, len);
memset(&strm, 0, sizeof(strm));
strm.workspace = kmalloc(zlib_inflate_workspacesize(), GFP_KERNEL);
if (strm.workspace == NULL) {
- DBG_FLT("binfmt_flat: no memory for decompress workspace\n");
+ pr_debug("no memory for decompress workspace\n");
return -ENOMEM;
}
buf = kmalloc(LBUFSIZE, GFP_KERNEL);
if (buf == NULL) {
- DBG_FLT("binfmt_flat: no memory for read buffer\n");
+ pr_debug("no memory for read buffer\n");
retval = -ENOMEM;
goto out_free;
}
@@ -218,49 +215,49 @@ static int decompress_exec(
/* Check minimum size -- gzip header */
if (ret < 10) {
- DBG_FLT("binfmt_flat: file too small?\n");
+ pr_debug("file too small?\n");
goto out_free_buf;
}
/* Check gzip magic number */
if ((buf[0] != 037) || ((buf[1] != 0213) && (buf[1] != 0236))) {
- DBG_FLT("binfmt_flat: unknown compression magic?\n");
+ pr_debug("unknown compression magic?\n");
goto out_free_buf;
}
/* Check gzip method */
if (buf[2] != 8) {
- DBG_FLT("binfmt_flat: unknown compression method?\n");
+ pr_debug("unknown compression method?\n");
goto out_free_buf;
}
/* Check gzip flags */
if ((buf[3] & ENCRYPTED) || (buf[3] & CONTINUATION) ||
(buf[3] & RESERVED)) {
- DBG_FLT("binfmt_flat: unknown flags?\n");
+ pr_debug("unknown flags?\n");
goto out_free_buf;
}
ret = 10;
if (buf[3] & EXTRA_FIELD) {
ret += 2 + buf[10] + (buf[11] << 8);
- if (unlikely(LBUFSIZE <= ret)) {
- DBG_FLT("binfmt_flat: buffer overflow (EXTRA)?\n");
+ if (unlikely(ret >= LBUFSIZE)) {
+ pr_debug("buffer overflow (EXTRA)?\n");
goto out_free_buf;
}
}
if (buf[3] & ORIG_NAME) {
while (ret < LBUFSIZE && buf[ret++] != 0)
;
- if (unlikely(LBUFSIZE == ret)) {
- DBG_FLT("binfmt_flat: buffer overflow (ORIG_NAME)?\n");
+ if (unlikely(ret == LBUFSIZE)) {
+ pr_debug("buffer overflow (ORIG_NAME)?\n");
goto out_free_buf;
}
}
if (buf[3] & COMMENT) {
while (ret < LBUFSIZE && buf[ret++] != 0)
;
- if (unlikely(LBUFSIZE == ret)) {
- DBG_FLT("binfmt_flat: buffer overflow (COMMENT)?\n");
+ if (unlikely(ret == LBUFSIZE)) {
+ pr_debug("buffer overflow (COMMENT)?\n");
goto out_free_buf;
}
}
@@ -273,7 +270,7 @@ static int decompress_exec(
strm.total_out = 0;
if (zlib_inflateInit2(&strm, -MAX_WBITS) != Z_OK) {
- DBG_FLT("binfmt_flat: zlib init failed?\n");
+ pr_debug("zlib init failed?\n");
goto out_free_buf;
}
@@ -290,7 +287,7 @@ static int decompress_exec(
}
if (ret < 0) {
- DBG_FLT("binfmt_flat: decompression failed (%d), %s\n",
+ pr_debug("decompression failed (%d), %s\n",
ret, strm.msg);
goto out_zlib;
}
@@ -327,24 +324,23 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
r &= 0x00ffffff; /* Trim ID off here */
}
if (id >= MAX_SHARED_LIBS) {
- printk("BINFMT_FLAT: reference 0x%x to shared library %d",
- (unsigned) r, id);
+ pr_err("reference 0x%lx to shared library %d", r, id);
goto failed;
}
if (curid != id) {
if (internalp) {
- printk("BINFMT_FLAT: reloc address 0x%x not in same module "
- "(%d != %d)", (unsigned) r, curid, id);
+ pr_err("reloc address 0x%lx not in same module "
+ "(%d != %d)", r, curid, id);
goto failed;
- } else if ( ! p->lib_list[id].loaded &&
- IS_ERR_VALUE(load_flat_shared_library(id, p))) {
- printk("BINFMT_FLAT: failed to load library %d", id);
+ } else if (!p->lib_list[id].loaded &&
+ load_flat_shared_library(id, p) < 0) {
+ pr_err("failed to load library %d", id);
goto failed;
}
/* Check versioning information (i.e. time stamps) */
if (p->lib_list[id].build_date && p->lib_list[curid].build_date &&
p->lib_list[curid].build_date < p->lib_list[id].build_date) {
- printk("BINFMT_FLAT: library %d is younger than %d", id, curid);
+ pr_err("library %d is younger than %d", id, curid);
goto failed;
}
}
@@ -358,8 +354,8 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
text_len = p->lib_list[id].text_len;
if (!flat_reloc_valid(r, start_brk - start_data + text_len)) {
- printk("BINFMT_FLAT: reloc outside program 0x%x (0 - 0x%x/0x%x)",
- (int) r,(int)(start_brk-start_data+text_len),(int)text_len);
+ pr_err("reloc outside program 0x%lx (0 - 0x%lx/0x%lx)",
+ r, start_brk-start_data+text_len, text_len);
goto failed;
}
@@ -369,10 +365,10 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
addr = r - text_len + start_data;
/* Range checked already above so doing the range tests is redundant...*/
- return(addr);
+ return addr;
failed:
- printk(", killing %s!\n", current->comm);
+ pr_cont(", killing %s!\n", current->comm);
send_sig(SIGSEGV, current, 0);
return RELOC_FAILED;
@@ -382,62 +378,57 @@ failed:
static void old_reloc(unsigned long rl)
{
-#ifdef DEBUG
- char *segment[] = { "TEXT", "DATA", "BSS", "*UNKNOWN*" };
-#endif
+ static const char *segment[] = { "TEXT", "DATA", "BSS", "*UNKNOWN*" };
flat_v2_reloc_t r;
- unsigned long *ptr;
-
+ unsigned long __user *ptr;
+ unsigned long val;
+
r.value = rl;
#if defined(CONFIG_COLDFIRE)
- ptr = (unsigned long *) (current->mm->start_code + r.reloc.offset);
+ ptr = (unsigned long __user *)(current->mm->start_code + r.reloc.offset);
#else
- ptr = (unsigned long *) (current->mm->start_data + r.reloc.offset);
+ ptr = (unsigned long __user *)(current->mm->start_data + r.reloc.offset);
#endif
+ get_user(val, ptr);
+
+ pr_debug("Relocation of variable at DATASEG+%x "
+ "(address %p, currently %lx) into segment %s\n",
+ r.reloc.offset, ptr, val, segment[r.reloc.type]);
-#ifdef DEBUG
- printk("Relocation of variable at DATASEG+%x "
- "(address %p, currently %x) into segment %s\n",
- r.reloc.offset, ptr, (int)*ptr, segment[r.reloc.type]);
-#endif
-
switch (r.reloc.type) {
case OLD_FLAT_RELOC_TYPE_TEXT:
- *ptr += current->mm->start_code;
+ val += current->mm->start_code;
break;
case OLD_FLAT_RELOC_TYPE_DATA:
- *ptr += current->mm->start_data;
+ val += current->mm->start_data;
break;
case OLD_FLAT_RELOC_TYPE_BSS:
- *ptr += current->mm->end_data;
+ val += current->mm->end_data;
break;
default:
- printk("BINFMT_FLAT: Unknown relocation type=%x\n", r.reloc.type);
+ pr_err("Unknown relocation type=%x\n", r.reloc.type);
break;
}
+ put_user(val, ptr);
-#ifdef DEBUG
- printk("Relocation became %x\n", (int)*ptr);
-#endif
-}
+ pr_debug("Relocation became %lx\n", val);
+}
/****************************************************************************/
-static int load_flat_file(struct linux_binprm * bprm,
+static int load_flat_file(struct linux_binprm *bprm,
struct lib_info *libinfo, int id, unsigned long *extra_stack)
{
- struct flat_hdr * hdr;
- unsigned long textpos = 0, datapos = 0, result;
- unsigned long realdatastart = 0;
- unsigned long text_len, data_len, bss_len, stack_len, flags;
- unsigned long full_data;
- unsigned long len, memp = 0;
- unsigned long memp_size, extra, rlim;
- unsigned long *reloc = 0, *rp;
+ struct flat_hdr *hdr;
+ unsigned long textpos, datapos, realdatastart;
+ unsigned long text_len, data_len, bss_len, stack_len, full_data, flags;
+ unsigned long len, memp, memp_size, extra, rlim;
+ unsigned long __user *reloc, *rp;
struct inode *inode;
- int i, rev, relocs = 0;
+ int i, rev, relocs;
loff_t fpos;
unsigned long start_code, end_code;
+ ssize_t result;
int ret;
hdr = ((struct flat_hdr *) bprm->buf); /* exec-header */
@@ -469,20 +460,30 @@ static int load_flat_file(struct linux_binprm * bprm,
}
if (flags & FLAT_FLAG_KTRACE)
- printk("BINFMT_FLAT: Loading file: %s\n", bprm->filename);
+ pr_info("Loading file: %s\n", bprm->filename);
if (rev != FLAT_VERSION && rev != OLD_FLAT_VERSION) {
- printk("BINFMT_FLAT: bad flat file version 0x%x (supported "
- "0x%lx and 0x%lx)\n",
- rev, FLAT_VERSION, OLD_FLAT_VERSION);
+ pr_err("bad flat file version 0x%x (supported 0x%lx and 0x%lx)\n",
+ rev, FLAT_VERSION, OLD_FLAT_VERSION);
ret = -ENOEXEC;
goto err;
}
-
+
/* Don't allow old format executables to use shared libraries */
if (rev == OLD_FLAT_VERSION && id != 0) {
- printk("BINFMT_FLAT: shared libraries are not available before rev 0x%x\n",
- (int) FLAT_VERSION);
+ pr_err("shared libraries are not available before rev 0x%lx\n",
+ FLAT_VERSION);
+ ret = -ENOEXEC;
+ goto err;
+ }
+
+ /*
+ * Make sure the header params are sane.
+ * 28 bits (256 MB) is way more than reasonable in this case.
+ * If some top bits are set we have probable binary corruption.
+ */
+ if ((text_len | data_len | bss_len | stack_len | full_data) >> 28) {
+ pr_err("bad header\n");
ret = -ENOEXEC;
goto err;
}
@@ -496,7 +497,7 @@ static int load_flat_file(struct linux_binprm * bprm,
#ifndef CONFIG_BINFMT_ZFLAT
if (flags & (FLAT_FLAG_GZIP|FLAT_FLAG_GZDATA)) {
- printk("Support for ZFLAT executables is not enabled.\n");
+ pr_err("Support for ZFLAT executables is not enabled.\n");
ret = -ENOEXEC;
goto err;
}
@@ -517,11 +518,9 @@ static int load_flat_file(struct linux_binprm * bprm,
/* Flush all traces of the currently running executable */
if (id == 0) {
- result = flush_old_exec(bprm);
- if (result) {
- ret = result;
+ ret = flush_old_exec(bprm);
+ if (ret)
goto err;
- }
/* OK, This is the point of no return */
set_personality(PER_LINUX_32BIT);
@@ -539,48 +538,48 @@ static int load_flat_file(struct linux_binprm * bprm,
* case, and then the fully copied to RAM case which lumps
* it all together.
*/
- if ((flags & (FLAT_FLAG_RAM|FLAT_FLAG_GZIP)) == 0) {
+ if (!IS_ENABLED(CONFIG_MMU) && !(flags & (FLAT_FLAG_RAM|FLAT_FLAG_GZIP))) {
/*
* this should give us a ROM ptr, but if it doesn't we don't
* really care
*/
- DBG_FLT("BINFMT_FLAT: ROM mapping of file (we hope)\n");
+ pr_debug("ROM mapping of file (we hope)\n");
textpos = vm_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC,
MAP_PRIVATE|MAP_EXECUTABLE, 0);
if (!textpos || IS_ERR_VALUE(textpos)) {
- if (!textpos)
- textpos = (unsigned long) -ENOMEM;
- printk("Unable to mmap process text, errno %d\n", (int)-textpos);
ret = textpos;
+ if (!textpos)
+ ret = -ENOMEM;
+ pr_err("Unable to mmap process text, errno %d\n", ret);
goto err;
}
len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
len = PAGE_ALIGN(len);
- realdatastart = vm_mmap(0, 0, len,
+ realdatastart = vm_mmap(NULL, 0, len,
PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
if (realdatastart == 0 || IS_ERR_VALUE(realdatastart)) {
+ ret = realdatastart;
if (!realdatastart)
- realdatastart = (unsigned long) -ENOMEM;
- printk("Unable to allocate RAM for process data, errno %d\n",
- (int)-realdatastart);
+ ret = -ENOMEM;
+ pr_err("Unable to allocate RAM for process data, "
+ "errno %d\n", ret);
vm_munmap(textpos, text_len);
- ret = realdatastart;
goto err;
}
datapos = ALIGN(realdatastart +
MAX_SHARED_LIBS * sizeof(unsigned long),
FLAT_DATA_ALIGN);
- DBG_FLT("BINFMT_FLAT: Allocated data+bss+stack (%d bytes): %x\n",
- (int)(data_len + bss_len + stack_len), (int)datapos);
+ pr_debug("Allocated data+bss+stack (%ld bytes): %lx\n",
+ data_len + bss_len + stack_len, datapos);
fpos = ntohl(hdr->data_start);
#ifdef CONFIG_BINFMT_ZFLAT
if (flags & FLAT_FLAG_GZDATA) {
- result = decompress_exec(bprm, fpos, (char *) datapos,
+ result = decompress_exec(bprm, fpos, (char *)datapos,
full_data, 0);
} else
#endif
@@ -589,29 +588,30 @@ static int load_flat_file(struct linux_binprm * bprm,
full_data);
}
if (IS_ERR_VALUE(result)) {
- printk("Unable to read data+bss, errno %d\n", (int)-result);
+ ret = result;
+ pr_err("Unable to read data+bss, errno %d\n", ret);
vm_munmap(textpos, text_len);
vm_munmap(realdatastart, len);
- ret = result;
goto err;
}
- reloc = (unsigned long *) (datapos+(ntohl(hdr->reloc_start)-text_len));
+ reloc = (unsigned long __user *)
+ (datapos + (ntohl(hdr->reloc_start) - text_len));
memp = realdatastart;
memp_size = len;
} else {
len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
len = PAGE_ALIGN(len);
- textpos = vm_mmap(0, 0, len,
+ textpos = vm_mmap(NULL, 0, len,
PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
if (!textpos || IS_ERR_VALUE(textpos)) {
- if (!textpos)
- textpos = (unsigned long) -ENOMEM;
- printk("Unable to allocate RAM for process text/data, errno %d\n",
- (int)-textpos);
ret = textpos;
+ if (!textpos)
+ ret = -ENOMEM;
+ pr_err("Unable to allocate RAM for process text/data, "
+ "errno %d\n", ret);
goto err;
}
@@ -620,7 +620,7 @@ static int load_flat_file(struct linux_binprm * bprm,
MAX_SHARED_LIBS * sizeof(unsigned long),
FLAT_DATA_ALIGN);
- reloc = (unsigned long *)
+ reloc = (unsigned long __user *)
(datapos + (ntohl(hdr->reloc_start) - text_len));
memp = textpos;
memp_size = len;
@@ -629,21 +629,59 @@ static int load_flat_file(struct linux_binprm * bprm,
* load it all in and treat it like a RAM load from now on
*/
if (flags & FLAT_FLAG_GZIP) {
- result = decompress_exec(bprm, sizeof (struct flat_hdr),
- (((char *) textpos) + sizeof (struct flat_hdr)),
+#ifndef CONFIG_MMU
+ result = decompress_exec(bprm, sizeof(struct flat_hdr),
+ (((char *)textpos) + sizeof(struct flat_hdr)),
(text_len + full_data
- - sizeof (struct flat_hdr)),
+ - sizeof(struct flat_hdr)),
0);
memmove((void *) datapos, (void *) realdatastart,
full_data);
+#else
+ /*
+ * This is used on MMU systems mainly for testing.
+ * Let's use a kernel buffer to simplify things.
+ */
+ long unz_text_len = text_len - sizeof(struct flat_hdr);
+ long unz_len = unz_text_len + full_data;
+ char *unz_data = vmalloc(unz_len);
+ if (!unz_data) {
+ result = -ENOMEM;
+ } else {
+ result = decompress_exec(bprm, sizeof(struct flat_hdr),
+ unz_data, unz_len, 0);
+ if (result == 0 &&
+ (copy_to_user((void __user *)textpos + sizeof(struct flat_hdr),
+ unz_data, unz_text_len) ||
+ copy_to_user((void __user *)datapos,
+ unz_data + unz_text_len, full_data)))
+ result = -EFAULT;
+ vfree(unz_data);
+ }
+#endif
} else if (flags & FLAT_FLAG_GZDATA) {
result = read_code(bprm->file, textpos, 0, text_len);
- if (!IS_ERR_VALUE(result))
+ if (!IS_ERR_VALUE(result)) {
+#ifndef CONFIG_MMU
result = decompress_exec(bprm, text_len, (char *) datapos,
full_data, 0);
- }
- else
+#else
+ char *unz_data = vmalloc(full_data);
+ if (!unz_data) {
+ result = -ENOMEM;
+ } else {
+ result = decompress_exec(bprm, text_len,
+ unz_data, full_data, 0);
+ if (result == 0 &&
+ copy_to_user((void __user *)datapos,
+ unz_data, full_data))
+ result = -EFAULT;
+ vfree(unz_data);
+ }
#endif
+ }
+ } else
+#endif /* CONFIG_BINFMT_ZFLAT */
{
result = read_code(bprm->file, textpos, 0, text_len);
if (!IS_ERR_VALUE(result))
@@ -652,21 +690,19 @@ static int load_flat_file(struct linux_binprm * bprm,
full_data);
}
if (IS_ERR_VALUE(result)) {
- printk("Unable to read code+data+bss, errno %d\n",(int)-result);
+ ret = result;
+ pr_err("Unable to read code+data+bss, errno %d\n", ret);
vm_munmap(textpos, text_len + data_len + extra +
MAX_SHARED_LIBS * sizeof(unsigned long));
- ret = result;
goto err;
}
}
- if (flags & FLAT_FLAG_KTRACE)
- printk("Mapping is %x, Entry point is %x, data_start is %x\n",
- (int)textpos, 0x00ffffff&ntohl(hdr->entry), ntohl(hdr->data_start));
+ start_code = textpos + sizeof(struct flat_hdr);
+ end_code = textpos + text_len;
+ text_len -= sizeof(struct flat_hdr); /* the real code len */
/* The main program needs a little extra setup in the task structure */
- start_code = textpos + sizeof (struct flat_hdr);
- end_code = textpos + text_len;
if (id == 0) {
current->mm->start_code = start_code;
current->mm->end_code = end_code;
@@ -681,19 +717,19 @@ static int load_flat_file(struct linux_binprm * bprm,
*/
current->mm->start_brk = datapos + data_len + bss_len;
current->mm->brk = (current->mm->start_brk + 3) & ~3;
+#ifndef CONFIG_MMU
current->mm->context.end_brk = memp + memp_size - stack_len;
+#endif
}
- if (flags & FLAT_FLAG_KTRACE)
- printk("%s %s: TEXT=%x-%x DATA=%x-%x BSS=%x-%x\n",
+ if (flags & FLAT_FLAG_KTRACE) {
+ pr_info("Mapping is %lx, Entry point is %x, data_start is %x\n",
+ textpos, 0x00ffffff&ntohl(hdr->entry), ntohl(hdr->data_start));
+ pr_info("%s %s: TEXT=%lx-%lx DATA=%lx-%lx BSS=%lx-%lx\n",
id ? "Lib" : "Load", bprm->filename,
- (int) start_code, (int) end_code,
- (int) datapos,
- (int) (datapos + data_len),
- (int) (datapos + data_len),
- (int) (((datapos + data_len + bss_len) + 3) & ~3));
-
- text_len -= sizeof(struct flat_hdr); /* the real code len */
+ start_code, end_code, datapos, datapos + data_len,
+ datapos + data_len, (datapos + data_len + bss_len + 3) & ~3);
+ }
/* Store the current module values into the global library structure */
libinfo->lib_list[id].start_code = start_code;
@@ -703,7 +739,7 @@ static int load_flat_file(struct linux_binprm * bprm,
libinfo->lib_list[id].loaded = 1;
libinfo->lib_list[id].entry = (0x00ffffff & ntohl(hdr->entry)) + textpos;
libinfo->lib_list[id].build_date = ntohl(hdr->build_date);
-
+
/*
* We just load the allocations into some temporary memory to
* help simplify all this mumbo jumbo
@@ -717,15 +753,20 @@ static int load_flat_file(struct linux_binprm * bprm,
* image.
*/
if (flags & FLAT_FLAG_GOTPIC) {
- for (rp = (unsigned long *)datapos; *rp != 0xffffffff; rp++) {
- unsigned long addr;
- if (*rp) {
- addr = calc_reloc(*rp, libinfo, id, 0);
+ for (rp = (unsigned long __user *)datapos; ; rp++) {
+ unsigned long addr, rp_val;
+ if (get_user(rp_val, rp))
+ return -EFAULT;
+ if (rp_val == 0xffffffff)
+ break;
+ if (rp_val) {
+ addr = calc_reloc(rp_val, libinfo, id, 0);
if (addr == RELOC_FAILED) {
ret = -ENOEXEC;
goto err;
}
- *rp = addr;
+ if (put_user(addr, rp))
+ return -EFAULT;
}
}
}
@@ -742,19 +783,23 @@ static int load_flat_file(struct linux_binprm * bprm,
* __start to address 4 so that is okay).
*/
if (rev > OLD_FLAT_VERSION) {
- unsigned long persistent = 0;
- for (i=0; i < relocs; i++) {
+ unsigned long __maybe_unused persistent = 0;
+ for (i = 0; i < relocs; i++) {
unsigned long addr, relval;
- /* Get the address of the pointer to be
- relocated (of course, the address has to be
- relocated first). */
- relval = ntohl(reloc[i]);
- if (flat_set_persistent (relval, &persistent))
+ /*
+ * Get the address of the pointer to be
+ * relocated (of course, the address has to be
+ * relocated first).
+ */
+ if (get_user(relval, reloc + i))
+ return -EFAULT;
+ relval = ntohl(relval);
+ if (flat_set_persistent(relval, &persistent))
continue;
addr = flat_get_relocate_addr(relval);
- rp = (unsigned long *) calc_reloc(addr, libinfo, id, 1);
- if (rp == (unsigned long *)RELOC_FAILED) {
+ rp = (unsigned long __user *)calc_reloc(addr, libinfo, id, 1);
+ if (rp == (unsigned long __user *)RELOC_FAILED) {
ret = -ENOEXEC;
goto err;
}
@@ -780,17 +825,23 @@ static int load_flat_file(struct linux_binprm * bprm,
}
}
} else {
- for (i=0; i < relocs; i++)
- old_reloc(ntohl(reloc[i]));
+ for (i = 0; i < relocs; i++) {
+ unsigned long relval;
+ if (get_user(relval, reloc + i))
+ return -EFAULT;
+ relval = ntohl(relval);
+ old_reloc(relval);
+ }
}
-
+
flush_icache_range(start_code, end_code);
/* zero the BSS, BRK and stack areas */
- memset((void*)(datapos + data_len), 0, bss_len +
- (memp + memp_size - stack_len - /* end brk */
- libinfo->lib_list[id].start_brk) + /* start brk */
- stack_len);
+ if (clear_user((void __user *)(datapos + data_len), bss_len +
+ (memp + memp_size - stack_len - /* end brk */
+ libinfo->lib_list[id].start_brk) + /* start brk */
+ stack_len))
+ return -EFAULT;
return 0;
err:
@@ -837,7 +888,7 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
res = prepare_binprm(&bprm);
- if (!IS_ERR_VALUE(res))
+ if (!res)
res = load_flat_file(&bprm, libs, id, NULL);
abort_creds(bprm.cred);
@@ -846,7 +897,7 @@ out:
allow_write_access(bprm.file);
fput(bprm.file);
- return(res);
+ return res;
}
#endif /* CONFIG_BINFMT_SHARED_FLAT */
@@ -857,18 +908,17 @@ out:
* libraries. There is no binary dependent code anywhere else.
*/
-static int load_flat_binary(struct linux_binprm * bprm)
+static int load_flat_binary(struct linux_binprm *bprm)
{
struct lib_info libinfo;
struct pt_regs *regs = current_pt_regs();
- unsigned long p = bprm->p;
- unsigned long stack_len;
+ unsigned long stack_len = 0;
unsigned long start_addr;
- unsigned long *sp;
int res;
int i, j;
memset(&libinfo, 0, sizeof(libinfo));
+
/*
* We have to add the size of our arguments to our stack size
* otherwise it's too easy for users to create stack overflows
@@ -876,38 +926,54 @@ static int load_flat_binary(struct linux_binprm * bprm)
* pedantic and include space for the argv/envp array as it may have
* a lot of entries.
*/
-#define TOP_OF_ARGS (PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *))
- stack_len = TOP_OF_ARGS - bprm->p; /* the strings */
- stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */
- stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */
- stack_len += FLAT_STACK_ALIGN - 1; /* reserve for upcoming alignment */
-
+#ifndef CONFIG_MMU
+ stack_len += PAGE_SIZE * MAX_ARG_PAGES - bprm->p; /* the strings */
+#endif
+ stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */
+ stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */
+ stack_len = ALIGN(stack_len, FLAT_STACK_ALIGN);
+
res = load_flat_file(bprm, &libinfo, 0, &stack_len);
- if (IS_ERR_VALUE(res))
+ if (res < 0)
return res;
-
+
/* Update data segment pointers for all libraries */
- for (i=0; i<MAX_SHARED_LIBS; i++)
- if (libinfo.lib_list[i].loaded)
- for (j=0; j<MAX_SHARED_LIBS; j++)
- (-(j+1))[(unsigned long *)(libinfo.lib_list[i].start_data)] =
- (libinfo.lib_list[j].loaded)?
- libinfo.lib_list[j].start_data:UNLOADED_LIB;
+ for (i = 0; i < MAX_SHARED_LIBS; i++) {
+ if (!libinfo.lib_list[i].loaded)
+ continue;
+ for (j = 0; j < MAX_SHARED_LIBS; j++) {
+ unsigned long val = libinfo.lib_list[j].loaded ?
+ libinfo.lib_list[j].start_data : UNLOADED_LIB;
+ unsigned long __user *p = (unsigned long __user *)
+ libinfo.lib_list[i].start_data;
+ p -= j + 1;
+ if (put_user(val, p))
+ return -EFAULT;
+ }
+ }
install_exec_creds(bprm);
set_binfmt(&flat_format);
- p = ((current->mm->context.end_brk + stack_len + 3) & ~3) - 4;
- DBG_FLT("p=%x\n", (int)p);
-
- /* copy the arg pages onto the stack, this could be more efficient :-) */
- for (i = TOP_OF_ARGS - 1; i >= bprm->p; i--)
- * (char *) --p =
- ((char *) page_address(bprm->page[i/PAGE_SIZE]))[i % PAGE_SIZE];
+#ifdef CONFIG_MMU
+ res = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
+ if (!res)
+ res = create_flat_tables(bprm, bprm->p);
+#else
+ /* Stash our initial stack pointer into the mm structure */
+ current->mm->start_stack =
+ ((current->mm->context.end_brk + stack_len + 3) & ~3) - 4;
+ pr_debug("sp=%lx\n", current->mm->start_stack);
+
+ /* copy the arg pages onto the stack */
+ res = transfer_args_to_stack(bprm, &current->mm->start_stack);
+ if (!res)
+ res = create_flat_tables(bprm, current->mm->start_stack);
+#endif
+ if (res)
+ return res;
- sp = (unsigned long *) create_flat_tables(p, bprm);
-
/* Fake some return addresses to ensure the call chain will
* initialise library in order for us. We are required to call
* lib 1 first, then 2, ... and finally the main program (id 0).
@@ -915,24 +981,24 @@ static int load_flat_binary(struct linux_binprm * bprm)
start_addr = libinfo.lib_list[0].entry;
#ifdef CONFIG_BINFMT_SHARED_FLAT
- for (i = MAX_SHARED_LIBS-1; i>0; i--) {
+ for (i = MAX_SHARED_LIBS-1; i > 0; i--) {
if (libinfo.lib_list[i].loaded) {
/* Push previos first to call address */
- --sp; put_user(start_addr, sp);
+ unsigned long __user *sp;
+ current->mm->start_stack -= sizeof(unsigned long);
+ sp = (unsigned long __user *)current->mm->start_stack;
+ __put_user(start_addr, sp);
start_addr = libinfo.lib_list[i].entry;
}
}
#endif
-
- /* Stash our initial stack pointer into the mm structure */
- current->mm->start_stack = (unsigned long )sp;
#ifdef FLAT_PLAT_INIT
FLAT_PLAT_INIT(regs);
#endif
- DBG_FLT("start_thread(regs=0x%x, entry=0x%x, start_stack=0x%x)\n",
- (int)regs, (int)start_addr, (int)current->mm->start_stack);
-
+
+ pr_debug("start_thread(regs=0x%p, entry=0x%lx, start_stack=0x%lx)\n",
+ regs, start_addr, current->mm->start_stack);
start_thread(regs, start_addr, current->mm->start_stack);
return 0;
@@ -945,9 +1011,6 @@ static int __init init_flat_binfmt(void)
register_binfmt(&flat_format);
return 0;
}
-
-/****************************************************************************/
-
core_initcall(init_flat_binfmt);
/****************************************************************************/
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 8a108c435bc62..6103a6362ccd1 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -659,13 +659,12 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
break;
case 3:
/* Delete this handler. */
- root = dget(file->f_path.dentry->d_sb->s_root);
+ root = file_inode(file)->i_sb->s_root;
inode_lock(d_inode(root));
kill_node(e);
inode_unlock(d_inode(root));
- dput(root);
break;
default:
return res;
@@ -687,8 +686,8 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
{
Node *e;
struct inode *inode;
- struct dentry *root, *dentry;
- struct super_block *sb = file->f_path.dentry->d_sb;
+ struct super_block *sb = file_inode(file)->i_sb;
+ struct dentry *root = sb->s_root, *dentry;
int err = 0;
e = create_entry(buffer, count);
@@ -696,7 +695,6 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
if (IS_ERR(e))
return PTR_ERR(e);
- root = dget(sb->s_root);
inode_lock(d_inode(root));
dentry = lookup_one_len(e->name, root, strlen(e->name));
err = PTR_ERR(dentry);
@@ -749,7 +747,6 @@ out2:
dput(dentry);
out:
inode_unlock(d_inode(root));
- dput(root);
if (err) {
kfree(e);
@@ -790,14 +787,13 @@ static ssize_t bm_status_write(struct file *file, const char __user *buffer,
break;
case 3:
/* Delete all handlers. */
- root = dget(file->f_path.dentry->d_sb->s_root);
+ root = file_inode(file)->i_sb->s_root;
inode_lock(d_inode(root));
while (!list_empty(&entries))
kill_node(list_entry(entries.next, Node, list));
inode_unlock(d_inode(root));
- dput(root);
break;
default:
return res;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 3172c4e2f5025..d402899ba1350 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -29,6 +29,7 @@
#include <linux/log2.h>
#include <linux/cleancache.h>
#include <linux/dax.h>
+#include <linux/badblocks.h>
#include <asm/uaccess.h>
#include "internal.h"
@@ -50,6 +51,18 @@ struct block_device *I_BDEV(struct inode *inode)
}
EXPORT_SYMBOL(I_BDEV);
+void __vfs_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk_ratelimited("%sVFS (%s): %pV\n", prefix, sb->s_id, &vaf);
+ va_end(args);
+}
+
static void bdev_write_inode(struct block_device *bdev)
{
struct inode *inode = bdev->bd_inode;
@@ -162,15 +175,15 @@ static struct inode *bdev_file_inode(struct file *file)
}
static ssize_t
-blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
+blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct inode *inode = bdev_file_inode(file);
if (IS_DAX(inode))
- return dax_do_io(iocb, inode, iter, offset, blkdev_get_block,
+ return dax_do_io(iocb, inode, iter, blkdev_get_block,
NULL, DIO_SKIP_DIO_COUNT);
- return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, offset,
+ return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter,
blkdev_get_block, NULL, NULL,
DIO_SKIP_DIO_COUNT);
}
@@ -331,7 +344,7 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,
ret = block_write_end(file, mapping, pos, len, copied, page, fsdata);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return ret;
}
@@ -403,7 +416,8 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
result = blk_queue_enter(bdev->bd_queue, false);
if (result)
return result;
- result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ);
+ result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
+ REQ_OP_READ);
blk_queue_exit(bdev->bd_queue);
return result;
}
@@ -432,7 +446,6 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
struct page *page, struct writeback_control *wbc)
{
int result;
- int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE;
const struct block_device_operations *ops = bdev->bd_disk->fops;
if (!ops->rw_page || bdev_get_integrity(bdev))
@@ -442,7 +455,8 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
return result;
set_page_writeback(page);
- result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw);
+ result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
+ REQ_OP_WRITE);
if (result)
end_page_writeback(page);
else
@@ -480,7 +494,7 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
if (size < 0)
return size;
- if (!ops->direct_access)
+ if (!blk_queue_dax(bdev_get_queue(bdev)) || !ops->direct_access)
return -EOPNOTSUPP;
if ((sector + DIV_ROUND_UP(size, 512)) >
part_nr_sects_read(bdev->bd_part))
@@ -488,7 +502,7 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
sector += get_start_sect(bdev);
if (sector % (PAGE_SIZE / 512))
return -EINVAL;
- avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn);
+ avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn, size);
if (!avail)
return -ERANGE;
if (avail > 0 && avail & ~PAGE_MASK)
@@ -497,6 +511,75 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
}
EXPORT_SYMBOL_GPL(bdev_direct_access);
+/**
+ * bdev_dax_supported() - Check if the device supports dax for filesystem
+ * @sb: The superblock of the device
+ * @blocksize: The block size of the device
+ *
+ * This is a library function for filesystems to check if the block device
+ * can be mounted with dax option.
+ *
+ * Return: negative errno if unsupported, 0 if supported.
+ */
+int bdev_dax_supported(struct super_block *sb, int blocksize)
+{
+ struct blk_dax_ctl dax = {
+ .sector = 0,
+ .size = PAGE_SIZE,
+ };
+ int err;
+
+ if (blocksize != PAGE_SIZE) {
+ vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax");
+ return -EINVAL;
+ }
+
+ err = bdev_direct_access(sb->s_bdev, &dax);
+ if (err < 0) {
+ switch (err) {
+ case -EOPNOTSUPP:
+ vfs_msg(sb, KERN_ERR,
+ "error: device does not support dax");
+ break;
+ case -EINVAL:
+ vfs_msg(sb, KERN_ERR,
+ "error: unaligned partition for dax");
+ break;
+ default:
+ vfs_msg(sb, KERN_ERR,
+ "error: dax access failed (%d)", err);
+ }
+ return err;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bdev_dax_supported);
+
+/**
+ * bdev_dax_capable() - Return if the raw device is capable for dax
+ * @bdev: The device for raw block device access
+ */
+bool bdev_dax_capable(struct block_device *bdev)
+{
+ struct blk_dax_ctl dax = {
+ .size = PAGE_SIZE,
+ };
+
+ if (!IS_ENABLED(CONFIG_FS_DAX))
+ return false;
+
+ dax.sector = 0;
+ if (bdev_direct_access(bdev, &dax) < 0)
+ return false;
+
+ dax.sector = bdev->bd_part->nr_sects - (PAGE_SIZE / 512);
+ if (bdev_direct_access(bdev, &dax) < 0)
+ return false;
+
+ return true;
+}
+
/*
* pseudo-fs
*/
@@ -532,7 +615,6 @@ static void init_once(void *foo)
memset(bdev, 0, sizeof(*bdev));
mutex_init(&bdev->bd_mutex);
- INIT_LIST_HEAD(&bdev->bd_inodes);
INIT_LIST_HEAD(&bdev->bd_list);
#ifdef CONFIG_SYSFS
INIT_LIST_HEAD(&bdev->bd_holder_disks);
@@ -542,24 +624,13 @@ static void init_once(void *foo)
mutex_init(&bdev->bd_fsfreeze_mutex);
}
-static inline void __bd_forget(struct inode *inode)
-{
- list_del_init(&inode->i_devices);
- inode->i_bdev = NULL;
- inode->i_mapping = &inode->i_data;
-}
-
static void bdev_evict_inode(struct inode *inode)
{
struct block_device *bdev = &BDEV_I(inode)->bdev;
- struct list_head *p;
truncate_inode_pages_final(&inode->i_data);
invalidate_inode_buffers(inode); /* is it needed here? */
clear_inode(inode);
spin_lock(&bdev_lock);
- while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
- __bd_forget(list_entry(p, struct inode, i_devices));
- }
list_del_init(&bdev->bd_list);
spin_unlock(&bdev_lock);
}
@@ -723,7 +794,6 @@ static struct block_device *bd_acquire(struct inode *inode)
bdgrab(bdev);
inode->i_bdev = bdev;
inode->i_mapping = bdev->bd_inode->i_mapping;
- list_add(&inode->i_devices, &bdev->bd_inodes);
}
spin_unlock(&bdev_lock);
}
@@ -739,7 +809,8 @@ void bd_forget(struct inode *inode)
spin_lock(&bdev_lock);
if (!sb_is_blkdev_sb(inode->i_sb))
bdev = inode->i_bdev;
- __bd_forget(inode);
+ inode->i_bdev = NULL;
+ inode->i_mapping = &inode->i_data;
spin_unlock(&bdev_lock);
if (bdev)
@@ -1149,7 +1220,7 @@ void bd_set_size(struct block_device *bdev, loff_t size)
inode_lock(bdev->bd_inode);
i_size_write(bdev->bd_inode, size);
inode_unlock(bdev->bd_inode);
- while (bsize < PAGE_CACHE_SIZE) {
+ while (bsize < PAGE_SIZE) {
if (size & bsize)
break;
bsize <<= 1;
@@ -1205,10 +1276,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bdev->bd_disk = disk;
bdev->bd_queue = disk->queue;
bdev->bd_contains = bdev;
- if (IS_ENABLED(CONFIG_BLK_DEV_DAX) && disk->fops->direct_access)
- bdev->bd_inode->i_flags = S_DAX;
- else
- bdev->bd_inode->i_flags = 0;
+ bdev->bd_inode->i_flags = 0;
if (!partno) {
ret = -ENXIO;
@@ -1238,7 +1306,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
if (!ret) {
bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
- if (!blkdev_dax_capable(bdev))
+ if (!bdev_dax_capable(bdev))
bdev->bd_inode->i_flags &= ~S_DAX;
}
@@ -1275,7 +1343,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
goto out_clear;
}
bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
- if (!blkdev_dax_capable(bdev))
+ if (!bdev_dax_capable(bdev))
bdev->bd_inode->i_flags &= ~S_DAX;
}
} else {
@@ -1660,12 +1728,8 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
blk_start_plug(&plug);
ret = __generic_file_write_iter(iocb, from);
- if (ret > 0) {
- ssize_t err;
- err = generic_write_sync(file, iocb->ki_pos - ret, ret);
- if (err < 0)
- ret = err;
- }
+ if (ret > 0)
+ ret = generic_write_sync(iocb, ret);
blk_finish_plug(&plug);
return ret;
}
@@ -1724,79 +1788,13 @@ static const struct address_space_operations def_blk_aops = {
.is_dirty_writeback = buffer_check_dirty_writeback,
};
-#ifdef CONFIG_FS_DAX
-/*
- * In the raw block case we do not need to contend with truncation nor
- * unwritten file extents. Without those concerns there is no need for
- * additional locking beyond the mmap_sem context that these routines
- * are already executing under.
- *
- * Note, there is no protection if the block device is dynamically
- * resized (partition grow/shrink) during a fault. A stable block device
- * size is already not enforced in the blkdev_direct_IO path.
- *
- * For DAX, it is the responsibility of the block device driver to
- * ensure the whole-disk device size is stable while requests are in
- * flight.
- *
- * Finally, unlike the filemap_page_mkwrite() case there is no
- * filesystem superblock to sync against freezing. We still include a
- * pfn_mkwrite callback for dax drivers to receive write fault
- * notifications.
- */
-static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- return __dax_fault(vma, vmf, blkdev_get_block, NULL);
-}
-
-static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma,
- struct vm_fault *vmf)
-{
- return dax_pfn_mkwrite(vma, vmf);
-}
-
-static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, unsigned int flags)
-{
- return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL);
-}
-
-static const struct vm_operations_struct blkdev_dax_vm_ops = {
- .fault = blkdev_dax_fault,
- .pmd_fault = blkdev_dax_pmd_fault,
- .pfn_mkwrite = blkdev_dax_pfn_mkwrite,
-};
-
-static const struct vm_operations_struct blkdev_default_vm_ops = {
- .fault = filemap_fault,
- .map_pages = filemap_map_pages,
-};
-
-static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
-{
- struct inode *bd_inode = bdev_file_inode(file);
-
- file_accessed(file);
- if (IS_DAX(bd_inode)) {
- vma->vm_ops = &blkdev_dax_vm_ops;
- vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
- } else {
- vma->vm_ops = &blkdev_default_vm_ops;
- }
-
- return 0;
-}
-#else
-#define blkdev_mmap generic_file_mmap
-#endif
-
const struct file_operations def_blk_fops = {
.open = blkdev_open,
.release = blkdev_close,
.llseek = block_llseek,
.read_iter = blkdev_read_iter,
.write_iter = blkdev_write_iter,
- .mmap = blkdev_mmap,
+ .mmap = generic_file_mmap,
.fsync = blkdev_fsync,
.unlocked_ioctl = block_ioctl,
#ifdef CONFIG_COMPAT
@@ -1845,7 +1843,7 @@ struct block_device *lookup_bdev(const char *pathname)
if (!S_ISBLK(inode->i_mode))
goto fail;
error = -EACCES;
- if (path.mnt->mnt_flags & MNT_NODEV)
+ if (!may_open_dev(&path))
goto fail;
error = -ENOMEM;
bdev = bd_acquire(inode);
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 6d263bb1621cd..53bb7af4e5f06 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -55,17 +55,13 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
}
if (size > 0) {
acl = posix_acl_from_xattr(&init_user_ns, value, size);
- } else if (size == -ENOENT || size == -ENODATA || size == 0) {
- /* FIXME, who returns -ENOENT? I think nobody */
+ } else if (size == -ERANGE || size == -ENODATA || size == 0) {
acl = NULL;
} else {
acl = ERR_PTR(-EIO);
}
kfree(value);
- if (!IS_ERR(acl))
- set_cached_acl(inode, type, acl);
-
return acl;
}
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 5fb60ea7eee2b..e0f071f6b5a76 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -34,6 +34,10 @@
struct __btrfs_workqueue {
struct workqueue_struct *normal_wq;
+
+ /* File system this workqueue services */
+ struct btrfs_fs_info *fs_info;
+
/* List head pointing to ordered work list */
struct list_head ordered_list;
@@ -70,6 +74,18 @@ void btrfs_##name(struct work_struct *arg) \
normal_work_helper(work); \
}
+struct btrfs_fs_info *
+btrfs_workqueue_owner(struct __btrfs_workqueue *wq)
+{
+ return wq->fs_info;
+}
+
+struct btrfs_fs_info *
+btrfs_work_owner(struct btrfs_work *work)
+{
+ return work->wq->fs_info;
+}
+
BTRFS_WORK_HELPER(worker_helper);
BTRFS_WORK_HELPER(delalloc_helper);
BTRFS_WORK_HELPER(flush_delalloc_helper);
@@ -94,14 +110,15 @@ BTRFS_WORK_HELPER(scrubnc_helper);
BTRFS_WORK_HELPER(scrubparity_helper);
static struct __btrfs_workqueue *
-__btrfs_alloc_workqueue(const char *name, unsigned int flags, int limit_active,
- int thresh)
+__btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
+ unsigned int flags, int limit_active, int thresh)
{
struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
if (!ret)
return NULL;
+ ret->fs_info = fs_info;
ret->limit_active = limit_active;
atomic_set(&ret->pending, 0);
if (thresh == 0)
@@ -143,7 +160,8 @@ __btrfs_alloc_workqueue(const char *name, unsigned int flags, int limit_active,
static inline void
__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
-struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
+struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
+ const char *name,
unsigned int flags,
int limit_active,
int thresh)
@@ -153,7 +171,8 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
if (!ret)
return NULL;
- ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI,
+ ret->normal = __btrfs_alloc_workqueue(fs_info, name,
+ flags & ~WQ_HIGHPRI,
limit_active, thresh);
if (!ret->normal) {
kfree(ret);
@@ -161,8 +180,8 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
}
if (flags & WQ_HIGHPRI) {
- ret->high = __btrfs_alloc_workqueue(name, flags, limit_active,
- thresh);
+ ret->high = __btrfs_alloc_workqueue(fs_info, name, flags,
+ limit_active, thresh);
if (!ret->high) {
__btrfs_destroy_workqueue(ret->normal);
kfree(ret);
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index ad4d0647d1a6c..8e52484cd4615 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -21,6 +21,7 @@
#define __BTRFS_ASYNC_THREAD_
#include <linux/workqueue.h>
+struct btrfs_fs_info;
struct btrfs_workqueue;
/* Internal use only */
struct __btrfs_workqueue;
@@ -67,7 +68,8 @@ BTRFS_WORK_HELPER_PROTO(scrubnc_helper);
BTRFS_WORK_HELPER_PROTO(scrubparity_helper);
-struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
+struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
+ const char *name,
unsigned int flags,
int limit_active,
int thresh);
@@ -80,4 +82,6 @@ void btrfs_queue_work(struct btrfs_workqueue *wq,
void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
void btrfs_set_work_high_priority(struct btrfs_work *work);
+struct btrfs_fs_info *btrfs_work_owner(struct btrfs_work *work);
+struct btrfs_fs_info *btrfs_workqueue_owner(struct __btrfs_workqueue *wq);
#endif
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 80e8472d618b8..2b88439c2ee86 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -139,7 +139,7 @@ int __init btrfs_prelim_ref_init(void)
btrfs_prelim_ref_cache = kmem_cache_create("btrfs_prelim_ref",
sizeof(struct __prelim_ref),
0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ SLAB_MEM_SPREAD,
NULL);
if (!btrfs_prelim_ref_cache)
return -ENOMEM;
@@ -361,7 +361,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
goto out;
}
- if (btrfs_test_is_dummy_root(root)) {
+ if (btrfs_is_testing(fs_info)) {
srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = -ENOENT;
goto out;
@@ -1939,7 +1939,7 @@ static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
* from ipath->fspath->val[i].
* when it returns, there are ipath->fspath->elem_cnt number of paths available
* in ipath->fspath->val[]. when the allocated space wasn't sufficient, the
- * number of missed paths in recored in ipath->fspath->elem_missed, otherwise,
+ * number of missed paths is recorded in ipath->fspath->elem_missed, otherwise,
* it's zero. ipath->fspath->bytes_missing holds the number of bytes that would
* have been needed to return all paths.
*/
@@ -1991,7 +1991,7 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
ifp = kmalloc(sizeof(*ifp), GFP_NOFS);
if (!ifp) {
- kfree(fspath);
+ vfree(fspath);
return ERR_PTR(-ENOMEM);
}
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 61205e3bbefac..4919aedb5fc18 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -196,6 +196,16 @@ struct btrfs_inode {
struct list_head delayed_iput;
long delayed_iput_count;
+ /*
+ * To avoid races between lockless (i_mutex not held) direct IO writes
+ * and concurrent fsync requests. Direct IO writes must acquire read
+ * access on this semaphore for creating an extent map and its
+ * corresponding ordered extent. The fast fsync path must acquire write
+ * access on this semaphore before it collects ordered extents and
+ * extent maps.
+ */
+ struct rw_semaphore dio_sem;
+
struct inode vfs_inode;
};
@@ -303,7 +313,7 @@ struct btrfs_dio_private {
struct bio *dio_bio;
/*
- * The original bio may be splited to several sub-bios, this is
+ * The original bio may be split to several sub-bios, this is
* done during endio of sub-bios
*/
int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int);
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index e34a71b3e2253..5d5cae05818da 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -757,7 +757,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
BUG_ON(NULL == l);
ret = btrfsic_read_block(state, &tmp_next_block_ctx);
- if (ret < (int)PAGE_CACHE_SIZE) {
+ if (ret < (int)PAGE_SIZE) {
printk(KERN_INFO
"btrfsic: read @logical %llu failed!\n",
tmp_next_block_ctx.start);
@@ -1231,15 +1231,15 @@ static void btrfsic_read_from_block_data(
size_t offset_in_page;
char *kaddr;
char *dst = (char *)dstv;
- size_t start_offset = block_ctx->start & ((u64)PAGE_CACHE_SIZE - 1);
- unsigned long i = (start_offset + offset) >> PAGE_CACHE_SHIFT;
+ size_t start_offset = block_ctx->start & ((u64)PAGE_SIZE - 1);
+ unsigned long i = (start_offset + offset) >> PAGE_SHIFT;
WARN_ON(offset + len > block_ctx->len);
- offset_in_page = (start_offset + offset) & (PAGE_CACHE_SIZE - 1);
+ offset_in_page = (start_offset + offset) & (PAGE_SIZE - 1);
while (len > 0) {
- cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page));
- BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_CACHE_SIZE));
+ cur = min(len, ((size_t)PAGE_SIZE - offset_in_page));
+ BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_SIZE));
kaddr = block_ctx->datav[i];
memcpy(dst, kaddr + offset_in_page, cur);
@@ -1605,8 +1605,8 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
BUG_ON(!block_ctx->datav);
BUG_ON(!block_ctx->pagev);
- num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
+ num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >>
+ PAGE_SHIFT;
while (num_pages > 0) {
num_pages--;
if (block_ctx->datav[num_pages]) {
@@ -1637,15 +1637,15 @@ static int btrfsic_read_block(struct btrfsic_state *state,
BUG_ON(block_ctx->datav);
BUG_ON(block_ctx->pagev);
BUG_ON(block_ctx->mem_to_free);
- if (block_ctx->dev_bytenr & ((u64)PAGE_CACHE_SIZE - 1)) {
+ if (block_ctx->dev_bytenr & ((u64)PAGE_SIZE - 1)) {
printk(KERN_INFO
"btrfsic: read_block() with unaligned bytenr %llu\n",
block_ctx->dev_bytenr);
return -1;
}
- num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
+ num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >>
+ PAGE_SHIFT;
block_ctx->mem_to_free = kzalloc((sizeof(*block_ctx->datav) +
sizeof(*block_ctx->pagev)) *
num_pages, GFP_NOFS);
@@ -1673,11 +1673,12 @@ static int btrfsic_read_block(struct btrfsic_state *state,
}
bio->bi_bdev = block_ctx->dev->bdev;
bio->bi_iter.bi_sector = dev_bytenr >> 9;
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
for (j = i; j < num_pages; j++) {
ret = bio_add_page(bio, block_ctx->pagev[j],
- PAGE_CACHE_SIZE, 0);
- if (PAGE_CACHE_SIZE != ret)
+ PAGE_SIZE, 0);
+ if (PAGE_SIZE != ret)
break;
}
if (j == i) {
@@ -1685,7 +1686,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
"btrfsic: error, failed to add a single page!\n");
return -1;
}
- if (submit_bio_wait(READ, bio)) {
+ if (submit_bio_wait(bio)) {
printk(KERN_INFO
"btrfsic: read error at logical %llu dev %s!\n",
block_ctx->start, block_ctx->dev->name);
@@ -1693,7 +1694,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
return -1;
}
bio_put(bio);
- dev_bytenr += (j - i) * PAGE_CACHE_SIZE;
+ dev_bytenr += (j - i) * PAGE_SIZE;
i = j;
}
for (i = 0; i < num_pages; i++) {
@@ -1769,9 +1770,9 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state,
u32 crc = ~(u32)0;
unsigned int i;
- if (num_pages * PAGE_CACHE_SIZE < state->metablock_size)
+ if (num_pages * PAGE_SIZE < state->metablock_size)
return 1; /* not metadata */
- num_pages = state->metablock_size >> PAGE_CACHE_SHIFT;
+ num_pages = state->metablock_size >> PAGE_SHIFT;
h = (struct btrfs_header *)datav[0];
if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE))
@@ -1779,8 +1780,8 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state,
for (i = 0; i < num_pages; i++) {
u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE);
- size_t sublen = i ? PAGE_CACHE_SIZE :
- (PAGE_CACHE_SIZE - BTRFS_CSUM_SIZE);
+ size_t sublen = i ? PAGE_SIZE :
+ (PAGE_SIZE - BTRFS_CSUM_SIZE);
crc = btrfs_crc32c(crc, data, sublen);
}
@@ -1826,14 +1827,14 @@ again:
if (block->is_superblock) {
bytenr = btrfs_super_bytenr((struct btrfs_super_block *)
mapped_datav[0]);
- if (num_pages * PAGE_CACHE_SIZE <
+ if (num_pages * PAGE_SIZE <
BTRFS_SUPER_INFO_SIZE) {
printk(KERN_INFO
"btrfsic: cannot work with too short bios!\n");
return;
}
is_metadata = 1;
- BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_CACHE_SIZE - 1));
+ BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_SIZE - 1));
processed_len = BTRFS_SUPER_INFO_SIZE;
if (state->print_mask &
BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
@@ -1844,7 +1845,7 @@ again:
}
if (is_metadata) {
if (!block->is_superblock) {
- if (num_pages * PAGE_CACHE_SIZE <
+ if (num_pages * PAGE_SIZE <
state->metablock_size) {
printk(KERN_INFO
"btrfsic: cannot work with too short bios!\n");
@@ -1880,7 +1881,7 @@ again:
}
block->logical_bytenr = bytenr;
} else {
- if (num_pages * PAGE_CACHE_SIZE <
+ if (num_pages * PAGE_SIZE <
state->datablock_size) {
printk(KERN_INFO
"btrfsic: cannot work with too short bios!\n");
@@ -1939,7 +1940,7 @@ again:
/*
* Clear all references of this block. Do not free
* the block itself even if is not referenced anymore
- * because it still carries valueable information
+ * because it still carries valuable information
* like whether it was ever written and IO completed.
*/
list_for_each_entry_safe(l, tmp, &block->ref_to_list,
@@ -2013,7 +2014,7 @@ again:
block->logical_bytenr = bytenr;
block->is_metadata = 1;
if (block->is_superblock) {
- BUG_ON(PAGE_CACHE_SIZE !=
+ BUG_ON(PAGE_SIZE !=
BTRFS_SUPER_INFO_SIZE);
ret = btrfsic_process_written_superblock(
state,
@@ -2172,8 +2173,8 @@ again:
continue_loop:
BUG_ON(!processed_len);
dev_bytenr += processed_len;
- mapped_datav += processed_len >> PAGE_CACHE_SHIFT;
- num_pages -= processed_len >> PAGE_CACHE_SHIFT;
+ mapped_datav += processed_len >> PAGE_SHIFT;
+ num_pages -= processed_len >> PAGE_SHIFT;
goto again;
}
@@ -2206,7 +2207,7 @@ static void btrfsic_bio_end_io(struct bio *bp)
block->dev_bytenr, block->mirror_num);
next_block = block->next_in_same_bio;
block->iodone_w_error = iodone_w_error;
- if (block->submit_bio_bh_rw & REQ_FLUSH) {
+ if (block->submit_bio_bh_rw & REQ_PREFLUSH) {
dev_state->last_flush_gen++;
if ((dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
@@ -2242,7 +2243,7 @@ static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate)
block->dev_bytenr, block->mirror_num);
block->iodone_w_error = iodone_w_error;
- if (block->submit_bio_bh_rw & REQ_FLUSH) {
+ if (block->submit_bio_bh_rw & REQ_PREFLUSH) {
dev_state->last_flush_gen++;
if ((dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
@@ -2645,7 +2646,7 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
* This algorithm is recursive because the amount of used stack space
* is very small and the max recursion depth is limited.
*/
- indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)",
+ indent_add = sprintf(buf, "%c-%llu(%s/%llu/%u)",
btrfsic_get_block_type(state, block),
block->logical_bytenr, block->dev_state->name,
block->dev_bytenr, block->mirror_num);
@@ -2855,12 +2856,12 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
return ds;
}
-int btrfsic_submit_bh(int rw, struct buffer_head *bh)
+int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh)
{
struct btrfsic_dev_state *dev_state;
if (!btrfsic_is_initialized)
- return submit_bh(rw, bh);
+ return submit_bh(op, op_flags, bh);
mutex_lock(&btrfsic_mutex);
/* since btrfsic_submit_bh() might also be called before
@@ -2869,26 +2870,26 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh)
/* Only called to write the superblock (incl. FLUSH/FUA) */
if (NULL != dev_state &&
- (rw & WRITE) && bh->b_size > 0) {
+ (op == REQ_OP_WRITE) && bh->b_size > 0) {
u64 dev_bytenr;
dev_bytenr = 4096 * bh->b_blocknr;
if (dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
printk(KERN_INFO
- "submit_bh(rw=0x%x, blocknr=%llu (bytenr %llu),"
- " size=%zu, data=%p, bdev=%p)\n",
- rw, (unsigned long long)bh->b_blocknr,
+ "submit_bh(op=0x%x,0x%x, blocknr=%llu "
+ "(bytenr %llu), size=%zu, data=%p, bdev=%p)\n",
+ op, op_flags, (unsigned long long)bh->b_blocknr,
dev_bytenr, bh->b_size, bh->b_data, bh->b_bdev);
btrfsic_process_written_block(dev_state, dev_bytenr,
&bh->b_data, 1, NULL,
- NULL, bh, rw);
- } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
+ NULL, bh, op_flags);
+ } else if (NULL != dev_state && (op_flags & REQ_PREFLUSH)) {
if (dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
printk(KERN_INFO
- "submit_bh(rw=0x%x FLUSH, bdev=%p)\n",
- rw, bh->b_bdev);
+ "submit_bh(op=0x%x,0x%x FLUSH, bdev=%p)\n",
+ op, op_flags, bh->b_bdev);
if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
if ((dev_state->state->print_mask &
(BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
@@ -2906,7 +2907,7 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh)
block->never_written = 0;
block->iodone_w_error = 0;
block->flush_gen = dev_state->last_flush_gen + 1;
- block->submit_bio_bh_rw = rw;
+ block->submit_bio_bh_rw = op_flags;
block->orig_bio_bh_private = bh->b_private;
block->orig_bio_bh_end_io.bh = bh->b_end_io;
block->next_in_same_bio = NULL;
@@ -2915,10 +2916,10 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh)
}
}
mutex_unlock(&btrfsic_mutex);
- return submit_bh(rw, bh);
+ return submit_bh(op, op_flags, bh);
}
-static void __btrfsic_submit_bio(int rw, struct bio *bio)
+static void __btrfsic_submit_bio(struct bio *bio)
{
struct btrfsic_dev_state *dev_state;
@@ -2930,7 +2931,7 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio)
* btrfsic_mount(), this might return NULL */
dev_state = btrfsic_dev_state_lookup(bio->bi_bdev);
if (NULL != dev_state &&
- (rw & WRITE) && NULL != bio->bi_io_vec) {
+ (bio_op(bio) == REQ_OP_WRITE) && NULL != bio->bi_io_vec) {
unsigned int i;
u64 dev_bytenr;
u64 cur_bytenr;
@@ -2942,9 +2943,9 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio)
if (dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
printk(KERN_INFO
- "submit_bio(rw=0x%x, bi_vcnt=%u,"
+ "submit_bio(rw=%d,0x%x, bi_vcnt=%u,"
" bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
- rw, bio->bi_vcnt,
+ bio_op(bio), bio->bi_rw, bio->bi_vcnt,
(unsigned long long)bio->bi_iter.bi_sector,
dev_bytenr, bio->bi_bdev);
@@ -2954,7 +2955,7 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio)
goto leave;
cur_bytenr = dev_bytenr;
for (i = 0; i < bio->bi_vcnt; i++) {
- BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_CACHE_SIZE);
+ BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_SIZE);
mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page);
if (!mapped_datav[i]) {
while (i > 0) {
@@ -2975,18 +2976,18 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio)
btrfsic_process_written_block(dev_state, dev_bytenr,
mapped_datav, bio->bi_vcnt,
bio, &bio_is_patched,
- NULL, rw);
+ NULL, bio->bi_rw);
while (i > 0) {
i--;
kunmap(bio->bi_io_vec[i].bv_page);
}
kfree(mapped_datav);
- } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
+ } else if (NULL != dev_state && (bio->bi_rw & REQ_PREFLUSH)) {
if (dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
printk(KERN_INFO
- "submit_bio(rw=0x%x FLUSH, bdev=%p)\n",
- rw, bio->bi_bdev);
+ "submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n",
+ bio_op(bio), bio->bi_rw, bio->bi_bdev);
if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
if ((dev_state->state->print_mask &
(BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
@@ -3004,7 +3005,7 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio)
block->never_written = 0;
block->iodone_w_error = 0;
block->flush_gen = dev_state->last_flush_gen + 1;
- block->submit_bio_bh_rw = rw;
+ block->submit_bio_bh_rw = bio->bi_rw;
block->orig_bio_bh_private = bio->bi_private;
block->orig_bio_bh_end_io.bio = bio->bi_end_io;
block->next_in_same_bio = NULL;
@@ -3016,16 +3017,16 @@ leave:
mutex_unlock(&btrfsic_mutex);
}
-void btrfsic_submit_bio(int rw, struct bio *bio)
+void btrfsic_submit_bio(struct bio *bio)
{
- __btrfsic_submit_bio(rw, bio);
- submit_bio(rw, bio);
+ __btrfsic_submit_bio(bio);
+ submit_bio(bio);
}
-int btrfsic_submit_bio_wait(int rw, struct bio *bio)
+int btrfsic_submit_bio_wait(struct bio *bio)
{
- __btrfsic_submit_bio(rw, bio);
- return submit_bio_wait(rw, bio);
+ __btrfsic_submit_bio(bio);
+ return submit_bio_wait(bio);
}
int btrfsic_mount(struct btrfs_root *root,
@@ -3037,16 +3038,16 @@ int btrfsic_mount(struct btrfs_root *root,
struct list_head *dev_head = &fs_devices->devices;
struct btrfs_device *device;
- if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) {
+ if (root->nodesize & ((u64)PAGE_SIZE - 1)) {
printk(KERN_INFO
- "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
- root->nodesize, PAGE_CACHE_SIZE);
+ "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_SIZE %ld!\n",
+ root->nodesize, PAGE_SIZE);
return -1;
}
- if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) {
+ if (root->sectorsize & ((u64)PAGE_SIZE - 1)) {
printk(KERN_INFO
- "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
- root->sectorsize, PAGE_CACHE_SIZE);
+ "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_SIZE %ld!\n",
+ root->sectorsize, PAGE_SIZE);
return -1;
}
state = kzalloc(sizeof(*state), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
index 13b8566c97ab4..f78dff1c7e86c 100644
--- a/fs/btrfs/check-integrity.h
+++ b/fs/btrfs/check-integrity.h
@@ -20,9 +20,9 @@
#define __BTRFS_CHECK_INTEGRITY__
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-int btrfsic_submit_bh(int rw, struct buffer_head *bh);
-void btrfsic_submit_bio(int rw, struct bio *bio);
-int btrfsic_submit_bio_wait(int rw, struct bio *bio);
+int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh);
+void btrfsic_submit_bio(struct bio *bio);
+int btrfsic_submit_bio_wait(struct bio *bio);
#else
#define btrfsic_submit_bh submit_bh
#define btrfsic_submit_bio submit_bio
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 3346cd8f99103..029db6e1105c7 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -119,7 +119,7 @@ static int check_compressed_csum(struct inode *inode,
csum = ~(u32)0;
kaddr = kmap_atomic(page);
- csum = btrfs_csum_data(kaddr, csum, PAGE_CACHE_SIZE);
+ csum = btrfs_csum_data(kaddr, csum, PAGE_SIZE);
btrfs_csum_final(csum, (char *)&csum);
kunmap_atomic(kaddr);
@@ -190,7 +190,7 @@ csum_failed:
for (index = 0; index < cb->nr_pages; index++) {
page = cb->compressed_pages[index];
page->mapping = NULL;
- page_cache_release(page);
+ put_page(page);
}
/* do io completion on the original bio */
@@ -224,8 +224,8 @@ out:
static noinline void end_compressed_writeback(struct inode *inode,
const struct compressed_bio *cb)
{
- unsigned long index = cb->start >> PAGE_CACHE_SHIFT;
- unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_CACHE_SHIFT;
+ unsigned long index = cb->start >> PAGE_SHIFT;
+ unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
struct page *pages[16];
unsigned long nr_pages = end_index - index + 1;
int i;
@@ -247,7 +247,7 @@ static noinline void end_compressed_writeback(struct inode *inode,
if (cb->errors)
SetPageError(pages[i]);
end_page_writeback(pages[i]);
- page_cache_release(pages[i]);
+ put_page(pages[i]);
}
nr_pages -= ret;
index += ret;
@@ -304,7 +304,7 @@ static void end_compressed_bio_write(struct bio *bio)
for (index = 0; index < cb->nr_pages; index++) {
page = cb->compressed_pages[index];
page->mapping = NULL;
- page_cache_release(page);
+ put_page(page);
}
/* finally free the cb struct */
@@ -341,7 +341,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
int ret;
int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
- WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
+ WARN_ON(start & ((u64)PAGE_SIZE - 1));
cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
if (!cb)
return -ENOMEM;
@@ -363,6 +363,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
kfree(cb);
return -ENOMEM;
}
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
bio->bi_private = cb;
bio->bi_end_io = end_compressed_bio_write;
atomic_inc(&cb->pending_bios);
@@ -373,15 +374,15 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
page = compressed_pages[pg_index];
page->mapping = inode->i_mapping;
if (bio->bi_iter.bi_size)
- ret = io_tree->ops->merge_bio_hook(WRITE, page, 0,
- PAGE_CACHE_SIZE,
+ ret = io_tree->ops->merge_bio_hook(page, 0,
+ PAGE_SIZE,
bio, 0);
else
ret = 0;
page->mapping = NULL;
- if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) <
- PAGE_CACHE_SIZE) {
+ if (ret || bio_add_page(bio, page, PAGE_SIZE, 0) <
+ PAGE_SIZE) {
bio_get(bio);
/*
@@ -401,24 +402,28 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
BUG_ON(ret); /* -ENOMEM */
}
- ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
- BUG_ON(ret); /* -ENOMEM */
+ ret = btrfs_map_bio(root, bio, 0, 1);
+ if (ret) {
+ bio->bi_error = ret;
+ bio_endio(bio);
+ }
bio_put(bio);
bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
BUG_ON(!bio);
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
bio->bi_private = cb;
bio->bi_end_io = end_compressed_bio_write;
- bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+ bio_add_page(bio, page, PAGE_SIZE, 0);
}
- if (bytes_left < PAGE_CACHE_SIZE) {
+ if (bytes_left < PAGE_SIZE) {
btrfs_info(BTRFS_I(inode)->root->fs_info,
"bytes left %lu compress len %lu nr %lu",
bytes_left, cb->compressed_len, cb->nr_pages);
}
- bytes_left -= PAGE_CACHE_SIZE;
- first_byte += PAGE_CACHE_SIZE;
+ bytes_left -= PAGE_SIZE;
+ first_byte += PAGE_SIZE;
cond_resched();
}
bio_get(bio);
@@ -431,8 +436,11 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
BUG_ON(ret); /* -ENOMEM */
}
- ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
- BUG_ON(ret); /* -ENOMEM */
+ ret = btrfs_map_bio(root, bio, 0, 1);
+ if (ret) {
+ bio->bi_error = ret;
+ bio_endio(bio);
+ }
bio_put(bio);
return 0;
@@ -457,17 +465,17 @@ static noinline int add_ra_bio_pages(struct inode *inode,
int misses = 0;
page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page;
- last_offset = (page_offset(page) + PAGE_CACHE_SIZE);
+ last_offset = (page_offset(page) + PAGE_SIZE);
em_tree = &BTRFS_I(inode)->extent_tree;
tree = &BTRFS_I(inode)->io_tree;
if (isize == 0)
return 0;
- end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+ end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
while (last_offset < compressed_end) {
- pg_index = last_offset >> PAGE_CACHE_SHIFT;
+ pg_index = last_offset >> PAGE_SHIFT;
if (pg_index > end_index)
break;
@@ -488,11 +496,11 @@ static noinline int add_ra_bio_pages(struct inode *inode,
break;
if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) {
- page_cache_release(page);
+ put_page(page);
goto next;
}
- end = last_offset + PAGE_CACHE_SIZE - 1;
+ end = last_offset + PAGE_SIZE - 1;
/*
* at this point, we have a locked page in the page cache
* for these bytes in the file. But, we have to make
@@ -502,27 +510,27 @@ static noinline int add_ra_bio_pages(struct inode *inode,
lock_extent(tree, last_offset, end);
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, last_offset,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
read_unlock(&em_tree->lock);
if (!em || last_offset < em->start ||
- (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
+ (last_offset + PAGE_SIZE > extent_map_end(em)) ||
(em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
free_extent_map(em);
unlock_extent(tree, last_offset, end);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
break;
}
free_extent_map(em);
if (page->index == end_index) {
char *userpage;
- size_t zero_offset = isize & (PAGE_CACHE_SIZE - 1);
+ size_t zero_offset = isize & (PAGE_SIZE - 1);
if (zero_offset) {
int zeros;
- zeros = PAGE_CACHE_SIZE - zero_offset;
+ zeros = PAGE_SIZE - zero_offset;
userpage = kmap_atomic(page);
memset(userpage + zero_offset, 0, zeros);
flush_dcache_page(page);
@@ -531,19 +539,19 @@ static noinline int add_ra_bio_pages(struct inode *inode,
}
ret = bio_add_page(cb->orig_bio, page,
- PAGE_CACHE_SIZE, 0);
+ PAGE_SIZE, 0);
- if (ret == PAGE_CACHE_SIZE) {
+ if (ret == PAGE_SIZE) {
nr_pages++;
- page_cache_release(page);
+ put_page(page);
} else {
unlock_extent(tree, last_offset, end);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
break;
}
next:
- last_offset += PAGE_CACHE_SIZE;
+ last_offset += PAGE_SIZE;
}
return 0;
}
@@ -567,7 +575,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
struct extent_map_tree *em_tree;
struct compressed_bio *cb;
struct btrfs_root *root = BTRFS_I(inode)->root;
- unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
+ unsigned long uncompressed_len = bio->bi_vcnt * PAGE_SIZE;
unsigned long compressed_len;
unsigned long nr_pages;
unsigned long pg_index;
@@ -589,7 +597,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree,
page_offset(bio->bi_io_vec->bv_page),
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
read_unlock(&em_tree->lock);
if (!em)
return -EIO;
@@ -617,7 +625,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
cb->compress_type = extent_compress_type(bio_flags);
cb->orig_bio = bio;
- nr_pages = DIV_ROUND_UP(compressed_len, PAGE_CACHE_SIZE);
+ nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *),
GFP_NOFS);
if (!cb->compressed_pages)
@@ -640,12 +648,13 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
add_ra_bio_pages(inode, em_start + em_len, cb);
/* include any pages we added in add_ra-bio_pages */
- uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
+ uncompressed_len = bio->bi_vcnt * PAGE_SIZE;
cb->len = uncompressed_len;
comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
if (!comp_bio)
goto fail2;
+ bio_set_op_attrs (comp_bio, REQ_OP_READ, 0);
comp_bio->bi_private = cb;
comp_bio->bi_end_io = end_compressed_bio_read;
atomic_inc(&cb->pending_bios);
@@ -653,18 +662,18 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
for (pg_index = 0; pg_index < nr_pages; pg_index++) {
page = cb->compressed_pages[pg_index];
page->mapping = inode->i_mapping;
- page->index = em_start >> PAGE_CACHE_SHIFT;
+ page->index = em_start >> PAGE_SHIFT;
if (comp_bio->bi_iter.bi_size)
- ret = tree->ops->merge_bio_hook(READ, page, 0,
- PAGE_CACHE_SIZE,
+ ret = tree->ops->merge_bio_hook(page, 0,
+ PAGE_SIZE,
comp_bio, 0);
else
ret = 0;
page->mapping = NULL;
- if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) <
- PAGE_CACHE_SIZE) {
+ if (ret || bio_add_page(comp_bio, page, PAGE_SIZE, 0) <
+ PAGE_SIZE) {
bio_get(comp_bio);
ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio,
@@ -687,8 +696,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
sums += DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
root->sectorsize);
- ret = btrfs_map_bio(root, READ, comp_bio,
- mirror_num, 0);
+ ret = btrfs_map_bio(root, comp_bio, mirror_num, 0);
if (ret) {
bio->bi_error = ret;
bio_endio(comp_bio);
@@ -699,12 +707,13 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
GFP_NOFS);
BUG_ON(!comp_bio);
+ bio_set_op_attrs(comp_bio, REQ_OP_READ, 0);
comp_bio->bi_private = cb;
comp_bio->bi_end_io = end_compressed_bio_read;
- bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0);
+ bio_add_page(comp_bio, page, PAGE_SIZE, 0);
}
- cur_disk_byte += PAGE_CACHE_SIZE;
+ cur_disk_byte += PAGE_SIZE;
}
bio_get(comp_bio);
@@ -717,7 +726,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
BUG_ON(ret); /* -ENOMEM */
}
- ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
+ ret = btrfs_map_bio(root, comp_bio, mirror_num, 0);
if (ret) {
bio->bi_error = ret;
bio_endio(comp_bio);
@@ -743,8 +752,11 @@ out:
static struct {
struct list_head idle_ws;
spinlock_t ws_lock;
- int num_ws;
- atomic_t alloc_ws;
+ /* Number of free workspaces */
+ int free_ws;
+ /* Total number of allocated workspaces */
+ atomic_t total_ws;
+ /* Waiters for a free workspace */
wait_queue_head_t ws_wait;
} btrfs_comp_ws[BTRFS_COMPRESS_TYPES];
@@ -758,16 +770,34 @@ void __init btrfs_init_compress(void)
int i;
for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
+ struct list_head *workspace;
+
INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws);
spin_lock_init(&btrfs_comp_ws[i].ws_lock);
- atomic_set(&btrfs_comp_ws[i].alloc_ws, 0);
+ atomic_set(&btrfs_comp_ws[i].total_ws, 0);
init_waitqueue_head(&btrfs_comp_ws[i].ws_wait);
+
+ /*
+ * Preallocate one workspace for each compression type so
+ * we can guarantee forward progress in the worst case
+ */
+ workspace = btrfs_compress_op[i]->alloc_workspace();
+ if (IS_ERR(workspace)) {
+ printk(KERN_WARNING
+ "BTRFS: cannot preallocate compression workspace, will try later");
+ } else {
+ atomic_set(&btrfs_comp_ws[i].total_ws, 1);
+ btrfs_comp_ws[i].free_ws = 1;
+ list_add(workspace, &btrfs_comp_ws[i].idle_ws);
+ }
}
}
/*
- * this finds an available workspace or allocates a new one
- * ERR_PTR is returned if things go bad.
+ * This finds an available workspace or allocates a new one.
+ * If it's not possible to allocate a new one, waits until there's one.
+ * Preallocation makes a forward progress guarantees and we do not return
+ * errors.
*/
static struct list_head *find_workspace(int type)
{
@@ -777,36 +807,58 @@ static struct list_head *find_workspace(int type)
struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws;
spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock;
- atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws;
+ atomic_t *total_ws = &btrfs_comp_ws[idx].total_ws;
wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait;
- int *num_ws = &btrfs_comp_ws[idx].num_ws;
+ int *free_ws = &btrfs_comp_ws[idx].free_ws;
again:
spin_lock(ws_lock);
if (!list_empty(idle_ws)) {
workspace = idle_ws->next;
list_del(workspace);
- (*num_ws)--;
+ (*free_ws)--;
spin_unlock(ws_lock);
return workspace;
}
- if (atomic_read(alloc_ws) > cpus) {
+ if (atomic_read(total_ws) > cpus) {
DEFINE_WAIT(wait);
spin_unlock(ws_lock);
prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE);
- if (atomic_read(alloc_ws) > cpus && !*num_ws)
+ if (atomic_read(total_ws) > cpus && !*free_ws)
schedule();
finish_wait(ws_wait, &wait);
goto again;
}
- atomic_inc(alloc_ws);
+ atomic_inc(total_ws);
spin_unlock(ws_lock);
workspace = btrfs_compress_op[idx]->alloc_workspace();
if (IS_ERR(workspace)) {
- atomic_dec(alloc_ws);
+ atomic_dec(total_ws);
wake_up(ws_wait);
+
+ /*
+ * Do not return the error but go back to waiting. There's a
+ * workspace preallocated for each type and the compression
+ * time is bounded so we get to a workspace eventually. This
+ * makes our caller's life easier.
+ *
+ * To prevent silent and low-probability deadlocks (when the
+ * initial preallocation fails), check if there are any
+ * workspaces at all.
+ */
+ if (atomic_read(total_ws) == 0) {
+ static DEFINE_RATELIMIT_STATE(_rs,
+ /* once per minute */ 60 * HZ,
+ /* no burst */ 1);
+
+ if (__ratelimit(&_rs)) {
+ printk(KERN_WARNING
+ "no compression workspaces, low memory, retrying");
+ }
+ }
+ goto again;
}
return workspace;
}
@@ -820,21 +872,21 @@ static void free_workspace(int type, struct list_head *workspace)
int idx = type - 1;
struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws;
spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock;
- atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws;
+ atomic_t *total_ws = &btrfs_comp_ws[idx].total_ws;
wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait;
- int *num_ws = &btrfs_comp_ws[idx].num_ws;
+ int *free_ws = &btrfs_comp_ws[idx].free_ws;
spin_lock(ws_lock);
- if (*num_ws < num_online_cpus()) {
+ if (*free_ws < num_online_cpus()) {
list_add(workspace, idle_ws);
- (*num_ws)++;
+ (*free_ws)++;
spin_unlock(ws_lock);
goto wake;
}
spin_unlock(ws_lock);
btrfs_compress_op[idx]->free_workspace(workspace);
- atomic_dec(alloc_ws);
+ atomic_dec(total_ws);
wake:
/*
* Make sure counter is updated before we wake up waiters.
@@ -857,7 +909,7 @@ static void free_workspaces(void)
workspace = btrfs_comp_ws[i].idle_ws.next;
list_del(workspace);
btrfs_compress_op[i]->free_workspace(workspace);
- atomic_dec(&btrfs_comp_ws[i].alloc_ws);
+ atomic_dec(&btrfs_comp_ws[i].total_ws);
}
}
}
@@ -894,8 +946,6 @@ int btrfs_compress_pages(int type, struct address_space *mapping,
int ret;
workspace = find_workspace(type);
- if (IS_ERR(workspace))
- return PTR_ERR(workspace);
ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
start, len, pages,
@@ -930,8 +980,6 @@ static int btrfs_decompress_biovec(int type, struct page **pages_in,
int ret;
workspace = find_workspace(type);
- if (IS_ERR(workspace))
- return PTR_ERR(workspace);
ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in,
disk_start,
@@ -952,8 +1000,6 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
int ret;
workspace = find_workspace(type);
- if (IS_ERR(workspace))
- return PTR_ERR(workspace);
ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
dest_page, start_byte,
@@ -1013,8 +1059,8 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
/* copy bytes from the working buffer into the pages */
while (working_bytes > 0) {
- bytes = min(PAGE_CACHE_SIZE - *pg_offset,
- PAGE_CACHE_SIZE - buf_offset);
+ bytes = min(PAGE_SIZE - *pg_offset,
+ PAGE_SIZE - buf_offset);
bytes = min(bytes, working_bytes);
kaddr = kmap_atomic(page_out);
memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
@@ -1027,7 +1073,7 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
current_buf_start += bytes;
/* check if we need to pick another page */
- if (*pg_offset == PAGE_CACHE_SIZE) {
+ if (*pg_offset == PAGE_SIZE) {
(*pg_index)++;
if (*pg_index >= vcnt)
return 0;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 77592931ab4fe..d1c56c94dd5ab 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -19,6 +19,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
+#include <linux/vmalloc.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -155,7 +156,7 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
/*
* RCU really hurts here, we could free up the root node because
- * it was cow'ed but we may not get the new root node yet so do
+ * it was COWed but we may not get the new root node yet so do
* the inc_not_zero dance and if it doesn't work then
* synchronize_rcu and try again.
*/
@@ -954,7 +955,7 @@ int btrfs_block_can_be_shared(struct btrfs_root *root,
struct extent_buffer *buf)
{
/*
- * Tree blocks not in refernece counted trees and tree roots
+ * Tree blocks not in reference counted trees and tree roots
* are never shared. If a block was allocated after the last
* snapshot and the block was not allocated by tree relocation,
* we know the block is not shared.
@@ -1010,7 +1011,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
return ret;
if (refs == 0) {
ret = -EROFS;
- btrfs_std_error(root->fs_info, ret, NULL);
+ btrfs_handle_fs_error(root->fs_info, ret, NULL);
return ret;
}
} else {
@@ -1152,14 +1153,14 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
ret = btrfs_reloc_cow_block(trans, root, buf, cow);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
}
@@ -1197,7 +1198,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if (last_ref) {
ret = tree_mod_log_free_eb(root->fs_info, buf);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
}
@@ -1269,7 +1270,7 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
/*
* tm is a pointer to the first operation to rewind within eb. then, all
- * previous operations will be rewinded (until we reach something older than
+ * previous operations will be rewound (until we reach something older than
* time_seq).
*/
static void
@@ -1344,7 +1345,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
}
/*
- * Called with eb read locked. If the buffer cannot be rewinded, the same buffer
+ * Called with eb read locked. If the buffer cannot be rewound, the same buffer
* is returned. If rewind operations happen, a fresh buffer is returned. The
* returned buffer is always read-locked. If the returned buffer is not the
* input buffer, the lock on the input buffer is released and the input buffer
@@ -1372,7 +1373,8 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
BUG_ON(tm->slot != 0);
- eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
+ eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start,
+ eb->len);
if (!eb_rewin) {
btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
@@ -1453,7 +1455,8 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
} else if (old_root) {
btrfs_tree_read_unlock(eb_root);
free_extent_buffer(eb_root);
- eb = alloc_dummy_extent_buffer(root->fs_info, logical);
+ eb = alloc_dummy_extent_buffer(root->fs_info, logical,
+ root->nodesize);
} else {
btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
eb = btrfs_clone_extent_buffer(eb_root);
@@ -1502,7 +1505,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf)
{
- if (btrfs_test_is_dummy_root(root))
+ if (btrfs_is_testing(root->fs_info))
return 0;
/* ensure we can see the force_cow */
@@ -1515,7 +1518,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
* 3) the root is not forced COW.
*
* What is forced COW:
- * when we create snapshot during commiting the transaction,
+ * when we create snapshot during committing the transaction,
* after we've finished coping src root, we must COW the shared
* block to ensure the metadata consistency.
*/
@@ -1530,7 +1533,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
/*
* cows a single block, see __btrfs_cow_block for the real work.
- * This version of it has extra checks so that a block isn't cow'd more than
+ * This version of it has extra checks so that a block isn't COWed more than
* once per transaction, as long as it hasn't been written yet
*/
noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
@@ -1551,6 +1554,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
trans->transid, root->fs_info->generation);
if (!should_cow_block(trans, root, buf)) {
+ trans->dirty = true;
*cow_ret = buf;
return 0;
}
@@ -1767,6 +1771,14 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
unsigned long map_len = 0;
int err;
+ if (low > high) {
+ btrfs_err(eb->fs_info,
+ "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
+ __func__, low, high, eb->start,
+ btrfs_header_owner(eb), btrfs_header_level(eb));
+ return -EINVAL;
+ }
+
while (low < high) {
mid = (low + high) / 2;
offset = p + mid * item_size;
@@ -1782,10 +1794,12 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
if (!err) {
tmp = (struct btrfs_disk_key *)(kaddr + offset -
map_start);
- } else {
+ } else if (err == 1) {
read_extent_buffer(eb, &unaligned,
offset, sizeof(unaligned));
tmp = &unaligned;
+ } else {
+ return err;
}
} else {
@@ -1852,7 +1866,6 @@ static void root_sub_used(struct btrfs_root *root, u32 size)
/* given a node and slot number, this reads the blocks it points to. The
* extent buffer is returned with a reference taken (but unlocked).
- * NULL is returned on error.
*/
static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
struct extent_buffer *parent, int slot)
@@ -1860,19 +1873,16 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
int level = btrfs_header_level(parent);
struct extent_buffer *eb;
- if (slot < 0)
- return NULL;
- if (slot >= btrfs_header_nritems(parent))
- return NULL;
+ if (slot < 0 || slot >= btrfs_header_nritems(parent))
+ return ERR_PTR(-ENOENT);
BUG_ON(level == 0);
eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
btrfs_node_ptr_generation(parent, slot));
- if (IS_ERR(eb) || !extent_buffer_uptodate(eb)) {
- if (!IS_ERR(eb))
- free_extent_buffer(eb);
- eb = NULL;
+ if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
+ free_extent_buffer(eb);
+ eb = ERR_PTR(-EIO);
}
return eb;
@@ -1925,9 +1935,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
/* promote the child to a root */
child = read_node_slot(root, mid, 0);
- if (!child) {
- ret = -EROFS;
- btrfs_std_error(root->fs_info, ret, NULL);
+ if (IS_ERR(child)) {
+ ret = PTR_ERR(child);
+ btrfs_handle_fs_error(root->fs_info, ret, NULL);
goto enospc;
}
@@ -1964,6 +1974,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
return 0;
left = read_node_slot(root, parent, pslot - 1);
+ if (IS_ERR(left))
+ left = NULL;
+
if (left) {
btrfs_tree_lock(left);
btrfs_set_lock_blocking(left);
@@ -1974,7 +1987,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
goto enospc;
}
}
+
right = read_node_slot(root, parent, pslot + 1);
+ if (IS_ERR(right))
+ right = NULL;
+
if (right) {
btrfs_tree_lock(right);
btrfs_set_lock_blocking(right);
@@ -2030,7 +2047,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
*/
if (!left) {
ret = -EROFS;
- btrfs_std_error(root->fs_info, ret, NULL);
+ btrfs_handle_fs_error(root->fs_info, ret, NULL);
goto enospc;
}
wret = balance_node_right(trans, root, mid, left);
@@ -2129,6 +2146,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
return 1;
left = read_node_slot(root, parent, pslot - 1);
+ if (IS_ERR(left))
+ left = NULL;
/* first, try to make some room in the middle buffer */
if (left) {
@@ -2179,6 +2198,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
free_extent_buffer(left);
}
right = read_node_slot(root, parent, pslot + 1);
+ if (IS_ERR(right))
+ right = NULL;
/*
* then try to empty the right most buffer into the middle
@@ -2509,6 +2530,8 @@ read_block_for_search(struct btrfs_trans_handle *trans,
if (!btrfs_buffer_uptodate(tmp, 0, 0))
ret = -EIO;
free_extent_buffer(tmp);
+ } else {
+ ret = PTR_ERR(tmp);
}
return ret;
}
@@ -2772,8 +2795,10 @@ again:
* then we don't want to set the path blocking,
* so we test it here
*/
- if (!should_cow_block(trans, root, b))
+ if (!should_cow_block(trans, root, b)) {
+ trans->dirty = true;
goto cow_done;
+ }
/*
* must have write locks on this node and the
@@ -2822,6 +2847,8 @@ cow_done:
}
ret = key_search(b, key, level, &prev_cmp, &slot);
+ if (ret < 0)
+ goto done;
if (level != 0) {
int dec = 0;
@@ -2985,7 +3012,7 @@ again:
btrfs_unlock_up_safe(p, level + 1);
/*
- * Since we can unwind eb's we want to do a real search every
+ * Since we can unwind ebs we want to do a real search every
* time.
*/
prev_cmp = -1;
@@ -3228,7 +3255,7 @@ static int push_node_left(struct btrfs_trans_handle *trans,
ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
push_items);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
copy_extent_buffer(dst, src,
@@ -3303,7 +3330,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
src_nritems - push_items, push_items);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
copy_extent_buffer(dst, src,
@@ -3507,7 +3534,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0,
mid, c_nritems - mid);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
copy_extent_buffer(split, c,
@@ -3761,7 +3788,11 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
btrfs_assert_tree_locked(path->nodes[1]);
right = read_node_slot(root, upper, slot + 1);
- if (right == NULL)
+ /*
+ * slot + 1 is not valid or we fail to read the right node,
+ * no big deal, just return.
+ */
+ if (IS_ERR(right))
return 1;
btrfs_tree_lock(right);
@@ -3991,7 +4022,11 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
btrfs_assert_tree_locked(path->nodes[1]);
left = read_node_slot(root, path->nodes[1], slot - 1);
- if (left == NULL)
+ /*
+ * slot - 1 is not valid or we fail to read the left node,
+ * no big deal, just return.
+ */
+ if (IS_ERR(left))
return 1;
btrfs_tree_lock(left);
@@ -5198,7 +5233,10 @@ find_next_key:
}
btrfs_set_path_blocking(path);
cur = read_node_slot(root, cur, slot);
- BUG_ON(!cur); /* -ENOMEM */
+ if (IS_ERR(cur)) {
+ ret = PTR_ERR(cur);
+ goto out;
+ }
btrfs_tree_read_lock(cur);
@@ -5217,15 +5255,21 @@ out:
return ret;
}
-static void tree_move_down(struct btrfs_root *root,
+static int tree_move_down(struct btrfs_root *root,
struct btrfs_path *path,
int *level, int root_level)
{
+ struct extent_buffer *eb;
+
BUG_ON(*level == 0);
- path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
- path->slots[*level]);
+ eb = read_node_slot(root, path->nodes[*level], path->slots[*level]);
+ if (IS_ERR(eb))
+ return PTR_ERR(eb);
+
+ path->nodes[*level - 1] = eb;
path->slots[*level - 1] = 0;
(*level)--;
+ return 0;
}
static int tree_move_next_or_upnext(struct btrfs_root *root,
@@ -5270,8 +5314,7 @@ static int tree_advance(struct btrfs_root *root,
if (*level == 0 || !allow_down) {
ret = tree_move_next_or_upnext(root, path, level, root_level);
} else {
- tree_move_down(root, path, level, root_level);
- ret = 0;
+ ret = tree_move_down(root, path, level, root_level);
}
if (ret >= 0) {
if (*level == 0)
@@ -5361,10 +5404,13 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
goto out;
}
- tmp_buf = kmalloc(left_root->nodesize, GFP_KERNEL);
+ tmp_buf = kmalloc(left_root->nodesize, GFP_KERNEL | __GFP_NOWARN);
if (!tmp_buf) {
- ret = -ENOMEM;
- goto out;
+ tmp_buf = vmalloc(left_root->nodesize);
+ if (!tmp_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
}
left_path->search_commit_root = 1;
@@ -5442,8 +5488,10 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
left_root_level,
advance_left != ADVANCE_ONLY_NEXT,
&left_key);
- if (ret < 0)
+ if (ret == -1)
left_end_reached = ADVANCE;
+ else if (ret < 0)
+ goto out;
advance_left = 0;
}
if (advance_right && !right_end_reached) {
@@ -5451,8 +5499,10 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
right_root_level,
advance_right != ADVANCE_ONLY_NEXT,
&right_key);
- if (ret < 0)
+ if (ret == -1)
right_end_reached = ADVANCE;
+ else if (ret < 0)
+ goto out;
advance_right = 0;
}
@@ -5565,7 +5615,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
out:
btrfs_free_path(left_path);
btrfs_free_path(right_path);
- kfree(tmp_buf);
+ kvfree(tmp_buf);
return ret;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 84a6a5b3384a7..2fe8f89091a30 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -33,6 +33,7 @@
#include <asm/kmap_types.h>
#include <linux/pagemap.h>
#include <linux/btrfs.h>
+#include <linux/btrfs_tree.h>
#include <linux/workqueue.h>
#include <linux/security.h>
#include <linux/sizes.h>
@@ -64,98 +65,6 @@ struct btrfs_ordered_sum;
#define BTRFS_COMPAT_EXTENT_TREE_V0
-/* holds pointers to all of the tree roots */
-#define BTRFS_ROOT_TREE_OBJECTID 1ULL
-
-/* stores information about which extents are in use, and reference counts */
-#define BTRFS_EXTENT_TREE_OBJECTID 2ULL
-
-/*
- * chunk tree stores translations from logical -> physical block numbering
- * the super block points to the chunk tree
- */
-#define BTRFS_CHUNK_TREE_OBJECTID 3ULL
-
-/*
- * stores information about which areas of a given device are in use.
- * one per device. The tree of tree roots points to the device tree
- */
-#define BTRFS_DEV_TREE_OBJECTID 4ULL
-
-/* one per subvolume, storing files and directories */
-#define BTRFS_FS_TREE_OBJECTID 5ULL
-
-/* directory objectid inside the root tree */
-#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
-
-/* holds checksums of all the data extents */
-#define BTRFS_CSUM_TREE_OBJECTID 7ULL
-
-/* holds quota configuration and tracking */
-#define BTRFS_QUOTA_TREE_OBJECTID 8ULL
-
-/* for storing items that use the BTRFS_UUID_KEY* types */
-#define BTRFS_UUID_TREE_OBJECTID 9ULL
-
-/* tracks free space in block groups. */
-#define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL
-
-/* device stats in the device tree */
-#define BTRFS_DEV_STATS_OBJECTID 0ULL
-
-/* for storing balance parameters in the root tree */
-#define BTRFS_BALANCE_OBJECTID -4ULL
-
-/* orhpan objectid for tracking unlinked/truncated files */
-#define BTRFS_ORPHAN_OBJECTID -5ULL
-
-/* does write ahead logging to speed up fsyncs */
-#define BTRFS_TREE_LOG_OBJECTID -6ULL
-#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL
-
-/* for space balancing */
-#define BTRFS_TREE_RELOC_OBJECTID -8ULL
-#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL
-
-/*
- * extent checksums all have this objectid
- * this allows them to share the logging tree
- * for fsyncs
- */
-#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
-
-/* For storing free space cache */
-#define BTRFS_FREE_SPACE_OBJECTID -11ULL
-
-/*
- * The inode number assigned to the special inode for storing
- * free ino cache
- */
-#define BTRFS_FREE_INO_OBJECTID -12ULL
-
-/* dummy objectid represents multiple objectids */
-#define BTRFS_MULTIPLE_OBJECTIDS -255ULL
-
-/*
- * All files have objectids in this range.
- */
-#define BTRFS_FIRST_FREE_OBJECTID 256ULL
-#define BTRFS_LAST_FREE_OBJECTID -256ULL
-#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL
-
-
-/*
- * the device items go into the chunk tree. The key is in the form
- * [ 1 BTRFS_DEV_ITEM_KEY device_id ]
- */
-#define BTRFS_DEV_ITEMS_OBJECTID 1ULL
-
-#define BTRFS_BTREE_INODE_OBJECTID 1
-
-#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
-
-#define BTRFS_DEV_REPLACE_DEVID 0ULL
-
/*
* the max metadata block size. This limit is somewhat artificial,
* but the memmove costs go through the roof for larger blocks.
@@ -175,31 +84,14 @@ struct btrfs_ordered_sum;
*/
#define BTRFS_LINK_MAX 65535U
-/* 32 bytes in various csum fields */
-#define BTRFS_CSUM_SIZE 32
-
-/* csum types */
-#define BTRFS_CSUM_TYPE_CRC32 0
-
static const int btrfs_csum_sizes[] = { 4 };
/* four bytes for CRC32 */
#define BTRFS_EMPTY_DIR_SIZE 0
-/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */
+/* specific to btrfs_map_block(), therefore not in include/linux/blk_types.h */
#define REQ_GET_READ_MIRRORS (1 << 30)
-#define BTRFS_FT_UNKNOWN 0
-#define BTRFS_FT_REG_FILE 1
-#define BTRFS_FT_DIR 2
-#define BTRFS_FT_CHRDEV 3
-#define BTRFS_FT_BLKDEV 4
-#define BTRFS_FT_FIFO 5
-#define BTRFS_FT_SOCK 6
-#define BTRFS_FT_SYMLINK 7
-#define BTRFS_FT_XATTR 8
-#define BTRFS_FT_MAX 9
-
/* ioprio of readahead is set to idle */
#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
@@ -207,138 +99,10 @@ static const int btrfs_csum_sizes[] = { 4 };
#define BTRFS_MAX_EXTENT_SIZE SZ_128M
-/*
- * The key defines the order in the tree, and so it also defines (optimal)
- * block layout.
- *
- * objectid corresponds to the inode number.
- *
- * type tells us things about the object, and is a kind of stream selector.
- * so for a given inode, keys with type of 1 might refer to the inode data,
- * type of 2 may point to file data in the btree and type == 3 may point to
- * extents.
- *
- * offset is the starting byte offset for this key in the stream.
- *
- * btrfs_disk_key is in disk byte order. struct btrfs_key is always
- * in cpu native order. Otherwise they are identical and their sizes
- * should be the same (ie both packed)
- */
-struct btrfs_disk_key {
- __le64 objectid;
- u8 type;
- __le64 offset;
-} __attribute__ ((__packed__));
-
-struct btrfs_key {
- u64 objectid;
- u8 type;
- u64 offset;
-} __attribute__ ((__packed__));
-
struct btrfs_mapping_tree {
struct extent_map_tree map_tree;
};
-struct btrfs_dev_item {
- /* the internal btrfs device id */
- __le64 devid;
-
- /* size of the device */
- __le64 total_bytes;
-
- /* bytes used */
- __le64 bytes_used;
-
- /* optimal io alignment for this device */
- __le32 io_align;
-
- /* optimal io width for this device */
- __le32 io_width;
-
- /* minimal io size for this device */
- __le32 sector_size;
-
- /* type and info about this device */
- __le64 type;
-
- /* expected generation for this device */
- __le64 generation;
-
- /*
- * starting byte of this partition on the device,
- * to allow for stripe alignment in the future
- */
- __le64 start_offset;
-
- /* grouping information for allocation decisions */
- __le32 dev_group;
-
- /* seek speed 0-100 where 100 is fastest */
- u8 seek_speed;
-
- /* bandwidth 0-100 where 100 is fastest */
- u8 bandwidth;
-
- /* btrfs generated uuid for this device */
- u8 uuid[BTRFS_UUID_SIZE];
-
- /* uuid of FS who owns this device */
- u8 fsid[BTRFS_UUID_SIZE];
-} __attribute__ ((__packed__));
-
-struct btrfs_stripe {
- __le64 devid;
- __le64 offset;
- u8 dev_uuid[BTRFS_UUID_SIZE];
-} __attribute__ ((__packed__));
-
-struct btrfs_chunk {
- /* size of this chunk in bytes */
- __le64 length;
-
- /* objectid of the root referencing this chunk */
- __le64 owner;
-
- __le64 stripe_len;
- __le64 type;
-
- /* optimal io alignment for this chunk */
- __le32 io_align;
-
- /* optimal io width for this chunk */
- __le32 io_width;
-
- /* minimal io size for this chunk */
- __le32 sector_size;
-
- /* 2^16 stripes is quite a lot, a second limit is the size of a single
- * item in the btree
- */
- __le16 num_stripes;
-
- /* sub stripes only matter for raid10 */
- __le16 sub_stripes;
- struct btrfs_stripe stripe;
- /* additional stripes go here */
-} __attribute__ ((__packed__));
-
-#define BTRFS_FREE_SPACE_EXTENT 1
-#define BTRFS_FREE_SPACE_BITMAP 2
-
-struct btrfs_free_space_entry {
- __le64 offset;
- __le64 bytes;
- u8 type;
-} __attribute__ ((__packed__));
-
-struct btrfs_free_space_header {
- struct btrfs_disk_key location;
- __le64 generation;
- __le64 num_entries;
- __le64 num_bitmaps;
-} __attribute__ ((__packed__));
-
static inline unsigned long btrfs_chunk_item_size(int num_stripes)
{
BUG_ON(num_stripes == 0);
@@ -346,9 +110,6 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
sizeof(struct btrfs_stripe) * (num_stripes - 1);
}
-#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0)
-#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1)
-
/*
* File system states
*/
@@ -356,13 +117,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
#define BTRFS_FS_STATE_REMOUNTING 1
#define BTRFS_FS_STATE_TRANS_ABORTED 2
#define BTRFS_FS_STATE_DEV_REPLACING 3
-
-/* Super block flags */
-/* Errors detected */
-#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2)
-
-#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
-#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33)
+#define BTRFS_FS_STATE_DUMMY_FS_INFO 4
#define BTRFS_BACKREF_REV_MAX 256
#define BTRFS_BACKREF_REV_SHIFT 56
@@ -390,27 +145,11 @@ struct btrfs_header {
u8 level;
} __attribute__ ((__packed__));
-#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \
- sizeof(struct btrfs_header)) / \
- sizeof(struct btrfs_key_ptr))
-#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
-#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->nodesize))
-#define BTRFS_FILE_EXTENT_INLINE_DATA_START \
- (offsetof(struct btrfs_file_extent_item, disk_bytenr))
-#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
- sizeof(struct btrfs_item) - \
- BTRFS_FILE_EXTENT_INLINE_DATA_START)
-#define BTRFS_MAX_XATTR_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
- sizeof(struct btrfs_item) -\
- sizeof(struct btrfs_dir_item))
-
-
/*
* this is a very generous portion of the super block, giving us
* room to translate 14 chunks with 3 stripes each.
*/
#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
-#define BTRFS_LABEL_SIZE 256
/*
* just in case we somehow lose the roots and are not able to mount,
@@ -507,31 +246,6 @@ struct btrfs_super_block {
* Compat flags that we support. If any incompat flags are set other than the
* ones specified below then we will fail to mount
*/
-#define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE (1ULL << 0)
-
-#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0)
-#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
-#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2)
-#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3)
-/*
- * some patches floated around with a second compression method
- * lets save that incompat here for when they do get in
- * Note we don't actually support it, we're just reserving the
- * number
- */
-#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2 (1ULL << 4)
-
-/*
- * older kernels tried to do bigger metadata blocks, but the
- * code was pretty buggy. Lets not let them try anymore.
- */
-#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5)
-
-#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6)
-#define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7)
-#define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA (1ULL << 8)
-#define BTRFS_FEATURE_INCOMPAT_NO_HOLES (1ULL << 9)
-
#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
#define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL
#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL
@@ -624,357 +338,8 @@ struct btrfs_path {
unsigned int need_commit_sem:1;
unsigned int skip_release_on_error:1;
};
-
-/*
- * items in the extent btree are used to record the objectid of the
- * owner of the block and the number of references
- */
-
-struct btrfs_extent_item {
- __le64 refs;
- __le64 generation;
- __le64 flags;
-} __attribute__ ((__packed__));
-
-struct btrfs_extent_item_v0 {
- __le32 refs;
-} __attribute__ ((__packed__));
-
#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \
sizeof(struct btrfs_item))
-
-#define BTRFS_EXTENT_FLAG_DATA (1ULL << 0)
-#define BTRFS_EXTENT_FLAG_TREE_BLOCK (1ULL << 1)
-
-/* following flags only apply to tree blocks */
-
-/* use full backrefs for extent pointers in the block */
-#define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8)
-
-/*
- * this flag is only used internally by scrub and may be changed at any time
- * it is only declared here to avoid collisions
- */
-#define BTRFS_EXTENT_FLAG_SUPER (1ULL << 48)
-
-struct btrfs_tree_block_info {
- struct btrfs_disk_key key;
- u8 level;
-} __attribute__ ((__packed__));
-
-struct btrfs_extent_data_ref {
- __le64 root;
- __le64 objectid;
- __le64 offset;
- __le32 count;
-} __attribute__ ((__packed__));
-
-struct btrfs_shared_data_ref {
- __le32 count;
-} __attribute__ ((__packed__));
-
-struct btrfs_extent_inline_ref {
- u8 type;
- __le64 offset;
-} __attribute__ ((__packed__));
-
-/* old style backrefs item */
-struct btrfs_extent_ref_v0 {
- __le64 root;
- __le64 generation;
- __le64 objectid;
- __le32 count;
-} __attribute__ ((__packed__));
-
-
-/* dev extents record free space on individual devices. The owner
- * field points back to the chunk allocation mapping tree that allocated
- * the extent. The chunk tree uuid field is a way to double check the owner
- */
-struct btrfs_dev_extent {
- __le64 chunk_tree;
- __le64 chunk_objectid;
- __le64 chunk_offset;
- __le64 length;
- u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
-} __attribute__ ((__packed__));
-
-struct btrfs_inode_ref {
- __le64 index;
- __le16 name_len;
- /* name goes here */
-} __attribute__ ((__packed__));
-
-struct btrfs_inode_extref {
- __le64 parent_objectid;
- __le64 index;
- __le16 name_len;
- __u8 name[0];
- /* name goes here */
-} __attribute__ ((__packed__));
-
-struct btrfs_timespec {
- __le64 sec;
- __le32 nsec;
-} __attribute__ ((__packed__));
-
-struct btrfs_inode_item {
- /* nfs style generation number */
- __le64 generation;
- /* transid that last touched this inode */
- __le64 transid;
- __le64 size;
- __le64 nbytes;
- __le64 block_group;
- __le32 nlink;
- __le32 uid;
- __le32 gid;
- __le32 mode;
- __le64 rdev;
- __le64 flags;
-
- /* modification sequence number for NFS */
- __le64 sequence;
-
- /*
- * a little future expansion, for more than this we can
- * just grow the inode item and version it
- */
- __le64 reserved[4];
- struct btrfs_timespec atime;
- struct btrfs_timespec ctime;
- struct btrfs_timespec mtime;
- struct btrfs_timespec otime;
-} __attribute__ ((__packed__));
-
-struct btrfs_dir_log_item {
- __le64 end;
-} __attribute__ ((__packed__));
-
-struct btrfs_dir_item {
- struct btrfs_disk_key location;
- __le64 transid;
- __le16 data_len;
- __le16 name_len;
- u8 type;
-} __attribute__ ((__packed__));
-
-#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0)
-
-/*
- * Internal in-memory flag that a subvolume has been marked for deletion but
- * still visible as a directory
- */
-#define BTRFS_ROOT_SUBVOL_DEAD (1ULL << 48)
-
-struct btrfs_root_item {
- struct btrfs_inode_item inode;
- __le64 generation;
- __le64 root_dirid;
- __le64 bytenr;
- __le64 byte_limit;
- __le64 bytes_used;
- __le64 last_snapshot;
- __le64 flags;
- __le32 refs;
- struct btrfs_disk_key drop_progress;
- u8 drop_level;
- u8 level;
-
- /*
- * The following fields appear after subvol_uuids+subvol_times
- * were introduced.
- */
-
- /*
- * This generation number is used to test if the new fields are valid
- * and up to date while reading the root item. Every time the root item
- * is written out, the "generation" field is copied into this field. If
- * anyone ever mounted the fs with an older kernel, we will have
- * mismatching generation values here and thus must invalidate the
- * new fields. See btrfs_update_root and btrfs_find_last_root for
- * details.
- * the offset of generation_v2 is also used as the start for the memset
- * when invalidating the fields.
- */
- __le64 generation_v2;
- u8 uuid[BTRFS_UUID_SIZE];
- u8 parent_uuid[BTRFS_UUID_SIZE];
- u8 received_uuid[BTRFS_UUID_SIZE];
- __le64 ctransid; /* updated when an inode changes */
- __le64 otransid; /* trans when created */
- __le64 stransid; /* trans when sent. non-zero for received subvol */
- __le64 rtransid; /* trans when received. non-zero for received subvol */
- struct btrfs_timespec ctime;
- struct btrfs_timespec otime;
- struct btrfs_timespec stime;
- struct btrfs_timespec rtime;
- __le64 reserved[8]; /* for future */
-} __attribute__ ((__packed__));
-
-/*
- * this is used for both forward and backward root refs
- */
-struct btrfs_root_ref {
- __le64 dirid;
- __le64 sequence;
- __le16 name_len;
-} __attribute__ ((__packed__));
-
-struct btrfs_disk_balance_args {
- /*
- * profiles to operate on, single is denoted by
- * BTRFS_AVAIL_ALLOC_BIT_SINGLE
- */
- __le64 profiles;
-
- /*
- * usage filter
- * BTRFS_BALANCE_ARGS_USAGE with a single value means '0..N'
- * BTRFS_BALANCE_ARGS_USAGE_RANGE - range syntax, min..max
- */
- union {
- __le64 usage;
- struct {
- __le32 usage_min;
- __le32 usage_max;
- };
- };
-
- /* devid filter */
- __le64 devid;
-
- /* devid subset filter [pstart..pend) */
- __le64 pstart;
- __le64 pend;
-
- /* btrfs virtual address space subset filter [vstart..vend) */
- __le64 vstart;
- __le64 vend;
-
- /*
- * profile to convert to, single is denoted by
- * BTRFS_AVAIL_ALLOC_BIT_SINGLE
- */
- __le64 target;
-
- /* BTRFS_BALANCE_ARGS_* */
- __le64 flags;
-
- /*
- * BTRFS_BALANCE_ARGS_LIMIT with value 'limit'
- * BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum
- * and maximum
- */
- union {
- __le64 limit;
- struct {
- __le32 limit_min;
- __le32 limit_max;
- };
- };
-
- /*
- * Process chunks that cross stripes_min..stripes_max devices,
- * BTRFS_BALANCE_ARGS_STRIPES_RANGE
- */
- __le32 stripes_min;
- __le32 stripes_max;
-
- __le64 unused[6];
-} __attribute__ ((__packed__));
-
-/*
- * store balance parameters to disk so that balance can be properly
- * resumed after crash or unmount
- */
-struct btrfs_balance_item {
- /* BTRFS_BALANCE_* */
- __le64 flags;
-
- struct btrfs_disk_balance_args data;
- struct btrfs_disk_balance_args meta;
- struct btrfs_disk_balance_args sys;
-
- __le64 unused[4];
-} __attribute__ ((__packed__));
-
-#define BTRFS_FILE_EXTENT_INLINE 0
-#define BTRFS_FILE_EXTENT_REG 1
-#define BTRFS_FILE_EXTENT_PREALLOC 2
-
-struct btrfs_file_extent_item {
- /*
- * transaction id that created this extent
- */
- __le64 generation;
- /*
- * max number of bytes to hold this extent in ram
- * when we split a compressed extent we can't know how big
- * each of the resulting pieces will be. So, this is
- * an upper limit on the size of the extent in ram instead of
- * an exact limit.
- */
- __le64 ram_bytes;
-
- /*
- * 32 bits for the various ways we might encode the data,
- * including compression and encryption. If any of these
- * are set to something a given disk format doesn't understand
- * it is treated like an incompat flag for reading and writing,
- * but not for stat.
- */
- u8 compression;
- u8 encryption;
- __le16 other_encoding; /* spare for later use */
-
- /* are we inline data or a real extent? */
- u8 type;
-
- /*
- * disk space consumed by the extent, checksum blocks are included
- * in these numbers
- *
- * At this offset in the structure, the inline extent data start.
- */
- __le64 disk_bytenr;
- __le64 disk_num_bytes;
- /*
- * the logical offset in file blocks (no csums)
- * this extent record is for. This allows a file extent to point
- * into the middle of an existing extent on disk, sharing it
- * between two snapshots (useful if some bytes in the middle of the
- * extent have changed
- */
- __le64 offset;
- /*
- * the logical number of file blocks (no csums included). This
- * always reflects the size uncompressed and without encoding.
- */
- __le64 num_bytes;
-
-} __attribute__ ((__packed__));
-
-struct btrfs_csum_item {
- u8 csum;
-} __attribute__ ((__packed__));
-
-struct btrfs_dev_stats_item {
- /*
- * grow this item struct at the end for future enhancements and keep
- * the existing values unchanged
- */
- __le64 values[BTRFS_DEV_STAT_VALUES_MAX];
-} __attribute__ ((__packed__));
-
-#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0
-#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1
-#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0
-#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED 1
-#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2
-#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3
-#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4
-
struct btrfs_dev_replace {
u64 replace_state; /* see #define above */
u64 time_started; /* seconds since 1-Jan-1970 */
@@ -1005,175 +370,6 @@ struct btrfs_dev_replace {
struct btrfs_scrub_progress scrub_progress;
};
-struct btrfs_dev_replace_item {
- /*
- * grow this item struct at the end for future enhancements and keep
- * the existing values unchanged
- */
- __le64 src_devid;
- __le64 cursor_left;
- __le64 cursor_right;
- __le64 cont_reading_from_srcdev_mode;
-
- __le64 replace_state;
- __le64 time_started;
- __le64 time_stopped;
- __le64 num_write_errors;
- __le64 num_uncorrectable_read_errors;
-} __attribute__ ((__packed__));
-
-/* different types of block groups (and chunks) */
-#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0)
-#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1)
-#define BTRFS_BLOCK_GROUP_METADATA (1ULL << 2)
-#define BTRFS_BLOCK_GROUP_RAID0 (1ULL << 3)
-#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
-#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
-#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
-#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7)
-#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8)
-#define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \
- BTRFS_SPACE_INFO_GLOBAL_RSV)
-
-enum btrfs_raid_types {
- BTRFS_RAID_RAID10,
- BTRFS_RAID_RAID1,
- BTRFS_RAID_DUP,
- BTRFS_RAID_RAID0,
- BTRFS_RAID_SINGLE,
- BTRFS_RAID_RAID5,
- BTRFS_RAID_RAID6,
- BTRFS_NR_RAID_TYPES
-};
-
-#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \
- BTRFS_BLOCK_GROUP_SYSTEM | \
- BTRFS_BLOCK_GROUP_METADATA)
-
-#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
- BTRFS_BLOCK_GROUP_RAID1 | \
- BTRFS_BLOCK_GROUP_RAID5 | \
- BTRFS_BLOCK_GROUP_RAID6 | \
- BTRFS_BLOCK_GROUP_DUP | \
- BTRFS_BLOCK_GROUP_RAID10)
-#define BTRFS_BLOCK_GROUP_RAID56_MASK (BTRFS_BLOCK_GROUP_RAID5 | \
- BTRFS_BLOCK_GROUP_RAID6)
-
-/*
- * We need a bit for restriper to be able to tell when chunks of type
- * SINGLE are available. This "extended" profile format is used in
- * fs_info->avail_*_alloc_bits (in-memory) and balance item fields
- * (on-disk). The corresponding on-disk bit in chunk.type is reserved
- * to avoid remappings between two formats in future.
- */
-#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48)
-
-/*
- * A fake block group type that is used to communicate global block reserve
- * size to userspace via the SPACE_INFO ioctl.
- */
-#define BTRFS_SPACE_INFO_GLOBAL_RSV (1ULL << 49)
-
-#define BTRFS_EXTENDED_PROFILE_MASK (BTRFS_BLOCK_GROUP_PROFILE_MASK | \
- BTRFS_AVAIL_ALLOC_BIT_SINGLE)
-
-static inline u64 chunk_to_extended(u64 flags)
-{
- if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0)
- flags |= BTRFS_AVAIL_ALLOC_BIT_SINGLE;
-
- return flags;
-}
-static inline u64 extended_to_chunk(u64 flags)
-{
- return flags & ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
-}
-
-struct btrfs_block_group_item {
- __le64 used;
- __le64 chunk_objectid;
- __le64 flags;
-} __attribute__ ((__packed__));
-
-struct btrfs_free_space_info {
- __le32 extent_count;
- __le32 flags;
-} __attribute__ ((__packed__));
-
-#define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0)
-
-#define BTRFS_QGROUP_LEVEL_SHIFT 48
-static inline u64 btrfs_qgroup_level(u64 qgroupid)
-{
- return qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT;
-}
-
-/*
- * is subvolume quota turned on?
- */
-#define BTRFS_QGROUP_STATUS_FLAG_ON (1ULL << 0)
-/*
- * RESCAN is set during the initialization phase
- */
-#define BTRFS_QGROUP_STATUS_FLAG_RESCAN (1ULL << 1)
-/*
- * Some qgroup entries are known to be out of date,
- * either because the configuration has changed in a way that
- * makes a rescan necessary, or because the fs has been mounted
- * with a non-qgroup-aware version.
- * Turning qouta off and on again makes it inconsistent, too.
- */
-#define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT (1ULL << 2)
-
-#define BTRFS_QGROUP_STATUS_VERSION 1
-
-struct btrfs_qgroup_status_item {
- __le64 version;
- /*
- * the generation is updated during every commit. As older
- * versions of btrfs are not aware of qgroups, it will be
- * possible to detect inconsistencies by checking the
- * generation on mount time
- */
- __le64 generation;
-
- /* flag definitions see above */
- __le64 flags;
-
- /*
- * only used during scanning to record the progress
- * of the scan. It contains a logical address
- */
- __le64 rescan;
-} __attribute__ ((__packed__));
-
-struct btrfs_qgroup_info_item {
- __le64 generation;
- __le64 rfer;
- __le64 rfer_cmpr;
- __le64 excl;
- __le64 excl_cmpr;
-} __attribute__ ((__packed__));
-
-/* flags definition for qgroup limits */
-#define BTRFS_QGROUP_LIMIT_MAX_RFER (1ULL << 0)
-#define BTRFS_QGROUP_LIMIT_MAX_EXCL (1ULL << 1)
-#define BTRFS_QGROUP_LIMIT_RSV_RFER (1ULL << 2)
-#define BTRFS_QGROUP_LIMIT_RSV_EXCL (1ULL << 3)
-#define BTRFS_QGROUP_LIMIT_RFER_CMPR (1ULL << 4)
-#define BTRFS_QGROUP_LIMIT_EXCL_CMPR (1ULL << 5)
-
-struct btrfs_qgroup_limit_item {
- /*
- * only updated when any of the other values change
- */
- __le64 flags;
- __le64 max_rfer;
- __le64 max_excl;
- __le64 rsv_rfer;
- __le64 rsv_excl;
-} __attribute__ ((__packed__));
-
/* For raid type sysfs entries */
struct raid_kobject {
int raid_type;
@@ -1221,7 +417,7 @@ struct btrfs_space_info {
* bytes_pinned does not reflect the bytes that will be pinned once the
* delayed refs are flushed, so this counter is inc'ed every time we
* call btrfs_free_extent so it is a realtime count of what will be
- * freed once the transaction is committed. It will be zero'ed every
+ * freed once the transaction is committed. It will be zeroed every
* time the transaction commits.
*/
struct percpu_counter total_bytes_pinned;
@@ -1229,6 +425,8 @@ struct btrfs_space_info {
struct list_head list;
/* Protected by the spinlock 'lock'. */
struct list_head ro_bgs;
+ struct list_head priority_tickets;
+ struct list_head tickets;
struct rw_semaphore groups_sem;
/* for block groups in our same type */
@@ -1408,6 +606,27 @@ struct btrfs_block_group_cache {
struct btrfs_io_ctl io_ctl;
+ /*
+ * Incremented when doing extent allocations and holding a read lock
+ * on the space_info's groups_sem semaphore.
+ * Decremented when an ordered extent that represents an IO against this
+ * block group's range is created (after it's added to its inode's
+ * root's list of ordered extents) or immediately after the allocation
+ * if it's a metadata extent or fallocate extent (for these cases we
+ * don't create ordered extents).
+ */
+ atomic_t reservations;
+
+ /*
+ * Incremented while holding the spinlock *lock* by a task checking if
+ * it can perform a nocow write (incremented if the value for the *ro*
+ * field is 0). Decremented by such tasks once they create an ordered
+ * extent or before that if some error happens before reaching that step.
+ * This is to prevent races between block group relocation and nocow
+ * writes through direct IO.
+ */
+ atomic_t nocow_writers;
+
/* Lock for free space tree operations. */
struct mutex free_space_lock;
@@ -1881,12 +1100,11 @@ struct btrfs_subvolume_writers {
#define BTRFS_ROOT_REF_COWS 1
#define BTRFS_ROOT_TRACK_DIRTY 2
#define BTRFS_ROOT_IN_RADIX 3
-#define BTRFS_ROOT_DUMMY_ROOT 4
-#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED 5
-#define BTRFS_ROOT_DEFRAG_RUNNING 6
-#define BTRFS_ROOT_FORCE_COW 7
-#define BTRFS_ROOT_MULTI_LOG_TASKS 8
-#define BTRFS_ROOT_DIRTY 9
+#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED 4
+#define BTRFS_ROOT_DEFRAG_RUNNING 5
+#define BTRFS_ROOT_FORCE_COW 6
+#define BTRFS_ROOT_MULTI_LOG_TASKS 7
+#define BTRFS_ROOT_DIRTY 8
/*
* in ram representation of the tree. extent_root is used for all allocations
@@ -1948,8 +1166,10 @@ struct btrfs_root {
u64 highest_objectid;
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
/* only used with CONFIG_BTRFS_FS_RUN_SANITY_TESTS is enabled */
u64 alloc_bytenr;
+#endif
u64 defrag_trans_start;
struct btrfs_key defrag_progress;
@@ -2026,227 +1246,38 @@ struct btrfs_root {
atomic_t qgroup_meta_rsv;
};
-struct btrfs_ioctl_defrag_range_args {
- /* start of the defrag operation */
- __u64 start;
-
- /* number of bytes to defrag, use (u64)-1 to say all */
- __u64 len;
-
- /*
- * flags for the operation, which can include turning
- * on compression for this one defrag
- */
- __u64 flags;
-
- /*
- * any extent bigger than this will be considered
- * already defragged. Use 0 to take the kernel default
- * Use 1 to say every single extent must be rewritten
- */
- __u32 extent_thresh;
-
- /*
- * which compression method to use if turning on compression
- * for this defrag operation. If unspecified, zlib will
- * be used
- */
- __u32 compress_type;
-
- /* spare for later */
- __u32 unused[4];
-};
-
-
-/*
- * inode items have the data typically returned from stat and store other
- * info about object characteristics. There is one for every file and dir in
- * the FS
- */
-#define BTRFS_INODE_ITEM_KEY 1
-#define BTRFS_INODE_REF_KEY 12
-#define BTRFS_INODE_EXTREF_KEY 13
-#define BTRFS_XATTR_ITEM_KEY 24
-#define BTRFS_ORPHAN_ITEM_KEY 48
-/* reserve 2-15 close to the inode for later flexibility */
-
-/*
- * dir items are the name -> inode pointers in a directory. There is one
- * for every name in a directory.
- */
-#define BTRFS_DIR_LOG_ITEM_KEY 60
-#define BTRFS_DIR_LOG_INDEX_KEY 72
-#define BTRFS_DIR_ITEM_KEY 84
-#define BTRFS_DIR_INDEX_KEY 96
-/*
- * extent data is for file data
- */
-#define BTRFS_EXTENT_DATA_KEY 108
-
-/*
- * extent csums are stored in a separate tree and hold csums for
- * an entire extent on disk.
- */
-#define BTRFS_EXTENT_CSUM_KEY 128
-
-/*
- * root items point to tree roots. They are typically in the root
- * tree used by the super block to find all the other trees
- */
-#define BTRFS_ROOT_ITEM_KEY 132
-
-/*
- * root backrefs tie subvols and snapshots to the directory entries that
- * reference them
- */
-#define BTRFS_ROOT_BACKREF_KEY 144
-
-/*
- * root refs make a fast index for listing all of the snapshots and
- * subvolumes referenced by a given root. They point directly to the
- * directory item in the root that references the subvol
- */
-#define BTRFS_ROOT_REF_KEY 156
-
-/*
- * extent items are in the extent map tree. These record which blocks
- * are used, and how many references there are to each block
- */
-#define BTRFS_EXTENT_ITEM_KEY 168
-
-/*
- * The same as the BTRFS_EXTENT_ITEM_KEY, except it's metadata we already know
- * the length, so we save the level in key->offset instead of the length.
- */
-#define BTRFS_METADATA_ITEM_KEY 169
-
-#define BTRFS_TREE_BLOCK_REF_KEY 176
-
-#define BTRFS_EXTENT_DATA_REF_KEY 178
-
-#define BTRFS_EXTENT_REF_V0_KEY 180
-
-#define BTRFS_SHARED_BLOCK_REF_KEY 182
-
-#define BTRFS_SHARED_DATA_REF_KEY 184
-
-/*
- * block groups give us hints into the extent allocation trees. Which
- * blocks are free etc etc
- */
-#define BTRFS_BLOCK_GROUP_ITEM_KEY 192
-
-/*
- * Every block group is represented in the free space tree by a free space info
- * item, which stores some accounting information. It is keyed on
- * (block_group_start, FREE_SPACE_INFO, block_group_length).
- */
-#define BTRFS_FREE_SPACE_INFO_KEY 198
-
-/*
- * A free space extent tracks an extent of space that is free in a block group.
- * It is keyed on (start, FREE_SPACE_EXTENT, length).
- */
-#define BTRFS_FREE_SPACE_EXTENT_KEY 199
-
-/*
- * When a block group becomes very fragmented, we convert it to use bitmaps
- * instead of extents. A free space bitmap is keyed on
- * (start, FREE_SPACE_BITMAP, length); the corresponding item is a bitmap with
- * (length / sectorsize) bits.
- */
-#define BTRFS_FREE_SPACE_BITMAP_KEY 200
-
-#define BTRFS_DEV_EXTENT_KEY 204
-#define BTRFS_DEV_ITEM_KEY 216
-#define BTRFS_CHUNK_ITEM_KEY 228
-
-/*
- * Records the overall state of the qgroups.
- * There's only one instance of this key present,
- * (0, BTRFS_QGROUP_STATUS_KEY, 0)
- */
-#define BTRFS_QGROUP_STATUS_KEY 240
-/*
- * Records the currently used space of the qgroup.
- * One key per qgroup, (0, BTRFS_QGROUP_INFO_KEY, qgroupid).
- */
-#define BTRFS_QGROUP_INFO_KEY 242
-/*
- * Contains the user configured limits for the qgroup.
- * One key per qgroup, (0, BTRFS_QGROUP_LIMIT_KEY, qgroupid).
- */
-#define BTRFS_QGROUP_LIMIT_KEY 244
-/*
- * Records the child-parent relationship of qgroups. For
- * each relation, 2 keys are present:
- * (childid, BTRFS_QGROUP_RELATION_KEY, parentid)
- * (parentid, BTRFS_QGROUP_RELATION_KEY, childid)
- */
-#define BTRFS_QGROUP_RELATION_KEY 246
-
-/*
- * Obsolete name, see BTRFS_TEMPORARY_ITEM_KEY.
- */
-#define BTRFS_BALANCE_ITEM_KEY 248
-
-/*
- * The key type for tree items that are stored persistently, but do not need to
- * exist for extended period of time. The items can exist in any tree.
- *
- * [subtype, BTRFS_TEMPORARY_ITEM_KEY, data]
- *
- * Existing items:
- *
- * - balance status item
- * (BTRFS_BALANCE_OBJECTID, BTRFS_TEMPORARY_ITEM_KEY, 0)
- */
-#define BTRFS_TEMPORARY_ITEM_KEY 248
+static inline u32 __BTRFS_LEAF_DATA_SIZE(u32 blocksize)
+{
+ return blocksize - sizeof(struct btrfs_header);
+}
-/*
- * Obsolete name, see BTRFS_PERSISTENT_ITEM_KEY
- */
-#define BTRFS_DEV_STATS_KEY 249
+static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_root *root)
+{
+ return __BTRFS_LEAF_DATA_SIZE(root->nodesize);
+}
-/*
- * The key type for tree items that are stored persistently and usually exist
- * for a long period, eg. filesystem lifetime. The item kinds can be status
- * information, stats or preference values. The item can exist in any tree.
- *
- * [subtype, BTRFS_PERSISTENT_ITEM_KEY, data]
- *
- * Existing items:
- *
- * - device statistics, store IO stats in the device tree, one key for all
- * stats
- * (BTRFS_DEV_STATS_OBJECTID, BTRFS_DEV_STATS_KEY, 0)
- */
-#define BTRFS_PERSISTENT_ITEM_KEY 249
+static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_root *root)
+{
+ return BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
+}
-/*
- * Persistantly stores the device replace state in the device tree.
- * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0).
- */
-#define BTRFS_DEV_REPLACE_KEY 250
+static inline u32 BTRFS_NODEPTRS_PER_BLOCK(const struct btrfs_root *root)
+{
+ return BTRFS_LEAF_DATA_SIZE(root) / sizeof(struct btrfs_key_ptr);
+}
-/*
- * Stores items that allow to quickly map UUIDs to something else.
- * These items are part of the filesystem UUID tree.
- * The key is built like this:
- * (UUID_upper_64_bits, BTRFS_UUID_KEY*, UUID_lower_64_bits).
- */
-#if BTRFS_UUID_SIZE != 16
-#error "UUID items require BTRFS_UUID_SIZE == 16!"
-#endif
-#define BTRFS_UUID_KEY_SUBVOL 251 /* for UUIDs assigned to subvols */
-#define BTRFS_UUID_KEY_RECEIVED_SUBVOL 252 /* for UUIDs assigned to
- * received subvols */
+#define BTRFS_FILE_EXTENT_INLINE_DATA_START \
+ (offsetof(struct btrfs_file_extent_item, disk_bytenr))
+static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_root *root)
+{
+ return BTRFS_MAX_ITEM_SIZE(root) -
+ BTRFS_FILE_EXTENT_INLINE_DATA_START;
+}
-/*
- * string items are for debugging. They just store a short string of
- * data in the FS
- */
-#define BTRFS_STRING_ITEM_KEY 253
+static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_root *root)
+{
+ return BTRFS_MAX_ITEM_SIZE(root) - sizeof(struct btrfs_dir_item);
+}
/*
* Flags for mount options.
@@ -2288,21 +1319,21 @@ struct btrfs_ioctl_defrag_range_args {
#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt)
-#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \
+#define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \
BTRFS_MOUNT_##opt)
-#define btrfs_set_and_info(root, opt, fmt, args...) \
+#define btrfs_set_and_info(fs_info, opt, fmt, args...) \
{ \
- if (!btrfs_test_opt(root, opt)) \
- btrfs_info(root->fs_info, fmt, ##args); \
- btrfs_set_opt(root->fs_info->mount_opt, opt); \
+ if (!btrfs_test_opt(fs_info, opt)) \
+ btrfs_info(fs_info, fmt, ##args); \
+ btrfs_set_opt(fs_info->mount_opt, opt); \
}
-#define btrfs_clear_and_info(root, opt, fmt, args...) \
+#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \
{ \
- if (btrfs_test_opt(root, opt)) \
- btrfs_info(root->fs_info, fmt, ##args); \
- btrfs_clear_opt(root->fs_info->mount_opt, opt); \
+ if (btrfs_test_opt(fs_info, opt)) \
+ btrfs_info(fs_info, fmt, ##args); \
+ btrfs_clear_opt(fs_info->mount_opt, opt); \
}
#ifdef CONFIG_BTRFS_DEBUG
@@ -2310,9 +1341,9 @@ static inline int
btrfs_should_fragment_free_space(struct btrfs_root *root,
struct btrfs_block_group_cache *block_group)
{
- return (btrfs_test_opt(root, FRAGMENT_METADATA) &&
+ return (btrfs_test_opt(root->fs_info, FRAGMENT_METADATA) &&
block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
- (btrfs_test_opt(root, FRAGMENT_DATA) &&
+ (btrfs_test_opt(root->fs_info, FRAGMENT_DATA) &&
block_group->flags & BTRFS_BLOCK_GROUP_DATA);
}
#endif
@@ -2392,7 +1423,7 @@ static inline void btrfs_init_map_token (struct btrfs_map_token *token)
token->kaddr = NULL;
}
-/* some macros to generate set/get funcs for the struct fields. This
+/* some macros to generate set/get functions for the struct fields. This
* assumes there is a lefoo_to_cpu for every type, so lets make a simple
* one for u8:
*/
@@ -3499,11 +2530,17 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
+void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
+ const u64 start);
+void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg);
+bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
+void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
+void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg);
void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root, unsigned long count);
int btrfs_async_run_delayed_refs(struct btrfs_root *root,
- unsigned long count, int wait);
+ unsigned long count, u64 transid, int wait);
int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len);
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr,
@@ -3609,6 +2646,15 @@ enum btrfs_reserve_flush_enum {
BTRFS_RESERVE_FLUSH_ALL,
};
+enum btrfs_flush_state {
+ FLUSH_DELAYED_ITEMS_NR = 1,
+ FLUSH_DELAYED_ITEMS = 2,
+ FLUSH_DELALLOC = 3,
+ FLUSH_DELALLOC_WAIT = 4,
+ ALLOC_CHUNK = 5,
+ COMMIT_TRANS = 6,
+};
+
int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes);
void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len);
@@ -3646,8 +2692,8 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv, u64 min_reserved,
enum btrfs_reserve_flush_enum flush);
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
- struct btrfs_block_rsv *dst_rsv,
- u64 num_bytes);
+ struct btrfs_block_rsv *dst_rsv, u64 num_bytes,
+ int update_size);
int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *dest, u64 num_bytes,
int min_factor);
@@ -3860,9 +2906,6 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
/* root-item.c */
-int btrfs_find_root_ref(struct btrfs_root *tree_root,
- struct btrfs_path *path,
- u64 root_id, u64 ref_id);
int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *tree_root,
u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
@@ -4076,7 +3119,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
struct btrfs_root *new_root,
struct btrfs_root *parent_root,
u64 new_dirid);
-int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
+int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
size_t size, struct bio *bio,
unsigned long bio_flags);
int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
@@ -4122,6 +3165,7 @@ void btrfs_test_inode_set_ops(struct inode *inode);
/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
int btrfs_ioctl_get_supported_features(void __user *arg);
void btrfs_update_iflags(struct inode *inode);
void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
@@ -4326,19 +3370,58 @@ static inline void assfail(char *expr, char *file, int line)
#define ASSERT(expr) ((void)0)
#endif
-#define btrfs_assert()
__printf(5, 6)
__cold
-void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...);
const char *btrfs_decode_error(int errno);
__cold
void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, const char *function,
+ const char *function,
unsigned int line, int errno);
+/*
+ * Call btrfs_abort_transaction as early as possible when an error condition is
+ * detected, that way the exact line number is reported.
+ */
+#define btrfs_abort_transaction(trans, errno) \
+do { \
+ /* Report first abort since mount */ \
+ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
+ &((trans)->fs_info->fs_state))) { \
+ WARN(1, KERN_DEBUG \
+ "BTRFS: Transaction aborted (error %d)\n", \
+ (errno)); \
+ } \
+ __btrfs_abort_transaction((trans), __func__, \
+ __LINE__, (errno)); \
+} while (0)
+
+#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \
+do { \
+ __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
+ (errno), fmt, ##args); \
+} while (0)
+
+__printf(5, 6)
+__cold
+void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+ unsigned int line, int errno, const char *fmt, ...);
+/*
+ * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
+ * will panic(). Otherwise we BUG() here.
+ */
+#define btrfs_panic(fs_info, errno, fmt, args...) \
+do { \
+ __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \
+ BUG(); \
+} while (0)
+
+
+/* compatibility and incompatibility defines */
+
#define btrfs_set_fs_incompat(__fs_info, opt) \
__btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
@@ -4455,44 +3538,6 @@ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag)
return !!(btrfs_super_compat_ro_flags(disk_super) & flag);
}
-/*
- * Call btrfs_abort_transaction as early as possible when an error condition is
- * detected, that way the exact line number is reported.
- */
-#define btrfs_abort_transaction(trans, root, errno) \
-do { \
- /* Report first abort since mount */ \
- if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
- &((root)->fs_info->fs_state))) { \
- WARN(1, KERN_DEBUG \
- "BTRFS: Transaction aborted (error %d)\n", \
- (errno)); \
- } \
- __btrfs_abort_transaction((trans), (root), __func__, \
- __LINE__, (errno)); \
-} while (0)
-
-#define btrfs_std_error(fs_info, errno, fmt, args...) \
-do { \
- __btrfs_std_error((fs_info), __func__, __LINE__, \
- (errno), fmt, ##args); \
-} while (0)
-
-__printf(5, 6)
-__cold
-void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
- unsigned int line, int errno, const char *fmt, ...);
-
-/*
- * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
- * will panic(). Otherwise we BUG() here.
- */
-#define btrfs_panic(fs_info, errno, fmt, args...) \
-do { \
- __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \
- BUG(); \
-} while (0)
-
/* acl.c */
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
struct posix_acl *btrfs_get_acl(struct inode *inode, int type);
@@ -4582,13 +3627,13 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
void btrfs_test_destroy_inode(struct inode *inode);
#endif
-static inline int btrfs_test_is_dummy_root(struct btrfs_root *root)
+static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
{
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ if (unlikely(test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO,
+ &fs_info->fs_state)))
return 1;
#endif
return 0;
}
-
#endif
diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h
new file mode 100644
index 0000000000000..83ebfe28da9e2
--- /dev/null
+++ b/fs/btrfs/dedupe.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (C) 2016 Fujitsu. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_DEDUPE__
+#define __BTRFS_DEDUPE__
+
+/* later in-band dedupe will expand this struct */
+struct btrfs_dedupe_hash;
+#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 6cef0062f9292..3eeb9cd8cfa57 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -34,7 +34,7 @@ int __init btrfs_delayed_inode_init(void)
delayed_node_cache = kmem_cache_create("btrfs_delayed_node",
sizeof(struct btrfs_delayed_node),
0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ SLAB_MEM_SPREAD,
NULL);
if (!delayed_node_cache)
return -ENOMEM;
@@ -134,7 +134,7 @@ again:
/* cached in the btrfs inode and can be accessed */
atomic_add(2, &node->refs);
- ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+ ret = radix_tree_preload(GFP_NOFS);
if (ret) {
kmem_cache_free(delayed_node_cache, node);
return ERR_PTR(ret);
@@ -553,7 +553,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
dst_rsv = &root->fs_info->delayed_block_rsv;
num_bytes = btrfs_calc_trans_metadata_size(root, 1);
- ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+ ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
if (!ret) {
trace_btrfs_space_reservation(root->fs_info, "delayed_item",
item->key.objectid,
@@ -598,6 +598,29 @@ static int btrfs_delayed_inode_reserve_metadata(
num_bytes = btrfs_calc_trans_metadata_size(root, 1);
/*
+ * If our block_rsv is the delalloc block reserve then check and see if
+ * we have our extra reservation for updating the inode. If not fall
+ * through and try to reserve space quickly.
+ *
+ * We used to try and steal from the delalloc block rsv or the global
+ * reserve, but we'd steal a full reservation, which isn't kind. We are
+ * here through delalloc which means we've likely just cowed down close
+ * to the leaf that contains the inode, so we would steal less just
+ * doing the fallback inode update, so if we do end up having to steal
+ * from the global block rsv we hopefully only steal one or two blocks
+ * worth which is less likely to hurt us.
+ */
+ if (src_rsv && src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
+ spin_lock(&BTRFS_I(inode)->lock);
+ if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+ &BTRFS_I(inode)->runtime_flags))
+ release = true;
+ else
+ src_rsv = NULL;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ }
+
+ /*
* btrfs_dirty_inode will update the inode under btrfs_join_transaction
* which doesn't reserve space for speed. This is a problem since we
* still need to reserve space for this update, so try to reserve the
@@ -626,51 +649,10 @@ static int btrfs_delayed_inode_reserve_metadata(
num_bytes, 1);
}
return ret;
- } else if (src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
- spin_lock(&BTRFS_I(inode)->lock);
- if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
- &BTRFS_I(inode)->runtime_flags)) {
- spin_unlock(&BTRFS_I(inode)->lock);
- release = true;
- goto migrate;
- }
- spin_unlock(&BTRFS_I(inode)->lock);
-
- /* Ok we didn't have space pre-reserved. This shouldn't happen
- * too often but it can happen if we do delalloc to an existing
- * inode which gets dirtied because of the time update, and then
- * isn't touched again until after the transaction commits and
- * then we try to write out the data. First try to be nice and
- * reserve something strictly for us. If not be a pain and try
- * to steal from the delalloc block rsv.
- */
- ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
- BTRFS_RESERVE_NO_FLUSH);
- if (!ret)
- goto out;
-
- ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
- if (!ret)
- goto out;
-
- if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
- btrfs_debug(root->fs_info,
- "block rsv migrate returned %d", ret);
- WARN_ON(1);
- }
- /*
- * Ok this is a problem, let's just steal from the global rsv
- * since this really shouldn't happen that often.
- */
- ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv,
- dst_rsv, num_bytes);
- goto out;
}
-migrate:
- ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+ ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
-out:
/*
* Migrate only takes a reservation, it doesn't touch the size of the
* block_rsv. This is to simplify people who don't normally have things
@@ -1188,7 +1170,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
if (ret) {
btrfs_release_delayed_node(curr_node);
curr_node = NULL;
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
break;
}
@@ -1606,15 +1588,23 @@ int btrfs_inode_delayed_dir_index_count(struct inode *inode)
return 0;
}
-void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list,
- struct list_head *del_list)
+bool btrfs_readdir_get_delayed_items(struct inode *inode,
+ struct list_head *ins_list,
+ struct list_head *del_list)
{
struct btrfs_delayed_node *delayed_node;
struct btrfs_delayed_item *item;
delayed_node = btrfs_get_delayed_node(inode);
if (!delayed_node)
- return;
+ return false;
+
+ /*
+ * We can only do one readdir with delayed items at a time because of
+ * item->readdir_list.
+ */
+ inode_unlock_shared(inode);
+ inode_lock(inode);
mutex_lock(&delayed_node->mutex);
item = __btrfs_first_delayed_insertion_item(delayed_node);
@@ -1641,10 +1631,13 @@ void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list,
* requeue or dequeue this delayed node.
*/
atomic_dec(&delayed_node->refs);
+
+ return true;
}
-void btrfs_put_delayed_items(struct list_head *ins_list,
- struct list_head *del_list)
+void btrfs_readdir_put_delayed_items(struct inode *inode,
+ struct list_head *ins_list,
+ struct list_head *del_list)
{
struct btrfs_delayed_item *curr, *next;
@@ -1659,6 +1652,12 @@ void btrfs_put_delayed_items(struct list_head *ins_list,
if (atomic_dec_and_test(&curr->refs))
kfree(curr);
}
+
+ /*
+ * The VFS is going to do up_read(), so we need to downgrade back to a
+ * read lock.
+ */
+ downgrade_write(&inode->i_rwsem);
}
int btrfs_should_delete_dir_index(struct list_head *del_list,
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 0167853c84aea..2495b3d4075f8 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -137,10 +137,12 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root);
void btrfs_destroy_delayed_inodes(struct btrfs_root *root);
/* Used for readdir() */
-void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list,
- struct list_head *del_list);
-void btrfs_put_delayed_items(struct list_head *ins_list,
- struct list_head *del_list);
+bool btrfs_readdir_get_delayed_items(struct inode *inode,
+ struct list_head *ins_list,
+ struct list_head *del_list);
+void btrfs_readdir_put_delayed_items(struct inode *inode,
+ struct list_head *ins_list,
+ struct list_head *del_list);
int btrfs_should_delete_dir_index(struct list_head *del_list,
u64 index);
int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 430b3689b112b..b6d210e7a993f 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -606,7 +606,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
qrecord->num_bytes = num_bytes;
qrecord->old_roots = NULL;
- qexisting = btrfs_qgroup_insert_dirty_extent(delayed_refs,
+ qexisting = btrfs_qgroup_insert_dirty_extent(fs_info,
+ delayed_refs,
qrecord);
if (qexisting)
kfree(qrecord);
@@ -615,7 +616,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
spin_lock_init(&head_ref->lock);
mutex_init(&head_ref->mutex);
- trace_add_delayed_ref_head(ref, head_ref, action);
+ trace_add_delayed_ref_head(fs_info, ref, head_ref, action);
existing = htree_insert(&delayed_refs->href_root,
&head_ref->href_node);
@@ -682,7 +683,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
ref->type = BTRFS_TREE_BLOCK_REF_KEY;
full_ref->level = level;
- trace_add_delayed_tree_ref(ref, full_ref, action);
+ trace_add_delayed_tree_ref(fs_info, ref, full_ref, action);
ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref);
@@ -739,7 +740,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
full_ref->objectid = owner;
full_ref->offset = offset;
- trace_add_delayed_data_ref(ref, full_ref, action);
+ trace_add_delayed_data_ref(fs_info, ref, full_ref, action);
ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref);
@@ -940,28 +941,28 @@ int btrfs_delayed_ref_init(void)
btrfs_delayed_ref_head_cachep = kmem_cache_create(
"btrfs_delayed_ref_head",
sizeof(struct btrfs_delayed_ref_head), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ SLAB_MEM_SPREAD, NULL);
if (!btrfs_delayed_ref_head_cachep)
goto fail;
btrfs_delayed_tree_ref_cachep = kmem_cache_create(
"btrfs_delayed_tree_ref",
sizeof(struct btrfs_delayed_tree_ref), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ SLAB_MEM_SPREAD, NULL);
if (!btrfs_delayed_tree_ref_cachep)
goto fail;
btrfs_delayed_data_ref_cachep = kmem_cache_create(
"btrfs_delayed_data_ref",
sizeof(struct btrfs_delayed_data_ref), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ SLAB_MEM_SPREAD, NULL);
if (!btrfs_delayed_data_ref_cachep)
goto fail;
btrfs_delayed_extent_op_cachep = kmem_cache_create(
"btrfs_delayed_extent_op",
sizeof(struct btrfs_delayed_extent_op), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ SLAB_MEM_SPREAD, NULL);
if (!btrfs_delayed_extent_op_cachep)
goto fail;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index c24b653c73430..5fca9534a2712 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -188,7 +188,7 @@ struct btrfs_delayed_ref_root {
/*
* To make qgroup to skip given root.
- * This is for snapshot, as btrfs_qgroup_inherit() will manully
+ * This is for snapshot, as btrfs_qgroup_inherit() will manually
* modify counters for snapshot and its source, so we should skip
* the snapshot in new_root/old_roots or it will get calculated twice
*/
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index a1d6652e0c477..e9bbff3c0029c 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -44,9 +44,6 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
struct btrfs_fs_info *fs_info,
struct btrfs_device *srcdev,
struct btrfs_device *tgtdev);
-static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
- char *srcdev_name,
- struct btrfs_device **device);
static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
static int btrfs_dev_replace_kthread(void *data);
static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
@@ -145,7 +142,7 @@ no_valid_dev_replace_entry_found:
* missing
*/
if (!dev_replace->srcdev &&
- !btrfs_test_opt(dev_root, DEGRADED)) {
+ !btrfs_test_opt(dev_root->fs_info, DEGRADED)) {
ret = -EIO;
btrfs_warn(fs_info,
"cannot mount because device replace operation is ongoing and");
@@ -154,7 +151,7 @@ no_valid_dev_replace_entry_found:
src_devid);
}
if (!dev_replace->tgtdev &&
- !btrfs_test_opt(dev_root, DEGRADED)) {
+ !btrfs_test_opt(dev_root->fs_info, DEGRADED)) {
ret = -EIO;
btrfs_warn(fs_info,
"cannot mount because device replace operation is ongoing and");
@@ -305,8 +302,8 @@ void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
dev_replace->cursor_left_last_write_of_item;
}
-int btrfs_dev_replace_start(struct btrfs_root *root,
- struct btrfs_ioctl_dev_replace_args *args)
+int btrfs_dev_replace_start(struct btrfs_root *root, char *tgtdev_name,
+ u64 srcdevid, char *srcdev_name, int read_src)
{
struct btrfs_trans_handle *trans;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -315,29 +312,16 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
struct btrfs_device *tgt_device = NULL;
struct btrfs_device *src_device = NULL;
- switch (args->start.cont_reading_from_srcdev_mode) {
- case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
- case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
- break;
- default:
- return -EINVAL;
- }
-
- if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
- args->start.tgtdev_name[0] == '\0')
- return -EINVAL;
-
/* the disk copy procedure reuses the scrub code */
mutex_lock(&fs_info->volume_mutex);
- ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
- args->start.srcdev_name,
- &src_device);
+ ret = btrfs_find_device_by_devspec(root, srcdevid,
+ srcdev_name, &src_device);
if (ret) {
mutex_unlock(&fs_info->volume_mutex);
return ret;
}
- ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
+ ret = btrfs_init_dev_replace_tgtdev(root, tgtdev_name,
src_device, &tgt_device);
mutex_unlock(&fs_info->volume_mutex);
if (ret)
@@ -364,18 +348,17 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
- args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
+ ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
goto leave;
}
- dev_replace->cont_reading_from_srcdev_mode =
- args->start.cont_reading_from_srcdev_mode;
+ dev_replace->cont_reading_from_srcdev_mode = read_src;
WARN_ON(!src_device);
dev_replace->srcdev = src_device;
WARN_ON(!tgt_device);
dev_replace->tgtdev = tgt_device;
- btrfs_info_in_rcu(root->fs_info,
+ btrfs_info_in_rcu(fs_info,
"dev_replace from %s (devid %llu) to %s started",
src_device->missing ? "<missing disk>" :
rcu_str_deref(src_device->name),
@@ -394,14 +377,15 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
dev_replace->cursor_right = 0;
dev_replace->is_valid = 1;
dev_replace->item_needs_writeback = 1;
- args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+ atomic64_set(&dev_replace->num_write_errors, 0);
+ atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
btrfs_dev_replace_unlock(dev_replace, 1);
ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
if (ret)
- btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret);
+ btrfs_err(fs_info, "kobj add dev failed %d\n", ret);
- btrfs_wait_ordered_roots(root->fs_info, -1);
+ btrfs_wait_ordered_roots(root->fs_info, -1, 0, (u64)-1);
/* force writing the updated state information to disk */
trans = btrfs_start_transaction(root, 0);
@@ -419,11 +403,9 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
btrfs_device_get_total_bytes(src_device),
&dev_replace->scrub_progress, 0, 1);
- ret = btrfs_dev_replace_finishing(root->fs_info, ret);
- /* don't warn if EINPROGRESS, someone else might be running scrub */
+ ret = btrfs_dev_replace_finishing(fs_info, ret);
if (ret == -EINPROGRESS) {
- args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
- ret = 0;
+ ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
} else {
WARN_ON(ret);
}
@@ -438,8 +420,37 @@ leave:
return ret;
}
+int btrfs_dev_replace_by_ioctl(struct btrfs_root *root,
+ struct btrfs_ioctl_dev_replace_args *args)
+{
+ int ret;
+
+ switch (args->start.cont_reading_from_srcdev_mode) {
+ case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
+ case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
+ args->start.tgtdev_name[0] == '\0')
+ return -EINVAL;
+
+ ret = btrfs_dev_replace_start(root, args->start.tgtdev_name,
+ args->start.srcdevid,
+ args->start.srcdev_name,
+ args->start.cont_reading_from_srcdev_mode);
+ args->result = ret;
+ /* don't warn if EINPROGRESS, someone else might be running scrub */
+ if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS)
+ ret = 0;
+
+ return ret;
+}
+
/*
- * blocked until all flighting bios are finished.
+ * blocked until all in-flight bios operations are finished.
*/
static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
{
@@ -493,7 +504,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return ret;
}
- btrfs_wait_ordered_roots(root->fs_info, -1);
+ btrfs_wait_ordered_roots(root->fs_info, -1, 0, (u64)-1);
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
@@ -558,10 +569,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
ASSERT(list_empty(&src_device->resized_list));
tgt_device->commit_total_bytes = src_device->commit_total_bytes;
tgt_device->commit_bytes_used = src_device->bytes_used;
- if (fs_info->sb->s_bdev == src_device->bdev)
- fs_info->sb->s_bdev = tgt_device->bdev;
- if (fs_info->fs_devices->latest_bdev == src_device->bdev)
- fs_info->fs_devices->latest_bdev = tgt_device->bdev;
+
+ btrfs_assign_next_active_device(fs_info, src_device, tgt_device);
+
list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
fs_info->fs_devices->rw_devices++;
@@ -624,25 +634,6 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
write_unlock(&em_tree->lock);
}
-static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
- char *srcdev_name,
- struct btrfs_device **device)
-{
- int ret;
-
- if (srcdevid) {
- ret = 0;
- *device = btrfs_find_device(root->fs_info, srcdevid, NULL,
- NULL);
- if (!*device)
- ret = -ENOENT;
- } else {
- ret = btrfs_find_device_missing_or_by_path(root, srcdev_name,
- device);
- }
- return ret;
-}
-
void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args)
{
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 29e3ef5f96bdd..e922b42d91df2 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -25,8 +25,10 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info);
-int btrfs_dev_replace_start(struct btrfs_root *root,
+int btrfs_dev_replace_by_ioctl(struct btrfs_root *root,
struct btrfs_ioctl_dev_replace_args *args);
+int btrfs_dev_replace_start(struct btrfs_root *root, char *tgtdev_name,
+ u64 srcdevid, char *srcdev_name, int read_src);
void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args);
int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4b02591b03010..87dad552e39ae 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -25,7 +25,6 @@
#include <linux/buffer_head.h>
#include <linux/workqueue.h>
#include <linux/kthread.h>
-#include <linux/freezer.h>
#include <linux/slab.h>
#include <linux/migrate.h>
#include <linux/ratelimit.h>
@@ -102,7 +101,7 @@ int __init btrfs_end_io_wq_init(void)
btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
sizeof(struct btrfs_end_io_wq),
0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ SLAB_MEM_SPREAD,
NULL);
if (!btrfs_end_io_wq_cache)
return -ENOMEM;
@@ -125,7 +124,6 @@ struct async_submit_bio {
struct list_head list;
extent_submit_bio_hook_t *submit_bio_start;
extent_submit_bio_hook_t *submit_bio_done;
- int rw;
int mirror_num;
unsigned long bio_flags;
/*
@@ -303,7 +301,7 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
err = map_private_extent_buffer(buf, offset, 32,
&kaddr, &map_start, &map_len);
if (err)
- return 1;
+ return err;
cur_len = min(len, map_len - (offset - map_start));
crc = btrfs_csum_data(kaddr + offset - map_start,
crc, cur_len);
@@ -313,7 +311,7 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
if (csum_size > sizeof(inline_result)) {
result = kzalloc(csum_size, GFP_NOFS);
if (!result)
- return 1;
+ return -ENOMEM;
} else {
result = (char *)&inline_result;
}
@@ -334,7 +332,7 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
val, found, btrfs_header_level(buf));
if (result != (char *)&inline_result)
kfree(result);
- return 1;
+ return -EUCLEAN;
}
} else {
write_extent_buffer(buf, result, 0, csum_size);
@@ -385,7 +383,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
/*
* Things reading via commit roots that don't have normal protection,
* like send, can have a really old block in cache that may point at a
- * block that has been free'd and re-allocated. So don't clear uptodate
+ * block that has been freed and re-allocated. So don't clear uptodate
* if we find an eb that is under IO (dirty/writeback) because we could
* end up reading in the stale data and then writing it back out and
* making everybody very sad.
@@ -419,7 +417,7 @@ static int btrfs_check_super_csum(char *raw_disk_sb)
/*
* The super_block structure does not span the whole
* BTRFS_SUPER_INFO_SIZE range, we expect that the unused space
- * is filled with zeros and is included in the checkum.
+ * is filled with zeros and is included in the checksum.
*/
crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE,
crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
@@ -513,11 +511,21 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
eb = (struct extent_buffer *)page->private;
if (page != eb->pages[0])
return 0;
+
found_start = btrfs_header_bytenr(eb);
- if (WARN_ON(found_start != start || !PageUptodate(page)))
- return 0;
- csum_tree_block(fs_info, eb, 0);
- return 0;
+ /*
+ * Please do not consolidate these warnings into a single if.
+ * It is useful to know what went wrong.
+ */
+ if (WARN_ON(found_start != start))
+ return -EUCLEAN;
+ if (WARN_ON(!PageUptodate(page)))
+ return -EUCLEAN;
+
+ ASSERT(memcmp_extent_buffer(eb, fs_info->fsid,
+ btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0);
+
+ return csum_tree_block(fs_info, eb, 0);
}
static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
@@ -591,7 +599,7 @@ static noinline int check_leaf(struct btrfs_root *root,
/*
* Check to make sure that we don't point outside of the leaf,
- * just incase all the items are consistent to eachother, but
+ * just in case all the items are consistent to each other, but
* all point outside of the leaf.
*/
if (btrfs_item_end_nr(leaf, slot) >
@@ -661,10 +669,8 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
eb, found_level);
ret = csum_tree_block(fs_info, eb, 1);
- if (ret) {
- ret = -EIO;
+ if (ret)
goto err;
- }
/*
* If this is a leaf block and it is corrupt, set the corrupt bit so
@@ -720,7 +726,7 @@ static void end_workqueue_bio(struct bio *bio)
fs_info = end_io_wq->info;
end_io_wq->error = bio->bi_error;
- if (bio->bi_rw & REQ_WRITE) {
+ if (bio_op(bio) == REQ_OP_WRITE) {
if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) {
wq = fs_info->endio_meta_write_workers;
func = btrfs_endio_meta_write_helper;
@@ -790,7 +796,7 @@ static void run_one_async_start(struct btrfs_work *work)
int ret;
async = container_of(work, struct async_submit_bio, work);
- ret = async->submit_bio_start(async->inode, async->rw, async->bio,
+ ret = async->submit_bio_start(async->inode, async->bio,
async->mirror_num, async->bio_flags,
async->bio_offset);
if (ret)
@@ -823,9 +829,8 @@ static void run_one_async_done(struct btrfs_work *work)
return;
}
- async->submit_bio_done(async->inode, async->rw, async->bio,
- async->mirror_num, async->bio_flags,
- async->bio_offset);
+ async->submit_bio_done(async->inode, async->bio, async->mirror_num,
+ async->bio_flags, async->bio_offset);
}
static void run_one_async_free(struct btrfs_work *work)
@@ -837,7 +842,7 @@ static void run_one_async_free(struct btrfs_work *work)
}
int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
- int rw, struct bio *bio, int mirror_num,
+ struct bio *bio, int mirror_num,
unsigned long bio_flags,
u64 bio_offset,
extent_submit_bio_hook_t *submit_bio_start,
@@ -850,7 +855,6 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
return -ENOMEM;
async->inode = inode;
- async->rw = rw;
async->bio = bio;
async->mirror_num = mirror_num;
async->submit_bio_start = submit_bio_start;
@@ -866,7 +870,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
atomic_inc(&fs_info->nr_async_submits);
- if (rw & REQ_SYNC)
+ if (bio->bi_rw & REQ_SYNC)
btrfs_set_work_high_priority(&async->work);
btrfs_queue_work(fs_info->workers, &async->work);
@@ -896,9 +900,8 @@ static int btree_csum_one_bio(struct bio *bio)
return ret;
}
-static int __btree_submit_bio_start(struct inode *inode, int rw,
- struct bio *bio, int mirror_num,
- unsigned long bio_flags,
+static int __btree_submit_bio_start(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags,
u64 bio_offset)
{
/*
@@ -908,7 +911,7 @@ static int __btree_submit_bio_start(struct inode *inode, int rw,
return btree_csum_one_bio(bio);
}
-static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
+static int __btree_submit_bio_done(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags,
u64 bio_offset)
{
@@ -918,7 +921,7 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
* when we're called for a write, we're already in the async
* submission context. Just jump into btrfs_map_bio
*/
- ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
+ ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 1);
if (ret) {
bio->bi_error = ret;
bio_endio(bio);
@@ -937,14 +940,14 @@ static int check_async_write(struct inode *inode, unsigned long bio_flags)
return 1;
}
-static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+static int btree_submit_bio_hook(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags,
u64 bio_offset)
{
int async = check_async_write(inode, bio_flags);
int ret;
- if (!(rw & REQ_WRITE)) {
+ if (bio_op(bio) != REQ_OP_WRITE) {
/*
* called for a read, do the setup so that checksum validation
* can happen in the async kernel threads
@@ -953,21 +956,19 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
bio, BTRFS_WQ_ENDIO_METADATA);
if (ret)
goto out_w_error;
- ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
- mirror_num, 0);
+ ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 0);
} else if (!async) {
ret = btree_csum_one_bio(bio);
if (ret)
goto out_w_error;
- ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
- mirror_num, 0);
+ ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 0);
} else {
/*
* kthread helpers are used to submit writes so that
* checksumming can happen in parallel across all CPUs
*/
ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
- inode, rw, bio, mirror_num, 0,
+ inode, bio, mirror_num, 0,
bio_offset,
__btree_submit_bio_start,
__btree_submit_bio_done);
@@ -1055,7 +1056,7 @@ static void btree_invalidatepage(struct page *page, unsigned int offset,
(unsigned long long)page_offset(page));
ClearPagePrivate(page);
set_page_private(page, 0);
- page_cache_release(page);
+ put_page(page);
}
}
@@ -1091,7 +1092,7 @@ void readahead_tree_block(struct btrfs_root *root, u64 bytenr)
struct inode *btree_inode = root->fs_info->btree_inode;
buf = btrfs_find_create_tree_block(root, bytenr);
- if (!buf)
+ if (IS_ERR(buf))
return;
read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
buf, 0, WAIT_NONE, btree_get_extent, 0);
@@ -1107,7 +1108,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
int ret;
buf = btrfs_find_create_tree_block(root, bytenr);
- if (!buf)
+ if (IS_ERR(buf))
return 0;
set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
@@ -1139,8 +1140,9 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
u64 bytenr)
{
- if (btrfs_test_is_dummy_root(root))
- return alloc_test_extent_buffer(root->fs_info, bytenr);
+ if (btrfs_is_testing(root->fs_info))
+ return alloc_test_extent_buffer(root->fs_info, bytenr,
+ root->nodesize);
return alloc_extent_buffer(root->fs_info, bytenr);
}
@@ -1164,8 +1166,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
int ret;
buf = btrfs_find_create_tree_block(root, bytenr);
- if (!buf)
- return ERR_PTR(-ENOMEM);
+ if (IS_ERR(buf))
+ return buf;
ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
if (ret) {
@@ -1225,6 +1227,7 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
struct btrfs_root *root, struct btrfs_fs_info *fs_info,
u64 objectid)
{
+ bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
root->node = NULL;
root->commit_root = NULL;
root->sectorsize = sectorsize;
@@ -1279,14 +1282,14 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
root->log_transid = 0;
root->log_transid_committed = -1;
root->last_log_commit = 0;
- if (fs_info)
+ if (!dummy)
extent_io_tree_init(&root->dirty_log_pages,
fs_info->btree_inode->i_mapping);
memset(&root->root_key, 0, sizeof(root->root_key));
memset(&root->root_item, 0, sizeof(root->root_item));
memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
- if (fs_info)
+ if (!dummy)
root->defrag_trans_start = fs_info->generation;
else
root->defrag_trans_start = 0;
@@ -1307,15 +1310,20 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
/* Should only be used by the testing infrastructure */
-struct btrfs_root *btrfs_alloc_dummy_root(void)
+struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info,
+ u32 sectorsize, u32 nodesize)
{
struct btrfs_root *root;
- root = btrfs_alloc_root(NULL, GFP_KERNEL);
+ if (!fs_info)
+ return ERR_PTR(-EINVAL);
+
+ root = btrfs_alloc_root(fs_info, GFP_KERNEL);
if (!root)
return ERR_PTR(-ENOMEM);
- __setup_root(4096, 4096, 4096, root, NULL, 1);
- set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state);
+ /* We don't use the stripesize in selftest, set it as sectorsize */
+ __setup_root(nodesize, sectorsize, sectorsize, root, fs_info,
+ BTRFS_ROOT_TREE_OBJECTID);
root->alloc_bytenr = 0;
return root;
@@ -1590,14 +1598,14 @@ int btrfs_init_fs_root(struct btrfs_root *root)
ret = get_anon_bdev(&root->anon_dev);
if (ret)
- goto free_writers;
+ goto fail;
mutex_lock(&root->objectid_mutex);
ret = btrfs_find_highest_objectid(root,
&root->highest_objectid);
if (ret) {
mutex_unlock(&root->objectid_mutex);
- goto free_root_dev;
+ goto fail;
}
ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
@@ -1605,14 +1613,8 @@ int btrfs_init_fs_root(struct btrfs_root *root)
mutex_unlock(&root->objectid_mutex);
return 0;
-
-free_root_dev:
- free_anon_bdev(root->anon_dev);
-free_writers:
- btrfs_free_subvolume_writers(root->subv_writers);
fail:
- kfree(root->free_ino_ctl);
- kfree(root->free_ino_pinned);
+ /* the caller is responsible to call free_fs_root */
return ret;
}
@@ -1633,7 +1635,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
{
int ret;
- ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+ ret = radix_tree_preload(GFP_NOFS);
if (ret)
return ret;
@@ -1757,7 +1759,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
if (err)
return err;
- bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
+ bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
bdi->congested_fn = btrfs_congested_fn;
bdi->congested_data = info;
bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
@@ -1796,6 +1798,13 @@ static int cleaner_kthread(void *arg)
if (btrfs_need_cleaner_sleep(root))
goto sleep;
+ /*
+ * Do not do anything if we might cause open_ctree() to block
+ * before we have finished mounting the filesystem.
+ */
+ if (!root->fs_info->open)
+ goto sleep;
+
if (!mutex_trylock(&root->fs_info->cleaner_mutex))
goto sleep;
@@ -1831,7 +1840,7 @@ static int cleaner_kthread(void *arg)
*/
btrfs_delete_unused_bgs(root->fs_info);
sleep:
- if (!try_to_freeze() && !again) {
+ if (!again) {
set_current_state(TASK_INTERRUPTIBLE);
if (!kthread_should_stop())
schedule();
@@ -1921,14 +1930,12 @@ sleep:
if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
&root->fs_info->fs_state)))
btrfs_cleanup_transaction(root);
- if (!try_to_freeze()) {
- set_current_state(TASK_INTERRUPTIBLE);
- if (!kthread_should_stop() &&
- (!btrfs_transaction_blocked(root->fs_info) ||
- cannot_commit))
- schedule_timeout(delay);
- __set_current_state(TASK_RUNNING);
- }
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (!kthread_should_stop() &&
+ (!btrfs_transaction_blocked(root->fs_info) ||
+ cannot_commit))
+ schedule_timeout(delay);
+ __set_current_state(TASK_RUNNING);
} while (!kthread_should_stop());
return 0;
}
@@ -2301,17 +2308,19 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
fs_info->workers =
- btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI,
- max_active, 16);
+ btrfs_alloc_workqueue(fs_info, "worker",
+ flags | WQ_HIGHPRI, max_active, 16);
fs_info->delalloc_workers =
- btrfs_alloc_workqueue("delalloc", flags, max_active, 2);
+ btrfs_alloc_workqueue(fs_info, "delalloc",
+ flags, max_active, 2);
fs_info->flush_workers =
- btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0);
+ btrfs_alloc_workqueue(fs_info, "flush_delalloc",
+ flags, max_active, 0);
fs_info->caching_workers =
- btrfs_alloc_workqueue("cache", flags, max_active, 0);
+ btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0);
/*
* a higher idle thresh on the submit workers makes it much more
@@ -2319,41 +2328,48 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
* devices
*/
fs_info->submit_workers =
- btrfs_alloc_workqueue("submit", flags,
+ btrfs_alloc_workqueue(fs_info, "submit", flags,
min_t(u64, fs_devices->num_devices,
max_active), 64);
fs_info->fixup_workers =
- btrfs_alloc_workqueue("fixup", flags, 1, 0);
+ btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0);
/*
* endios are largely parallel and should have a very
* low idle thresh
*/
fs_info->endio_workers =
- btrfs_alloc_workqueue("endio", flags, max_active, 4);
+ btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4);
fs_info->endio_meta_workers =
- btrfs_alloc_workqueue("endio-meta", flags, max_active, 4);
+ btrfs_alloc_workqueue(fs_info, "endio-meta", flags,
+ max_active, 4);
fs_info->endio_meta_write_workers =
- btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
+ btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags,
+ max_active, 2);
fs_info->endio_raid56_workers =
- btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
+ btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
+ max_active, 4);
fs_info->endio_repair_workers =
- btrfs_alloc_workqueue("endio-repair", flags, 1, 0);
+ btrfs_alloc_workqueue(fs_info, "endio-repair", flags, 1, 0);
fs_info->rmw_workers =
- btrfs_alloc_workqueue("rmw", flags, max_active, 2);
+ btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2);
fs_info->endio_write_workers =
- btrfs_alloc_workqueue("endio-write", flags, max_active, 2);
+ btrfs_alloc_workqueue(fs_info, "endio-write", flags,
+ max_active, 2);
fs_info->endio_freespace_worker =
- btrfs_alloc_workqueue("freespace-write", flags, max_active, 0);
+ btrfs_alloc_workqueue(fs_info, "freespace-write", flags,
+ max_active, 0);
fs_info->delayed_workers =
- btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0);
+ btrfs_alloc_workqueue(fs_info, "delayed-meta", flags,
+ max_active, 0);
fs_info->readahead_workers =
- btrfs_alloc_workqueue("readahead", flags, max_active, 2);
+ btrfs_alloc_workqueue(fs_info, "readahead", flags,
+ max_active, 2);
fs_info->qgroup_rescan_workers =
- btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
+ btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
fs_info->extent_workers =
- btrfs_alloc_workqueue("extent-refs", flags,
+ btrfs_alloc_workqueue(fs_info, "extent-refs", flags,
min_t(u64, fs_devices->num_devices,
max_active), 8);
@@ -2412,7 +2428,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
/* returns with log_tree_root freed on success */
ret = btrfs_recover_log_trees(log_tree_root);
if (ret) {
- btrfs_std_error(tree_root->fs_info, ret,
+ btrfs_handle_fs_error(tree_root->fs_info, ret,
"Failed to recover log tree");
free_extent_buffer(log_tree_root->node);
kfree(log_tree_root);
@@ -2537,7 +2553,7 @@ int open_ctree(struct super_block *sb,
err = ret;
goto fail_bdi;
}
- fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE *
+ fs_info->dirty_metadata_batch = PAGE_SIZE *
(1 + ilog2(nr_cpu_ids));
ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
@@ -2708,7 +2724,7 @@ int open_ctree(struct super_block *sb,
* Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
*/
if (btrfs_check_super_csum(bh->b_data)) {
- printk(KERN_ERR "BTRFS: superblock checksum mismatch\n");
+ btrfs_err(fs_info, "superblock checksum mismatch");
err = -EINVAL;
brelse(bh);
goto fail_alloc;
@@ -2728,7 +2744,7 @@ int open_ctree(struct super_block *sb,
ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
if (ret) {
- printk(KERN_ERR "BTRFS: superblock contains fatal errors\n");
+ btrfs_err(fs_info, "superblock contains fatal errors");
err = -EINVAL;
goto fail_alloc;
}
@@ -2763,9 +2779,9 @@ int open_ctree(struct super_block *sb,
features = btrfs_super_incompat_flags(disk_super) &
~BTRFS_FEATURE_INCOMPAT_SUPP;
if (features) {
- printk(KERN_ERR "BTRFS: couldn't mount because of "
- "unsupported optional features (%Lx).\n",
- features);
+ btrfs_err(fs_info,
+ "cannot mount because of unsupported optional features (%llx)",
+ features);
err = -EINVAL;
goto fail_alloc;
}
@@ -2776,21 +2792,22 @@ int open_ctree(struct super_block *sb,
features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
- printk(KERN_INFO "BTRFS: has skinny extents\n");
+ btrfs_info(fs_info, "has skinny extents");
/*
* flag our filesystem as having big metadata blocks if
* they are bigger than the page size
*/
- if (btrfs_super_nodesize(disk_super) > PAGE_CACHE_SIZE) {
+ if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
- printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n");
+ btrfs_info(fs_info,
+ "flagging fs with big metadata feature");
features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
}
nodesize = btrfs_super_nodesize(disk_super);
sectorsize = btrfs_super_sectorsize(disk_super);
- stripesize = btrfs_super_stripesize(disk_super);
+ stripesize = sectorsize;
fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
@@ -2800,9 +2817,9 @@ int open_ctree(struct super_block *sb,
*/
if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
(sectorsize != nodesize)) {
- printk(KERN_ERR "BTRFS: unequal leaf/node/sector sizes "
- "are not allowed for mixed block groups on %s\n",
- sb->s_id);
+ btrfs_err(fs_info,
+"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
+ nodesize, sectorsize);
goto fail_alloc;
}
@@ -2815,8 +2832,8 @@ int open_ctree(struct super_block *sb,
features = btrfs_super_compat_ro_flags(disk_super) &
~BTRFS_FEATURE_COMPAT_RO_SUPP;
if (!(sb->s_flags & MS_RDONLY) && features) {
- printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
- "unsupported option features (%Lx).\n",
+ btrfs_err(fs_info,
+ "cannot mount read-write because of unsupported optional features (%llx)",
features);
err = -EINVAL;
goto fail_alloc;
@@ -2832,7 +2849,7 @@ int open_ctree(struct super_block *sb,
fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
- SZ_4M / PAGE_CACHE_SIZE);
+ SZ_4M / PAGE_SIZE);
tree_root->nodesize = nodesize;
tree_root->sectorsize = sectorsize;
@@ -2845,8 +2862,7 @@ int open_ctree(struct super_block *sb,
ret = btrfs_read_sys_array(tree_root);
mutex_unlock(&fs_info->chunk_mutex);
if (ret) {
- printk(KERN_ERR "BTRFS: failed to read the system "
- "array on %s\n", sb->s_id);
+ btrfs_err(fs_info, "failed to read the system array: %d", ret);
goto fail_sb_buffer;
}
@@ -2860,8 +2876,7 @@ int open_ctree(struct super_block *sb,
generation);
if (IS_ERR(chunk_root->node) ||
!extent_buffer_uptodate(chunk_root->node)) {
- printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n",
- sb->s_id);
+ btrfs_err(fs_info, "failed to read chunk root");
if (!IS_ERR(chunk_root->node))
free_extent_buffer(chunk_root->node);
chunk_root->node = NULL;
@@ -2875,8 +2890,7 @@ int open_ctree(struct super_block *sb,
ret = btrfs_read_chunk_tree(chunk_root);
if (ret) {
- printk(KERN_ERR "BTRFS: failed to read chunk tree on %s\n",
- sb->s_id);
+ btrfs_err(fs_info, "failed to read chunk tree: %d", ret);
goto fail_tree_roots;
}
@@ -2887,8 +2901,7 @@ int open_ctree(struct super_block *sb,
btrfs_close_extra_devices(fs_devices, 0);
if (!fs_devices->latest_bdev) {
- printk(KERN_ERR "BTRFS: failed to read devices on %s\n",
- sb->s_id);
+ btrfs_err(fs_info, "failed to read devices");
goto fail_tree_roots;
}
@@ -2900,8 +2913,7 @@ retry_root_backup:
generation);
if (IS_ERR(tree_root->node) ||
!extent_buffer_uptodate(tree_root->node)) {
- printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n",
- sb->s_id);
+ btrfs_warn(fs_info, "failed to read tree root");
if (!IS_ERR(tree_root->node))
free_extent_buffer(tree_root->node);
tree_root->node = NULL;
@@ -2933,20 +2945,19 @@ retry_root_backup:
ret = btrfs_recover_balance(fs_info);
if (ret) {
- printk(KERN_ERR "BTRFS: failed to recover balance\n");
+ btrfs_err(fs_info, "failed to recover balance: %d", ret);
goto fail_block_groups;
}
ret = btrfs_init_dev_stats(fs_info);
if (ret) {
- printk(KERN_ERR "BTRFS: failed to init dev_stats: %d\n",
- ret);
+ btrfs_err(fs_info, "failed to init dev_stats: %d", ret);
goto fail_block_groups;
}
ret = btrfs_init_dev_replace(fs_info);
if (ret) {
- pr_err("BTRFS: failed to init dev_replace: %d\n", ret);
+ btrfs_err(fs_info, "failed to init dev_replace: %d", ret);
goto fail_block_groups;
}
@@ -2954,31 +2965,33 @@ retry_root_backup:
ret = btrfs_sysfs_add_fsid(fs_devices, NULL);
if (ret) {
- pr_err("BTRFS: failed to init sysfs fsid interface: %d\n", ret);
+ btrfs_err(fs_info, "failed to init sysfs fsid interface: %d",
+ ret);
goto fail_block_groups;
}
ret = btrfs_sysfs_add_device(fs_devices);
if (ret) {
- pr_err("BTRFS: failed to init sysfs device interface: %d\n", ret);
+ btrfs_err(fs_info, "failed to init sysfs device interface: %d",
+ ret);
goto fail_fsdev_sysfs;
}
ret = btrfs_sysfs_add_mounted(fs_info);
if (ret) {
- pr_err("BTRFS: failed to init sysfs interface: %d\n", ret);
+ btrfs_err(fs_info, "failed to init sysfs interface: %d", ret);
goto fail_fsdev_sysfs;
}
ret = btrfs_init_space_info(fs_info);
if (ret) {
- printk(KERN_ERR "BTRFS: Failed to initial space info: %d\n", ret);
+ btrfs_err(fs_info, "failed to initialize space info: %d", ret);
goto fail_sysfs;
}
ret = btrfs_read_block_groups(fs_info->extent_root);
if (ret) {
- printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret);
+ btrfs_err(fs_info, "failed to read block groups: %d", ret);
goto fail_sysfs;
}
fs_info->num_tolerated_disk_barrier_failures =
@@ -2986,7 +2999,8 @@ retry_root_backup:
if (fs_info->fs_devices->missing_devices >
fs_info->num_tolerated_disk_barrier_failures &&
!(sb->s_flags & MS_RDONLY)) {
- pr_warn("BTRFS: missing devices(%llu) exceeds the limit(%d), writeable mount is not allowed\n",
+ btrfs_warn(fs_info,
+"missing devices (%llu) exceeds the limit (%d), writeable mount is not allowed",
fs_info->fs_devices->missing_devices,
fs_info->num_tolerated_disk_barrier_failures);
goto fail_sysfs;
@@ -3003,30 +3017,30 @@ retry_root_backup:
if (IS_ERR(fs_info->transaction_kthread))
goto fail_cleaner;
- if (!btrfs_test_opt(tree_root, SSD) &&
- !btrfs_test_opt(tree_root, NOSSD) &&
+ if (!btrfs_test_opt(tree_root->fs_info, SSD) &&
+ !btrfs_test_opt(tree_root->fs_info, NOSSD) &&
!fs_info->fs_devices->rotating) {
- printk(KERN_INFO "BTRFS: detected SSD devices, enabling SSD "
- "mode\n");
+ btrfs_info(fs_info, "detected SSD devices, enabling SSD mode");
btrfs_set_opt(fs_info->mount_opt, SSD);
}
/*
- * Mount does not set all options immediatelly, we can do it now and do
+ * Mount does not set all options immediately, we can do it now and do
* not have to wait for transaction commit
*/
btrfs_apply_pending_changes(fs_info);
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
+ if (btrfs_test_opt(tree_root->fs_info, CHECK_INTEGRITY)) {
ret = btrfsic_mount(tree_root, fs_devices,
- btrfs_test_opt(tree_root,
+ btrfs_test_opt(tree_root->fs_info,
CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
1 : 0,
fs_info->check_integrity_print_mask);
if (ret)
- printk(KERN_WARNING "BTRFS: failed to initialize"
- " integrity check module %s\n", sb->s_id);
+ btrfs_warn(fs_info,
+ "failed to initialize integrity check module: %d",
+ ret);
}
#endif
ret = btrfs_read_qgroup_config(fs_info);
@@ -3035,7 +3049,7 @@ retry_root_backup:
/* do not make disk changes in broken FS or nologreplay is given */
if (btrfs_super_log_root(disk_super) != 0 &&
- !btrfs_test_opt(tree_root, NOLOGREPLAY)) {
+ !btrfs_test_opt(tree_root->fs_info, NOLOGREPLAY)) {
ret = btrfs_replay_log(fs_info, fs_devices);
if (ret) {
err = ret;
@@ -3056,8 +3070,8 @@ retry_root_backup:
ret = btrfs_recover_relocation(tree_root);
mutex_unlock(&fs_info->cleaner_mutex);
if (ret < 0) {
- printk(KERN_WARNING
- "BTRFS: failed to recover relocation\n");
+ btrfs_warn(fs_info, "failed to recover relocation: %d",
+ ret);
err = -EINVAL;
goto fail_qgroup;
}
@@ -3076,13 +3090,13 @@ retry_root_backup:
if (sb->s_flags & MS_RDONLY)
return 0;
- if (btrfs_test_opt(tree_root, FREE_SPACE_TREE) &&
+ if (btrfs_test_opt(tree_root->fs_info, FREE_SPACE_TREE) &&
!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
- pr_info("BTRFS: creating free space tree\n");
+ btrfs_info(fs_info, "creating free space tree");
ret = btrfs_create_free_space_tree(fs_info);
if (ret) {
- pr_warn("BTRFS: failed to create free space tree %d\n",
- ret);
+ btrfs_warn(fs_info,
+ "failed to create free space tree: %d", ret);
close_ctree(tree_root);
return ret;
}
@@ -3099,49 +3113,49 @@ retry_root_backup:
ret = btrfs_resume_balance_async(fs_info);
if (ret) {
- printk(KERN_WARNING "BTRFS: failed to resume balance\n");
+ btrfs_warn(fs_info, "failed to resume balance: %d", ret);
close_ctree(tree_root);
return ret;
}
ret = btrfs_resume_dev_replace_async(fs_info);
if (ret) {
- pr_warn("BTRFS: failed to resume dev_replace\n");
+ btrfs_warn(fs_info, "failed to resume device replace: %d", ret);
close_ctree(tree_root);
return ret;
}
btrfs_qgroup_rescan_resume(fs_info);
- if (btrfs_test_opt(tree_root, CLEAR_CACHE) &&
+ if (btrfs_test_opt(tree_root->fs_info, CLEAR_CACHE) &&
btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
- pr_info("BTRFS: clearing free space tree\n");
+ btrfs_info(fs_info, "clearing free space tree");
ret = btrfs_clear_free_space_tree(fs_info);
if (ret) {
- pr_warn("BTRFS: failed to clear free space tree %d\n",
- ret);
+ btrfs_warn(fs_info,
+ "failed to clear free space tree: %d", ret);
close_ctree(tree_root);
return ret;
}
}
if (!fs_info->uuid_root) {
- pr_info("BTRFS: creating UUID tree\n");
+ btrfs_info(fs_info, "creating UUID tree");
ret = btrfs_create_uuid_tree(fs_info);
if (ret) {
- pr_warn("BTRFS: failed to create the UUID tree %d\n",
- ret);
+ btrfs_warn(fs_info,
+ "failed to create the UUID tree: %d", ret);
close_ctree(tree_root);
return ret;
}
- } else if (btrfs_test_opt(tree_root, RESCAN_UUID_TREE) ||
+ } else if (btrfs_test_opt(tree_root->fs_info, RESCAN_UUID_TREE) ||
fs_info->generation !=
btrfs_super_uuid_tree_generation(disk_super)) {
- pr_info("BTRFS: checking UUID tree\n");
+ btrfs_info(fs_info, "checking UUID tree");
ret = btrfs_check_uuid_tree(fs_info);
if (ret) {
- pr_warn("BTRFS: failed to check the UUID tree %d\n",
- ret);
+ btrfs_warn(fs_info,
+ "failed to check the UUID tree: %d", ret);
close_ctree(tree_root);
return ret;
}
@@ -3211,7 +3225,7 @@ fail:
return err;
recovery_tree_root:
- if (!btrfs_test_opt(tree_root, USEBACKUPROOT))
+ if (!btrfs_test_opt(tree_root->fs_info, USEBACKUPROOT))
goto fail_tree_roots;
free_root_pointers(fs_info, 0);
@@ -3240,7 +3254,7 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
btrfs_warn_rl_in_rcu(device->dev_root->fs_info,
"lost page write due to IO error on %s",
rcu_str_deref(device->name));
- /* note, we dont' set_buffer_write_io_error because we have
+ /* note, we don't set_buffer_write_io_error because we have
* our own ways of dealing with the IO errors
*/
clear_buffer_uptodate(bh);
@@ -3405,9 +3419,9 @@ static int write_dev_supers(struct btrfs_device *device,
* to go down lazy.
*/
if (i == 0)
- ret = btrfsic_submit_bh(WRITE_FUA, bh);
+ ret = btrfsic_submit_bh(REQ_OP_WRITE, WRITE_FUA, bh);
else
- ret = btrfsic_submit_bh(WRITE_SYNC, bh);
+ ret = btrfsic_submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh);
if (ret)
errors++;
}
@@ -3471,12 +3485,13 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
bio->bi_end_io = btrfs_end_empty_barrier;
bio->bi_bdev = device->bdev;
+ bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
init_completion(&device->flush_wait);
bio->bi_private = &device->flush_wait;
device->flush_bio = bio;
bio_get(bio);
- btrfsic_submit_bio(WRITE_FLUSH, bio);
+ btrfsic_submit_bio(bio);
return 0;
}
@@ -3626,7 +3641,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
int total_errors = 0;
u64 flags;
- do_barriers = !btrfs_test_opt(root, NOBARRIER);
+ do_barriers = !btrfs_test_opt(root->fs_info, NOBARRIER);
backup_super_roots(root->fs_info);
sb = root->fs_info->super_for_commit;
@@ -3641,7 +3656,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
if (ret) {
mutex_unlock(
&root->fs_info->fs_devices->device_list_mutex);
- btrfs_std_error(root->fs_info, ret,
+ btrfs_handle_fs_error(root->fs_info, ret,
"errors while submitting device barriers.");
return ret;
}
@@ -3681,7 +3696,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
/* FUA is masked off if unsupported and can't be the reason */
- btrfs_std_error(root->fs_info, -EIO,
+ btrfs_handle_fs_error(root->fs_info, -EIO,
"%d errors while writing supers", total_errors);
return -EIO;
}
@@ -3699,7 +3714,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
}
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
if (total_errors > max_errors) {
- btrfs_std_error(root->fs_info, -EIO,
+ btrfs_handle_fs_error(root->fs_info, -EIO,
"%d errors while writing supers", total_errors);
return -EIO;
}
@@ -3910,7 +3925,7 @@ void close_ctree(struct btrfs_root *root)
iput(fs_info->btree_inode);
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- if (btrfs_test_opt(root, CHECK_INTEGRITY))
+ if (btrfs_test_opt(root->fs_info, CHECK_INTEGRITY))
btrfsic_unmount(root, fs_info->fs_devices);
#endif
@@ -4071,9 +4086,9 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
ret = -EINVAL;
}
/* Only PAGE SIZE is supported yet */
- if (sectorsize != PAGE_CACHE_SIZE) {
+ if (sectorsize != PAGE_SIZE) {
printk(KERN_ERR "BTRFS: sectorsize %llu not supported yet, only support %lu\n",
- sectorsize, PAGE_CACHE_SIZE);
+ sectorsize, PAGE_SIZE);
ret = -EINVAL;
}
if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
@@ -4115,6 +4130,16 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
* Hint to catch really bogus numbers, bitflips or so, more exact checks are
* done later
*/
+ if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) {
+ btrfs_err(fs_info, "bytes_used is too small %llu",
+ btrfs_super_bytes_used(sb));
+ ret = -EINVAL;
+ }
+ if (!is_power_of_2(btrfs_super_stripesize(sb))) {
+ btrfs_err(fs_info, "invalid stripesize %u",
+ btrfs_super_stripesize(sb));
+ ret = -EINVAL;
+ }
if (btrfs_super_num_devices(sb) > (1UL << 31))
printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n",
btrfs_super_num_devices(sb));
@@ -4352,7 +4377,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
if (ret)
break;
- clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
+ clear_extent_bits(dirty_pages, start, end, mark);
while (start <= end) {
eb = btrfs_find_tree_block(root->fs_info, start);
start += root->nodesize;
@@ -4387,7 +4412,7 @@ again:
if (ret)
break;
- clear_extent_dirty(unpin, start, end, GFP_NOFS);
+ clear_extent_dirty(unpin, start, end);
btrfs_error_unpin_extent_range(root, start, end);
cond_resched();
}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 8e79d0070bcf5..b3207a0e09f79 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -90,7 +90,8 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
void btrfs_free_fs_root(struct btrfs_root *root);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-struct btrfs_root *btrfs_alloc_dummy_root(void);
+struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info,
+ u32 sectorsize, u32 nodesize);
#endif
/*
@@ -122,7 +123,7 @@ void btrfs_csum_final(u32 crc, char *result);
int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
enum btrfs_wq_endio_type metadata);
int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
- int rw, struct bio *bio, int mirror_num,
+ struct bio *bio, int mirror_num,
unsigned long bio_flags, u64 bio_offset,
extent_submit_bio_hook_t *submit_bio_start,
extent_submit_bio_hook_t *submit_bio_done);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 53e12977bfd01..61b494e8e604e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -111,6 +111,16 @@ static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
int btrfs_pin_extent(struct btrfs_root *root,
u64 bytenr, u64 num_bytes, int reserved);
+static int __reserve_metadata_bytes(struct btrfs_root *root,
+ struct btrfs_space_info *space_info,
+ u64 orig_bytes,
+ enum btrfs_reserve_flush_enum flush);
+static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ u64 num_bytes);
+static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ u64 num_bytes);
static noinline int
block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -231,9 +241,9 @@ static int add_excluded_extent(struct btrfs_root *root,
{
u64 end = start + num_bytes - 1;
set_extent_bits(&root->fs_info->freed_extents[0],
- start, end, EXTENT_UPTODATE, GFP_NOFS);
+ start, end, EXTENT_UPTODATE);
set_extent_bits(&root->fs_info->freed_extents[1],
- start, end, EXTENT_UPTODATE, GFP_NOFS);
+ start, end, EXTENT_UPTODATE);
return 0;
}
@@ -246,9 +256,9 @@ static void free_excluded_extents(struct btrfs_root *root,
end = start + cache->key.offset - 1;
clear_extent_bits(&root->fs_info->freed_extents[0],
- start, end, EXTENT_UPTODATE, GFP_NOFS);
+ start, end, EXTENT_UPTODATE);
clear_extent_bits(&root->fs_info->freed_extents[1],
- start, end, EXTENT_UPTODATE, GFP_NOFS);
+ start, end, EXTENT_UPTODATE);
}
static int exclude_super_stripes(struct btrfs_root *root,
@@ -980,7 +990,7 @@ out_free:
* event that tree block loses its owner tree's reference and do the
* back refs conversion.
*
- * When a tree block is COW'd through a tree, there are four cases:
+ * When a tree block is COWed through a tree, there are four cases:
*
* The reference count of the block is one and the tree is the block's
* owner tree. Nothing to do in this case.
@@ -2042,8 +2052,13 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
struct btrfs_bio *bbio = NULL;
+ /*
+ * Avoid races with device replace and make sure our bbio has devices
+ * associated to its stripes that don't go away while we are discarding.
+ */
+ btrfs_bio_counter_inc_blocked(root->fs_info);
/* Tell the block device(s) that the sectors can be discarded */
- ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
+ ret = btrfs_map_block(root->fs_info, REQ_OP_DISCARD,
bytenr, &num_bytes, &bbio, 0);
/* Error condition is -ENOMEM */
if (!ret) {
@@ -2074,6 +2089,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
}
btrfs_put_bbio(bbio);
}
+ btrfs_bio_counter_dec(root->fs_info);
if (actual_bytes)
*actual_bytes = discarded_bytes;
@@ -2164,7 +2180,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
path, bytenr, parent, root_objectid,
owner, offset, refs_to_add);
if (ret)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
out:
btrfs_free_path(path);
return ret;
@@ -2188,7 +2204,7 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
ins.type = BTRFS_EXTENT_ITEM_KEY;
ref = btrfs_delayed_node_to_data_ref(node);
- trace_run_delayed_data_ref(node, ref, node->action);
+ trace_run_delayed_data_ref(root->fs_info, node, ref, node->action);
if (node->type == BTRFS_SHARED_DATA_REF_KEY)
parent = ref->parent;
@@ -2343,7 +2359,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
SKINNY_METADATA);
ref = btrfs_delayed_node_to_tree_ref(node);
- trace_run_delayed_tree_ref(node, ref, node->action);
+ trace_run_delayed_tree_ref(root->fs_info, node, ref, node->action);
if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
parent = ref->parent;
@@ -2407,7 +2423,8 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
*/
BUG_ON(extent_op);
head = btrfs_delayed_node_to_head(node);
- trace_run_delayed_ref_head(node, head, node->action);
+ trace_run_delayed_ref_head(root->fs_info, node, head,
+ node->action);
if (insert_reserved) {
btrfs_pin_extent(root, node->bytenr,
@@ -2595,7 +2612,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
}
/*
- * Need to drop our head ref lock and re-aqcuire the
+ * Need to drop our head ref lock and re-acquire the
* delayed ref lock and then re-check to make sure
* nobody got added.
*/
@@ -2747,7 +2764,7 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
/*
* We don't ever fill up leaves all the way so multiply by 2 just to be
- * closer to what we're really going to want to ouse.
+ * closer to what we're really going to want to use.
*/
return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
}
@@ -2762,7 +2779,7 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes)
u64 num_csums_per_leaf;
u64 num_csums;
- csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
+ csum_size = BTRFS_MAX_ITEM_SIZE(root);
num_csums_per_leaf = div64_u64(csum_size,
(u64)btrfs_super_csum_size(root->fs_info->super_copy));
num_csums = div64_u64(csum_bytes, root->sectorsize);
@@ -2829,6 +2846,7 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
struct async_delayed_refs {
struct btrfs_root *root;
+ u64 transid;
int count;
int error;
int sync;
@@ -2844,6 +2862,10 @@ static void delayed_ref_async_start(struct btrfs_work *work)
async = container_of(work, struct async_delayed_refs, work);
+ /* if the commit is already started, we don't need to wait here */
+ if (btrfs_transaction_blocked(async->root->fs_info))
+ goto done;
+
trans = btrfs_join_transaction(async->root);
if (IS_ERR(trans)) {
async->error = PTR_ERR(trans);
@@ -2851,14 +2873,19 @@ static void delayed_ref_async_start(struct btrfs_work *work)
}
/*
- * trans->sync means that when we call end_transaciton, we won't
+ * trans->sync means that when we call end_transaction, we won't
* wait on delayed refs
*/
trans->sync = true;
+
+ /* Don't bother flushing if we got into a different transaction */
+ if (trans->transid > async->transid)
+ goto end;
+
ret = btrfs_run_delayed_refs(trans, async->root, async->count);
if (ret)
async->error = ret;
-
+end:
ret = btrfs_end_transaction(trans, async->root);
if (ret && !async->error)
async->error = ret;
@@ -2870,7 +2897,7 @@ done:
}
int btrfs_async_run_delayed_refs(struct btrfs_root *root,
- unsigned long count, int wait)
+ unsigned long count, u64 transid, int wait)
{
struct async_delayed_refs *async;
int ret;
@@ -2882,6 +2909,7 @@ int btrfs_async_run_delayed_refs(struct btrfs_root *root,
async->root = root->fs_info->tree_root;
async->count = count;
async->error = 0;
+ async->transid = transid;
if (wait)
async->sync = 1;
else
@@ -2943,7 +2971,7 @@ again:
trans->can_flush_pending_bgs = false;
ret = __btrfs_run_delayed_refs(trans, root, count);
if (ret < 0) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -3207,7 +3235,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
u64, u64, u64, u64, u64, u64);
- if (btrfs_test_is_dummy_root(root))
+ if (btrfs_is_testing(root->fs_info))
return 0;
ref_root = btrfs_header_owner(buf);
@@ -3402,7 +3430,7 @@ again:
* transaction, this only happens in really bad situations
* anyway.
*/
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out_put;
}
WARN_ON(ret);
@@ -3420,7 +3448,7 @@ again:
spin_lock(&block_group->lock);
if (block_group->cached != BTRFS_CACHE_FINISHED ||
- !btrfs_test_opt(root, SPACE_CACHE)) {
+ !btrfs_test_opt(root->fs_info, SPACE_CACHE)) {
/*
* don't bother trying to write stuff out _if_
* a) we're not cached,
@@ -3452,7 +3480,7 @@ again:
num_pages = 1;
num_pages *= 16;
- num_pages *= PAGE_CACHE_SIZE;
+ num_pages *= PAGE_SIZE;
ret = btrfs_check_data_free_space(inode, 0, num_pages);
if (ret)
@@ -3497,7 +3525,7 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
if (list_empty(&cur_trans->dirty_bgs) ||
- !btrfs_test_opt(root, SPACE_CACHE))
+ !btrfs_test_opt(root->fs_info, SPACE_CACHE))
return 0;
path = btrfs_alloc_path();
@@ -3642,7 +3670,7 @@ again:
}
spin_unlock(&cur_trans->dirty_bgs_lock);
} else if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
}
}
@@ -3788,7 +3816,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
cache);
}
if (ret)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
}
/* if its not on the io list, we need to put the block group */
@@ -3824,6 +3852,59 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
return readonly;
}
+bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+ struct btrfs_block_group_cache *bg;
+ bool ret = true;
+
+ bg = btrfs_lookup_block_group(fs_info, bytenr);
+ if (!bg)
+ return false;
+
+ spin_lock(&bg->lock);
+ if (bg->ro)
+ ret = false;
+ else
+ atomic_inc(&bg->nocow_writers);
+ spin_unlock(&bg->lock);
+
+ /* no put on block group, done by btrfs_dec_nocow_writers */
+ if (!ret)
+ btrfs_put_block_group(bg);
+
+ return ret;
+
+}
+
+void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+ struct btrfs_block_group_cache *bg;
+
+ bg = btrfs_lookup_block_group(fs_info, bytenr);
+ ASSERT(bg);
+ if (atomic_dec_and_test(&bg->nocow_writers))
+ wake_up_atomic_t(&bg->nocow_writers);
+ /*
+ * Once for our lookup and once for the lookup done by a previous call
+ * to btrfs_inc_nocow_writers()
+ */
+ btrfs_put_block_group(bg);
+ btrfs_put_block_group(bg);
+}
+
+static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a)
+{
+ schedule();
+ return 0;
+}
+
+void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
+{
+ wait_on_atomic_t(&bg->nocow_writers,
+ btrfs_wait_nocow_writers_atomic_t,
+ TASK_UNINTERRUPTIBLE);
+}
+
static const char *alloc_name(u64 flags)
{
switch (flags) {
@@ -3843,6 +3924,7 @@ static const char *alloc_name(u64 flags)
static int update_space_info(struct btrfs_fs_info *info, u64 flags,
u64 total_bytes, u64 bytes_used,
+ u64 bytes_readonly,
struct btrfs_space_info **space_info)
{
struct btrfs_space_info *found;
@@ -3863,8 +3945,11 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
found->disk_total += total_bytes * factor;
found->bytes_used += bytes_used;
found->disk_used += bytes_used * factor;
+ found->bytes_readonly += bytes_readonly;
if (total_bytes > 0)
found->full = 0;
+ space_info_add_new_bytes(info, found, total_bytes -
+ bytes_used - bytes_readonly);
spin_unlock(&found->lock);
*space_info = found;
return 0;
@@ -3890,7 +3975,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
found->disk_used = bytes_used * factor;
found->bytes_pinned = 0;
found->bytes_reserved = 0;
- found->bytes_readonly = 0;
+ found->bytes_readonly = bytes_readonly;
found->bytes_may_use = 0;
found->full = 0;
found->max_extent_size = 0;
@@ -3899,6 +3984,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
found->flush = 0;
init_waitqueue_head(&found->wait);
INIT_LIST_HEAD(&found->ro_bgs);
+ INIT_LIST_HEAD(&found->tickets);
+ INIT_LIST_HEAD(&found->priority_tickets);
ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
info->space_info_kobj, "%s",
@@ -4141,7 +4228,7 @@ commit_trans:
if (need_commit > 0) {
btrfs_start_delalloc_roots(fs_info, 0, -1);
- btrfs_wait_ordered_roots(fs_info, -1);
+ btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
}
trans = btrfs_join_transaction(root);
@@ -4243,7 +4330,7 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
* Called if we need to clear a data reservation for this inode
* Normally in a error case.
*
- * This one will handle the per-indoe data rsv map for accurate reserved
+ * This one will handle the per-inode data rsv map for accurate reserved
* space framework.
*/
void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
@@ -4357,7 +4444,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans,
thresh = btrfs_calc_trunc_metadata_size(root, num_devs) +
btrfs_calc_trans_metadata_size(root, 1);
- if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
+ if (left < thresh && btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
left, thresh, type);
dump_space_info(info, 0, 0);
@@ -4400,7 +4487,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
space_info = __find_space_info(extent_root->fs_info, flags);
if (!space_info) {
ret = update_space_info(extent_root->fs_info, flags,
- 0, 0, &space_info);
+ 0, 0, 0, &space_info);
BUG_ON(ret); /* -ENOMEM */
}
BUG_ON(!space_info); /* Logic error */
@@ -4502,7 +4589,7 @@ out:
*/
if (trans->can_flush_pending_bgs &&
trans->chunk_bytes_reserved >= (u64)SZ_2M) {
- btrfs_create_pending_block_groups(trans, trans->root);
+ btrfs_create_pending_block_groups(trans, extent_root);
btrfs_trans_release_chunk_metadata(trans);
}
return ret;
@@ -4512,12 +4599,19 @@ static int can_overcommit(struct btrfs_root *root,
struct btrfs_space_info *space_info, u64 bytes,
enum btrfs_reserve_flush_enum flush)
{
- struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
- u64 profile = btrfs_get_alloc_profile(root, 0);
+ struct btrfs_block_rsv *global_rsv;
+ u64 profile;
u64 space_size;
u64 avail;
u64 used;
+ /* Don't overcommit when in mixed mode. */
+ if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
+ return 0;
+
+ BUG_ON(root->fs_info == NULL);
+ global_rsv = &root->fs_info->global_block_rsv;
+ profile = btrfs_get_alloc_profile(root, 0);
used = space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_pinned + space_info->bytes_readonly;
@@ -4583,7 +4677,8 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
*/
btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
if (!current->journal_info)
- btrfs_wait_ordered_roots(root->fs_info, nr_items);
+ btrfs_wait_ordered_roots(root->fs_info, nr_items,
+ 0, (u64)-1);
}
}
@@ -4620,7 +4715,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
/* Calc the number of the pages we need flush for space reservation */
items = calc_reclaim_items_nr(root, to_reclaim);
- to_reclaim = items * EXTENT_SIZE_PER_ITEM;
+ to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM;
trans = (struct btrfs_trans_handle *)current->journal_info;
block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -4632,14 +4727,15 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
if (trans)
return;
if (wait_ordered)
- btrfs_wait_ordered_roots(root->fs_info, items);
+ btrfs_wait_ordered_roots(root->fs_info, items,
+ 0, (u64)-1);
return;
}
loops = 0;
while (delalloc_bytes && loops < 3) {
max_reclaim = min(delalloc_bytes, to_reclaim);
- nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
+ nr_pages = max_reclaim >> PAGE_SHIFT;
btrfs_writeback_inodes_sb_nr(root, nr_pages, items);
/*
* We need to wait for the async pages to actually start before
@@ -4667,11 +4763,17 @@ skip_async:
spin_unlock(&space_info->lock);
break;
}
+ if (list_empty(&space_info->tickets) &&
+ list_empty(&space_info->priority_tickets)) {
+ spin_unlock(&space_info->lock);
+ break;
+ }
spin_unlock(&space_info->lock);
loops++;
if (wait_ordered && !trans) {
- btrfs_wait_ordered_roots(root->fs_info, items);
+ btrfs_wait_ordered_roots(root->fs_info, items,
+ 0, (u64)-1);
} else {
time_left = schedule_timeout_killable(1);
if (time_left)
@@ -4734,13 +4836,11 @@ commit:
return btrfs_commit_transaction(trans, root);
}
-enum flush_state {
- FLUSH_DELAYED_ITEMS_NR = 1,
- FLUSH_DELAYED_ITEMS = 2,
- FLUSH_DELALLOC = 3,
- FLUSH_DELALLOC_WAIT = 4,
- ALLOC_CHUNK = 5,
- COMMIT_TRANS = 6,
+struct reserve_ticket {
+ u64 bytes;
+ int error;
+ struct list_head list;
+ wait_queue_head_t wait;
};
static int flush_space(struct btrfs_root *root,
@@ -4793,6 +4893,8 @@ static int flush_space(struct btrfs_root *root,
break;
}
+ trace_btrfs_flush_space(root->fs_info, space_info->flags, num_bytes,
+ orig_bytes, state, ret);
return ret;
}
@@ -4800,17 +4902,22 @@ static inline u64
btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
struct btrfs_space_info *space_info)
{
+ struct reserve_ticket *ticket;
u64 used;
u64 expected;
- u64 to_reclaim;
+ u64 to_reclaim = 0;
to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
- spin_lock(&space_info->lock);
if (can_overcommit(root, space_info, to_reclaim,
- BTRFS_RESERVE_FLUSH_ALL)) {
- to_reclaim = 0;
- goto out;
- }
+ BTRFS_RESERVE_FLUSH_ALL))
+ return 0;
+
+ list_for_each_entry(ticket, &space_info->tickets, list)
+ to_reclaim += ticket->bytes;
+ list_for_each_entry(ticket, &space_info->priority_tickets, list)
+ to_reclaim += ticket->bytes;
+ if (to_reclaim)
+ return to_reclaim;
used = space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_pinned + space_info->bytes_readonly +
@@ -4826,14 +4933,11 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
to_reclaim = 0;
to_reclaim = min(to_reclaim, space_info->bytes_may_use +
space_info->bytes_reserved);
-out:
- spin_unlock(&space_info->lock);
-
return to_reclaim;
}
static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
- struct btrfs_fs_info *fs_info, u64 used)
+ struct btrfs_root *root, u64 used)
{
u64 thresh = div_factor_fine(space_info->total_bytes, 98);
@@ -4841,158 +4945,217 @@ static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
return 0;
- return (used >= thresh && !btrfs_fs_closing(fs_info) &&
- !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
+ if (!btrfs_calc_reclaim_metadata_size(root, space_info))
+ return 0;
+
+ return (used >= thresh && !btrfs_fs_closing(root->fs_info) &&
+ !test_bit(BTRFS_FS_STATE_REMOUNTING,
+ &root->fs_info->fs_state));
}
-static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info,
- struct btrfs_fs_info *fs_info,
- int flush_state)
+static void wake_all_tickets(struct list_head *head)
{
- u64 used;
-
- spin_lock(&space_info->lock);
- /*
- * We run out of space and have not got any free space via flush_space,
- * so don't bother doing async reclaim.
- */
- if (flush_state > COMMIT_TRANS && space_info->full) {
- spin_unlock(&space_info->lock);
- return 0;
- }
+ struct reserve_ticket *ticket;
- used = space_info->bytes_used + space_info->bytes_reserved +
- space_info->bytes_pinned + space_info->bytes_readonly +
- space_info->bytes_may_use;
- if (need_do_async_reclaim(space_info, fs_info, used)) {
- spin_unlock(&space_info->lock);
- return 1;
+ while (!list_empty(head)) {
+ ticket = list_first_entry(head, struct reserve_ticket, list);
+ list_del_init(&ticket->list);
+ ticket->error = -ENOSPC;
+ wake_up(&ticket->wait);
}
- spin_unlock(&space_info->lock);
-
- return 0;
}
+/*
+ * This is for normal flushers, we can wait all goddamned day if we want to. We
+ * will loop and continuously try to flush as long as we are making progress.
+ * We count progress as clearing off tickets each time we have to loop.
+ */
static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
{
+ struct reserve_ticket *last_ticket = NULL;
struct btrfs_fs_info *fs_info;
struct btrfs_space_info *space_info;
u64 to_reclaim;
int flush_state;
+ int commit_cycles = 0;
fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+ spin_lock(&space_info->lock);
to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
space_info);
- if (!to_reclaim)
+ if (!to_reclaim) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
return;
+ }
+ last_ticket = list_first_entry(&space_info->tickets,
+ struct reserve_ticket, list);
+ spin_unlock(&space_info->lock);
flush_state = FLUSH_DELAYED_ITEMS_NR;
do {
+ struct reserve_ticket *ticket;
+ int ret;
+
+ ret = flush_space(fs_info->fs_root, space_info, to_reclaim,
+ to_reclaim, flush_state);
+ spin_lock(&space_info->lock);
+ if (list_empty(&space_info->tickets)) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
+ space_info);
+ ticket = list_first_entry(&space_info->tickets,
+ struct reserve_ticket, list);
+ if (last_ticket == ticket) {
+ flush_state++;
+ } else {
+ last_ticket = ticket;
+ flush_state = FLUSH_DELAYED_ITEMS_NR;
+ if (commit_cycles)
+ commit_cycles--;
+ }
+
+ if (flush_state > COMMIT_TRANS) {
+ commit_cycles++;
+ if (commit_cycles > 2) {
+ wake_all_tickets(&space_info->tickets);
+ space_info->flush = 0;
+ } else {
+ flush_state = FLUSH_DELAYED_ITEMS_NR;
+ }
+ }
+ spin_unlock(&space_info->lock);
+ } while (flush_state <= COMMIT_TRANS);
+}
+
+void btrfs_init_async_reclaim_work(struct work_struct *work)
+{
+ INIT_WORK(work, btrfs_async_reclaim_metadata_space);
+}
+
+static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ struct reserve_ticket *ticket)
+{
+ u64 to_reclaim;
+ int flush_state = FLUSH_DELAYED_ITEMS_NR;
+
+ spin_lock(&space_info->lock);
+ to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
+ space_info);
+ if (!to_reclaim) {
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ spin_unlock(&space_info->lock);
+
+ do {
flush_space(fs_info->fs_root, space_info, to_reclaim,
to_reclaim, flush_state);
flush_state++;
- if (!btrfs_need_do_async_reclaim(space_info, fs_info,
- flush_state))
+ spin_lock(&space_info->lock);
+ if (ticket->bytes == 0) {
+ spin_unlock(&space_info->lock);
return;
+ }
+ spin_unlock(&space_info->lock);
+
+ /*
+ * Priority flushers can't wait on delalloc without
+ * deadlocking.
+ */
+ if (flush_state == FLUSH_DELALLOC ||
+ flush_state == FLUSH_DELALLOC_WAIT)
+ flush_state = ALLOC_CHUNK;
} while (flush_state < COMMIT_TRANS);
}
-void btrfs_init_async_reclaim_work(struct work_struct *work)
+static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ struct reserve_ticket *ticket, u64 orig_bytes)
+
{
- INIT_WORK(work, btrfs_async_reclaim_metadata_space);
+ DEFINE_WAIT(wait);
+ int ret = 0;
+
+ spin_lock(&space_info->lock);
+ while (ticket->bytes > 0 && ticket->error == 0) {
+ ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
+ if (ret) {
+ ret = -EINTR;
+ break;
+ }
+ spin_unlock(&space_info->lock);
+
+ schedule();
+
+ finish_wait(&ticket->wait, &wait);
+ spin_lock(&space_info->lock);
+ }
+ if (!ret)
+ ret = ticket->error;
+ if (!list_empty(&ticket->list))
+ list_del_init(&ticket->list);
+ if (ticket->bytes && ticket->bytes < orig_bytes) {
+ u64 num_bytes = orig_bytes - ticket->bytes;
+ space_info->bytes_may_use -= num_bytes;
+ trace_btrfs_space_reservation(fs_info, "space_info",
+ space_info->flags, num_bytes, 0);
+ }
+ spin_unlock(&space_info->lock);
+
+ return ret;
}
/**
* reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
* @root - the root we're allocating for
- * @block_rsv - the block_rsv we're allocating for
+ * @space_info - the space info we want to allocate from
* @orig_bytes - the number of bytes we want
* @flush - whether or not we can flush to make our reservation
*
- * This will reserve orgi_bytes number of bytes from the space info associated
+ * This will reserve orig_bytes number of bytes from the space info associated
* with the block_rsv. If there is not enough space it will make an attempt to
* flush out space to make room. It will do this by flushing delalloc if
* possible or committing the transaction. If flush is 0 then no attempts to
* regain reservations will be made and this will fail if there is not enough
* space already.
*/
-static int reserve_metadata_bytes(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 orig_bytes,
- enum btrfs_reserve_flush_enum flush)
+static int __reserve_metadata_bytes(struct btrfs_root *root,
+ struct btrfs_space_info *space_info,
+ u64 orig_bytes,
+ enum btrfs_reserve_flush_enum flush)
{
- struct btrfs_space_info *space_info = block_rsv->space_info;
+ struct reserve_ticket ticket;
u64 used;
- u64 num_bytes = orig_bytes;
- int flush_state = FLUSH_DELAYED_ITEMS_NR;
int ret = 0;
- bool flushing = false;
-
-again:
- ret = 0;
- spin_lock(&space_info->lock);
- /*
- * We only want to wait if somebody other than us is flushing and we
- * are actually allowed to flush all things.
- */
- while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
- space_info->flush) {
- spin_unlock(&space_info->lock);
- /*
- * If we have a trans handle we can't wait because the flusher
- * may have to commit the transaction, which would mean we would
- * deadlock since we are waiting for the flusher to finish, but
- * hold the current transaction open.
- */
- if (current->journal_info)
- return -EAGAIN;
- ret = wait_event_killable(space_info->wait, !space_info->flush);
- /* Must have been killed, return */
- if (ret)
- return -EINTR;
- spin_lock(&space_info->lock);
- }
+ ASSERT(orig_bytes);
+ ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
+ spin_lock(&space_info->lock);
ret = -ENOSPC;
used = space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_pinned + space_info->bytes_readonly +
space_info->bytes_may_use;
/*
- * The idea here is that we've not already over-reserved the block group
- * then we can go ahead and save our reservation first and then start
- * flushing if we need to. Otherwise if we've already overcommitted
- * lets start flushing stuff first and then come back and try to make
- * our reservation.
+ * If we have enough space then hooray, make our reservation and carry
+ * on. If not see if we can overcommit, and if we can, hooray carry on.
+ * If not things get more complicated.
*/
- if (used <= space_info->total_bytes) {
- if (used + orig_bytes <= space_info->total_bytes) {
- space_info->bytes_may_use += orig_bytes;
- trace_btrfs_space_reservation(root->fs_info,
- "space_info", space_info->flags, orig_bytes, 1);
- ret = 0;
- } else {
- /*
- * Ok set num_bytes to orig_bytes since we aren't
- * overocmmitted, this way we only try and reclaim what
- * we need.
- */
- num_bytes = orig_bytes;
- }
- } else {
- /*
- * Ok we're over committed, set num_bytes to the overcommitted
- * amount plus the amount of bytes that we need for this
- * reservation.
- */
- num_bytes = used - space_info->total_bytes +
- (orig_bytes * 2);
- }
-
- if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
+ if (used + orig_bytes <= space_info->total_bytes) {
+ space_info->bytes_may_use += orig_bytes;
+ trace_btrfs_space_reservation(root->fs_info, "space_info",
+ space_info->flags, orig_bytes,
+ 1);
+ ret = 0;
+ } else if (can_overcommit(root, space_info, orig_bytes, flush)) {
space_info->bytes_may_use += orig_bytes;
trace_btrfs_space_reservation(root->fs_info, "space_info",
space_info->flags, orig_bytes,
@@ -5001,16 +5164,31 @@ again:
}
/*
- * Couldn't make our reservation, save our place so while we're trying
- * to reclaim space we can actually use it instead of somebody else
- * stealing it from us.
+ * If we couldn't make a reservation then setup our reservation ticket
+ * and kick the async worker if it's not already running.
*
- * We make the other tasks wait for the flush only when we can flush
- * all things.
+ * If we are a priority flusher then we just need to add our ticket to
+ * the list and we will do our own flushing further down.
*/
if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
- flushing = true;
- space_info->flush = 1;
+ ticket.bytes = orig_bytes;
+ ticket.error = 0;
+ init_waitqueue_head(&ticket.wait);
+ if (flush == BTRFS_RESERVE_FLUSH_ALL) {
+ list_add_tail(&ticket.list, &space_info->tickets);
+ if (!space_info->flush) {
+ space_info->flush = 1;
+ trace_btrfs_trigger_flush(root->fs_info,
+ space_info->flags,
+ orig_bytes, flush,
+ "enospc");
+ queue_work(system_unbound_wq,
+ &root->fs_info->async_reclaim_work);
+ }
+ } else {
+ list_add_tail(&ticket.list,
+ &space_info->priority_tickets);
+ }
} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
used += orig_bytes;
/*
@@ -5019,39 +5197,67 @@ again:
* the async reclaim as we will panic.
*/
if (!root->fs_info->log_root_recovering &&
- need_do_async_reclaim(space_info, root->fs_info, used) &&
- !work_busy(&root->fs_info->async_reclaim_work))
+ need_do_async_reclaim(space_info, root, used) &&
+ !work_busy(&root->fs_info->async_reclaim_work)) {
+ trace_btrfs_trigger_flush(root->fs_info,
+ space_info->flags,
+ orig_bytes, flush,
+ "preempt");
queue_work(system_unbound_wq,
&root->fs_info->async_reclaim_work);
+ }
}
spin_unlock(&space_info->lock);
-
if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
- goto out;
+ return ret;
- ret = flush_space(root, space_info, num_bytes, orig_bytes,
- flush_state);
- flush_state++;
+ if (flush == BTRFS_RESERVE_FLUSH_ALL)
+ return wait_reserve_ticket(root->fs_info, space_info, &ticket,
+ orig_bytes);
- /*
- * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
- * would happen. So skip delalloc flush.
- */
- if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
- (flush_state == FLUSH_DELALLOC ||
- flush_state == FLUSH_DELALLOC_WAIT))
- flush_state = ALLOC_CHUNK;
+ ret = 0;
+ priority_reclaim_metadata_space(root->fs_info, space_info, &ticket);
+ spin_lock(&space_info->lock);
+ if (ticket.bytes) {
+ if (ticket.bytes < orig_bytes) {
+ u64 num_bytes = orig_bytes - ticket.bytes;
+ space_info->bytes_may_use -= num_bytes;
+ trace_btrfs_space_reservation(root->fs_info,
+ "space_info", space_info->flags,
+ num_bytes, 0);
- if (!ret)
- goto again;
- else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
- flush_state < COMMIT_TRANS)
- goto again;
- else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
- flush_state <= COMMIT_TRANS)
- goto again;
+ }
+ list_del_init(&ticket.list);
+ ret = -ENOSPC;
+ }
+ spin_unlock(&space_info->lock);
+ ASSERT(list_empty(&ticket.list));
+ return ret;
+}
-out:
+/**
+ * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
+ * @root - the root we're allocating for
+ * @block_rsv - the block_rsv we're allocating for
+ * @orig_bytes - the number of bytes we want
+ * @flush - whether or not we can flush to make our reservation
+ *
+ * This will reserve orgi_bytes number of bytes from the space info associated
+ * with the block_rsv. If there is not enough space it will make an attempt to
+ * flush out space to make room. It will do this by flushing delalloc if
+ * possible or committing the transaction. If flush is 0 then no attempts to
+ * regain reservations will be made and this will fail if there is not enough
+ * space already.
+ */
+static int reserve_metadata_bytes(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 orig_bytes,
+ enum btrfs_reserve_flush_enum flush)
+{
+ int ret;
+
+ ret = __reserve_metadata_bytes(root, block_rsv->space_info, orig_bytes,
+ flush);
if (ret == -ENOSPC &&
unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
struct btrfs_block_rsv *global_rsv =
@@ -5064,13 +5270,8 @@ out:
if (ret == -ENOSPC)
trace_btrfs_space_reservation(root->fs_info,
"space_info:enospc",
- space_info->flags, orig_bytes, 1);
- if (flushing) {
- spin_lock(&space_info->lock);
- space_info->flush = 0;
- wake_up_all(&space_info->wait);
- spin_unlock(&space_info->lock);
- }
+ block_rsv->space_info->flags,
+ orig_bytes, 1);
return ret;
}
@@ -5146,6 +5347,108 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
return 0;
}
+/*
+ * This is for space we already have accounted in space_info->bytes_may_use, so
+ * basically when we're returning space from block_rsv's.
+ */
+static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ u64 num_bytes)
+{
+ struct reserve_ticket *ticket;
+ struct list_head *head;
+ u64 used;
+ enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
+ bool check_overcommit = false;
+
+ spin_lock(&space_info->lock);
+ head = &space_info->priority_tickets;
+
+ /*
+ * If we are over our limit then we need to check and see if we can
+ * overcommit, and if we can't then we just need to free up our space
+ * and not satisfy any requests.
+ */
+ used = space_info->bytes_used + space_info->bytes_reserved +
+ space_info->bytes_pinned + space_info->bytes_readonly +
+ space_info->bytes_may_use;
+ if (used - num_bytes >= space_info->total_bytes)
+ check_overcommit = true;
+again:
+ while (!list_empty(head) && num_bytes) {
+ ticket = list_first_entry(head, struct reserve_ticket,
+ list);
+ /*
+ * We use 0 bytes because this space is already reserved, so
+ * adding the ticket space would be a double count.
+ */
+ if (check_overcommit &&
+ !can_overcommit(fs_info->extent_root, space_info, 0,
+ flush))
+ break;
+ if (num_bytes >= ticket->bytes) {
+ list_del_init(&ticket->list);
+ num_bytes -= ticket->bytes;
+ ticket->bytes = 0;
+ wake_up(&ticket->wait);
+ } else {
+ ticket->bytes -= num_bytes;
+ num_bytes = 0;
+ }
+ }
+
+ if (num_bytes && head == &space_info->priority_tickets) {
+ head = &space_info->tickets;
+ flush = BTRFS_RESERVE_FLUSH_ALL;
+ goto again;
+ }
+ space_info->bytes_may_use -= num_bytes;
+ trace_btrfs_space_reservation(fs_info, "space_info",
+ space_info->flags, num_bytes, 0);
+ spin_unlock(&space_info->lock);
+}
+
+/*
+ * This is for newly allocated space that isn't accounted in
+ * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent
+ * we use this helper.
+ */
+static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ u64 num_bytes)
+{
+ struct reserve_ticket *ticket;
+ struct list_head *head = &space_info->priority_tickets;
+
+again:
+ while (!list_empty(head) && num_bytes) {
+ ticket = list_first_entry(head, struct reserve_ticket,
+ list);
+ if (num_bytes >= ticket->bytes) {
+ trace_btrfs_space_reservation(fs_info, "space_info",
+ space_info->flags,
+ ticket->bytes, 1);
+ list_del_init(&ticket->list);
+ num_bytes -= ticket->bytes;
+ space_info->bytes_may_use += ticket->bytes;
+ ticket->bytes = 0;
+ wake_up(&ticket->wait);
+ } else {
+ trace_btrfs_space_reservation(fs_info, "space_info",
+ space_info->flags,
+ num_bytes, 1);
+ space_info->bytes_may_use += num_bytes;
+ ticket->bytes -= num_bytes;
+ num_bytes = 0;
+ }
+ }
+
+ if (num_bytes && head == &space_info->priority_tickets) {
+ head = &space_info->tickets;
+ goto again;
+ }
+}
+
static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
struct btrfs_block_rsv *dest, u64 num_bytes)
@@ -5180,18 +5483,15 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
}
spin_unlock(&dest->lock);
}
- if (num_bytes) {
- spin_lock(&space_info->lock);
- space_info->bytes_may_use -= num_bytes;
- trace_btrfs_space_reservation(fs_info, "space_info",
- space_info->flags, num_bytes, 0);
- spin_unlock(&space_info->lock);
- }
+ if (num_bytes)
+ space_info_add_old_bytes(fs_info, space_info,
+ num_bytes);
}
}
-static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
- struct btrfs_block_rsv *dst, u64 num_bytes)
+int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
+ struct btrfs_block_rsv *dst, u64 num_bytes,
+ int update_size)
{
int ret;
@@ -5199,7 +5499,7 @@ static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
if (ret)
return ret;
- block_rsv_add_bytes(dst, num_bytes, 1);
+ block_rsv_add_bytes(dst, num_bytes, update_size);
return 0;
}
@@ -5306,13 +5606,6 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
return ret;
}
-int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
- struct btrfs_block_rsv *dst_rsv,
- u64 num_bytes)
-{
- return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
-}
-
void btrfs_block_rsv_release(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes)
@@ -5325,48 +5618,21 @@ void btrfs_block_rsv_release(struct btrfs_root *root,
num_bytes);
}
-/*
- * helper to calculate size of global block reservation.
- * the desired value is sum of space used by extent tree,
- * checksum tree and root tree
- */
-static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_space_info *sinfo;
- u64 num_bytes;
- u64 meta_used;
- u64 data_used;
- int csum_size = btrfs_super_csum_size(fs_info->super_copy);
-
- sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
- spin_lock(&sinfo->lock);
- data_used = sinfo->bytes_used;
- spin_unlock(&sinfo->lock);
-
- sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
- spin_lock(&sinfo->lock);
- if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
- data_used = 0;
- meta_used = sinfo->bytes_used;
- spin_unlock(&sinfo->lock);
-
- num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
- csum_size * 2;
- num_bytes += div_u64(data_used + meta_used, 50);
-
- if (num_bytes * 3 > meta_used)
- num_bytes = div_u64(meta_used, 3);
-
- return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10);
-}
-
static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
{
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
struct btrfs_space_info *sinfo = block_rsv->space_info;
u64 num_bytes;
- num_bytes = calc_global_metadata_size(fs_info);
+ /*
+ * The global block rsv is based on the size of the extent tree, the
+ * checksum tree and the root tree. If the fs is empty we want to set
+ * it to a minimal amount for safety.
+ */
+ num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
+ btrfs_root_used(&fs_info->csum_root->root_item) +
+ btrfs_root_used(&fs_info->tree_root->root_item);
+ num_bytes = max_t(u64, num_bytes, SZ_16M);
spin_lock(&sinfo->lock);
spin_lock(&block_rsv->lock);
@@ -5464,7 +5730,7 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
*/
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
{
- struct btrfs_fs_info *fs_info = trans->root->fs_info;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
if (!trans->chunk_bytes_reserved)
return;
@@ -5481,7 +5747,13 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
+ /*
+ * We always use trans->block_rsv here as we will have reserved space
+ * for our orphan when starting the transaction, using get_block_rsv()
+ * here will sometimes make us choose the wrong block rsv as we could be
+ * doing a reloc inode for a non refcounted root.
+ */
+ struct btrfs_block_rsv *src_rsv = trans->block_rsv;
struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
/*
@@ -5492,7 +5764,7 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
trace_btrfs_space_reservation(root->fs_info, "orphan",
btrfs_ino(inode), num_bytes, 1);
- return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+ return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
}
void btrfs_orphan_release_metadata(struct inode *inode)
@@ -5516,7 +5788,7 @@ void btrfs_orphan_release_metadata(struct inode *inode)
* common file/directory operations, they change two fs/file trees
* and root tree, the number of items that the qgroup reserves is
* different with the free space reservation. So we can not use
- * the space reseravtion mechanism in start_transaction().
+ * the space reservation mechanism in start_transaction().
*/
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv,
@@ -5547,7 +5819,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
BTRFS_RESERVE_FLUSH_ALL);
if (ret == -ENOSPC && use_global_rsv)
- ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
+ ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
if (ret && *qgroup_reserved)
btrfs_qgroup_free_meta(root, *qgroup_reserved);
@@ -5565,7 +5837,7 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
/**
* drop_outstanding_extent - drop an outstanding extent
* @inode: the inode we're dropping the extent for
- * @num_bytes: the number of bytes we're relaseing.
+ * @num_bytes: the number of bytes we're releasing.
*
* This is called when we are freeing up an outstanding extent, either called
* after an error or after an extent is written. This will return the number of
@@ -5591,7 +5863,7 @@ static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)
drop_inode_space = 1;
/*
- * If we have more or the same amount of outsanding extents than we have
+ * If we have more or the same amount of outstanding extents than we have
* reserved then we need to leave the reserved extents count alone.
*/
if (BTRFS_I(inode)->outstanding_extents >=
@@ -5605,8 +5877,8 @@ static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)
}
/**
- * calc_csum_metadata_size - return the amount of metada space that must be
- * reserved/free'd for the given bytes.
+ * calc_csum_metadata_size - return the amount of metadata space that must be
+ * reserved/freed for the given bytes.
* @inode: the inode we're manipulating
* @num_bytes: the number of bytes in question
* @reserve: 1 if we are reserving space, 0 if we are freeing space
@@ -5657,21 +5929,26 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
u64 to_reserve = 0;
u64 csum_bytes;
unsigned nr_extents = 0;
- int extra_reserve = 0;
enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
int ret = 0;
bool delalloc_lock = true;
u64 to_free = 0;
unsigned dropped;
+ bool release_extra = false;
/* If we are a free space inode we need to not flush since we will be in
* the middle of a transaction commit. We also don't need the delalloc
* mutex since we won't race with anybody. We need this mostly to make
* lockdep shut its filthy mouth.
+ *
+ * If we have a transaction open (can happen if we call truncate_block
+ * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
*/
if (btrfs_is_free_space_inode(inode)) {
flush = BTRFS_RESERVE_NO_FLUSH;
delalloc_lock = false;
+ } else if (current->journal_info) {
+ flush = BTRFS_RESERVE_FLUSH_LIMIT;
}
if (flush != BTRFS_RESERVE_NO_FLUSH &&
@@ -5688,24 +5965,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
BTRFS_MAX_EXTENT_SIZE - 1,
BTRFS_MAX_EXTENT_SIZE);
BTRFS_I(inode)->outstanding_extents += nr_extents;
- nr_extents = 0;
+ nr_extents = 0;
if (BTRFS_I(inode)->outstanding_extents >
BTRFS_I(inode)->reserved_extents)
- nr_extents = BTRFS_I(inode)->outstanding_extents -
+ nr_extents += BTRFS_I(inode)->outstanding_extents -
BTRFS_I(inode)->reserved_extents;
- /*
- * Add an item to reserve for updating the inode when we complete the
- * delalloc io.
- */
- if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
- &BTRFS_I(inode)->runtime_flags)) {
- nr_extents++;
- extra_reserve = 1;
- }
-
- to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
+ /* We always want to reserve a slot for updating the inode. */
+ to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents + 1);
to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
csum_bytes = BTRFS_I(inode)->csum_bytes;
spin_unlock(&BTRFS_I(inode)->lock);
@@ -5717,17 +5985,17 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
goto out_fail;
}
- ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
+ ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush);
if (unlikely(ret)) {
btrfs_qgroup_free_meta(root, nr_extents * root->nodesize);
goto out_fail;
}
spin_lock(&BTRFS_I(inode)->lock);
- if (extra_reserve) {
- set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
- &BTRFS_I(inode)->runtime_flags);
- nr_extents--;
+ if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+ &BTRFS_I(inode)->runtime_flags)) {
+ to_reserve -= btrfs_calc_trans_metadata_size(root, 1);
+ release_extra = true;
}
BTRFS_I(inode)->reserved_extents += nr_extents;
spin_unlock(&BTRFS_I(inode)->lock);
@@ -5738,8 +6006,10 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
if (to_reserve)
trace_btrfs_space_reservation(root->fs_info, "delalloc",
btrfs_ino(inode), to_reserve, 1);
- block_rsv_add_bytes(block_rsv, to_reserve, 1);
-
+ if (release_extra)
+ btrfs_block_rsv_release(root, block_rsv,
+ btrfs_calc_trans_metadata_size(root,
+ 1));
return 0;
out_fail:
@@ -5758,7 +6028,7 @@ out_fail:
/*
* This is tricky, but first we need to figure out how much we
- * free'd from any free-ers that occurred during this
+ * freed from any free-ers that occurred during this
* reservation, so we reset ->csum_bytes to the csum_bytes
* before we dropped our lock, and then call the free for the
* number of bytes that were freed while we were trying our
@@ -5780,7 +6050,7 @@ out_fail:
/*
* Now reset ->csum_bytes to what it should be. If bytes is
- * more than to_free then we would have free'd more space had we
+ * more than to_free then we would have freed more space had we
* not had an artificially high ->csum_bytes, so we need to free
* the remainder. If bytes is the same or less then we don't
* need to do anything, the other free-ers did the correct
@@ -5831,7 +6101,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
if (dropped > 0)
to_free += btrfs_calc_trans_metadata_size(root, dropped);
- if (btrfs_test_is_dummy_root(root))
+ if (btrfs_is_testing(root->fs_info))
return;
trace_btrfs_space_reservation(root->fs_info, "delalloc",
@@ -5946,7 +6216,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
- if (btrfs_test_opt(root, SPACE_CACHE) &&
+ if (btrfs_test_opt(root->fs_info, SPACE_CACHE) &&
cache->disk_cache_state < BTRFS_DC_CLEAR)
cache->disk_cache_state = BTRFS_DC_CLEAR;
@@ -5971,6 +6241,9 @@ static int update_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
+ trace_btrfs_space_reservation(root->fs_info, "pinned",
+ cache->space_info->flags,
+ num_bytes, 1);
set_extent_dirty(info->pinned_extents,
bytenr, bytenr + num_bytes - 1,
GFP_NOFS | __GFP_NOFAIL);
@@ -6045,10 +6318,10 @@ static int pin_down_extent(struct btrfs_root *root,
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
+ trace_btrfs_space_reservation(root->fs_info, "pinned",
+ cache->space_info->flags, num_bytes, 1);
set_extent_dirty(root->fs_info->pinned_extents, bytenr,
bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
- if (reserved)
- trace_btrfs_reserved_extent_free(root, bytenr, num_bytes);
return 0;
}
@@ -6172,6 +6445,57 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log,
return 0;
}
+static void
+btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
+{
+ atomic_inc(&bg->reservations);
+}
+
+void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
+ const u64 start)
+{
+ struct btrfs_block_group_cache *bg;
+
+ bg = btrfs_lookup_block_group(fs_info, start);
+ ASSERT(bg);
+ if (atomic_dec_and_test(&bg->reservations))
+ wake_up_atomic_t(&bg->reservations);
+ btrfs_put_block_group(bg);
+}
+
+static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a)
+{
+ schedule();
+ return 0;
+}
+
+void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
+{
+ struct btrfs_space_info *space_info = bg->space_info;
+
+ ASSERT(bg->ro);
+
+ if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
+ return;
+
+ /*
+ * Our block group is read only but before we set it to read only,
+ * some task might have had allocated an extent from it already, but it
+ * has not yet created a respective ordered extent (and added it to a
+ * root's list of ordered extents).
+ * Therefore wait for any task currently allocating extents, since the
+ * block group's reservations counter is incremented while a read lock
+ * on the groups' semaphore is held and decremented after releasing
+ * the read access on that semaphore and creating the ordered extent.
+ */
+ down_write(&space_info->groups_sem);
+ up_write(&space_info->groups_sem);
+
+ wait_on_atomic_t(&bg->reservations,
+ btrfs_wait_bg_reservations_atomic_t,
+ TASK_UNINTERRUPTIBLE);
+}
+
/**
* btrfs_update_reserved_bytes - update the block_group and space info counters
* @cache: The cache we are manipulating
@@ -6274,7 +6598,7 @@ fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info,
u64 *empty_cluster)
{
struct btrfs_free_cluster *ret = NULL;
- bool ssd = btrfs_test_opt(root, SSD);
+ bool ssd = btrfs_test_opt(root->fs_info, SSD);
*empty_cluster = 0;
if (btrfs_mixed_space_info(space_info))
@@ -6352,6 +6676,9 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
spin_lock(&cache->lock);
cache->pinned -= len;
space_info->bytes_pinned -= len;
+
+ trace_btrfs_space_reservation(fs_info, "pinned",
+ space_info->flags, len, 0);
space_info->max_extent_size = 0;
percpu_counter_add(&space_info->total_bytes_pinned, -len);
if (cache->ro) {
@@ -6359,17 +6686,29 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
readonly = true;
}
spin_unlock(&cache->lock);
- if (!readonly && global_rsv->space_info == space_info) {
+ if (!readonly && return_free_space &&
+ global_rsv->space_info == space_info) {
+ u64 to_add = len;
+ WARN_ON(!return_free_space);
spin_lock(&global_rsv->lock);
if (!global_rsv->full) {
- len = min(len, global_rsv->size -
- global_rsv->reserved);
- global_rsv->reserved += len;
- space_info->bytes_may_use += len;
+ to_add = min(len, global_rsv->size -
+ global_rsv->reserved);
+ global_rsv->reserved += to_add;
+ space_info->bytes_may_use += to_add;
if (global_rsv->reserved >= global_rsv->size)
global_rsv->full = 1;
+ trace_btrfs_space_reservation(fs_info,
+ "space_info",
+ space_info->flags,
+ to_add, 1);
+ len -= to_add;
}
spin_unlock(&global_rsv->lock);
+ /* Add to any tickets we may have */
+ if (len)
+ space_info_add_new_bytes(fs_info, space_info,
+ len);
}
spin_unlock(&space_info->lock);
}
@@ -6404,11 +6743,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
break;
}
- if (btrfs_test_opt(root, DISCARD))
+ if (btrfs_test_opt(root->fs_info, DISCARD))
ret = btrfs_discard_extent(root, start,
end + 1 - start, NULL);
- clear_extent_dirty(unpin, start, end, GFP_NOFS);
+ clear_extent_dirty(unpin, start, end);
unpin_extent_range(root, start, end, true);
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
cond_resched();
@@ -6542,7 +6881,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
NULL, refs_to_drop,
is_data, &last_ref);
if (ret) {
- btrfs_abort_transaction(trans, extent_root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
btrfs_release_path(path);
@@ -6591,7 +6930,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
path->nodes[0]);
}
if (ret < 0) {
- btrfs_abort_transaction(trans, extent_root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
extent_slot = path->slots[0];
@@ -6602,10 +6941,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
"unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu",
bytenr, parent, root_objectid, owner_objectid,
owner_offset);
- btrfs_abort_transaction(trans, extent_root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
} else {
- btrfs_abort_transaction(trans, extent_root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -6617,7 +6956,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
ret = convert_extent_item_v0(trans, extent_root, path,
owner_objectid, 0);
if (ret < 0) {
- btrfs_abort_transaction(trans, extent_root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -6636,7 +6975,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
btrfs_print_leaf(extent_root, path->nodes[0]);
}
if (ret < 0) {
- btrfs_abort_transaction(trans, extent_root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -6661,7 +7000,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
btrfs_err(info, "trying to drop %d refs but we only have %Lu "
"for bytenr %Lu", refs_to_drop, refs, bytenr);
ret = -EINVAL;
- btrfs_abort_transaction(trans, extent_root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
refs -= refs_to_drop;
@@ -6684,7 +7023,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
iref, refs_to_drop,
is_data, &last_ref);
if (ret) {
- btrfs_abort_transaction(trans, extent_root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
}
@@ -6707,7 +7046,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
num_to_del);
if (ret) {
- btrfs_abort_transaction(trans, extent_root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
btrfs_release_path(path);
@@ -6715,7 +7054,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
if (is_data) {
ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
if (ret) {
- btrfs_abort_transaction(trans, extent_root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
}
@@ -6723,13 +7062,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
ret = add_to_free_space_tree(trans, root->fs_info, bytenr,
num_bytes);
if (ret) {
- btrfs_abort_transaction(trans, extent_root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
ret = update_block_group(trans, root, bytenr, num_bytes, 0);
if (ret) {
- btrfs_abort_transaction(trans, extent_root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
}
@@ -6878,7 +7217,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
- if (btrfs_test_is_dummy_root(root))
+ if (btrfs_is_testing(fs_info))
return 0;
add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
@@ -7025,36 +7364,35 @@ btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
int delalloc)
{
struct btrfs_block_group_cache *used_bg = NULL;
- bool locked = false;
-again:
+
spin_lock(&cluster->refill_lock);
- if (locked) {
- if (used_bg == cluster->block_group)
+ while (1) {
+ used_bg = cluster->block_group;
+ if (!used_bg)
+ return NULL;
+
+ if (used_bg == block_group)
return used_bg;
- up_read(&used_bg->data_rwsem);
- btrfs_put_block_group(used_bg);
- }
+ btrfs_get_block_group(used_bg);
- used_bg = cluster->block_group;
- if (!used_bg)
- return NULL;
+ if (!delalloc)
+ return used_bg;
- if (used_bg == block_group)
- return used_bg;
+ if (down_read_trylock(&used_bg->data_rwsem))
+ return used_bg;
- btrfs_get_block_group(used_bg);
+ spin_unlock(&cluster->refill_lock);
- if (!delalloc)
- return used_bg;
+ down_read(&used_bg->data_rwsem);
- if (down_read_trylock(&used_bg->data_rwsem))
- return used_bg;
+ spin_lock(&cluster->refill_lock);
+ if (used_bg == cluster->block_group)
+ return used_bg;
- spin_unlock(&cluster->refill_lock);
- down_read(&used_bg->data_rwsem);
- locked = true;
- goto again;
+ up_read(&used_bg->data_rwsem);
+ btrfs_put_block_group(used_bg);
+ }
}
static inline void
@@ -7431,6 +7769,7 @@ checks:
btrfs_add_free_space(block_group, offset, num_bytes);
goto loop;
}
+ btrfs_inc_block_group_reservations(block_group);
/* we are all good, lets return */
ins->objectid = search_start;
@@ -7471,7 +7810,7 @@ loop:
if (loop == LOOP_CACHING_NOWAIT) {
/*
* We want to skip the LOOP_CACHING_WAIT step if we
- * don't have any unached bgs and we've alrelady done a
+ * don't have any uncached bgs and we've already done a
* full search through.
*/
if (orig_have_caching_bg || !full_search)
@@ -7513,8 +7852,7 @@ loop:
* can do more things.
*/
if (ret < 0 && ret != -ENOSPC)
- btrfs_abort_transaction(trans,
- root, ret);
+ btrfs_abort_transaction(trans, ret);
else
ret = 0;
if (!exist)
@@ -7568,8 +7906,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n",
info->flags,
info->total_bytes - info->bytes_used - info->bytes_pinned -
- info->bytes_reserved - info->bytes_readonly,
- (info->full) ? "" : "not ");
+ info->bytes_reserved - info->bytes_readonly -
+ info->bytes_may_use, (info->full) ? "" : "not ");
printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, "
"reserved=%llu, may_use=%llu, readonly=%llu\n",
info->total_bytes, info->bytes_used, info->bytes_pinned,
@@ -7612,8 +7950,10 @@ again:
WARN_ON(num_bytes < root->sectorsize);
ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
flags, delalloc);
-
- if (ret == -ENOSPC) {
+ if (!ret && !is_data) {
+ btrfs_dec_block_group_reservations(root->fs_info,
+ ins->objectid);
+ } else if (ret == -ENOSPC) {
if (!final_tried && ins->offset) {
num_bytes = min(num_bytes >> 1, ins->offset);
num_bytes = round_down(num_bytes, root->sectorsize);
@@ -7621,7 +7961,7 @@ again:
if (num_bytes == min_alloc_size)
final_tried = true;
goto again;
- } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+ } else if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
struct btrfs_space_info *sinfo;
sinfo = __find_space_info(root->fs_info, flags);
@@ -7652,16 +7992,14 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
if (pin)
pin_down_extent(root, cache, start, len, 1);
else {
- if (btrfs_test_opt(root, DISCARD))
+ if (btrfs_test_opt(root->fs_info, DISCARD))
ret = btrfs_discard_extent(root, start, len, NULL);
btrfs_add_free_space(cache, start, len);
btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
+ trace_btrfs_reserved_extent_free(root, start, len);
}
btrfs_put_block_group(cache);
-
- trace_btrfs_reserved_extent_free(root, start, len);
-
return ret;
}
@@ -7873,7 +8211,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
/*
* Mixed block groups will exclude before processing the log so we only
- * need to do the exlude dance if this fs isn't mixed.
+ * need to do the exclude dance if this fs isn't mixed.
*/
if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
@@ -7901,8 +8239,9 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf;
buf = btrfs_find_create_tree_block(root, bytenr);
- if (!buf)
- return ERR_PTR(-ENOMEM);
+ if (IS_ERR(buf))
+ return buf;
+
btrfs_set_header_generation(buf, trans->transid);
btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
btrfs_tree_lock(buf);
@@ -7923,13 +8262,13 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
buf->start + buf->len - 1, GFP_NOFS);
else
set_extent_new(&root->dirty_log_pages, buf->start,
- buf->start + buf->len - 1, GFP_NOFS);
+ buf->start + buf->len - 1);
} else {
buf->log_index = -1;
set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
buf->start + buf->len - 1, GFP_NOFS);
}
- trans->blocks_used++;
+ trans->dirty = true;
/* this returns a buffer locked for blocking */
return buf;
}
@@ -7961,7 +8300,7 @@ again:
goto again;
}
- if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+ if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
static DEFINE_RATELIMIT_STATE(_rs,
DEFAULT_RATELIMIT_INTERVAL * 10,
/*DEFAULT_RATELIMIT_BURST*/ 1);
@@ -8015,13 +8354,15 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
SKINNY_METADATA);
- if (btrfs_test_is_dummy_root(root)) {
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (btrfs_is_testing(root->fs_info)) {
buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
level);
if (!IS_ERR(buf))
root->alloc_bytenr += blocksize;
return buf;
}
+#endif
block_rsv = use_block_rsv(trans, root, blocksize);
if (IS_ERR(block_rsv))
@@ -8201,7 +8542,8 @@ static int record_one_subtree_extent(struct btrfs_trans_handle *trans,
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
- if (btrfs_qgroup_insert_dirty_extent(delayed_refs, qrecord))
+ if (btrfs_qgroup_insert_dirty_extent(trans->fs_info,
+ delayed_refs, qrecord))
kfree(qrecord);
spin_unlock(&delayed_refs->lock);
@@ -8544,8 +8886,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
next = btrfs_find_tree_block(root->fs_info, bytenr);
if (!next) {
next = btrfs_find_create_tree_block(root, bytenr);
- if (!next)
- return -ENOMEM;
+ if (IS_ERR(next))
+ return PTR_ERR(next);
+
btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
level - 1);
reada = 1;
@@ -8985,7 +9328,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
&root->root_key,
root_item);
if (ret) {
- btrfs_abort_transaction(trans, tree_root, ret);
+ btrfs_abort_transaction(trans, ret);
err = ret;
goto out_end_trans;
}
@@ -9012,7 +9355,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
ret = btrfs_del_root(trans, tree_root, &root->root_key);
if (ret) {
- btrfs_abort_transaction(trans, tree_root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
@@ -9020,7 +9363,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
ret = btrfs_find_root(tree_root, &root->root_key, path,
NULL, NULL);
if (ret < 0) {
- btrfs_abort_transaction(trans, tree_root, ret);
+ btrfs_abort_transaction(trans, ret);
err = ret;
goto out_end_trans;
} else if (ret > 0) {
@@ -9058,7 +9401,7 @@ out:
if (!for_reloc && root_dropped == false)
btrfs_add_dead_root(root);
if (err && err != -EAGAIN)
- btrfs_std_error(root->fs_info, err, NULL);
+ btrfs_handle_fs_error(root->fs_info, err, NULL);
return err;
}
@@ -9317,7 +9660,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
u64 free_bytes = 0;
int factor;
- /* It's df, we don't care if it's racey */
+ /* It's df, we don't care if it's racy */
if (list_empty(&sinfo->ro_bgs))
return 0;
@@ -9386,15 +9729,23 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
u64 dev_min = 1;
u64 dev_nr = 0;
u64 target;
+ int debug;
int index;
int full = 0;
int ret = 0;
+ debug = btrfs_test_opt(root->fs_info, ENOSPC_DEBUG);
+
block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
/* odd, couldn't find the block group, leave it alone */
- if (!block_group)
+ if (!block_group) {
+ if (debug)
+ btrfs_warn(root->fs_info,
+ "can't find block group for bytenr %llu",
+ bytenr);
return -1;
+ }
min_free = btrfs_block_group_used(&block_group->item);
@@ -9448,8 +9799,13 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
* this is just a balance, so if we were marked as full
* we know there is no space for a new chunk
*/
- if (full)
+ if (full) {
+ if (debug)
+ btrfs_warn(root->fs_info,
+ "no space to alloc new chunk for block group %llu",
+ block_group->key.objectid);
goto out;
+ }
index = get_block_group_index(block_group);
}
@@ -9496,6 +9852,10 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
ret = -1;
}
}
+ if (debug && ret == -1)
+ btrfs_warn(root->fs_info,
+ "no space to allocate a new chunk for block group %llu",
+ block_group->key.objectid);
mutex_unlock(&root->fs_info->chunk_mutex);
btrfs_end_transaction(trans, root);
out:
@@ -9530,7 +9890,22 @@ static int find_first_block_group(struct btrfs_root *root,
if (found_key.objectid >= key->objectid &&
found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
- ret = 0;
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+
+ em_tree = &root->fs_info->mapping_tree.map_tree;
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, found_key.objectid,
+ found_key.offset);
+ read_unlock(&em_tree->lock);
+ if (!em) {
+ btrfs_err(root->fs_info,
+ "logical %llu len %llu found bg but no related chunk",
+ found_key.objectid, found_key.offset);
+ ret = -ENOENT;
+ } else {
+ ret = 0;
+ }
goto out;
}
path->slots[0]++;
@@ -9646,13 +10021,15 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
space_info = list_entry(info->space_info.next,
struct btrfs_space_info,
list);
- if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
- if (WARN_ON(space_info->bytes_pinned > 0 ||
+
+ /*
+ * Do not hide this behind enospc_debug, this is actually
+ * important and indicates a real bug if this happens.
+ */
+ if (WARN_ON(space_info->bytes_pinned > 0 ||
space_info->bytes_reserved > 0 ||
- space_info->bytes_may_use > 0)) {
- dump_space_info(space_info, 0, 0);
- }
- }
+ space_info->bytes_may_use > 0))
+ dump_space_info(space_info, 0, 0);
list_del(&space_info->list);
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
struct kobject *kobj;
@@ -9770,10 +10147,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
path->reada = READA_FORWARD;
cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
- if (btrfs_test_opt(root, SPACE_CACHE) &&
+ if (btrfs_test_opt(root->fs_info, SPACE_CACHE) &&
btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
need_clear = 1;
- if (btrfs_test_opt(root, CLEAR_CACHE))
+ if (btrfs_test_opt(root->fs_info, CLEAR_CACHE))
need_clear = 1;
while (1) {
@@ -9804,7 +10181,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
* b) Setting 'dirty flag' makes sure that we flush
* the new space cache info onto disk.
*/
- if (btrfs_test_opt(root, SPACE_CACHE))
+ if (btrfs_test_opt(root->fs_info, SPACE_CACHE))
cache->disk_cache_state = BTRFS_DC_CLEAR;
}
@@ -9860,9 +10237,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
goto error;
}
+ trace_btrfs_add_block_group(root->fs_info, cache, 0);
ret = update_space_info(info, cache->flags, found_key.offset,
btrfs_block_group_used(&cache->item),
- &space_info);
+ cache->bytes_super, &space_info);
if (ret) {
btrfs_remove_free_space_cache(cache);
spin_lock(&info->block_group_cache_lock);
@@ -9875,9 +10253,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
}
cache->space_info = space_info;
- spin_lock(&cache->space_info->lock);
- cache->space_info->bytes_readonly += cache->bytes_super;
- spin_unlock(&cache->space_info->lock);
__link_block_group(space_info, cache);
@@ -9948,11 +10323,11 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
ret = btrfs_insert_item(trans, extent_root, &key, &item,
sizeof(item));
if (ret)
- btrfs_abort_transaction(trans, extent_root, ret);
+ btrfs_abort_transaction(trans, ret);
ret = btrfs_finish_chunk_alloc(trans, extent_root,
key.objectid, key.offset);
if (ret)
- btrfs_abort_transaction(trans, extent_root, ret);
+ btrfs_abort_transaction(trans, ret);
add_block_group_free_space(trans, root->fs_info, block_group);
/* already aborted the transaction if it failed. */
next:
@@ -9969,7 +10344,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
int ret;
struct btrfs_root *extent_root;
struct btrfs_block_group_cache *cache;
-
extent_root = root->fs_info->extent_root;
btrfs_set_log_full_commit(root->fs_info, trans);
@@ -10015,7 +10389,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
* assigned to our block group, but don't update its counters just yet.
* We want our bg to be added to the rbtree with its ->space_info set.
*/
- ret = update_space_info(root->fs_info, cache->flags, 0, 0,
+ ret = update_space_info(root->fs_info, cache->flags, 0, 0, 0,
&cache->space_info);
if (ret) {
btrfs_remove_free_space_cache(cache);
@@ -10034,8 +10408,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
* Now that our block group has its ->space_info set and is inserted in
* the rbtree, update the space info's counters.
*/
+ trace_btrfs_add_block_group(root->fs_info, cache, 1);
ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
- &cache->space_info);
+ cache->bytes_super, &cache->space_info);
if (ret) {
btrfs_remove_free_space_cache(cache);
spin_lock(&root->fs_info->block_group_cache_lock);
@@ -10048,16 +10423,11 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
}
update_global_block_rsv(root->fs_info);
- spin_lock(&cache->space_info->lock);
- cache->space_info->bytes_readonly += cache->bytes_super;
- spin_unlock(&cache->space_info->lock);
-
__link_block_group(cache->space_info, cache);
list_add_tail(&cache->bg_list, &trans->new_bgs);
set_avail_alloc_bits(extent_root->fs_info, type);
-
return 0;
}
@@ -10270,7 +10640,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
spin_lock(&block_group->space_info->lock);
list_del_init(&block_group->ro_list);
- if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+ if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
WARN_ON(block_group->space_info->total_bytes
< block_group->key.offset);
WARN_ON(block_group->space_info->bytes_readonly
@@ -10509,14 +10879,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
*/
mutex_lock(&fs_info->unused_bg_unpin_mutex);
ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
- EXTENT_DIRTY, GFP_NOFS);
+ EXTENT_DIRTY);
if (ret) {
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
btrfs_dec_block_group_ro(root, block_group);
goto end_trans;
}
ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
- EXTENT_DIRTY, GFP_NOFS);
+ EXTENT_DIRTY);
if (ret) {
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
btrfs_dec_block_group_ro(root, block_group);
@@ -10538,7 +10908,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_unlock(&space_info->lock);
/* DISCARD can flip during remount */
- trimming = btrfs_test_opt(root, DISCARD);
+ trimming = btrfs_test_opt(root->fs_info, DISCARD);
/* Implicit trim during transaction commit. */
if (trimming)
@@ -10602,21 +10972,21 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
mixed = 1;
flags = BTRFS_BLOCK_GROUP_SYSTEM;
- ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+ ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
if (ret)
goto out;
if (mixed) {
flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
- ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+ ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
} else {
flags = BTRFS_BLOCK_GROUP_METADATA;
- ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+ ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
if (ret)
goto out;
flags = BTRFS_BLOCK_GROUP_DATA;
- ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+ ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
}
out:
return ret;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 76a0c8597d98d..44fe66b53c8b4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -163,13 +163,13 @@ int __init extent_io_init(void)
{
extent_state_cache = kmem_cache_create("btrfs_extent_state",
sizeof(struct extent_state), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ SLAB_MEM_SPREAD, NULL);
if (!extent_state_cache)
return -ENOMEM;
extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
sizeof(struct extent_buffer), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ SLAB_MEM_SPREAD, NULL);
if (!extent_buffer_cache)
goto free_state_cache;
@@ -726,14 +726,6 @@ next:
start = last_end + 1;
if (start <= end && state && !need_resched())
goto hit_next;
- goto search_again;
-
-out:
- spin_unlock(&tree->lock);
- if (prealloc)
- free_extent_state(prealloc);
-
- return 0;
search_again:
if (start > end)
@@ -742,6 +734,14 @@ search_again:
if (gfpflags_allow_blocking(mask))
cond_resched();
goto again;
+
+out:
+ spin_unlock(&tree->lock);
+ if (prealloc)
+ free_extent_state(prealloc);
+
+ return 0;
+
}
static void wait_on_state(struct extent_io_tree *tree,
@@ -873,8 +873,14 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
bits |= EXTENT_FIRST_DELALLOC;
again:
if (!prealloc && gfpflags_allow_blocking(mask)) {
+ /*
+ * Don't care for allocation failure here because we might end
+ * up not needing the pre-allocated extent state at all, which
+ * is the case if we only have in the tree extent states that
+ * cover our input range and don't cover too any other range.
+ * If we end up needing a new extent state we allocate it later.
+ */
prealloc = alloc_extent_state(mask);
- BUG_ON(!prealloc);
}
spin_lock(&tree->lock);
@@ -1037,7 +1043,13 @@ hit_next:
goto out;
}
- goto search_again;
+search_again:
+ if (start > end)
+ goto out;
+ spin_unlock(&tree->lock);
+ if (gfpflags_allow_blocking(mask))
+ cond_resched();
+ goto again;
out:
spin_unlock(&tree->lock);
@@ -1046,13 +1058,6 @@ out:
return err;
-search_again:
- if (start > end)
- goto out;
- spin_unlock(&tree->lock);
- if (gfpflags_allow_blocking(mask))
- cond_resched();
- goto again;
}
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
@@ -1073,17 +1078,18 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
* @bits: the bits to set in this range
* @clear_bits: the bits to clear in this range
* @cached_state: state that we're going to cache
- * @mask: the allocation mask
*
* This will go through and set bits for the given range. If any states exist
* already in this range they are set with the given bit and cleared of the
* clear_bits. This is only meant to be used by things that are mergeable, ie
* converting from say DELALLOC to DIRTY. This is not meant to be used with
* boundary bits like LOCK.
+ *
+ * All allocations are done with GFP_NOFS.
*/
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, unsigned clear_bits,
- struct extent_state **cached_state, gfp_t mask)
+ struct extent_state **cached_state)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
@@ -1098,7 +1104,7 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
btrfs_debug_check_extent_io_range(tree, start, end);
again:
- if (!prealloc && gfpflags_allow_blocking(mask)) {
+ if (!prealloc) {
/*
* Best effort, don't worry if extent state allocation fails
* here for the first iteration. We might have a cached state
@@ -1106,7 +1112,7 @@ again:
* extent state allocations are needed. We'll only know this
* after locking the tree.
*/
- prealloc = alloc_extent_state(mask);
+ prealloc = alloc_extent_state(GFP_NOFS);
if (!prealloc && !first_iteration)
return -ENOMEM;
}
@@ -1263,7 +1269,13 @@ hit_next:
goto out;
}
- goto search_again;
+search_again:
+ if (start > end)
+ goto out;
+ spin_unlock(&tree->lock);
+ cond_resched();
+ first_iteration = false;
+ goto again;
out:
spin_unlock(&tree->lock);
@@ -1271,21 +1283,11 @@ out:
free_extent_state(prealloc);
return err;
-
-search_again:
- if (start > end)
- goto out;
- spin_unlock(&tree->lock);
- if (gfpflags_allow_blocking(mask))
- cond_resched();
- first_iteration = false;
- goto again;
}
/* wrappers around set/clear extent bit */
int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, gfp_t mask,
- struct extent_changeset *changeset)
+ unsigned bits, struct extent_changeset *changeset)
{
/*
* We don't support EXTENT_LOCKED yet, as current changeset will
@@ -1295,7 +1297,7 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
*/
BUG_ON(bits & EXTENT_LOCKED);
- return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, mask,
+ return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
changeset);
}
@@ -1308,8 +1310,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
}
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, gfp_t mask,
- struct extent_changeset *changeset)
+ unsigned bits, struct extent_changeset *changeset)
{
/*
* Don't support EXTENT_LOCKED case, same reason as
@@ -1317,7 +1318,7 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
*/
BUG_ON(bits & EXTENT_LOCKED);
- return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask,
+ return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS,
changeset);
}
@@ -1363,23 +1364,23 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
{
- unsigned long index = start >> PAGE_CACHE_SHIFT;
- unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ unsigned long index = start >> PAGE_SHIFT;
+ unsigned long end_index = end >> PAGE_SHIFT;
struct page *page;
while (index <= end_index) {
page = find_get_page(inode->i_mapping, index);
BUG_ON(!page); /* Pages should be in the extent_io_tree */
clear_page_dirty_for_io(page);
- page_cache_release(page);
+ put_page(page);
index++;
}
}
void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
{
- unsigned long index = start >> PAGE_CACHE_SHIFT;
- unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ unsigned long index = start >> PAGE_SHIFT;
+ unsigned long end_index = end >> PAGE_SHIFT;
struct page *page;
while (index <= end_index) {
@@ -1387,7 +1388,7 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
BUG_ON(!page); /* Pages should be in the extent_io_tree */
__set_page_dirty_nobuffers(page);
account_page_redirty(page);
- page_cache_release(page);
+ put_page(page);
index++;
}
}
@@ -1397,15 +1398,15 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
*/
static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
{
- unsigned long index = start >> PAGE_CACHE_SHIFT;
- unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ unsigned long index = start >> PAGE_SHIFT;
+ unsigned long end_index = end >> PAGE_SHIFT;
struct page *page;
while (index <= end_index) {
page = find_get_page(tree->mapping, index);
BUG_ON(!page); /* Pages should be in the extent_io_tree */
set_page_writeback(page);
- page_cache_release(page);
+ put_page(page);
index++;
}
}
@@ -1556,8 +1557,8 @@ static noinline void __unlock_for_delalloc(struct inode *inode,
{
int ret;
struct page *pages[16];
- unsigned long index = start >> PAGE_CACHE_SHIFT;
- unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ unsigned long index = start >> PAGE_SHIFT;
+ unsigned long end_index = end >> PAGE_SHIFT;
unsigned long nr_pages = end_index - index + 1;
int i;
@@ -1571,7 +1572,7 @@ static noinline void __unlock_for_delalloc(struct inode *inode,
for (i = 0; i < ret; i++) {
if (pages[i] != locked_page)
unlock_page(pages[i]);
- page_cache_release(pages[i]);
+ put_page(pages[i]);
}
nr_pages -= ret;
index += ret;
@@ -1584,9 +1585,9 @@ static noinline int lock_delalloc_pages(struct inode *inode,
u64 delalloc_start,
u64 delalloc_end)
{
- unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
+ unsigned long index = delalloc_start >> PAGE_SHIFT;
unsigned long start_index = index;
- unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = delalloc_end >> PAGE_SHIFT;
unsigned long pages_locked = 0;
struct page *pages[16];
unsigned long nrpages;
@@ -1619,11 +1620,11 @@ static noinline int lock_delalloc_pages(struct inode *inode,
pages[i]->mapping != inode->i_mapping) {
ret = -EAGAIN;
unlock_page(pages[i]);
- page_cache_release(pages[i]);
+ put_page(pages[i]);
goto done;
}
}
- page_cache_release(pages[i]);
+ put_page(pages[i]);
pages_locked++;
}
nrpages -= ret;
@@ -1636,7 +1637,7 @@ done:
__unlock_for_delalloc(inode, locked_page,
delalloc_start,
((u64)(start_index + pages_locked - 1)) <<
- PAGE_CACHE_SHIFT);
+ PAGE_SHIFT);
}
return ret;
}
@@ -1696,7 +1697,7 @@ again:
free_extent_state(cached_state);
cached_state = NULL;
if (!loops) {
- max_bytes = PAGE_CACHE_SIZE;
+ max_bytes = PAGE_SIZE;
loops = 1;
goto again;
} else {
@@ -1735,8 +1736,8 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
int ret;
struct page *pages[16];
- unsigned long index = start >> PAGE_CACHE_SHIFT;
- unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ unsigned long index = start >> PAGE_SHIFT;
+ unsigned long end_index = end >> PAGE_SHIFT;
unsigned long nr_pages = end_index - index + 1;
int i;
@@ -1757,7 +1758,7 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
SetPagePrivate2(pages[i]);
if (pages[i] == locked_page) {
- page_cache_release(pages[i]);
+ put_page(pages[i]);
continue;
}
if (page_ops & PAGE_CLEAR_DIRTY)
@@ -1770,7 +1771,7 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
end_page_writeback(pages[i]);
if (page_ops & PAGE_UNLOCK)
unlock_page(pages[i]);
- page_cache_release(pages[i]);
+ put_page(pages[i]);
}
nr_pages -= ret;
index += ret;
@@ -1961,7 +1962,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
{
u64 start = page_offset(page);
- u64 end = start + PAGE_CACHE_SIZE - 1;
+ u64 end = start + PAGE_SIZE - 1;
if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
SetPageUptodate(page);
}
@@ -1975,13 +1976,13 @@ int free_io_failure(struct inode *inode, struct io_failure_record *rec)
set_state_failrec(failure_tree, rec->start, NULL);
ret = clear_extent_bits(failure_tree, rec->start,
rec->start + rec->len - 1,
- EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+ EXTENT_LOCKED | EXTENT_DIRTY);
if (ret)
err = ret;
ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
rec->start + rec->len - 1,
- EXTENT_DAMAGED, GFP_NOFS);
+ EXTENT_DAMAGED);
if (ret && !err)
err = ret;
@@ -2024,9 +2025,16 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
bio->bi_iter.bi_size = 0;
map_length = length;
+ /*
+ * Avoid races with device replace and make sure our bbio has devices
+ * associated to its stripes that don't go away while we are doing the
+ * read repair operation.
+ */
+ btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_block(fs_info, WRITE, logical,
&map_length, &bbio, mirror_num);
if (ret) {
+ btrfs_bio_counter_dec(fs_info);
bio_put(bio);
return -EIO;
}
@@ -2036,14 +2044,17 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
dev = bbio->stripes[mirror_num-1].dev;
btrfs_put_bbio(bbio);
if (!dev || !dev->bdev || !dev->writeable) {
+ btrfs_bio_counter_dec(fs_info);
bio_put(bio);
return -EIO;
}
bio->bi_bdev = dev->bdev;
+ bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC);
bio_add_page(bio, page, length, pg_offset);
- if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) {
+ if (btrfsic_submit_bio_wait(bio)) {
/* try to remap that extent elsewhere? */
+ btrfs_bio_counter_dec(fs_info);
bio_put(bio);
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
return -EIO;
@@ -2053,6 +2064,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
"read error corrected: ino %llu off %llu (dev %s sector %llu)",
btrfs_ino(inode), start,
rcu_str_deref(dev->name), sector);
+ btrfs_bio_counter_dec(fs_info);
bio_put(bio);
return 0;
}
@@ -2071,11 +2083,11 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
struct page *p = eb->pages[i];
ret = repair_io_failure(root->fs_info->btree_inode, start,
- PAGE_CACHE_SIZE, start, p,
+ PAGE_SIZE, start, p,
start - page_offset(p), mirror_num);
if (ret)
break;
- start += PAGE_CACHE_SIZE;
+ start += PAGE_SIZE;
}
return ret;
@@ -2232,13 +2244,12 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
/* set the bits in the private failure tree */
ret = set_extent_bits(failure_tree, start, end,
- EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+ EXTENT_LOCKED | EXTENT_DIRTY);
if (ret >= 0)
ret = set_state_failrec(failure_tree, start, failrec);
/* set the bits in the inode's tree */
if (ret >= 0)
- ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
- GFP_NOFS);
+ ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED);
if (ret < 0) {
kfree(failrec);
return ret;
@@ -2376,7 +2387,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
int read_mode;
int ret;
- BUG_ON(failed_bio->bi_rw & REQ_WRITE);
+ BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
if (ret)
@@ -2402,12 +2413,12 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
free_io_failure(inode, failrec);
return -EIO;
}
+ bio_set_op_attrs(bio, REQ_OP_READ, read_mode);
pr_debug("Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d\n",
read_mode, failrec->this_mirror, failrec->in_validation);
- ret = tree->ops->submit_bio_hook(inode, read_mode, bio,
- failrec->this_mirror,
+ ret = tree->ops->submit_bio_hook(inode, bio, failrec->this_mirror,
failrec->bio_flags, 0);
if (ret) {
free_io_failure(inode, failrec);
@@ -2466,8 +2477,8 @@ static void end_bio_extent_writepage(struct bio *bio)
* advance bv_offset and adjust bv_len to compensate.
* Print a warning for nonzero offsets, and an error
* if they don't add up to a full page. */
- if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
- if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
+ if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) {
+ if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
"partial page write in btrfs with offset %u and length %u",
bvec->bv_offset, bvec->bv_len);
@@ -2541,8 +2552,8 @@ static void end_bio_extent_readpage(struct bio *bio)
* advance bv_offset and adjust bv_len to compensate.
* Print a warning for nonzero offsets, and an error
* if they don't add up to a full page. */
- if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
- if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
+ if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) {
+ if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
"partial page read in btrfs with offset %u and length %u",
bvec->bv_offset, bvec->bv_len);
@@ -2598,13 +2609,13 @@ static void end_bio_extent_readpage(struct bio *bio)
readpage_ok:
if (likely(uptodate)) {
loff_t i_size = i_size_read(inode);
- pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ pgoff_t end_index = i_size >> PAGE_SHIFT;
unsigned off;
/* Zero out the end if this page straddles i_size */
- off = i_size & (PAGE_CACHE_SIZE-1);
+ off = i_size & (PAGE_SIZE-1);
if (page->index == end_index && off)
- zero_user_segment(page, off, PAGE_CACHE_SIZE);
+ zero_user_segment(page, off, PAGE_SIZE);
SetPageUptodate(page);
} else {
ClearPageUptodate(page);
@@ -2686,12 +2697,6 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask)
btrfs_bio->csum = NULL;
btrfs_bio->csum_allocated = NULL;
btrfs_bio->end_io = NULL;
-
-#ifdef CONFIG_BLK_CGROUP
- /* FIXME, put this into bio_clone_bioset */
- if (bio->bi_css)
- bio_associate_blkcg(new, bio->bi_css);
-#endif
}
return new;
}
@@ -2713,8 +2718,8 @@ struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
}
-static int __must_check submit_one_bio(int rw, struct bio *bio,
- int mirror_num, unsigned long bio_flags)
+static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
+ unsigned long bio_flags)
{
int ret = 0;
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -2725,33 +2730,31 @@ static int __must_check submit_one_bio(int rw, struct bio *bio,
start = page_offset(page) + bvec->bv_offset;
bio->bi_private = NULL;
-
bio_get(bio);
if (tree->ops && tree->ops->submit_bio_hook)
- ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
+ ret = tree->ops->submit_bio_hook(page->mapping->host, bio,
mirror_num, bio_flags, start);
else
- btrfsic_submit_bio(rw, bio);
+ btrfsic_submit_bio(bio);
bio_put(bio);
return ret;
}
-static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page,
+static int merge_bio(struct extent_io_tree *tree, struct page *page,
unsigned long offset, size_t size, struct bio *bio,
unsigned long bio_flags)
{
int ret = 0;
if (tree->ops && tree->ops->merge_bio_hook)
- ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio,
+ ret = tree->ops->merge_bio_hook(page, offset, size, bio,
bio_flags);
- BUG_ON(ret < 0);
return ret;
}
-static int submit_extent_page(int rw, struct extent_io_tree *tree,
+static int submit_extent_page(int op, int op_flags, struct extent_io_tree *tree,
struct writeback_control *wbc,
struct page *page, sector_t sector,
size_t size, unsigned long offset,
@@ -2768,7 +2771,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
struct bio *bio;
int contig = 0;
int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
- size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
+ size_t page_size = min_t(size_t, size, PAGE_SIZE);
if (bio_ret && *bio_ret) {
bio = *bio_ret;
@@ -2779,10 +2782,9 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
if (prev_bio_flags != bio_flags || !contig ||
force_bio_submit ||
- merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||
+ merge_bio(tree, page, offset, page_size, bio, bio_flags) ||
bio_add_page(bio, page, page_size, offset) < page_size) {
- ret = submit_one_bio(rw, bio, mirror_num,
- prev_bio_flags);
+ ret = submit_one_bio(bio, mirror_num, prev_bio_flags);
if (ret < 0) {
*bio_ret = NULL;
return ret;
@@ -2803,6 +2805,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
bio_add_page(bio, page, page_size, offset);
bio->bi_end_io = end_io_func;
bio->bi_private = tree;
+ bio_set_op_attrs(bio, op, op_flags);
if (wbc) {
wbc_init_bio(wbc, bio);
wbc_account_io(wbc, page, page_size);
@@ -2811,7 +2814,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
if (bio_ret)
*bio_ret = bio;
else
- ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
+ ret = submit_one_bio(bio, mirror_num, bio_flags);
return ret;
}
@@ -2821,7 +2824,7 @@ static void attach_extent_buffer_page(struct extent_buffer *eb,
{
if (!PagePrivate(page)) {
SetPagePrivate(page);
- page_cache_get(page);
+ get_page(page);
set_page_private(page, (unsigned long)eb);
} else {
WARN_ON(page->private != (unsigned long)eb);
@@ -2832,7 +2835,7 @@ void set_page_extent_mapped(struct page *page)
{
if (!PagePrivate(page)) {
SetPagePrivate(page);
- page_cache_get(page);
+ get_page(page);
set_page_private(page, EXTENT_PAGE_PRIVATE);
}
}
@@ -2869,18 +2872,19 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
* into the tree that are removed when the IO is done (by the end_io
* handlers)
* XXX JDM: This needs looking at to ensure proper page locking
+ * return 0 on success, otherwise return error
*/
static int __do_readpage(struct extent_io_tree *tree,
struct page *page,
get_extent_t *get_extent,
struct extent_map **em_cached,
struct bio **bio, int mirror_num,
- unsigned long *bio_flags, int rw,
+ unsigned long *bio_flags, int read_flags,
u64 *prev_em_start)
{
struct inode *inode = page->mapping->host;
u64 start = page_offset(page);
- u64 page_end = start + PAGE_CACHE_SIZE - 1;
+ u64 page_end = start + PAGE_SIZE - 1;
u64 end;
u64 cur = start;
u64 extent_offset;
@@ -2890,7 +2894,7 @@ static int __do_readpage(struct extent_io_tree *tree,
sector_t sector;
struct extent_map *em;
struct block_device *bdev;
- int ret;
+ int ret = 0;
int nr = 0;
size_t pg_offset = 0;
size_t iosize;
@@ -2909,12 +2913,12 @@ static int __do_readpage(struct extent_io_tree *tree,
}
}
- if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
+ if (page->index == last_byte >> PAGE_SHIFT) {
char *userpage;
- size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
+ size_t zero_offset = last_byte & (PAGE_SIZE - 1);
if (zero_offset) {
- iosize = PAGE_CACHE_SIZE - zero_offset;
+ iosize = PAGE_SIZE - zero_offset;
userpage = kmap_atomic(page);
memset(userpage + zero_offset, 0, iosize);
flush_dcache_page(page);
@@ -2922,14 +2926,14 @@ static int __do_readpage(struct extent_io_tree *tree,
}
}
while (cur <= end) {
- unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
+ unsigned long pnr = (last_byte >> PAGE_SHIFT) + 1;
bool force_bio_submit = false;
if (cur >= last_byte) {
char *userpage;
struct extent_state *cached = NULL;
- iosize = PAGE_CACHE_SIZE - pg_offset;
+ iosize = PAGE_SIZE - pg_offset;
userpage = kmap_atomic(page);
memset(userpage + pg_offset, 0, iosize);
flush_dcache_page(page);
@@ -3058,8 +3062,8 @@ static int __do_readpage(struct extent_io_tree *tree,
}
pnr -= page->index;
- ret = submit_extent_page(rw, tree, NULL, page,
- sector, disk_io_size, pg_offset,
+ ret = submit_extent_page(REQ_OP_READ, read_flags, tree, NULL,
+ page, sector, disk_io_size, pg_offset,
bdev, bio, pnr,
end_bio_extent_readpage, mirror_num,
*bio_flags,
@@ -3071,6 +3075,7 @@ static int __do_readpage(struct extent_io_tree *tree,
} else {
SetPageError(page);
unlock_extent(tree, cur, cur + iosize - 1);
+ goto out;
}
cur = cur + iosize;
pg_offset += iosize;
@@ -3081,7 +3086,7 @@ out:
SetPageUptodate(page);
unlock_page(page);
}
- return 0;
+ return ret;
}
static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
@@ -3090,7 +3095,7 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
get_extent_t *get_extent,
struct extent_map **em_cached,
struct bio **bio, int mirror_num,
- unsigned long *bio_flags, int rw,
+ unsigned long *bio_flags,
u64 *prev_em_start)
{
struct inode *inode;
@@ -3111,8 +3116,8 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
for (index = 0; index < nr_pages; index++) {
__do_readpage(tree, pages[index], get_extent, em_cached, bio,
- mirror_num, bio_flags, rw, prev_em_start);
- page_cache_release(pages[index]);
+ mirror_num, bio_flags, 0, prev_em_start);
+ put_page(pages[index]);
}
}
@@ -3121,7 +3126,7 @@ static void __extent_readpages(struct extent_io_tree *tree,
int nr_pages, get_extent_t *get_extent,
struct extent_map **em_cached,
struct bio **bio, int mirror_num,
- unsigned long *bio_flags, int rw,
+ unsigned long *bio_flags,
u64 *prev_em_start)
{
u64 start = 0;
@@ -3134,18 +3139,18 @@ static void __extent_readpages(struct extent_io_tree *tree,
page_start = page_offset(pages[index]);
if (!end) {
start = page_start;
- end = start + PAGE_CACHE_SIZE - 1;
+ end = start + PAGE_SIZE - 1;
first_index = index;
} else if (end + 1 == page_start) {
- end += PAGE_CACHE_SIZE;
+ end += PAGE_SIZE;
} else {
__do_contiguous_readpages(tree, &pages[first_index],
index - first_index, start,
end, get_extent, em_cached,
bio, mirror_num, bio_flags,
- rw, prev_em_start);
+ prev_em_start);
start = page_start;
- end = start + PAGE_CACHE_SIZE - 1;
+ end = start + PAGE_SIZE - 1;
first_index = index;
}
}
@@ -3154,7 +3159,7 @@ static void __extent_readpages(struct extent_io_tree *tree,
__do_contiguous_readpages(tree, &pages[first_index],
index - first_index, start,
end, get_extent, em_cached, bio,
- mirror_num, bio_flags, rw,
+ mirror_num, bio_flags,
prev_em_start);
}
@@ -3162,18 +3167,18 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
struct page *page,
get_extent_t *get_extent,
struct bio **bio, int mirror_num,
- unsigned long *bio_flags, int rw)
+ unsigned long *bio_flags, int read_flags)
{
struct inode *inode = page->mapping->host;
struct btrfs_ordered_extent *ordered;
u64 start = page_offset(page);
- u64 end = start + PAGE_CACHE_SIZE - 1;
+ u64 end = start + PAGE_SIZE - 1;
int ret;
while (1) {
lock_extent(tree, start, end);
ordered = btrfs_lookup_ordered_range(inode, start,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
if (!ordered)
break;
unlock_extent(tree, start, end);
@@ -3182,7 +3187,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
}
ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num,
- bio_flags, rw, NULL);
+ bio_flags, read_flags, NULL);
return ret;
}
@@ -3194,20 +3199,16 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
int ret;
ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
- &bio_flags, READ);
+ &bio_flags, 0);
if (bio)
- ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
+ ret = submit_one_bio(bio, mirror_num, bio_flags);
return ret;
}
-static noinline void update_nr_written(struct page *page,
- struct writeback_control *wbc,
- unsigned long nr_written)
+static void update_nr_written(struct page *page, struct writeback_control *wbc,
+ unsigned long nr_written)
{
wbc->nr_to_write -= nr_written;
- if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
- wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
- page->mapping->writeback_index = page->index + nr_written;
}
/*
@@ -3227,7 +3228,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
unsigned long *nr_written)
{
struct extent_io_tree *tree = epd->tree;
- u64 page_end = delalloc_start + PAGE_CACHE_SIZE - 1;
+ u64 page_end = delalloc_start + PAGE_SIZE - 1;
u64 nr_delalloc;
u64 delalloc_to_write = 0;
u64 delalloc_end = 0;
@@ -3264,13 +3265,11 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
goto done;
}
/*
- * delalloc_end is already one less than the total
- * length, so we don't subtract one from
- * PAGE_CACHE_SIZE
+ * delalloc_end is already one less than the total length, so
+ * we don't subtract one from PAGE_SIZE
*/
delalloc_to_write += (delalloc_end - delalloc_start +
- PAGE_CACHE_SIZE) >>
- PAGE_CACHE_SHIFT;
+ PAGE_SIZE) >> PAGE_SHIFT;
delalloc_start = delalloc_end + 1;
}
if (wbc->nr_to_write < delalloc_to_write) {
@@ -3319,7 +3318,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
{
struct extent_io_tree *tree = epd->tree;
u64 start = page_offset(page);
- u64 page_end = start + PAGE_CACHE_SIZE - 1;
+ u64 page_end = start + PAGE_SIZE - 1;
u64 end;
u64 cur = start;
u64 extent_offset;
@@ -3370,6 +3369,8 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
while (cur <= end) {
u64 em_end;
+ unsigned long max_nr;
+
if (cur >= i_size) {
if (tree->ops && tree->ops->writepage_end_io_hook)
tree->ops->writepage_end_io_hook(page, cur,
@@ -3425,32 +3426,23 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
continue;
}
- if (tree->ops && tree->ops->writepage_io_hook) {
- ret = tree->ops->writepage_io_hook(page, cur,
- cur + iosize - 1);
- } else {
- ret = 0;
+ max_nr = (i_size >> PAGE_SHIFT) + 1;
+
+ set_range_writeback(tree, cur, cur + iosize - 1);
+ if (!PageWriteback(page)) {
+ btrfs_err(BTRFS_I(inode)->root->fs_info,
+ "page %lu not writeback, cur %llu end %llu",
+ page->index, cur, end);
}
- if (ret) {
- SetPageError(page);
- } else {
- unsigned long max_nr = (i_size >> PAGE_CACHE_SHIFT) + 1;
- set_range_writeback(tree, cur, cur + iosize - 1);
- if (!PageWriteback(page)) {
- btrfs_err(BTRFS_I(inode)->root->fs_info,
- "page %lu not writeback, cur %llu end %llu",
- page->index, cur, end);
- }
+ ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc,
+ page, sector, iosize, pg_offset,
+ bdev, &epd->bio, max_nr,
+ end_bio_extent_writepage,
+ 0, 0, 0, false);
+ if (ret)
+ SetPageError(page);
- ret = submit_extent_page(write_flags, tree, wbc, page,
- sector, iosize, pg_offset,
- bdev, &epd->bio, max_nr,
- end_bio_extent_writepage,
- 0, 0, 0, false);
- if (ret)
- SetPageError(page);
- }
cur = cur + iosize;
pg_offset += iosize;
nr++;
@@ -3477,19 +3469,17 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
struct inode *inode = page->mapping->host;
struct extent_page_data *epd = data;
u64 start = page_offset(page);
- u64 page_end = start + PAGE_CACHE_SIZE - 1;
+ u64 page_end = start + PAGE_SIZE - 1;
int ret;
int nr = 0;
size_t pg_offset = 0;
loff_t i_size = i_size_read(inode);
- unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
- int write_flags;
+ unsigned long end_index = i_size >> PAGE_SHIFT;
+ int write_flags = 0;
unsigned long nr_written = 0;
if (wbc->sync_mode == WB_SYNC_ALL)
write_flags = WRITE_SYNC;
- else
- write_flags = WRITE;
trace___extent_writepage(page, inode, wbc);
@@ -3497,10 +3487,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
ClearPageError(page);
- pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
+ pg_offset = i_size & (PAGE_SIZE - 1);
if (page->index > end_index ||
(page->index == end_index && !pg_offset)) {
- page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
+ page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
unlock_page(page);
return 0;
}
@@ -3510,7 +3500,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
userpage = kmap_atomic(page);
memset(userpage + pg_offset, 0,
- PAGE_CACHE_SIZE - pg_offset);
+ PAGE_SIZE - pg_offset);
kunmap_atomic(userpage);
flush_dcache_page(page);
}
@@ -3733,7 +3723,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
u64 offset = eb->start;
unsigned long i, num_pages;
unsigned long bio_flags = 0;
- int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META;
+ int write_flags = (epd->sync_io ? WRITE_SYNC : 0) | REQ_META;
int ret = 0;
clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
@@ -3747,9 +3737,10 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
clear_page_dirty_for_io(p);
set_page_writeback(p);
- ret = submit_extent_page(rw, tree, wbc, p, offset >> 9,
- PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
- -1, end_bio_extent_buffer_writepage,
+ ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc,
+ p, offset >> 9, PAGE_SIZE, 0, bdev,
+ &epd->bio, -1,
+ end_bio_extent_buffer_writepage,
0, epd->bio_flags, bio_flags, false);
epd->bio_flags = bio_flags;
if (ret) {
@@ -3760,7 +3751,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
ret = -EIO;
break;
}
- offset += PAGE_CACHE_SIZE;
+ offset += PAGE_SIZE;
update_nr_written(p, wbc, 1);
unlock_page(p);
}
@@ -3804,8 +3795,8 @@ int btree_write_cache_pages(struct address_space *mapping,
index = mapping->writeback_index; /* Start from prev offset */
end = -1;
} else {
- index = wbc->range_start >> PAGE_CACHE_SHIFT;
- end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ index = wbc->range_start >> PAGE_SHIFT;
+ end = wbc->range_end >> PAGE_SHIFT;
scanned = 1;
}
if (wbc->sync_mode == WB_SYNC_ALL)
@@ -3922,12 +3913,13 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
struct inode *inode = mapping->host;
int ret = 0;
int done = 0;
- int err = 0;
int nr_to_write_done = 0;
struct pagevec pvec;
int nr_pages;
pgoff_t index;
pgoff_t end; /* Inclusive */
+ pgoff_t done_index;
+ int range_whole = 0;
int scanned = 0;
int tag;
@@ -3948,8 +3940,10 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
index = mapping->writeback_index; /* Start from prev offset */
end = -1;
} else {
- index = wbc->range_start >> PAGE_CACHE_SHIFT;
- end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ index = wbc->range_start >> PAGE_SHIFT;
+ end = wbc->range_end >> PAGE_SHIFT;
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = 1;
scanned = 1;
}
if (wbc->sync_mode == WB_SYNC_ALL)
@@ -3959,6 +3953,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
retry:
if (wbc->sync_mode == WB_SYNC_ALL)
tag_pages_for_writeback(mapping, index, end);
+ done_index = index;
while (!done && !nr_to_write_done && (index <= end) &&
(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
@@ -3968,6 +3963,7 @@ retry:
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
+ done_index = page->index;
/*
* At this point we hold neither mapping->tree_lock nor
* lock on the page itself: the page may be truncated or
@@ -4009,8 +4005,20 @@ retry:
unlock_page(page);
ret = 0;
}
- if (!err && ret < 0)
- err = ret;
+ if (ret < 0) {
+ /*
+ * done_index is set past this page,
+ * so media errors will not choke
+ * background writeout for the entire
+ * file. This has consequences for
+ * range_cyclic semantics (ie. it may
+ * not be suitable for data integrity
+ * writeout).
+ */
+ done_index = page->index + 1;
+ done = 1;
+ break;
+ }
/*
* the filesystem may choose to bump up nr_to_write.
@@ -4022,7 +4030,7 @@ retry:
pagevec_release(&pvec);
cond_resched();
}
- if (!scanned && !done && !err) {
+ if (!scanned && !done) {
/*
* We hit the last page and there is more work to be done: wrap
* back to the start of the file
@@ -4031,20 +4039,23 @@ retry:
index = 0;
goto retry;
}
+
+ if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
+ mapping->writeback_index = done_index;
+
btrfs_add_delayed_iput(inode);
- return err;
+ return ret;
}
static void flush_epd_write_bio(struct extent_page_data *epd)
{
if (epd->bio) {
- int rw = WRITE;
int ret;
- if (epd->sync_io)
- rw = WRITE_SYNC;
+ bio_set_op_attrs(epd->bio, REQ_OP_WRITE,
+ epd->sync_io ? WRITE_SYNC : 0);
- ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags);
+ ret = submit_one_bio(epd->bio, 0, epd->bio_flags);
BUG_ON(ret < 0); /* -ENOMEM */
epd->bio = NULL;
}
@@ -4083,8 +4094,8 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
int ret = 0;
struct address_space *mapping = inode->i_mapping;
struct page *page;
- unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
- PAGE_CACHE_SHIFT;
+ unsigned long nr_pages = (end - start + PAGE_SIZE) >>
+ PAGE_SHIFT;
struct extent_page_data epd = {
.bio = NULL,
@@ -4102,18 +4113,18 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
};
while (start <= end) {
- page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+ page = find_get_page(mapping, start >> PAGE_SHIFT);
if (clear_page_dirty_for_io(page))
ret = __extent_writepage(page, &wbc_writepages, &epd);
else {
if (tree->ops && tree->ops->writepage_end_io_hook)
tree->ops->writepage_end_io_hook(page, start,
- start + PAGE_CACHE_SIZE - 1,
+ start + PAGE_SIZE - 1,
NULL, 1);
unlock_page(page);
}
- page_cache_release(page);
- start += PAGE_CACHE_SIZE;
+ put_page(page);
+ start += PAGE_SIZE;
}
flush_epd_write_bio(&epd);
@@ -4162,8 +4173,9 @@ int extent_readpages(struct extent_io_tree *tree,
prefetchw(&page->flags);
list_del(&page->lru);
if (add_to_page_cache_lru(page, mapping,
- page->index, GFP_NOFS)) {
- page_cache_release(page);
+ page->index,
+ readahead_gfp_mask(mapping))) {
+ put_page(page);
continue;
}
@@ -4171,19 +4183,19 @@ int extent_readpages(struct extent_io_tree *tree,
if (nr < ARRAY_SIZE(pagepool))
continue;
__extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
- &bio, 0, &bio_flags, READ, &prev_em_start);
+ &bio, 0, &bio_flags, &prev_em_start);
nr = 0;
}
if (nr)
__extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
- &bio, 0, &bio_flags, READ, &prev_em_start);
+ &bio, 0, &bio_flags, &prev_em_start);
if (em_cached)
free_extent_map(em_cached);
BUG_ON(!list_empty(pages));
if (bio)
- return submit_one_bio(READ, bio, 0, bio_flags);
+ return submit_one_bio(bio, 0, bio_flags);
return 0;
}
@@ -4197,7 +4209,7 @@ int extent_invalidatepage(struct extent_io_tree *tree,
{
struct extent_state *cached_state = NULL;
u64 start = page_offset(page);
- u64 end = start + PAGE_CACHE_SIZE - 1;
+ u64 end = start + PAGE_SIZE - 1;
size_t blocksize = page->mapping->host->i_sb->s_blocksize;
start += ALIGN(offset, blocksize);
@@ -4223,7 +4235,7 @@ static int try_release_extent_state(struct extent_map_tree *map,
struct page *page, gfp_t mask)
{
u64 start = page_offset(page);
- u64 end = start + PAGE_CACHE_SIZE - 1;
+ u64 end = start + PAGE_SIZE - 1;
int ret = 1;
if (test_range_bit(tree, start, end,
@@ -4262,7 +4274,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
{
struct extent_map *em;
u64 start = page_offset(page);
- u64 end = start + PAGE_CACHE_SIZE - 1;
+ u64 end = start + PAGE_SIZE - 1;
if (gfpflags_allow_blocking(mask) &&
page->mapping->host->i_size > SZ_16M) {
@@ -4381,8 +4393,12 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (ret < 0) {
btrfs_free_path(path);
return ret;
+ } else {
+ WARN_ON(!ret);
+ if (ret == 1)
+ ret = 0;
}
- WARN_ON(!ret);
+
path->slots[0]--;
btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
found_type = found_key.type;
@@ -4587,14 +4603,14 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb)
ClearPagePrivate(page);
set_page_private(page, 0);
/* One for the page private */
- page_cache_release(page);
+ put_page(page);
}
if (mapped)
spin_unlock(&page->mapping->private_lock);
- /* One for when we alloced the page */
- page_cache_release(page);
+ /* One for when we allocated the page */
+ put_page(page);
} while (index != 0);
}
@@ -4706,16 +4722,16 @@ err:
}
struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start)
+ u64 start, u32 nodesize)
{
unsigned long len;
if (!fs_info) {
/*
* Called only from tests that don't always have a fs_info
- * available, but we know that nodesize is 4096
+ * available
*/
- len = 4096;
+ len = nodesize;
} else {
len = fs_info->tree_root->nodesize;
}
@@ -4779,7 +4795,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
rcu_read_lock();
eb = radix_tree_lookup(&fs_info->buffer_radix,
- start >> PAGE_CACHE_SHIFT);
+ start >> PAGE_SHIFT);
if (eb && atomic_inc_not_zero(&eb->refs)) {
rcu_read_unlock();
/*
@@ -4811,7 +4827,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start)
+ u64 start, u32 nodesize)
{
struct extent_buffer *eb, *exists = NULL;
int ret;
@@ -4819,17 +4835,17 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
eb = find_extent_buffer(fs_info, start);
if (eb)
return eb;
- eb = alloc_dummy_extent_buffer(fs_info, start);
+ eb = alloc_dummy_extent_buffer(fs_info, start, nodesize);
if (!eb)
return NULL;
eb->fs_info = fs_info;
again:
- ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+ ret = radix_tree_preload(GFP_NOFS);
if (ret)
goto free_eb;
spin_lock(&fs_info->buffer_lock);
ret = radix_tree_insert(&fs_info->buffer_radix,
- start >> PAGE_CACHE_SHIFT, eb);
+ start >> PAGE_SHIFT, eb);
spin_unlock(&fs_info->buffer_lock);
radix_tree_preload_end();
if (ret == -EEXIST) {
@@ -4862,7 +4878,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
unsigned long len = fs_info->tree_root->nodesize;
unsigned long num_pages = num_extent_pages(start, len);
unsigned long i;
- unsigned long index = start >> PAGE_CACHE_SHIFT;
+ unsigned long index = start >> PAGE_SHIFT;
struct extent_buffer *eb;
struct extent_buffer *exists = NULL;
struct page *p;
@@ -4870,18 +4886,25 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
int uptodate = 1;
int ret;
+ if (!IS_ALIGNED(start, fs_info->tree_root->sectorsize)) {
+ btrfs_err(fs_info, "bad tree block start %llu", start);
+ return ERR_PTR(-EINVAL);
+ }
+
eb = find_extent_buffer(fs_info, start);
if (eb)
return eb;
eb = __alloc_extent_buffer(fs_info, start, len);
if (!eb)
- return NULL;
+ return ERR_PTR(-ENOMEM);
for (i = 0; i < num_pages; i++, index++) {
p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
- if (!p)
+ if (!p) {
+ exists = ERR_PTR(-ENOMEM);
goto free_eb;
+ }
spin_lock(&mapping->private_lock);
if (PagePrivate(p)) {
@@ -4896,7 +4919,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
if (atomic_inc_not_zero(&exists->refs)) {
spin_unlock(&mapping->private_lock);
unlock_page(p);
- page_cache_release(p);
+ put_page(p);
mark_extent_buffer_accessed(exists, p);
goto free_eb;
}
@@ -4908,7 +4931,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
*/
ClearPagePrivate(p);
WARN_ON(PageDirty(p));
- page_cache_release(p);
+ put_page(p);
}
attach_extent_buffer_page(eb, p);
spin_unlock(&mapping->private_lock);
@@ -4925,13 +4948,15 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
if (uptodate)
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
again:
- ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
- if (ret)
+ ret = radix_tree_preload(GFP_NOFS);
+ if (ret) {
+ exists = ERR_PTR(ret);
goto free_eb;
+ }
spin_lock(&fs_info->buffer_lock);
ret = radix_tree_insert(&fs_info->buffer_radix,
- start >> PAGE_CACHE_SHIFT, eb);
+ start >> PAGE_SHIFT, eb);
spin_unlock(&fs_info->buffer_lock);
radix_tree_preload_end();
if (ret == -EEXIST) {
@@ -4994,7 +5019,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
spin_lock(&fs_info->buffer_lock);
radix_tree_delete(&fs_info->buffer_radix,
- eb->start >> PAGE_CACHE_SHIFT);
+ eb->start >> PAGE_SHIFT);
spin_unlock(&fs_info->buffer_lock);
} else {
spin_unlock(&eb->refs_lock);
@@ -5168,8 +5193,8 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
if (start) {
WARN_ON(start < eb->start);
- start_i = (start >> PAGE_CACHE_SHIFT) -
- (eb->start >> PAGE_CACHE_SHIFT);
+ start_i = (start >> PAGE_SHIFT) -
+ (eb->start >> PAGE_SHIFT);
} else {
start_i = 0;
}
@@ -5200,22 +5225,38 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
atomic_set(&eb->io_pages, num_reads);
for (i = start_i; i < num_pages; i++) {
page = eb->pages[i];
+
if (!PageUptodate(page)) {
+ if (ret) {
+ atomic_dec(&eb->io_pages);
+ unlock_page(page);
+ continue;
+ }
+
ClearPageError(page);
err = __extent_read_full_page(tree, page,
get_extent, &bio,
mirror_num, &bio_flags,
- READ | REQ_META);
- if (err)
+ REQ_META);
+ if (err) {
ret = err;
+ /*
+ * We use &bio in above __extent_read_full_page,
+ * so we ensure that if it returns error, the
+ * current page fails to add itself to bio and
+ * it's been unlocked.
+ *
+ * We must dec io_pages by ourselves.
+ */
+ atomic_dec(&eb->io_pages);
+ }
} else {
unlock_page(page);
}
}
if (bio) {
- err = submit_one_bio(READ | REQ_META, bio, mirror_num,
- bio_flags);
+ err = submit_one_bio(bio, mirror_num, bio_flags);
if (err)
return err;
}
@@ -5252,18 +5293,18 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
struct page *page;
char *kaddr;
char *dst = (char *)dstv;
- size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
- unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+ size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+ unsigned long i = (start_offset + start) >> PAGE_SHIFT;
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
+ offset = (start_offset + start) & (PAGE_SIZE - 1);
while (len > 0) {
page = eb->pages[i];
- cur = min(len, (PAGE_CACHE_SIZE - offset));
+ cur = min(len, (PAGE_SIZE - offset));
kaddr = page_address(page);
memcpy(dst, kaddr + offset, cur);
@@ -5283,19 +5324,19 @@ int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv,
struct page *page;
char *kaddr;
char __user *dst = (char __user *)dstv;
- size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
- unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+ size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+ unsigned long i = (start_offset + start) >> PAGE_SHIFT;
int ret = 0;
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
+ offset = (start_offset + start) & (PAGE_SIZE - 1);
while (len > 0) {
page = eb->pages[i];
- cur = min(len, (PAGE_CACHE_SIZE - offset));
+ cur = min(len, (PAGE_SIZE - offset));
kaddr = page_address(page);
if (copy_to_user(dst, kaddr + offset, cur)) {
ret = -EFAULT;
@@ -5311,28 +5352,33 @@ int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv,
return ret;
}
+/*
+ * return 0 if the item is found within a page.
+ * return 1 if the item spans two pages.
+ * return -EINVAL otherwise.
+ */
int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
unsigned long min_len, char **map,
unsigned long *map_start,
unsigned long *map_len)
{
- size_t offset = start & (PAGE_CACHE_SIZE - 1);
+ size_t offset = start & (PAGE_SIZE - 1);
char *kaddr;
struct page *p;
- size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
- unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+ size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+ unsigned long i = (start_offset + start) >> PAGE_SHIFT;
unsigned long end_i = (start_offset + start + min_len - 1) >>
- PAGE_CACHE_SHIFT;
+ PAGE_SHIFT;
if (i != end_i)
- return -EINVAL;
+ return 1;
if (i == 0) {
offset = start_offset;
*map_start = 0;
} else {
offset = 0;
- *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
+ *map_start = ((u64)i << PAGE_SHIFT) - start_offset;
}
if (start + min_len > eb->len) {
@@ -5345,7 +5391,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
p = eb->pages[i];
kaddr = page_address(p);
*map = kaddr + offset;
- *map_len = PAGE_CACHE_SIZE - offset;
+ *map_len = PAGE_SIZE - offset;
return 0;
}
@@ -5358,19 +5404,19 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
struct page *page;
char *kaddr;
char *ptr = (char *)ptrv;
- size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
- unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+ size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+ unsigned long i = (start_offset + start) >> PAGE_SHIFT;
int ret = 0;
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
+ offset = (start_offset + start) & (PAGE_SIZE - 1);
while (len > 0) {
page = eb->pages[i];
- cur = min(len, (PAGE_CACHE_SIZE - offset));
+ cur = min(len, (PAGE_SIZE - offset));
kaddr = page_address(page);
ret = memcmp(ptr, kaddr + offset, cur);
@@ -5393,19 +5439,19 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
struct page *page;
char *kaddr;
char *src = (char *)srcv;
- size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
- unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+ size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+ unsigned long i = (start_offset + start) >> PAGE_SHIFT;
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
+ offset = (start_offset + start) & (PAGE_SIZE - 1);
while (len > 0) {
page = eb->pages[i];
WARN_ON(!PageUptodate(page));
- cur = min(len, PAGE_CACHE_SIZE - offset);
+ cur = min(len, PAGE_SIZE - offset);
kaddr = page_address(page);
memcpy(kaddr + offset, src, cur);
@@ -5423,19 +5469,19 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,
size_t offset;
struct page *page;
char *kaddr;
- size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
- unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+ size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+ unsigned long i = (start_offset + start) >> PAGE_SHIFT;
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
+ offset = (start_offset + start) & (PAGE_SIZE - 1);
while (len > 0) {
page = eb->pages[i];
WARN_ON(!PageUptodate(page));
- cur = min(len, PAGE_CACHE_SIZE - offset);
+ cur = min(len, PAGE_SIZE - offset);
kaddr = page_address(page);
memset(kaddr + offset, c, cur);
@@ -5454,19 +5500,19 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
size_t offset;
struct page *page;
char *kaddr;
- size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
- unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
+ size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1);
+ unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT;
WARN_ON(src->len != dst_len);
offset = (start_offset + dst_offset) &
- (PAGE_CACHE_SIZE - 1);
+ (PAGE_SIZE - 1);
while (len > 0) {
page = dst->pages[i];
WARN_ON(!PageUptodate(page));
- cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
+ cur = min(len, (unsigned long)(PAGE_SIZE - offset));
kaddr = page_address(page);
read_extent_buffer(src, kaddr + offset, src_offset, cur);
@@ -5508,7 +5554,7 @@ static inline void eb_bitmap_offset(struct extent_buffer *eb,
unsigned long *page_index,
size_t *page_offset)
{
- size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+ size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
size_t byte_offset = BIT_BYTE(nr);
size_t offset;
@@ -5519,8 +5565,8 @@ static inline void eb_bitmap_offset(struct extent_buffer *eb,
*/
offset = start_offset + start + byte_offset;
- *page_index = offset >> PAGE_CACHE_SHIFT;
- *page_offset = offset & (PAGE_CACHE_SIZE - 1);
+ *page_index = offset >> PAGE_SHIFT;
+ *page_offset = offset & (PAGE_SIZE - 1);
}
/**
@@ -5572,7 +5618,7 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
len -= bits_to_set;
bits_to_set = BITS_PER_BYTE;
mask_to_set = ~0U;
- if (++offset >= PAGE_CACHE_SIZE && len > 0) {
+ if (++offset >= PAGE_SIZE && len > 0) {
offset = 0;
page = eb->pages[++i];
WARN_ON(!PageUptodate(page));
@@ -5614,7 +5660,7 @@ void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
len -= bits_to_clear;
bits_to_clear = BITS_PER_BYTE;
mask_to_clear = ~0U;
- if (++offset >= PAGE_CACHE_SIZE && len > 0) {
+ if (++offset >= PAGE_SIZE && len > 0) {
offset = 0;
page = eb->pages[++i];
WARN_ON(!PageUptodate(page));
@@ -5661,7 +5707,7 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
size_t cur;
size_t dst_off_in_page;
size_t src_off_in_page;
- size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+ size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1);
unsigned long dst_i;
unsigned long src_i;
@@ -5680,17 +5726,17 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
while (len > 0) {
dst_off_in_page = (start_offset + dst_offset) &
- (PAGE_CACHE_SIZE - 1);
+ (PAGE_SIZE - 1);
src_off_in_page = (start_offset + src_offset) &
- (PAGE_CACHE_SIZE - 1);
+ (PAGE_SIZE - 1);
- dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
- src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
+ dst_i = (start_offset + dst_offset) >> PAGE_SHIFT;
+ src_i = (start_offset + src_offset) >> PAGE_SHIFT;
- cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
+ cur = min(len, (unsigned long)(PAGE_SIZE -
src_off_in_page));
cur = min_t(unsigned long, cur,
- (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
+ (unsigned long)(PAGE_SIZE - dst_off_in_page));
copy_pages(dst->pages[dst_i], dst->pages[src_i],
dst_off_in_page, src_off_in_page, cur);
@@ -5709,7 +5755,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
size_t src_off_in_page;
unsigned long dst_end = dst_offset + len - 1;
unsigned long src_end = src_offset + len - 1;
- size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+ size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1);
unsigned long dst_i;
unsigned long src_i;
@@ -5728,13 +5774,13 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
return;
}
while (len > 0) {
- dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
- src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
+ dst_i = (start_offset + dst_end) >> PAGE_SHIFT;
+ src_i = (start_offset + src_end) >> PAGE_SHIFT;
dst_off_in_page = (start_offset + dst_end) &
- (PAGE_CACHE_SIZE - 1);
+ (PAGE_SIZE - 1);
src_off_in_page = (start_offset + src_end) &
- (PAGE_CACHE_SIZE - 1);
+ (PAGE_SIZE - 1);
cur = min_t(unsigned long, len, src_off_in_page + 1);
cur = min(cur, dst_off_in_page + 1);
@@ -5753,7 +5799,7 @@ int try_release_extent_buffer(struct page *page)
struct extent_buffer *eb;
/*
- * We need to make sure noboody is attaching this page to an eb right
+ * We need to make sure nobody is attaching this page to an eb right
* now.
*/
spin_lock(&page->mapping->private_lock);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 5dbf92e68fbd1..bc2729a7612db 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -63,17 +63,16 @@ struct btrfs_root;
struct btrfs_io_bio;
struct io_failure_record;
-typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
- struct bio *bio, int mirror_num,
- unsigned long bio_flags, u64 bio_offset);
+typedef int (extent_submit_bio_hook_t)(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags,
+ u64 bio_offset);
struct extent_io_ops {
int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
u64 start, u64 end, int *page_started,
unsigned long *nr_written);
int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
- int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
extent_submit_bio_hook_t *submit_bio_hook;
- int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset,
+ int (*merge_bio_hook)(struct page *page, unsigned long offset,
size_t size, struct bio *bio,
unsigned long bio_flags);
int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
@@ -120,7 +119,7 @@ struct extent_state {
};
#define INLINE_EXTENT_BUFFER_PAGES 16
-#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_CACHE_SIZE)
+#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_SIZE)
struct extent_buffer {
u64 start;
unsigned long len;
@@ -221,8 +220,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, int filled,
struct extent_state *cached_state);
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, gfp_t mask,
- struct extent_changeset *changeset);
+ unsigned bits, struct extent_changeset *changeset);
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, int wake, int delete,
struct extent_state **cached, gfp_t mask);
@@ -241,27 +239,27 @@ static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start,
}
static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
- u64 end, unsigned bits, gfp_t mask)
+ u64 end, unsigned bits)
{
int wake = 0;
if (bits & EXTENT_LOCKED)
wake = 1;
- return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask);
+ return clear_extent_bit(tree, start, end, bits, wake, 0, NULL,
+ GFP_NOFS);
}
int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, gfp_t mask,
- struct extent_changeset *changeset);
+ unsigned bits, struct extent_changeset *changeset);
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, u64 *failed_start,
struct extent_state **cached_state, gfp_t mask);
static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
- u64 end, unsigned bits, gfp_t mask)
+ u64 end, unsigned bits)
{
- return set_extent_bit(tree, start, end, bits, NULL, NULL, mask);
+ return set_extent_bit(tree, start, end, bits, NULL, NULL, GFP_NOFS);
}
static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
@@ -279,37 +277,38 @@ static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
}
static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
- u64 end, gfp_t mask)
+ u64 end)
{
return clear_extent_bit(tree, start, end,
EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
+ EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
}
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, unsigned clear_bits,
- struct extent_state **cached_state, gfp_t mask);
+ struct extent_state **cached_state);
static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached_state, gfp_t mask)
+ u64 end, struct extent_state **cached_state)
{
return set_extent_bit(tree, start, end,
EXTENT_DELALLOC | EXTENT_UPTODATE,
- NULL, cached_state, mask);
+ NULL, cached_state, GFP_NOFS);
}
static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached_state, gfp_t mask)
+ u64 end, struct extent_state **cached_state)
{
return set_extent_bit(tree, start, end,
EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
- NULL, cached_state, mask);
+ NULL, cached_state, GFP_NOFS);
}
static inline int set_extent_new(struct extent_io_tree *tree, u64 start,
- u64 end, gfp_t mask)
+ u64 end)
{
- return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, mask);
+ return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL,
+ GFP_NOFS);
}
static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start,
@@ -349,7 +348,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, unsigned long len);
struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start);
+ u64 start, u32 nodesize);
struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
@@ -365,8 +364,8 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
static inline unsigned long num_extent_pages(u64 start, u64 len)
{
- return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
- (start >> PAGE_CACHE_SHIFT);
+ return ((start + len + PAGE_SIZE - 1) >> PAGE_SHIFT) -
+ (start >> PAGE_SHIFT);
}
static inline void extent_buffer_get(struct extent_buffer *eb)
@@ -469,5 +468,5 @@ noinline u64 find_lock_delalloc_range(struct inode *inode,
u64 *end, u64 max_bytes);
#endif
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start);
+ u64 start, u32 nodesize);
#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 318b048eb2549..26f9ac719d20b 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -13,7 +13,7 @@ int __init extent_map_init(void)
{
extent_map_cache = kmem_cache_create("btrfs_extent_map",
sizeof(struct extent_map), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ SLAB_MEM_SPREAD, NULL);
if (!extent_map_cache)
return -ENOMEM;
return 0;
@@ -62,7 +62,7 @@ struct extent_map *alloc_extent_map(void)
/**
* free_extent_map - drop reference count of an extent_map
- * @em: extent map being releasead
+ * @em: extent map being released
*
* Drops the reference out on @em by one and free the structure
* if the reference count hits zero.
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index b5baf5bdc8e18..d0d571c47d33b 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -32,7 +32,7 @@
size) - 1))
#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \
- PAGE_CACHE_SIZE))
+ PAGE_SIZE))
#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
sizeof(struct btrfs_ordered_sum)) / \
@@ -203,7 +203,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
csum = (u8 *)dst;
}
- if (bio->bi_iter.bi_size > PAGE_CACHE_SIZE * 8)
+ if (bio->bi_iter.bi_size > PAGE_SIZE * 8)
path->reada = READA_FORWARD;
WARN_ON(bio->bi_vcnt <= 0);
@@ -248,9 +248,9 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
BTRFS_DATA_RELOC_TREE_OBJECTID) {
set_extent_bits(io_tree, offset,
offset + root->sectorsize - 1,
- EXTENT_NODATASUM, GFP_NOFS);
+ EXTENT_NODATASUM);
} else {
- btrfs_info(BTRFS_I(inode)->root->fs_info,
+ btrfs_info_rl(BTRFS_I(inode)->root->fs_info,
"no csum found for inode %llu start %llu",
btrfs_ino(inode), offset);
}
@@ -699,7 +699,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
*/
ret = btrfs_split_item(trans, root, path, &key, offset);
if (ret && ret != -EAGAIN) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 15a09cb156cec..9404121fd5f7b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -132,7 +132,7 @@ static int __btrfs_add_inode_defrag(struct inode *inode,
static inline int __need_auto_defrag(struct btrfs_root *root)
{
- if (!btrfs_test_opt(root, AUTO_DEFRAG))
+ if (!btrfs_test_opt(root->fs_info, AUTO_DEFRAG))
return 0;
if (btrfs_fs_closing(root->fs_info))
@@ -414,11 +414,11 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
size_t copied = 0;
size_t total_copied = 0;
int pg = 0;
- int offset = pos & (PAGE_CACHE_SIZE - 1);
+ int offset = pos & (PAGE_SIZE - 1);
while (write_bytes > 0) {
size_t count = min_t(size_t,
- PAGE_CACHE_SIZE - offset, write_bytes);
+ PAGE_SIZE - offset, write_bytes);
struct page *page = prepared_pages[pg];
/*
* Copy data from userspace to the current page
@@ -448,7 +448,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
if (unlikely(copied == 0))
break;
- if (copied < PAGE_CACHE_SIZE - offset) {
+ if (copied < PAGE_SIZE - offset) {
offset += copied;
} else {
pg++;
@@ -473,7 +473,7 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
*/
ClearPageChecked(pages[i]);
unlock_page(pages[i]);
- page_cache_release(pages[i]);
+ put_page(pages[i]);
}
}
@@ -950,7 +950,7 @@ delete_extent_item:
ret = btrfs_del_items(trans, root, path, del_slot,
del_nr);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
break;
}
@@ -974,7 +974,7 @@ delete_extent_item:
path->slots[0] = del_slot;
ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
if (ret)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
}
leaf = path->nodes[0];
@@ -1190,7 +1190,7 @@ again:
goto again;
}
if (ret < 0) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -1278,7 +1278,7 @@ again:
ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
if (ret < 0) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
}
@@ -1297,7 +1297,7 @@ static int prepare_uptodate_page(struct inode *inode,
{
int ret = 0;
- if (((pos & (PAGE_CACHE_SIZE - 1)) || force_uptodate) &&
+ if (((pos & (PAGE_SIZE - 1)) || force_uptodate) &&
!PageUptodate(page)) {
ret = btrfs_readpage(NULL, page);
if (ret)
@@ -1323,7 +1323,7 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages,
size_t write_bytes, bool force_uptodate)
{
int i;
- unsigned long index = pos >> PAGE_CACHE_SHIFT;
+ unsigned long index = pos >> PAGE_SHIFT;
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
int err = 0;
int faili;
@@ -1345,7 +1345,7 @@ again:
err = prepare_uptodate_page(inode, pages[i],
pos + write_bytes, false);
if (err) {
- page_cache_release(pages[i]);
+ put_page(pages[i]);
if (err == -EAGAIN) {
err = 0;
goto again;
@@ -1360,7 +1360,7 @@ again:
fail:
while (faili >= 0) {
unlock_page(pages[faili]);
- page_cache_release(pages[faili]);
+ put_page(pages[faili]);
faili--;
}
return err;
@@ -1408,7 +1408,7 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
cached_state, GFP_NOFS);
for (i = 0; i < num_pages; i++) {
unlock_page(pages[i]);
- page_cache_release(pages[i]);
+ put_page(pages[i]);
}
btrfs_start_ordered_extent(inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
@@ -1497,8 +1497,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
bool force_page_uptodate = false;
bool need_unlock;
- nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_CACHE_SIZE),
- PAGE_CACHE_SIZE / (sizeof(struct page *)));
+ nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
+ PAGE_SIZE / (sizeof(struct page *)));
nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
nrptrs = max(nrptrs, 8);
pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
@@ -1506,13 +1506,13 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
return -ENOMEM;
while (iov_iter_count(i) > 0) {
- size_t offset = pos & (PAGE_CACHE_SIZE - 1);
+ size_t offset = pos & (PAGE_SIZE - 1);
size_t sector_offset;
size_t write_bytes = min(iov_iter_count(i),
- nrptrs * (size_t)PAGE_CACHE_SIZE -
+ nrptrs * (size_t)PAGE_SIZE -
offset);
size_t num_pages = DIV_ROUND_UP(write_bytes + offset,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
size_t reserve_bytes;
size_t dirty_pages;
size_t copied;
@@ -1534,30 +1534,30 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
reserve_bytes = round_up(write_bytes + sector_offset,
root->sectorsize);
- if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
- BTRFS_INODE_PREALLOC)) &&
- check_can_nocow(inode, pos, &write_bytes) > 0) {
- /*
- * For nodata cow case, no need to reserve
- * data space.
- */
- only_release_metadata = true;
- /*
- * our prealloc extent may be smaller than
- * write_bytes, so scale down.
- */
- num_pages = DIV_ROUND_UP(write_bytes + offset,
- PAGE_CACHE_SIZE);
- reserve_bytes = round_up(write_bytes + sector_offset,
- root->sectorsize);
- goto reserve_metadata;
- }
-
ret = btrfs_check_data_free_space(inode, pos, write_bytes);
- if (ret < 0)
- break;
+ if (ret < 0) {
+ if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_PREALLOC)) &&
+ check_can_nocow(inode, pos, &write_bytes) > 0) {
+ /*
+ * For nodata cow case, no need to reserve
+ * data space.
+ */
+ only_release_metadata = true;
+ /*
+ * our prealloc extent may be smaller than
+ * write_bytes, so scale down.
+ */
+ num_pages = DIV_ROUND_UP(write_bytes + offset,
+ PAGE_SIZE);
+ reserve_bytes = round_up(write_bytes +
+ sector_offset,
+ root->sectorsize);
+ } else {
+ break;
+ }
+ }
-reserve_metadata:
ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
if (ret) {
if (!only_release_metadata)
@@ -1596,6 +1596,13 @@ again:
copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
+ num_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
+ reserve_bytes);
+ dirty_sectors = round_up(copied + sector_offset,
+ root->sectorsize);
+ dirty_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
+ dirty_sectors);
+
/*
* if we have trouble faulting in the pages, fall
* back to one page at a time
@@ -1605,30 +1612,28 @@ again:
if (copied == 0) {
force_page_uptodate = true;
+ dirty_sectors = 0;
dirty_pages = 0;
} else {
force_page_uptodate = false;
dirty_pages = DIV_ROUND_UP(copied + offset,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
}
/*
* If we had a short copy we need to release the excess delaloc
* bytes we reserved. We need to increment outstanding_extents
- * because btrfs_delalloc_release_space will decrement it, but
+ * because btrfs_delalloc_release_space and
+ * btrfs_delalloc_release_metadata will decrement it, but
* we still have an outstanding extent for the chunk we actually
* managed to copy.
*/
- num_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
- reserve_bytes);
- dirty_sectors = round_up(copied + sector_offset,
- root->sectorsize);
- dirty_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
- dirty_sectors);
-
if (num_sectors > dirty_sectors) {
- release_bytes = (write_bytes - copied)
- & ~((u64)root->sectorsize - 1);
+
+ /* release everything except the sectors we dirtied */
+ release_bytes -= dirty_sectors <<
+ root->fs_info->sb->s_blocksize_bits;
+
if (copied > 0) {
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->outstanding_extents++;
@@ -1641,7 +1646,7 @@ again:
u64 __pos;
__pos = round_down(pos, root->sectorsize) +
- (dirty_pages << PAGE_CACHE_SHIFT);
+ (dirty_pages << PAGE_SHIFT);
btrfs_delalloc_release_space(inode, __pos,
release_bytes);
}
@@ -1682,7 +1687,7 @@ again:
cond_resched();
balance_dirty_pages_ratelimited(inode->i_mapping);
- if (dirty_pages < (root->nodesize >> PAGE_CACHE_SHIFT) + 1)
+ if (dirty_pages < (root->nodesize >> PAGE_SHIFT) + 1)
btrfs_btree_balance_dirty(root);
pos += copied;
@@ -1696,25 +1701,26 @@ again:
btrfs_end_write_no_snapshoting(root);
btrfs_delalloc_release_metadata(inode, release_bytes);
} else {
- btrfs_delalloc_release_space(inode, pos, release_bytes);
+ btrfs_delalloc_release_space(inode,
+ round_down(pos, root->sectorsize),
+ release_bytes);
}
}
return num_written ? num_written : ret;
}
-static ssize_t __btrfs_direct_write(struct kiocb *iocb,
- struct iov_iter *from,
- loff_t pos)
+static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
+ loff_t pos = iocb->ki_pos;
ssize_t written;
ssize_t written_buffered;
loff_t endbyte;
int err;
- written = generic_file_direct_write(iocb, from, pos);
+ written = generic_file_direct_write(iocb, from);
if (written < 0 || !iov_iter_count(from))
return written;
@@ -1738,8 +1744,8 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
goto out;
written += written_buffered;
iocb->ki_pos = pos + written_buffered;
- invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
- endbyte >> PAGE_CACHE_SHIFT);
+ invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
+ endbyte >> PAGE_SHIFT);
out:
return written ? written : err;
}
@@ -1832,7 +1838,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
atomic_inc(&BTRFS_I(inode)->sync_writers);
if (iocb->ki_flags & IOCB_DIRECT) {
- num_written = __btrfs_direct_write(iocb, from, pos);
+ num_written = __btrfs_direct_write(iocb, from);
} else {
num_written = __btrfs_buffered_write(file, from, pos);
if (num_written > 0)
@@ -1852,11 +1858,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->last_sub_trans = root->log_transid;
spin_unlock(&BTRFS_I(inode)->lock);
- if (num_written > 0) {
- err = generic_write_sync(file, pos, num_written);
- if (err < 0)
- num_written = err;
- }
+ if (num_written > 0)
+ num_written = generic_write_sync(iocb, num_written);
if (sync)
atomic_dec(&BTRFS_I(inode)->sync_writers);
@@ -1905,7 +1908,7 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
*/
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
- struct dentry *dentry = file->f_path.dentry;
+ struct dentry *dentry = file_dentry(file);
struct inode *inode = d_inode(dentry);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
@@ -2024,7 +2027,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
BTRFS_I(inode)->last_trans
<= root->fs_info->last_trans_committed)) {
/*
- * We'v had everything committed since the last time we were
+ * We've had everything committed since the last time we were
* modified so clear this flag in case it was set for whatever
* reason, it's no longer relevant.
*/
@@ -2372,7 +2375,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
/* Check the aligned pages after the first unaligned page,
* if offset != orig_start, which means the first unaligned page
- * including serveral following pages are already in holes,
+ * including several following pages are already in holes,
* the extra check can be skipped */
if (offset == orig_start) {
/* after truncate page, check hole again */
@@ -2474,7 +2477,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
}
ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
- min_size);
+ min_size, 0);
BUG_ON(ret);
trans->block_rsv = rsv;
@@ -2517,7 +2520,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
}
ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
- rsv, min_size);
+ rsv, min_size, 0);
BUG_ON(ret); /* shouldn't happen */
trans->block_rsv = rsv;
@@ -2682,9 +2685,12 @@ static long btrfs_fallocate(struct file *file, int mode,
return ret;
inode_lock(inode);
- ret = inode_newsize_ok(inode, alloc_end);
- if (ret)
- goto out;
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
+ ret = inode_newsize_ok(inode, offset + len);
+ if (ret)
+ goto out;
+ }
/*
* TODO: Move these two operations after we have checked
@@ -2953,7 +2959,7 @@ const struct file_operations btrfs_file_operations = {
.fallocate = btrfs_fallocate,
.unlocked_ioctl = btrfs_ioctl,
#ifdef CONFIG_COMPAT
- .compat_ioctl = btrfs_ioctl,
+ .compat_ioctl = btrfs_compat_ioctl,
#endif
.copy_file_range = btrfs_copy_file_range,
.clone_file_range = btrfs_clone_file_range,
@@ -2969,7 +2975,7 @@ int btrfs_auto_defrag_init(void)
{
btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
sizeof(struct inode_defrag), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ SLAB_MEM_SPREAD,
NULL);
if (!btrfs_inode_defrag_cachep)
return -ENOMEM;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 8f835bfa1bdd2..d571bd2b697bf 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -29,7 +29,7 @@
#include "inode-map.h"
#include "volumes.h"
-#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
+#define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
#define MAX_CACHE_BYTES_PER_GIG SZ_32K
struct btrfs_trim_range {
@@ -280,7 +280,7 @@ fail:
if (locked)
mutex_unlock(&trans->transaction->cache_write_mutex);
if (ret)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -295,7 +295,7 @@ static int readahead_cache(struct inode *inode)
return -ENOMEM;
file_ra_state_init(ra, inode->i_mapping);
- last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+ last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
@@ -310,14 +310,14 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
int num_pages;
int check_crcs = 0;
- num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
+ num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
check_crcs = 1;
/* Make sure we can fit our crcs into the first page */
if (write && check_crcs &&
- (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
+ (num_pages * sizeof(u32)) >= PAGE_SIZE)
return -ENOSPC;
memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
@@ -354,9 +354,9 @@ static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
io_ctl->page = io_ctl->pages[io_ctl->index++];
io_ctl->cur = page_address(io_ctl->page);
io_ctl->orig = io_ctl->cur;
- io_ctl->size = PAGE_CACHE_SIZE;
+ io_ctl->size = PAGE_SIZE;
if (clear)
- memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
+ memset(io_ctl->cur, 0, PAGE_SIZE);
}
static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
@@ -369,7 +369,7 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
if (io_ctl->pages[i]) {
ClearPageChecked(io_ctl->pages[i]);
unlock_page(io_ctl->pages[i]);
- page_cache_release(io_ctl->pages[i]);
+ put_page(io_ctl->pages[i]);
}
}
}
@@ -475,7 +475,7 @@ static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
offset = sizeof(u32) * io_ctl->num_pages;
crc = btrfs_csum_data(io_ctl->orig + offset, crc,
- PAGE_CACHE_SIZE - offset);
+ PAGE_SIZE - offset);
btrfs_csum_final(crc, (char *)&crc);
io_ctl_unmap_page(io_ctl);
tmp = page_address(io_ctl->pages[0]);
@@ -503,7 +503,7 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
io_ctl_map_page(io_ctl, 0);
crc = btrfs_csum_data(io_ctl->orig + offset, crc,
- PAGE_CACHE_SIZE - offset);
+ PAGE_SIZE - offset);
btrfs_csum_final(crc, (char *)&crc);
if (val != crc) {
btrfs_err_rl(io_ctl->root->fs_info,
@@ -561,7 +561,7 @@ static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
io_ctl_map_page(io_ctl, 0);
}
- memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE);
+ memcpy(io_ctl->cur, bitmap, PAGE_SIZE);
io_ctl_set_crc(io_ctl, io_ctl->index - 1);
if (io_ctl->index < io_ctl->num_pages)
io_ctl_map_page(io_ctl, 0);
@@ -621,7 +621,7 @@ static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
if (ret)
return ret;
- memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE);
+ memcpy(entry->bitmap, io_ctl->cur, PAGE_SIZE);
io_ctl_unmap_page(io_ctl);
return 0;
@@ -775,7 +775,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
} else {
ASSERT(num_bitmaps);
num_bitmaps--;
- e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+ e->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS);
if (!e->bitmap) {
kmem_cache_free(
btrfs_free_space_cachep, e);
@@ -1415,11 +1415,11 @@ static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
u64 offset)
{
u64 bitmap_start;
- u32 bytes_per_bitmap;
+ u64 bytes_per_bitmap;
bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
bitmap_start = offset - ctl->start;
- bitmap_start = div_u64(bitmap_start, bytes_per_bitmap);
+ bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
bitmap_start *= bytes_per_bitmap;
bitmap_start += ctl->start;
@@ -1638,10 +1638,10 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
u64 bitmap_bytes;
u64 extent_bytes;
u64 size = block_group->key.offset;
- u32 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
- u32 max_bitmaps = div_u64(size + bytes_per_bg - 1, bytes_per_bg);
+ u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
+ u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
- max_bitmaps = max_t(u32, max_bitmaps, 1);
+ max_bitmaps = max_t(u64, max_bitmaps, 1);
ASSERT(ctl->total_bitmaps <= max_bitmaps);
@@ -1660,7 +1660,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
* sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
* we add more bitmaps.
*/
- bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_CACHE_SIZE;
+ bitmap_bytes = (ctl->total_bitmaps + 1) * ctl->unit;
if (bitmap_bytes >= max_bytes) {
ctl->extents_thresh = 0;
@@ -1983,7 +1983,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
/*
* If this block group has some small extents we don't want to
* use up all of our free slots in the cache with them, we want
- * to reserve them to larger extents, however if we have plent
+ * to reserve them to larger extents, however if we have plenty
* of cache left then go ahead an dadd them, no sense in adding
* the overhead of a bitmap if we don't have to.
*/
@@ -2111,7 +2111,7 @@ new_bitmap:
}
/* allocate the bitmap */
- info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+ info->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS);
spin_lock(&ctl->tree_lock);
if (!info->bitmap) {
ret = -ENOMEM;
@@ -3026,7 +3026,7 @@ int btrfs_find_space_cluster(struct btrfs_root *root,
* For metadata, allow allocates with smaller extents. For
* data, keep it dense.
*/
- if (btrfs_test_opt(root, SSD_SPREAD)) {
+ if (btrfs_test_opt(root->fs_info, SSD_SPREAD)) {
cont1_bytes = min_bytes = bytes + empty_size;
} else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
cont1_bytes = bytes;
@@ -3470,7 +3470,7 @@ int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
int ret = 0;
u64 root_gen = btrfs_root_generation(&root->root_item);
- if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+ if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
return 0;
/*
@@ -3514,7 +3514,7 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
struct btrfs_io_ctl io_ctl;
bool release_metadata = true;
- if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+ if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
return 0;
memset(&io_ctl, 0, sizeof(io_ctl));
@@ -3580,7 +3580,7 @@ again:
}
if (!map) {
- map = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+ map = kzalloc(PAGE_SIZE, GFP_NOFS);
if (!map) {
kmem_cache_free(btrfs_free_space_cachep, info);
return -ENOMEM;
@@ -3662,7 +3662,7 @@ have_info:
if (tmp->offset + tmp->bytes < offset)
break;
if (offset + bytes < tmp->offset) {
- n = rb_prev(&info->offset_index);
+ n = rb_prev(&tmp->offset_index);
continue;
}
info = tmp;
@@ -3676,7 +3676,7 @@ have_info:
if (offset + bytes < tmp->offset)
break;
if (tmp->offset + tmp->bytes < offset) {
- n = rb_next(&info->offset_index);
+ n = rb_next(&tmp->offset_index);
continue;
}
info = tmp;
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 33178c490ace0..3af651c2bbc7e 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -123,7 +123,7 @@ int btrfs_return_cluster_to_free_space(
int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
u64 *trimmed, u64 start, u64 end, u64 minlen);
-/* Support functions for runnint our sanity tests */
+/* Support functions for running our sanity tests */
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
int test_add_free_space_entry(struct btrfs_block_group_cache *cache,
u64 offset, u64 bytes, bool bitmap);
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 53dbeaf6ce941..87e7e3d3e6760 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -305,7 +305,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
out:
kvfree(bitmap);
if (ret)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -454,7 +454,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
out:
kvfree(bitmap);
if (ret)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -851,7 +851,7 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
out:
btrfs_free_path(path);
if (ret)
- btrfs_abort_transaction(trans, fs_info->free_space_root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -1047,7 +1047,7 @@ int add_to_free_space_tree(struct btrfs_trans_handle *trans,
out:
btrfs_free_path(path);
if (ret)
- btrfs_abort_transaction(trans, fs_info->free_space_root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -1193,7 +1193,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
abort:
fs_info->creating_free_space_tree = 0;
- btrfs_abort_transaction(trans, tree_root, ret);
+ btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans, tree_root);
return ret;
}
@@ -1280,7 +1280,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
return 0;
abort:
- btrfs_abort_transaction(trans, tree_root, ret);
+ btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans, tree_root);
return ret;
}
@@ -1333,7 +1333,7 @@ out:
btrfs_free_path(path);
mutex_unlock(&block_group->free_space_lock);
if (ret)
- btrfs_abort_transaction(trans, fs_info->free_space_root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -1410,7 +1410,7 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans,
out:
btrfs_free_path(path);
if (ret)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
index aae520b2aee53..a97fdc156a035 100644
--- a/fs/btrfs/hash.c
+++ b/fs/btrfs/hash.c
@@ -24,6 +24,11 @@ int __init btrfs_hash_init(void)
return PTR_ERR_OR_ZERO(tfm);
}
+const char* btrfs_crc32c_impl(void)
+{
+ return crypto_tfm_alg_driver_name(crypto_shash_tfm(tfm));
+}
+
void btrfs_hash_exit(void)
{
crypto_free_shash(tfm);
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
index 118a2316e5d39..c3a2ec554361f 100644
--- a/fs/btrfs/hash.h
+++ b/fs/btrfs/hash.h
@@ -22,6 +22,7 @@
int __init btrfs_hash_init(void);
void btrfs_hash_exit(void);
+const char* btrfs_crc32c_impl(void);
u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length);
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index be4d22a5022fa..b8acc07ac6c2b 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -157,7 +157,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
*/
if (!btrfs_find_name_in_ext_backref(path, ref_objectid,
name, name_len, &extref)) {
- btrfs_std_error(root->fs_info, -ENOENT, NULL);
+ btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL);
ret = -EROFS;
goto out;
}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 1f0ec19b23f61..aa6fabaee72ed 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -38,7 +38,7 @@ static int caching_kthread(void *data)
int slot;
int ret;
- if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+ if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
return 0;
path = btrfs_alloc_path();
@@ -141,7 +141,7 @@ static void start_caching(struct btrfs_root *root)
int ret;
u64 objectid;
- if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+ if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
return;
spin_lock(&root->ino_cache_lock);
@@ -185,7 +185,7 @@ static void start_caching(struct btrfs_root *root)
int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
{
- if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+ if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
return btrfs_find_free_objectid(root, objectid);
again:
@@ -211,7 +211,7 @@ void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
{
struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
- if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+ if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
return;
again:
if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
@@ -251,7 +251,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root)
struct rb_node *n;
u64 count;
- if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+ if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
return;
while (1) {
@@ -283,7 +283,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root)
}
#define INIT_THRESHOLD ((SZ_32K / 2) / sizeof(struct btrfs_free_space))
-#define INODES_PER_BITMAP (PAGE_CACHE_SIZE * 8)
+#define INODES_PER_BITMAP (PAGE_SIZE * 8)
/*
* The goal is to keep the memory used by the free_ino tree won't
@@ -317,7 +317,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
}
ctl->extents_thresh = (max_bitmaps - ctl->total_bitmaps) *
- PAGE_CACHE_SIZE / sizeof(*info);
+ PAGE_SIZE / sizeof(*info);
}
/*
@@ -412,7 +412,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
if (btrfs_root_refs(&root->root_item) == 0)
return 0;
- if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+ if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
return 0;
path = btrfs_alloc_path();
@@ -458,7 +458,7 @@ again:
BTRFS_I(inode)->generation = 0;
ret = btrfs_update_inode(trans, root, inode);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out_put;
}
@@ -466,7 +466,7 @@ again:
ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
if (ret) {
if (ret != -ENOSPC)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out_put;
}
}
@@ -481,12 +481,12 @@ again:
spin_lock(&ctl->tree_lock);
prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents;
- prealloc = ALIGN(prealloc, PAGE_CACHE_SIZE);
- prealloc += ctl->total_bitmaps * PAGE_CACHE_SIZE;
+ prealloc = ALIGN(prealloc, PAGE_SIZE);
+ prealloc += ctl->total_bitmaps * PAGE_SIZE;
spin_unlock(&ctl->tree_lock);
/* Just to make sure we have enough space */
- prealloc += 8 * PAGE_CACHE_SIZE;
+ prealloc += 8 * PAGE_SIZE;
ret = btrfs_delalloc_reserve_space(inode, 0, prealloc);
if (ret)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 41a5688ffdfe8..b0f421f332ae9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -60,6 +60,7 @@
#include "hash.h"
#include "props.h"
#include "qgroup.h"
+#include "dedupe.h"
struct btrfs_iget_args {
struct btrfs_key *location;
@@ -105,8 +106,9 @@ static int btrfs_truncate(struct inode *inode);
static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
static noinline int cow_file_range(struct inode *inode,
struct page *locked_page,
- u64 start, u64 end, int *page_started,
- unsigned long *nr_written, int unlock);
+ u64 start, u64 end, u64 delalloc_end,
+ int *page_started, unsigned long *nr_written,
+ int unlock, struct btrfs_dedupe_hash *hash);
static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
u64 len, u64 orig_start,
u64 block_start, u64 block_len,
@@ -194,7 +196,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
while (compressed_size > 0) {
cpage = compressed_pages[i];
cur_size = min_t(unsigned long, compressed_size,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
kaddr = kmap_atomic(cpage);
write_extent_buffer(leaf, kaddr, ptr, cur_size);
@@ -208,13 +210,13 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
compress_type);
} else {
page = find_get_page(inode->i_mapping,
- start >> PAGE_CACHE_SHIFT);
+ start >> PAGE_SHIFT);
btrfs_set_file_extent_compression(leaf, ei, 0);
kaddr = kmap_atomic(page);
- offset = start & (PAGE_CACHE_SIZE - 1);
+ offset = start & (PAGE_SIZE - 1);
write_extent_buffer(leaf, kaddr + offset, ptr, size);
kunmap_atomic(kaddr);
- page_cache_release(page);
+ put_page(page);
}
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
@@ -294,7 +296,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
start, aligned_end, NULL,
1, 1, extent_item_size, &extent_inserted);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -305,7 +307,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
inline_len, compressed_size,
compress_type, compressed_pages);
if (ret && ret != -ENOSPC) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
} else if (ret == -ENOSPC) {
ret = 1;
@@ -322,7 +324,7 @@ out:
* And at reserve time, it's always aligned to page size, so
* just free one page here.
*/
- btrfs_qgroup_free_data(inode, 0, PAGE_CACHE_SIZE);
+ btrfs_qgroup_free_data(inode, 0, PAGE_SIZE);
btrfs_free_path(path);
btrfs_end_transaction(trans, root);
return ret;
@@ -374,12 +376,12 @@ static inline int inode_need_compress(struct inode *inode)
struct btrfs_root *root = BTRFS_I(inode)->root;
/* force compress */
- if (btrfs_test_opt(root, FORCE_COMPRESS))
+ if (btrfs_test_opt(root->fs_info, FORCE_COMPRESS))
return 1;
/* bad compression ratios */
if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
return 0;
- if (btrfs_test_opt(root, COMPRESS) ||
+ if (btrfs_test_opt(root->fs_info, COMPRESS) ||
BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
BTRFS_I(inode)->force_compress)
return 1;
@@ -435,8 +437,8 @@ static noinline void compress_file_range(struct inode *inode,
actual_end = min_t(u64, isize, end + 1);
again:
will_compress = 0;
- nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
- nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_CACHE_SIZE);
+ nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
+ nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_SIZE);
/*
* we don't want to send crud past the end of i_size through
@@ -455,7 +457,7 @@ again:
/*
* skip compression for a small file range(<=blocksize) that
- * isn't an inline extent, since it dosen't save disk space at all.
+ * isn't an inline extent, since it doesn't save disk space at all.
*/
if (total_compressed <= blocksize &&
(start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
@@ -514,7 +516,7 @@ again:
if (!ret) {
unsigned long offset = total_compressed &
- (PAGE_CACHE_SIZE - 1);
+ (PAGE_SIZE - 1);
struct page *page = pages[nr_pages_ret - 1];
char *kaddr;
@@ -524,7 +526,7 @@ again:
if (offset) {
kaddr = kmap_atomic(page);
memset(kaddr + offset, 0,
- PAGE_CACHE_SIZE - offset);
+ PAGE_SIZE - offset);
kunmap_atomic(kaddr);
}
will_compress = 1;
@@ -580,21 +582,39 @@ cont:
* one last check to make sure the compression is really a
* win, compare the page count read with the blocks on disk
*/
- total_in = ALIGN(total_in, PAGE_CACHE_SIZE);
+ total_in = ALIGN(total_in, PAGE_SIZE);
if (total_compressed >= total_in) {
will_compress = 0;
} else {
num_bytes = total_in;
+ *num_added += 1;
+
+ /*
+ * The async work queues will take care of doing actual
+ * allocation on disk for these compressed pages, and
+ * will submit them to the elevator.
+ */
+ add_async_extent(async_cow, start, num_bytes,
+ total_compressed, pages, nr_pages_ret,
+ compress_type);
+
+ if (start + num_bytes < end) {
+ start += num_bytes;
+ pages = NULL;
+ cond_resched();
+ goto again;
+ }
+ return;
}
}
- if (!will_compress && pages) {
+ if (pages) {
/*
* the compression code ran but failed to make things smaller,
* free any pages it allocated and our page pointer array
*/
for (i = 0; i < nr_pages_ret; i++) {
WARN_ON(pages[i]->mapping);
- page_cache_release(pages[i]);
+ put_page(pages[i]);
}
kfree(pages);
pages = NULL;
@@ -602,55 +622,35 @@ cont:
nr_pages_ret = 0;
/* flag the file so we don't compress in the future */
- if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
+ if (!btrfs_test_opt(root->fs_info, FORCE_COMPRESS) &&
!(BTRFS_I(inode)->force_compress)) {
BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
}
}
- if (will_compress) {
- *num_added += 1;
-
- /* the async work queues will take care of doing actual
- * allocation on disk for these compressed pages,
- * and will submit them to the elevator.
- */
- add_async_extent(async_cow, start, num_bytes,
- total_compressed, pages, nr_pages_ret,
- compress_type);
-
- if (start + num_bytes < end) {
- start += num_bytes;
- pages = NULL;
- cond_resched();
- goto again;
- }
- } else {
cleanup_and_bail_uncompressed:
- /*
- * No compression, but we still need to write the pages in
- * the file we've been given so far. redirty the locked
- * page if it corresponds to our extent and set things up
- * for the async work queue to run cow_file_range to do
- * the normal delalloc dance
- */
- if (page_offset(locked_page) >= start &&
- page_offset(locked_page) <= end) {
- __set_page_dirty_nobuffers(locked_page);
- /* unlocked later on in the async handlers */
- }
- if (redirty)
- extent_range_redirty_for_io(inode, start, end);
- add_async_extent(async_cow, start, end - start + 1,
- 0, NULL, 0, BTRFS_COMPRESS_NONE);
- *num_added += 1;
- }
+ /*
+ * No compression, but we still need to write the pages in the file
+ * we've been given so far. redirty the locked page if it corresponds
+ * to our extent and set things up for the async work queue to run
+ * cow_file_range to do the normal delalloc dance.
+ */
+ if (page_offset(locked_page) >= start &&
+ page_offset(locked_page) <= end)
+ __set_page_dirty_nobuffers(locked_page);
+ /* unlocked later on in the async handlers */
+
+ if (redirty)
+ extent_range_redirty_for_io(inode, start, end);
+ add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0,
+ BTRFS_COMPRESS_NONE);
+ *num_added += 1;
return;
free_pages_out:
for (i = 0; i < nr_pages_ret; i++) {
WARN_ON(pages[i]->mapping);
- page_cache_release(pages[i]);
+ put_page(pages[i]);
}
kfree(pages);
}
@@ -664,7 +664,7 @@ static void free_async_extent_pages(struct async_extent *async_extent)
for (i = 0; i < async_extent->nr_pages; i++) {
WARN_ON(async_extent->pages[i]->mapping);
- page_cache_release(async_extent->pages[i]);
+ put_page(async_extent->pages[i]);
}
kfree(async_extent->pages);
async_extent->nr_pages = 0;
@@ -712,7 +712,10 @@ retry:
async_extent->start,
async_extent->start +
async_extent->ram_size - 1,
- &page_started, &nr_written, 0);
+ async_extent->start +
+ async_extent->ram_size - 1,
+ &page_started, &nr_written, 0,
+ NULL);
/* JDM XXX */
@@ -824,6 +827,7 @@ retry:
async_extent->ram_size - 1, 0);
goto out_free_reserve;
}
+ btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
/*
* clear dirty, set writeback and unlock the pages.
@@ -861,6 +865,7 @@ retry:
}
return;
out_free_reserve:
+ btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
out_free:
extent_clear_unlock_delalloc(inode, async_extent->start,
@@ -923,9 +928,9 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
*/
static noinline int cow_file_range(struct inode *inode,
struct page *locked_page,
- u64 start, u64 end, int *page_started,
- unsigned long *nr_written,
- int unlock)
+ u64 start, u64 end, u64 delalloc_end,
+ int *page_started, unsigned long *nr_written,
+ int unlock, struct btrfs_dedupe_hash *hash)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 alloc_hint = 0;
@@ -966,7 +971,7 @@ static noinline int cow_file_range(struct inode *inode,
PAGE_END_WRITEBACK);
*nr_written = *nr_written +
- (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
+ (end - start + PAGE_SIZE) / PAGE_SIZE;
*page_started = 1;
goto out;
} else if (ret < 0) {
@@ -1038,6 +1043,8 @@ static noinline int cow_file_range(struct inode *inode,
goto out_drop_extent_cache;
}
+ btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
+
if (disk_num_bytes < cur_alloc_size)
break;
@@ -1066,6 +1073,7 @@ out:
out_drop_extent_cache:
btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
out_reserve:
+ btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
out_unlock:
extent_clear_unlock_delalloc(inode, start, end, locked_page,
@@ -1106,8 +1114,8 @@ static noinline void async_cow_submit(struct btrfs_work *work)
async_cow = container_of(work, struct async_cow, work);
root = async_cow->root;
- nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
- PAGE_CACHE_SHIFT;
+ nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
+ PAGE_SHIFT;
/*
* atomic_sub_return implies a barrier for waitqueue_active
@@ -1151,7 +1159,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
async_cow->start = start;
if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
- !btrfs_test_opt(root, FORCE_COMPRESS))
+ !btrfs_test_opt(root->fs_info, FORCE_COMPRESS))
cur_end = end;
else
cur_end = min(end, start + SZ_512K - 1);
@@ -1164,8 +1172,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
async_cow_start, async_cow_submit,
async_cow_free);
- nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
- PAGE_CACHE_SHIFT;
+ nr_pages = (cur_end - start + PAGE_SIZE) >>
+ PAGE_SHIFT;
atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
btrfs_queue_work(root->fs_info->delalloc_workers,
@@ -1377,6 +1385,9 @@ next_slot:
*/
if (csum_exist_in_range(root, disk_bytenr, num_bytes))
goto out_check;
+ if (!btrfs_inc_nocow_writers(root->fs_info,
+ disk_bytenr))
+ goto out_check;
nocow = 1;
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
extent_end = found_key.offset +
@@ -1391,6 +1402,9 @@ out_check:
path->slots[0]++;
if (!nolock && nocow)
btrfs_end_write_no_snapshoting(root);
+ if (nocow)
+ btrfs_dec_nocow_writers(root->fs_info,
+ disk_bytenr);
goto next_slot;
}
if (!nocow) {
@@ -1407,10 +1421,14 @@ out_check:
if (cow_start != (u64)-1) {
ret = cow_file_range(inode, locked_page,
cow_start, found_key.offset - 1,
- page_started, nr_written, 1);
+ end, page_started, nr_written, 1,
+ NULL);
if (ret) {
if (!nolock && nocow)
btrfs_end_write_no_snapshoting(root);
+ if (nocow)
+ btrfs_dec_nocow_writers(root->fs_info,
+ disk_bytenr);
goto error;
}
cow_start = (u64)-1;
@@ -1453,6 +1471,8 @@ out_check:
ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
num_bytes, num_bytes, type);
+ if (nocow)
+ btrfs_dec_nocow_writers(root->fs_info, disk_bytenr);
BUG_ON(ret); /* -ENOMEM */
if (root->root_key.objectid ==
@@ -1485,8 +1505,8 @@ out_check:
}
if (cow_start != (u64)-1) {
- ret = cow_file_range(inode, locked_page, cow_start, end,
- page_started, nr_written, 1);
+ ret = cow_file_range(inode, locked_page, cow_start, end, end,
+ page_started, nr_written, 1, NULL);
if (ret)
goto error;
}
@@ -1545,8 +1565,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 0, nr_written);
} else if (!inode_need_compress(inode)) {
- ret = cow_file_range(inode, locked_page, start, end,
- page_started, nr_written, 1);
+ ret = cow_file_range(inode, locked_page, start, end, end,
+ page_started, nr_written, 1, NULL);
} else {
set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
&BTRFS_I(inode)->runtime_flags);
@@ -1724,7 +1744,7 @@ static void btrfs_set_bit_hook(struct inode *inode,
}
/* For sanity tests */
- if (btrfs_test_is_dummy_root(root))
+ if (btrfs_is_testing(root->fs_info))
return;
__percpu_counter_add(&root->fs_info->delalloc_bytes, len,
@@ -1783,7 +1803,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
btrfs_delalloc_release_metadata(inode, len);
/* For sanity tests. */
- if (btrfs_test_is_dummy_root(root))
+ if (btrfs_is_testing(root->fs_info))
return;
if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
@@ -1806,8 +1826,12 @@ static void btrfs_clear_bit_hook(struct inode *inode,
/*
* extent_io.c merge_bio_hook, this must check the chunk tree to make sure
* we don't create bios that span stripes or chunks
+ *
+ * return 1 if page cannot be merged to bio
+ * return 0 if page can be merged to bio
+ * return error otherwise
*/
-int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
+int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
size_t size, struct bio *bio,
unsigned long bio_flags)
{
@@ -1822,10 +1846,10 @@ int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
length = bio->bi_iter.bi_size;
map_length = length;
- ret = btrfs_map_block(root->fs_info, rw, logical,
+ ret = btrfs_map_block(root->fs_info, bio_op(bio), logical,
&map_length, NULL, 0);
- /* Will always return 0 with map_multi == NULL */
- BUG_ON(ret < 0);
+ if (ret < 0)
+ return ret;
if (map_length < length + size)
return 1;
return 0;
@@ -1839,9 +1863,8 @@ int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
* At IO completion time the cums attached on the ordered extent record
* are inserted into the btree
*/
-static int __btrfs_submit_bio_start(struct inode *inode, int rw,
- struct bio *bio, int mirror_num,
- unsigned long bio_flags,
+static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags,
u64 bio_offset)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -1860,14 +1883,14 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw,
* At IO completion time the cums attached on the ordered extent record
* are inserted into the btree
*/
-static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
+static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags,
u64 bio_offset)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
- ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
+ ret = btrfs_map_bio(root, bio, mirror_num, 1);
if (ret) {
bio->bi_error = ret;
bio_endio(bio);
@@ -1879,7 +1902,7 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
* extent_io.c submission hook. This does the right thing for csum calculation
* on write, or reading the csums from the tree before a read
*/
-static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags,
u64 bio_offset)
{
@@ -1894,7 +1917,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
if (btrfs_is_free_space_inode(inode))
metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
- if (!(rw & REQ_WRITE)) {
+ if (bio_op(bio) != REQ_OP_WRITE) {
ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
if (ret)
goto out;
@@ -1916,7 +1939,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
goto mapit;
/* we're doing a write, do the async checksumming */
ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
- inode, rw, bio, mirror_num,
+ inode, bio, mirror_num,
bio_flags, bio_offset,
__btrfs_submit_bio_start,
__btrfs_submit_bio_done);
@@ -1928,7 +1951,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
}
mapit:
- ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
+ ret = btrfs_map_bio(root, bio, mirror_num, 0);
out:
if (ret < 0) {
@@ -1960,9 +1983,9 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
struct extent_state **cached_state)
{
- WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
+ WARN_ON((end & (PAGE_SIZE - 1)) == 0);
return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
- cached_state, GFP_NOFS);
+ cached_state);
}
/* see btrfs_writepage_start_hook for details on why this is required */
@@ -1993,7 +2016,7 @@ again:
inode = page->mapping->host;
page_start = page_offset(page);
- page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
+ page_end = page_offset(page) + PAGE_SIZE - 1;
lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
&cached_state);
@@ -2003,7 +2026,7 @@ again:
goto out;
ordered = btrfs_lookup_ordered_range(inode, page_start,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
if (ordered) {
unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
page_end, &cached_state, GFP_NOFS);
@@ -2014,7 +2037,7 @@ again:
}
ret = btrfs_delalloc_reserve_space(inode, page_start,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
if (ret) {
mapping_set_error(page->mapping, ret);
end_extent_writepage(page, ret, page_start, page_end);
@@ -2030,7 +2053,7 @@ out:
&cached_state, GFP_NOFS);
out_page:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
kfree(fixup);
}
@@ -2063,7 +2086,7 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
return -EAGAIN;
SetPageChecked(page);
- page_cache_get(page);
+ get_page(page);
btrfs_init_work(&fixup->work, btrfs_fixup_helper,
btrfs_writepage_fixup_worker, NULL, NULL);
fixup->page = page;
@@ -2579,7 +2602,7 @@ again:
ret = btrfs_insert_empty_item(trans, root, path, &key,
sizeof(*extent));
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
@@ -2606,7 +2629,7 @@ again:
backref->root_id, backref->inum,
new->file_pos); /* start - extent_offset */
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
@@ -2875,7 +2898,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
ret = btrfs_update_inode_fallback(trans, root, inode);
if (ret) /* -ENOMEM or corruption */
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -2935,7 +2958,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
ordered_extent->file_offset, ordered_extent->len,
trans->transid);
if (ret < 0) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out_unlock;
}
@@ -2945,7 +2968,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
btrfs_ordered_update_i_size(inode, 0, ordered_extent);
ret = btrfs_update_inode_fallback(trans, root, inode);
if (ret) { /* -ENOMEM or corruption */
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out_unlock;
}
ret = 0;
@@ -3103,8 +3126,7 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
- clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
- GFP_NOFS);
+ clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
return 0;
}
@@ -3190,7 +3212,7 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
root->root_key.objectid);
if (ret)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
else
clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
&root->state);
@@ -3256,7 +3278,16 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
/* grab metadata reservation from transaction handle */
if (reserve) {
ret = btrfs_orphan_reserve_metadata(trans, inode);
- BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */
+ ASSERT(!ret);
+ if (ret) {
+ atomic_dec(&root->orphan_inodes);
+ clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
+ &BTRFS_I(inode)->runtime_flags);
+ if (insert)
+ clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+ &BTRFS_I(inode)->runtime_flags);
+ return ret;
+ }
}
/* insert an orphan item to track this unlinked/truncated file */
@@ -3272,7 +3303,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
if (ret != -EEXIST) {
clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
&BTRFS_I(inode)->runtime_flags);
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
}
@@ -3284,7 +3315,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
root->root_key.objectid);
if (ret && ret != -EEXIST) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
}
@@ -3706,7 +3737,7 @@ cache_index:
* and doesn't have an inode ref with the name "bar" anymore.
*
* Setting last_unlink_trans to last_trans is a pessimistic approach,
- * but it guarantees correctness at the expense of ocassional full
+ * but it guarantees correctness at the expense of occasional full
* transaction commits on fsync if our inode is a directory, or if our
* inode is not a directory, logging its parent unnecessarily.
*/
@@ -3983,20 +4014,20 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
btrfs_info(root->fs_info,
"failed to delete reference to %.*s, inode %llu parent %llu",
name_len, name, ino, dir_ino);
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto err;
}
skip_backref:
ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto err;
}
ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
inode, dir_ino);
if (ret != 0 && ret != -ENOENT) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto err;
}
@@ -4005,7 +4036,7 @@ skip_backref:
if (ret == -ENOENT)
ret = 0;
else if (ret)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
err:
btrfs_free_path(path);
if (ret)
@@ -4119,7 +4150,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
ret = btrfs_delete_one_dir_name(trans, root, path, di);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
btrfs_release_path(path);
@@ -4129,7 +4160,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
dir_ino, &index, name, name_len);
if (ret < 0) {
if (ret != -ENOENT) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
di = btrfs_search_dir_index_item(root, path, dir_ino,
@@ -4139,7 +4170,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
ret = -ENOENT;
else
ret = PTR_ERR(di);
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -4152,7 +4183,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -4161,7 +4192,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
ret = btrfs_update_inode_fallback(trans, root, dir);
if (ret)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
out:
btrfs_free_path(path);
return ret;
@@ -4247,7 +4278,7 @@ static int truncate_inline_extent(struct inode *inode,
if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) {
loff_t offset = new_size;
- loff_t page_end = ALIGN(offset, PAGE_CACHE_SIZE);
+ loff_t page_end = ALIGN(offset, PAGE_SIZE);
/*
* Zero out the remaining of the last page of our inline extent,
@@ -4482,7 +4513,6 @@ search_again:
pending_del_nr);
if (err) {
btrfs_abort_transaction(trans,
- root,
err);
goto error;
}
@@ -4494,8 +4524,7 @@ search_again:
item_end,
new_size);
if (err) {
- btrfs_abort_transaction(trans,
- root, err);
+ btrfs_abort_transaction(trans, err);
goto error;
}
} else if (test_bit(BTRFS_ROOT_REF_COWS,
@@ -4534,6 +4563,7 @@ delete:
BUG_ON(ret);
if (btrfs_should_throttle_delayed_refs(trans, root))
btrfs_async_run_delayed_refs(root,
+ trans->transid,
trans->delayed_ref_updates * 2, 0);
if (be_nice) {
if (truncate_space_check(trans, root,
@@ -4558,8 +4588,7 @@ delete:
pending_del_slot,
pending_del_nr);
if (ret) {
- btrfs_abort_transaction(trans,
- root, ret);
+ btrfs_abort_transaction(trans, ret);
goto error;
}
pending_del_nr = 0;
@@ -4592,7 +4621,7 @@ out:
ret = btrfs_del_items(trans, root, path, pending_del_slot,
pending_del_nr);
if (ret)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
}
error:
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
@@ -4633,7 +4662,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
struct extent_state *cached_state = NULL;
char *kaddr;
u32 blocksize = root->sectorsize;
- pgoff_t index = from >> PAGE_CACHE_SHIFT;
+ pgoff_t index = from >> PAGE_SHIFT;
unsigned offset = from & (blocksize - 1);
struct page *page;
gfp_t mask = btrfs_alloc_write_mask(mapping);
@@ -4668,7 +4697,7 @@ again:
lock_page(page);
if (page->mapping != mapping) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
goto again;
}
if (!PageUptodate(page)) {
@@ -4686,7 +4715,7 @@ again:
unlock_extent_cached(io_tree, block_start, block_end,
&cached_state, GFP_NOFS);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
btrfs_start_ordered_extent(inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
goto again;
@@ -4728,7 +4757,7 @@ out_unlock:
btrfs_delalloc_release_space(inode, block_start,
blocksize);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
out:
return ret;
}
@@ -4761,7 +4790,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans, root);
return ret;
}
@@ -4769,7 +4798,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
0, 0, len, 0, len, 0, 0, 0);
if (ret)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
else
btrfs_update_inode(trans, root, inode);
btrfs_end_transaction(trans, root);
@@ -4962,7 +4991,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
* be instantly completed which will give us extents that need
* to be truncated. If we fail to get an orphan inode down we
* could have left over extents that were never meant to live,
- * so we need to garuntee from this point on that everything
+ * so we need to guarantee from this point on that everything
* will be consistent.
*/
ret = btrfs_orphan_add(trans, inode);
@@ -4996,7 +5025,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
i_size_write(inode, BTRFS_I(inode)->disk_i_size);
err = btrfs_orphan_del(trans, inode);
if (err)
- btrfs_abort_transaction(trans, root, err);
+ btrfs_abort_transaction(trans, err);
btrfs_end_transaction(trans, root);
}
}
@@ -5134,11 +5163,18 @@ void btrfs_evict_inode(struct inode *inode)
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_block_rsv *rsv, *global_rsv;
int steal_from_global = 0;
- u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
+ u64 min_size;
int ret;
trace_btrfs_inode_evict(inode);
+ if (!root) {
+ kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+ return;
+ }
+
+ min_size = btrfs_calc_trunc_metadata_size(root, 1);
+
evict_inode_truncate_pages(inode);
if (inode->i_nlink &&
@@ -5232,14 +5268,14 @@ void btrfs_evict_inode(struct inode *inode)
}
/*
- * We can't just steal from the global reserve, we need tomake
+ * We can't just steal from the global reserve, we need to make
* sure there is room to do it, if not we need to commit and try
* again.
*/
if (steal_from_global) {
if (!btrfs_check_space_for_delayed_refs(trans, root))
ret = btrfs_block_rsv_migrate(global_rsv, rsv,
- min_size);
+ min_size, 0);
else
ret = -ENOSPC;
}
@@ -5733,6 +5769,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
int name_len;
int is_curr = 0; /* ctx->pos points to the current index? */
bool emitted;
+ bool put = false;
/* FIXME, use a real flag for deciding about the key type */
if (root->fs_info->tree_root == root)
@@ -5750,7 +5787,8 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
if (key_type == BTRFS_DIR_INDEX_KEY) {
INIT_LIST_HEAD(&ins_list);
INIT_LIST_HEAD(&del_list);
- btrfs_get_delayed_items(inode, &ins_list, &del_list);
+ put = btrfs_readdir_get_delayed_items(inode, &ins_list,
+ &del_list);
}
key.type = key_type;
@@ -5897,8 +5935,8 @@ next:
nopos:
ret = 0;
err:
- if (key_type == BTRFS_DIR_INDEX_KEY)
- btrfs_put_delayed_items(&ins_list, &del_list);
+ if (put)
+ btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list);
btrfs_free_path(path);
return ret;
}
@@ -6213,9 +6251,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
btrfs_inherit_iflags(inode, dir);
if (S_ISREG(mode)) {
- if (btrfs_test_opt(root, NODATASUM))
+ if (btrfs_test_opt(root->fs_info, NODATASUM))
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
- if (btrfs_test_opt(root, NODATACOW))
+ if (btrfs_test_opt(root->fs_info, NODATACOW))
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
BTRFS_INODE_NODATASUM;
}
@@ -6293,7 +6331,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
if (ret == -EEXIST || ret == -EOVERFLOW)
goto fail_dir_item;
else if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -6304,7 +6342,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
current_fs_time(parent_inode->i_sb);
ret = btrfs_update_inode(trans, root, parent_inode);
if (ret)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
fail_dir_item:
@@ -6717,7 +6755,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
read_extent_buffer(leaf, tmp, ptr, inline_size);
- max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
+ max_size = min_t(unsigned long, PAGE_SIZE, max_size);
ret = btrfs_decompress(compress_type, tmp, page,
extent_offset, inline_size, max_size);
kfree(tmp);
@@ -6879,8 +6917,8 @@ next:
size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
extent_offset = page_offset(page) + pg_offset - extent_start;
- copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
- size - extent_offset);
+ copy_size = min_t(u64, PAGE_SIZE - pg_offset,
+ size - extent_offset);
em->start = extent_start + extent_offset;
em->len = ALIGN(copy_size, root->sectorsize);
em->orig_block_len = em->len;
@@ -6899,9 +6937,9 @@ next:
map = kmap(page);
read_extent_buffer(leaf, map + pg_offset, ptr,
copy_size);
- if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
+ if (pg_offset + copy_size < PAGE_SIZE) {
memset(map + pg_offset + copy_size, 0,
- PAGE_CACHE_SIZE - pg_offset -
+ PAGE_SIZE - pg_offset -
copy_size);
}
kunmap(page);
@@ -6964,7 +7002,18 @@ insert:
* existing will always be non-NULL, since there must be
* extent causing the -EEXIST.
*/
- if (start >= extent_map_end(existing) ||
+ if (existing->start == em->start &&
+ extent_map_end(existing) == extent_map_end(em) &&
+ em->block_start == existing->block_start) {
+ /*
+ * these two extents are the same, it happens
+ * with inlines especially
+ */
+ free_extent_map(em);
+ em = existing;
+ err = 0;
+
+ } else if (start >= extent_map_end(existing) ||
start <= existing->start) {
/*
* The existing extent map is the one nearest to
@@ -7129,6 +7178,43 @@ out:
return em;
}
+static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
+ const u64 start,
+ const u64 len,
+ const u64 orig_start,
+ const u64 block_start,
+ const u64 block_len,
+ const u64 orig_block_len,
+ const u64 ram_bytes,
+ const int type)
+{
+ struct extent_map *em = NULL;
+ int ret;
+
+ down_read(&BTRFS_I(inode)->dio_sem);
+ if (type != BTRFS_ORDERED_NOCOW) {
+ em = create_pinned_em(inode, start, len, orig_start,
+ block_start, block_len, orig_block_len,
+ ram_bytes, type);
+ if (IS_ERR(em))
+ goto out;
+ }
+ ret = btrfs_add_ordered_extent_dio(inode, start, block_start,
+ len, block_len, type);
+ if (ret) {
+ if (em) {
+ free_extent_map(em);
+ btrfs_drop_extent_cache(inode, start,
+ start + len - 1, 0);
+ }
+ em = ERR_PTR(ret);
+ }
+ out:
+ up_read(&BTRFS_I(inode)->dio_sem);
+
+ return em;
+}
+
static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
u64 start, u64 len)
{
@@ -7144,41 +7230,13 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
if (ret)
return ERR_PTR(ret);
- /*
- * Create the ordered extent before the extent map. This is to avoid
- * races with the fast fsync path that would lead to it logging file
- * extent items that point to disk extents that were not yet written to.
- * The fast fsync path collects ordered extents into a local list and
- * then collects all the new extent maps, so we must create the ordered
- * extent first and make sure the fast fsync path collects any new
- * ordered extents after collecting new extent maps as well.
- * The fsync path simply can not rely on inode_dio_wait() because it
- * causes deadlock with AIO.
- */
- ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
- ins.offset, ins.offset, 0);
- if (ret) {
+ em = btrfs_create_dio_extent(inode, start, ins.offset, start,
+ ins.objectid, ins.offset, ins.offset,
+ ins.offset, 0);
+ btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
+ if (IS_ERR(em))
btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
- return ERR_PTR(ret);
- }
- em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
- ins.offset, ins.offset, ins.offset, 0);
- if (IS_ERR(em)) {
- struct btrfs_ordered_extent *oe;
-
- btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
- oe = btrfs_lookup_ordered_extent(inode, start);
- ASSERT(oe);
- if (WARN_ON(!oe))
- return em;
- set_bit(BTRFS_ORDERED_IOERR, &oe->flags);
- set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags);
- btrfs_remove_ordered_extent(inode, oe);
- /* Once for our lookup and once for the ordered extents tree. */
- btrfs_put_ordered_extent(oe);
- btrfs_put_ordered_extent(oe);
- }
return em;
}
@@ -7336,12 +7394,12 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
int start_idx;
int end_idx;
- start_idx = start >> PAGE_CACHE_SHIFT;
+ start_idx = start >> PAGE_SHIFT;
/*
* end is the last byte in the last page. end == start is legal
*/
- end_idx = end >> PAGE_CACHE_SHIFT;
+ end_idx = end >> PAGE_SHIFT;
rcu_read_lock();
@@ -7382,7 +7440,7 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
* include/linux/pagemap.h for details.
*/
if (unlikely(page != *pagep)) {
- page_cache_release(page);
+ put_page(page);
page = NULL;
}
}
@@ -7390,7 +7448,7 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
if (page) {
if (page->index <= end_idx)
found = true;
- page_cache_release(page);
+ put_page(page);
}
rcu_read_unlock();
@@ -7408,7 +7466,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
cached_state);
/*
* We're concerned with the entire range that we're going to be
- * doing DIO to, so we need to make sure theres no ordered
+ * doing DIO to, so we need to make sure there's no ordered
* extents in this range.
*/
ordered = btrfs_lookup_ordered_range(inode, lockstart,
@@ -7570,7 +7628,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
if (current->journal_info) {
/*
* Need to pull our outstanding extents and set journal_info to NULL so
- * that anything that needs to check if there's a transction doesn't get
+ * that anything that needs to check if there's a transaction doesn't get
* confused.
*/
dio_data = current->journal_info;
@@ -7603,7 +7661,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
* decompress it, so there will be buffering required no matter what we
* do, so go ahead and fallback to buffered.
*
- * We return -ENOTBLK because thats what makes DIO go ahead and go back
+ * We return -ENOTBLK because that's what makes DIO go ahead and go back
* to buffered IO. Don't blame me, this is the price we pay for using
* the generic code.
*/
@@ -7650,24 +7708,21 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
block_start = em->block_start + (start - em->start);
if (can_nocow_extent(inode, start, &len, &orig_start,
- &orig_block_len, &ram_bytes) == 1) {
+ &orig_block_len, &ram_bytes) == 1 &&
+ btrfs_inc_nocow_writers(root->fs_info, block_start)) {
+ struct extent_map *em2;
+
+ em2 = btrfs_create_dio_extent(inode, start, len,
+ orig_start, block_start,
+ len, orig_block_len,
+ ram_bytes, type);
+ btrfs_dec_nocow_writers(root->fs_info, block_start);
if (type == BTRFS_ORDERED_PREALLOC) {
free_extent_map(em);
- em = create_pinned_em(inode, start, len,
- orig_start,
- block_start, len,
- orig_block_len,
- ram_bytes, type);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- goto unlock_err;
- }
+ em = em2;
}
-
- ret = btrfs_add_ordered_extent_dio(inode, start,
- block_start, len, len, type);
- if (ret) {
- free_extent_map(em);
+ if (em2 && IS_ERR(em2)) {
+ ret = PTR_ERR(em2);
goto unlock_err;
}
goto unlock;
@@ -7746,12 +7801,12 @@ err:
}
static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio,
- int rw, int mirror_num)
+ int mirror_num)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
- BUG_ON(rw & REQ_WRITE);
+ BUG_ON(bio_op(bio) == REQ_OP_WRITE);
bio_get(bio);
@@ -7760,7 +7815,7 @@ static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio,
if (ret)
goto err;
- ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
+ ret = btrfs_map_bio(root, bio, mirror_num, 0);
err:
bio_put(bio);
return ret;
@@ -7811,7 +7866,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
int read_mode;
int ret;
- BUG_ON(failed_bio->bi_rw & REQ_WRITE);
+ BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
if (ret)
@@ -7839,13 +7894,13 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
free_io_failure(inode, failrec);
return -EIO;
}
+ bio_set_op_attrs(bio, REQ_OP_READ, read_mode);
btrfs_debug(BTRFS_I(inode)->root->fs_info,
"Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n",
read_mode, failrec->this_mirror, failrec->in_validation);
- ret = submit_dio_repair_bio(inode, bio, read_mode,
- failrec->this_mirror);
+ ret = submit_dio_repair_bio(inode, bio, failrec->this_mirror);
if (ret) {
free_io_failure(inode, failrec);
bio_put(bio);
@@ -8135,7 +8190,7 @@ static void btrfs_endio_direct_write(struct bio *bio)
bio_put(bio);
}
-static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
+static int __btrfs_submit_bio_start_direct_io(struct inode *inode,
struct bio *bio, int mirror_num,
unsigned long bio_flags, u64 offset)
{
@@ -8153,8 +8208,8 @@ static void btrfs_end_dio_bio(struct bio *bio)
if (err)
btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
- "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
- btrfs_ino(dip->inode), bio->bi_rw,
+ "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d",
+ btrfs_ino(dip->inode), bio_op(bio), bio->bi_rw,
(unsigned long long)bio->bi_iter.bi_sector,
bio->bi_iter.bi_size, err);
@@ -8228,11 +8283,11 @@ static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root,
}
static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
- int rw, u64 file_offset, int skip_sum,
+ u64 file_offset, int skip_sum,
int async_submit)
{
struct btrfs_dio_private *dip = bio->bi_private;
- int write = rw & REQ_WRITE;
+ bool write = bio_op(bio) == REQ_OP_WRITE;
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
@@ -8253,8 +8308,7 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
if (write && async_submit) {
ret = btrfs_wq_submit_bio(root->fs_info,
- inode, rw, bio, 0, 0,
- file_offset,
+ inode, bio, 0, 0, file_offset,
__btrfs_submit_bio_start_direct_io,
__btrfs_submit_bio_done);
goto err;
@@ -8273,13 +8327,13 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
goto err;
}
map:
- ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
+ ret = btrfs_map_bio(root, bio, 0, async_submit);
err:
bio_put(bio);
return ret;
}
-static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
+static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip,
int skip_sum)
{
struct inode *inode = dip->inode;
@@ -8298,8 +8352,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
int i;
map_length = orig_bio->bi_iter.bi_size;
- ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
- &map_length, NULL, 0);
+ ret = btrfs_map_block(root->fs_info, bio_op(orig_bio),
+ start_sector << 9, &map_length, NULL, 0);
if (ret)
return -EIO;
@@ -8319,6 +8373,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
if (!bio)
return -ENOMEM;
+ bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_rw);
bio->bi_private = dip;
bio->bi_end_io = btrfs_end_dio_bio;
btrfs_io_bio(bio)->logical = file_offset;
@@ -8338,7 +8393,7 @@ next_block:
* before we're done setting it up
*/
atomic_inc(&dip->pending_bios);
- ret = __btrfs_submit_dio_bio(bio, inode, rw,
+ ret = __btrfs_submit_dio_bio(bio, inode,
file_offset, skip_sum,
async_submit);
if (ret) {
@@ -8356,12 +8411,13 @@ next_block:
start_sector, GFP_NOFS);
if (!bio)
goto out_err;
+ bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_rw);
bio->bi_private = dip;
bio->bi_end_io = btrfs_end_dio_bio;
btrfs_io_bio(bio)->logical = file_offset;
map_length = orig_bio->bi_iter.bi_size;
- ret = btrfs_map_block(root->fs_info, rw,
+ ret = btrfs_map_block(root->fs_info, bio_op(orig_bio),
start_sector << 9,
&map_length, NULL, 0);
if (ret) {
@@ -8381,7 +8437,7 @@ next_block:
}
submit:
- ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
+ ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum,
async_submit);
if (!ret)
return 0;
@@ -8401,14 +8457,14 @@ out_err:
return 0;
}
-static void btrfs_submit_direct(int rw, struct bio *dio_bio,
- struct inode *inode, loff_t file_offset)
+static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
+ loff_t file_offset)
{
struct btrfs_dio_private *dip = NULL;
struct bio *io_bio = NULL;
struct btrfs_io_bio *btrfs_bio;
int skip_sum;
- int write = rw & REQ_WRITE;
+ bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
int ret = 0;
skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
@@ -8459,7 +8515,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
dio_data->unsubmitted_oe_range_end;
}
- ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
+ ret = btrfs_submit_direct_hook(dip, skip_sum);
if (!ret)
return;
@@ -8541,13 +8597,13 @@ out:
return retval;
}
-static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- loff_t offset)
+static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_dio_data dio_data = { 0 };
+ loff_t offset = iocb->ki_pos;
size_t count = 0;
int flags = 0;
bool wakeup = true;
@@ -8607,7 +8663,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
ret = __blockdev_direct_IO(iocb, inode,
BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
- iter, offset, btrfs_get_blocks_direct, NULL,
+ iter, btrfs_get_blocks_direct, NULL,
btrfs_submit_direct, flags);
if (iov_iter_rw(iter) == WRITE) {
current->journal_info = NULL;
@@ -8719,7 +8775,7 @@ static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
if (ret == 1) {
ClearPagePrivate(page);
set_page_private(page, 0);
- page_cache_release(page);
+ put_page(page);
}
return ret;
}
@@ -8739,7 +8795,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
u64 page_start = page_offset(page);
- u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+ u64 page_end = page_start + PAGE_SIZE - 1;
u64 start;
u64 end;
int inode_evicting = inode->i_state & I_FREEING;
@@ -8822,7 +8878,7 @@ again:
* 2) Not written to disk
* This means the reserved space should be freed here.
*/
- btrfs_qgroup_free_data(inode, page_start, PAGE_CACHE_SIZE);
+ btrfs_qgroup_free_data(inode, page_start, PAGE_SIZE);
if (!inode_evicting) {
clear_extent_bit(tree, page_start, page_end,
EXTENT_LOCKED | EXTENT_DIRTY |
@@ -8837,7 +8893,7 @@ again:
if (PagePrivate(page)) {
ClearPagePrivate(page);
set_page_private(page, 0);
- page_cache_release(page);
+ put_page(page);
}
}
@@ -8874,11 +8930,11 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
u64 page_end;
u64 end;
- reserved_space = PAGE_CACHE_SIZE;
+ reserved_space = PAGE_SIZE;
sb_start_pagefault(inode->i_sb);
page_start = page_offset(page);
- page_end = page_start + PAGE_CACHE_SIZE - 1;
+ page_end = page_start + PAGE_SIZE - 1;
end = page_end;
/*
@@ -8934,15 +8990,15 @@ again:
goto again;
}
- if (page->index == ((size - 1) >> PAGE_CACHE_SHIFT)) {
+ if (page->index == ((size - 1) >> PAGE_SHIFT)) {
reserved_space = round_up(size - page_start, root->sectorsize);
- if (reserved_space < PAGE_CACHE_SIZE) {
+ if (reserved_space < PAGE_SIZE) {
end = page_start + reserved_space - 1;
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->outstanding_extents++;
spin_unlock(&BTRFS_I(inode)->lock);
btrfs_delalloc_release_space(inode, page_start,
- PAGE_CACHE_SIZE - reserved_space);
+ PAGE_SIZE - reserved_space);
}
}
@@ -8969,14 +9025,14 @@ again:
ret = 0;
/* page is wholly or partially inside EOF */
- if (page_start + PAGE_CACHE_SIZE > size)
- zero_start = size & ~PAGE_CACHE_MASK;
+ if (page_start + PAGE_SIZE > size)
+ zero_start = size & ~PAGE_MASK;
else
- zero_start = PAGE_CACHE_SIZE;
+ zero_start = PAGE_SIZE;
- if (zero_start != PAGE_CACHE_SIZE) {
+ if (zero_start != PAGE_SIZE) {
kaddr = kmap(page);
- memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
+ memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start);
flush_dcache_page(page);
kunmap(page);
}
@@ -9019,7 +9075,7 @@ static int btrfs_truncate(struct inode *inode)
return ret;
/*
- * Yes ladies and gentelment, this is indeed ugly. The fact is we have
+ * Yes ladies and gentlemen, this is indeed ugly. The fact is we have
* 3 things going on here
*
* 1) We need to reserve space for our orphan item and the space to
@@ -9033,15 +9089,15 @@ static int btrfs_truncate(struct inode *inode)
* space reserved in case it uses space during the truncate (thank you
* very much snapshotting).
*
- * And we need these to all be seperate. The fact is we can use alot of
+ * And we need these to all be separate. The fact is we can use a lot of
* space doing the truncate, and we have no earthly idea how much space
- * we will use, so we need the truncate reservation to be seperate so it
+ * we will use, so we need the truncate reservation to be separate so it
* doesn't end up using space reserved for updating the inode or
* removing the orphan item. We also need to be able to stop the
* transaction and start a new one, which means we need to be able to
* update the inode several times, and we have no idea of knowing how
* many times that will be, so we can't just reserve 1 item for the
- * entirety of the opration, so that has to be done seperately as well.
+ * entirety of the operation, so that has to be done separately as well.
* Then there is the orphan item, which does indeed need to be held on
* to for the whole operation, and we need nobody to touch this reserved
* space except the orphan code.
@@ -9072,7 +9128,7 @@ static int btrfs_truncate(struct inode *inode)
/* Migrate the slack space for the truncate to our reserve */
ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
- min_size);
+ min_size, 0);
BUG_ON(ret);
/*
@@ -9112,7 +9168,7 @@ static int btrfs_truncate(struct inode *inode)
}
ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
- rsv, min_size);
+ rsv, min_size, 0);
BUG_ON(ret); /* shouldn't happen */
trans->block_rsv = rsv;
}
@@ -9133,7 +9189,6 @@ static int btrfs_truncate(struct inode *inode)
ret = btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(root);
}
-
out:
btrfs_free_block_rsv(root, rsv);
@@ -9230,6 +9285,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
INIT_LIST_HEAD(&ei->delalloc_inodes);
INIT_LIST_HEAD(&ei->delayed_iput);
RB_CLEAR_NODE(&ei->rb_node);
+ init_rwsem(&ei->dio_sem);
return inode;
}
@@ -9341,25 +9397,25 @@ int btrfs_init_cachep(void)
btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
sizeof(struct btrfs_trans_handle), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
if (!btrfs_trans_handle_cachep)
goto fail;
btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
sizeof(struct btrfs_transaction), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
if (!btrfs_transaction_cachep)
goto fail;
btrfs_path_cachep = kmem_cache_create("btrfs_path",
sizeof(struct btrfs_path), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ SLAB_MEM_SPREAD, NULL);
if (!btrfs_path_cachep)
goto fail;
btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
sizeof(struct btrfs_free_space), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ SLAB_MEM_SPREAD, NULL);
if (!btrfs_free_space_cachep)
goto fail;
@@ -9387,10 +9443,281 @@ static int btrfs_getattr(struct vfsmount *mnt,
return 0;
}
+static int btrfs_rename_exchange(struct inode *old_dir,
+ struct dentry *old_dentry,
+ struct inode *new_dir,
+ struct dentry *new_dentry)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(old_dir)->root;
+ struct btrfs_root *dest = BTRFS_I(new_dir)->root;
+ struct inode *new_inode = new_dentry->d_inode;
+ struct inode *old_inode = old_dentry->d_inode;
+ struct timespec ctime = CURRENT_TIME;
+ struct dentry *parent;
+ u64 old_ino = btrfs_ino(old_inode);
+ u64 new_ino = btrfs_ino(new_inode);
+ u64 old_idx = 0;
+ u64 new_idx = 0;
+ u64 root_objectid;
+ int ret;
+ bool root_log_pinned = false;
+ bool dest_log_pinned = false;
+
+ /* we only allow rename subvolume link between subvolumes */
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
+ return -EXDEV;
+
+ /* close the race window with snapshot create/destroy ioctl */
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+ down_read(&root->fs_info->subvol_sem);
+ if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
+ down_read(&dest->fs_info->subvol_sem);
+
+ /*
+ * We want to reserve the absolute worst case amount of items. So if
+ * both inodes are subvols and we need to unlink them then that would
+ * require 4 item modifications, but if they are both normal inodes it
+ * would require 5 item modifications, so we'll assume their normal
+ * inodes. So 5 * 2 is 10, plus 2 for the new links, so 12 total items
+ * should cover the worst case number of items we'll modify.
+ */
+ trans = btrfs_start_transaction(root, 12);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_notrans;
+ }
+
+ /*
+ * We need to find a free sequence number both in the source and
+ * in the destination directory for the exchange.
+ */
+ ret = btrfs_set_inode_index(new_dir, &old_idx);
+ if (ret)
+ goto out_fail;
+ ret = btrfs_set_inode_index(old_dir, &new_idx);
+ if (ret)
+ goto out_fail;
+
+ BTRFS_I(old_inode)->dir_index = 0ULL;
+ BTRFS_I(new_inode)->dir_index = 0ULL;
+
+ /* Reference for the source. */
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ /* force full log commit if subvolume involved. */
+ btrfs_set_log_full_commit(root->fs_info, trans);
+ } else {
+ btrfs_pin_log_trans(root);
+ root_log_pinned = true;
+ ret = btrfs_insert_inode_ref(trans, dest,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len,
+ old_ino,
+ btrfs_ino(new_dir), old_idx);
+ if (ret)
+ goto out_fail;
+ }
+
+ /* And now for the dest. */
+ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ /* force full log commit if subvolume involved. */
+ btrfs_set_log_full_commit(dest->fs_info, trans);
+ } else {
+ btrfs_pin_log_trans(dest);
+ dest_log_pinned = true;
+ ret = btrfs_insert_inode_ref(trans, root,
+ old_dentry->d_name.name,
+ old_dentry->d_name.len,
+ new_ino,
+ btrfs_ino(old_dir), new_idx);
+ if (ret)
+ goto out_fail;
+ }
+
+ /* Update inode version and ctime/mtime. */
+ inode_inc_iversion(old_dir);
+ inode_inc_iversion(new_dir);
+ inode_inc_iversion(old_inode);
+ inode_inc_iversion(new_inode);
+ old_dir->i_ctime = old_dir->i_mtime = ctime;
+ new_dir->i_ctime = new_dir->i_mtime = ctime;
+ old_inode->i_ctime = ctime;
+ new_inode->i_ctime = ctime;
+
+ if (old_dentry->d_parent != new_dentry->d_parent) {
+ btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
+ btrfs_record_unlink_dir(trans, new_dir, new_inode, 1);
+ }
+
+ /* src is a subvolume */
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
+ ret = btrfs_unlink_subvol(trans, root, old_dir,
+ root_objectid,
+ old_dentry->d_name.name,
+ old_dentry->d_name.len);
+ } else { /* src is an inode */
+ ret = __btrfs_unlink_inode(trans, root, old_dir,
+ old_dentry->d_inode,
+ old_dentry->d_name.name,
+ old_dentry->d_name.len);
+ if (!ret)
+ ret = btrfs_update_inode(trans, root, old_inode);
+ }
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
+
+ /* dest is a subvolume */
+ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ root_objectid = BTRFS_I(new_inode)->root->root_key.objectid;
+ ret = btrfs_unlink_subvol(trans, dest, new_dir,
+ root_objectid,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len);
+ } else { /* dest is an inode */
+ ret = __btrfs_unlink_inode(trans, dest, new_dir,
+ new_dentry->d_inode,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len);
+ if (!ret)
+ ret = btrfs_update_inode(trans, dest, new_inode);
+ }
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
+
+ ret = btrfs_add_link(trans, new_dir, old_inode,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len, 0, old_idx);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
+
+ ret = btrfs_add_link(trans, old_dir, new_inode,
+ old_dentry->d_name.name,
+ old_dentry->d_name.len, 0, new_idx);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
+
+ if (old_inode->i_nlink == 1)
+ BTRFS_I(old_inode)->dir_index = old_idx;
+ if (new_inode->i_nlink == 1)
+ BTRFS_I(new_inode)->dir_index = new_idx;
+
+ if (root_log_pinned) {
+ parent = new_dentry->d_parent;
+ btrfs_log_new_name(trans, old_inode, old_dir, parent);
+ btrfs_end_log_trans(root);
+ root_log_pinned = false;
+ }
+ if (dest_log_pinned) {
+ parent = old_dentry->d_parent;
+ btrfs_log_new_name(trans, new_inode, new_dir, parent);
+ btrfs_end_log_trans(dest);
+ dest_log_pinned = false;
+ }
+out_fail:
+ /*
+ * If we have pinned a log and an error happened, we unpin tasks
+ * trying to sync the log and force them to fallback to a transaction
+ * commit if the log currently contains any of the inodes involved in
+ * this rename operation (to ensure we do not persist a log with an
+ * inconsistent state for any of these inodes or leading to any
+ * inconsistencies when replayed). If the transaction was aborted, the
+ * abortion reason is propagated to userspace when attempting to commit
+ * the transaction. If the log does not contain any of these inodes, we
+ * allow the tasks to sync it.
+ */
+ if (ret && (root_log_pinned || dest_log_pinned)) {
+ if (btrfs_inode_in_log(old_dir, root->fs_info->generation) ||
+ btrfs_inode_in_log(new_dir, root->fs_info->generation) ||
+ btrfs_inode_in_log(old_inode, root->fs_info->generation) ||
+ (new_inode &&
+ btrfs_inode_in_log(new_inode, root->fs_info->generation)))
+ btrfs_set_log_full_commit(root->fs_info, trans);
+
+ if (root_log_pinned) {
+ btrfs_end_log_trans(root);
+ root_log_pinned = false;
+ }
+ if (dest_log_pinned) {
+ btrfs_end_log_trans(dest);
+ dest_log_pinned = false;
+ }
+ }
+ ret = btrfs_end_transaction(trans, root);
+out_notrans:
+ if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
+ up_read(&dest->fs_info->subvol_sem);
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+ up_read(&root->fs_info->subvol_sem);
+
+ return ret;
+}
+
+static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *dir,
+ struct dentry *dentry)
+{
+ int ret;
+ struct inode *inode;
+ u64 objectid;
+ u64 index;
+
+ ret = btrfs_find_free_ino(root, &objectid);
+ if (ret)
+ return ret;
+
+ inode = btrfs_new_inode(trans, root, dir,
+ dentry->d_name.name,
+ dentry->d_name.len,
+ btrfs_ino(dir),
+ objectid,
+ S_IFCHR | WHITEOUT_MODE,
+ &index);
+
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ return ret;
+ }
+
+ inode->i_op = &btrfs_special_inode_operations;
+ init_special_inode(inode, inode->i_mode,
+ WHITEOUT_DEV);
+
+ ret = btrfs_init_inode_security(trans, inode, dir,
+ &dentry->d_name);
+ if (ret)
+ goto out;
+
+ ret = btrfs_add_nondir(trans, dir, dentry,
+ inode, 0, index);
+ if (ret)
+ goto out;
+
+ ret = btrfs_update_inode(trans, root, inode);
+out:
+ unlock_new_inode(inode);
+ if (ret)
+ inode_dec_link_count(inode);
+ iput(inode);
+
+ return ret;
+}
+
static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
struct btrfs_trans_handle *trans;
+ unsigned int trans_num_items;
struct btrfs_root *root = BTRFS_I(old_dir)->root;
struct btrfs_root *dest = BTRFS_I(new_dir)->root;
struct inode *new_inode = d_inode(new_dentry);
@@ -9399,6 +9726,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
u64 root_objectid;
int ret;
u64 old_ino = btrfs_ino(old_inode);
+ bool log_pinned = false;
if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return -EPERM;
@@ -9449,15 +9777,21 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
* We want to reserve the absolute worst case amount of items. So if
* both inodes are subvols and we need to unlink them then that would
* require 4 item modifications, but if they are both normal inodes it
- * would require 5 item modifications, so we'll assume their normal
+ * would require 5 item modifications, so we'll assume they are normal
* inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
* should cover the worst case number of items we'll modify.
+ * If our rename has the whiteout flag, we need more 5 units for the
+ * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item
+ * when selinux is enabled).
*/
- trans = btrfs_start_transaction(root, 11);
+ trans_num_items = 11;
+ if (flags & RENAME_WHITEOUT)
+ trans_num_items += 5;
+ trans = btrfs_start_transaction(root, trans_num_items);
if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- goto out_notrans;
- }
+ ret = PTR_ERR(trans);
+ goto out_notrans;
+ }
if (dest != root)
btrfs_record_root_in_trans(trans, dest);
@@ -9471,6 +9805,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(root->fs_info, trans);
} else {
+ btrfs_pin_log_trans(root);
+ log_pinned = true;
ret = btrfs_insert_inode_ref(trans, dest,
new_dentry->d_name.name,
new_dentry->d_name.len,
@@ -9478,14 +9814,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
btrfs_ino(new_dir), index);
if (ret)
goto out_fail;
- /*
- * this is an ugly little race, but the rename is required
- * to make sure that if we crash, the inode is either at the
- * old name or the new one. pinning the log transaction lets
- * us make sure we don't allow a log commit to come in after
- * we unlink the name but before we add the new name back in.
- */
- btrfs_pin_log_trans(root);
}
inode_inc_iversion(old_dir);
@@ -9512,7 +9840,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
ret = btrfs_update_inode(trans, root, old_inode);
}
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -9536,7 +9864,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (!ret && new_inode->i_nlink == 0)
ret = btrfs_orphan_add(trans, d_inode(new_dentry));
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out_fail;
}
}
@@ -9545,19 +9873,53 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
new_dentry->d_name.name,
new_dentry->d_name.len, 0, index);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out_fail;
}
if (old_inode->i_nlink == 1)
BTRFS_I(old_inode)->dir_index = index;
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ if (log_pinned) {
struct dentry *parent = new_dentry->d_parent;
+
btrfs_log_new_name(trans, old_inode, old_dir, parent);
btrfs_end_log_trans(root);
+ log_pinned = false;
+ }
+
+ if (flags & RENAME_WHITEOUT) {
+ ret = btrfs_whiteout_for_rename(trans, root, old_dir,
+ old_dentry);
+
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
}
out_fail:
+ /*
+ * If we have pinned the log and an error happened, we unpin tasks
+ * trying to sync the log and force them to fallback to a transaction
+ * commit if the log currently contains any of the inodes involved in
+ * this rename operation (to ensure we do not persist a log with an
+ * inconsistent state for any of these inodes or leading to any
+ * inconsistencies when replayed). If the transaction was aborted, the
+ * abortion reason is propagated to userspace when attempting to commit
+ * the transaction. If the log does not contain any of these inodes, we
+ * allow the tasks to sync it.
+ */
+ if (ret && log_pinned) {
+ if (btrfs_inode_in_log(old_dir, root->fs_info->generation) ||
+ btrfs_inode_in_log(new_dir, root->fs_info->generation) ||
+ btrfs_inode_in_log(old_inode, root->fs_info->generation) ||
+ (new_inode &&
+ btrfs_inode_in_log(new_inode, root->fs_info->generation)))
+ btrfs_set_log_full_commit(root->fs_info, trans);
+
+ btrfs_end_log_trans(root);
+ log_pinned = false;
+ }
btrfs_end_transaction(trans, root);
out_notrans:
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
@@ -9570,10 +9932,14 @@ static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
{
- if (flags & ~RENAME_NOREPLACE)
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;
- return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry);
+ if (flags & RENAME_EXCHANGE)
+ return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
+ new_dentry);
+
+ return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
}
static void btrfs_run_delalloc_work(struct btrfs_work *work)
@@ -9942,6 +10308,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
btrfs_end_transaction(trans, root);
break;
}
+ btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
last_alloc = ins.offset;
ret = insert_reserved_file_extent(trans, inode,
@@ -9952,7 +10319,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
if (ret) {
btrfs_free_reserved_extent(root, ins.objectid,
ins.offset, 0);
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
if (own_trans)
btrfs_end_transaction(trans, root);
break;
@@ -10012,7 +10379,7 @@ next:
ret = btrfs_update_inode(trans, root, inode);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
if (own_trans)
btrfs_end_transaction(trans, root);
break;
@@ -10160,10 +10527,10 @@ static const struct inode_operations btrfs_dir_inode_operations = {
.symlink = btrfs_symlink,
.setattr = btrfs_setattr,
.mknod = btrfs_mknod,
- .setxattr = btrfs_setxattr,
+ .setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
- .removexattr = btrfs_removexattr,
+ .removexattr = generic_removexattr,
.permission = btrfs_permission,
.get_acl = btrfs_get_acl,
.set_acl = btrfs_set_acl,
@@ -10181,10 +10548,10 @@ static const struct inode_operations btrfs_dir_ro_inode_operations = {
static const struct file_operations btrfs_dir_file_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .iterate = btrfs_real_readdir,
+ .iterate_shared = btrfs_real_readdir,
.unlocked_ioctl = btrfs_ioctl,
#ifdef CONFIG_COMPAT
- .compat_ioctl = btrfs_ioctl,
+ .compat_ioctl = btrfs_compat_ioctl,
#endif
.release = btrfs_release_file,
.fsync = btrfs_sync_file,
@@ -10237,10 +10604,10 @@ static const struct address_space_operations btrfs_symlink_aops = {
static const struct inode_operations btrfs_file_inode_operations = {
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
- .setxattr = btrfs_setxattr,
+ .setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
- .removexattr = btrfs_removexattr,
+ .removexattr = generic_removexattr,
.permission = btrfs_permission,
.fiemap = btrfs_fiemap,
.get_acl = btrfs_get_acl,
@@ -10251,10 +10618,10 @@ static const struct inode_operations btrfs_special_inode_operations = {
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
.permission = btrfs_permission,
- .setxattr = btrfs_setxattr,
+ .setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
- .removexattr = btrfs_removexattr,
+ .removexattr = generic_removexattr,
.get_acl = btrfs_get_acl,
.set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
@@ -10265,10 +10632,10 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
.permission = btrfs_permission,
- .setxattr = btrfs_setxattr,
+ .setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
- .removexattr = btrfs_removexattr,
+ .removexattr = generic_removexattr,
.update_time = btrfs_update_time,
};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 053e677839fef..14ed1e9e6bc83 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -125,10 +125,10 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
if (flags & BTRFS_INODE_NODATACOW)
iflags |= FS_NOCOW_FL;
- if ((flags & BTRFS_INODE_COMPRESS) && !(flags & BTRFS_INODE_NOCOMPRESS))
- iflags |= FS_COMPR_FL;
- else if (flags & BTRFS_INODE_NOCOMPRESS)
+ if (flags & BTRFS_INODE_NOCOMPRESS)
iflags |= FS_NOCOMP_FL;
+ else if (flags & BTRFS_INODE_COMPRESS)
+ iflags |= FS_COMPR_FL;
return iflags;
}
@@ -296,7 +296,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
}
} else {
/*
- * Revert back under same assuptions as above
+ * Revert back under same assumptions as above
*/
if (S_ISREG(mode)) {
if (inode->i_size == 0)
@@ -439,7 +439,7 @@ static noinline int create_subvol(struct inode *dir,
{
struct btrfs_trans_handle *trans;
struct btrfs_key key;
- struct btrfs_root_item root_item;
+ struct btrfs_root_item *root_item;
struct btrfs_inode_item *inode_item;
struct extent_buffer *leaf;
struct btrfs_root *root = BTRFS_I(dir)->root;
@@ -455,16 +455,22 @@ static noinline int create_subvol(struct inode *dir,
u64 qgroup_reserved;
uuid_le new_uuid;
+ root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
+ if (!root_item)
+ return -ENOMEM;
+
ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
if (ret)
- return ret;
+ goto fail_free;
/*
* Don't create subvolume whose level is not zero. Or qgroup will be
- * screwed up since it assume subvolme qgroup's level to be 0.
+ * screwed up since it assumes subvolume qgroup's level to be 0.
*/
- if (btrfs_qgroup_level(objectid))
- return -ENOSPC;
+ if (btrfs_qgroup_level(objectid)) {
+ ret = -ENOSPC;
+ goto fail_free;
+ }
btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
/*
@@ -474,14 +480,14 @@ static noinline int create_subvol(struct inode *dir,
ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
8, &qgroup_reserved, false);
if (ret)
- return ret;
+ goto fail_free;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
btrfs_subvolume_release_metadata(root, &block_rsv,
qgroup_reserved);
- return ret;
+ goto fail_free;
}
trans->block_rsv = &block_rsv;
trans->bytes_reserved = block_rsv.size;
@@ -509,47 +515,45 @@ static noinline int create_subvol(struct inode *dir,
BTRFS_UUID_SIZE);
btrfs_mark_buffer_dirty(leaf);
- memset(&root_item, 0, sizeof(root_item));
-
- inode_item = &root_item.inode;
+ inode_item = &root_item->inode;
btrfs_set_stack_inode_generation(inode_item, 1);
btrfs_set_stack_inode_size(inode_item, 3);
btrfs_set_stack_inode_nlink(inode_item, 1);
btrfs_set_stack_inode_nbytes(inode_item, root->nodesize);
btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
- btrfs_set_root_flags(&root_item, 0);
- btrfs_set_root_limit(&root_item, 0);
+ btrfs_set_root_flags(root_item, 0);
+ btrfs_set_root_limit(root_item, 0);
btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT);
- btrfs_set_root_bytenr(&root_item, leaf->start);
- btrfs_set_root_generation(&root_item, trans->transid);
- btrfs_set_root_level(&root_item, 0);
- btrfs_set_root_refs(&root_item, 1);
- btrfs_set_root_used(&root_item, leaf->len);
- btrfs_set_root_last_snapshot(&root_item, 0);
+ btrfs_set_root_bytenr(root_item, leaf->start);
+ btrfs_set_root_generation(root_item, trans->transid);
+ btrfs_set_root_level(root_item, 0);
+ btrfs_set_root_refs(root_item, 1);
+ btrfs_set_root_used(root_item, leaf->len);
+ btrfs_set_root_last_snapshot(root_item, 0);
- btrfs_set_root_generation_v2(&root_item,
- btrfs_root_generation(&root_item));
+ btrfs_set_root_generation_v2(root_item,
+ btrfs_root_generation(root_item));
uuid_le_gen(&new_uuid);
- memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE);
- btrfs_set_stack_timespec_sec(&root_item.otime, cur_time.tv_sec);
- btrfs_set_stack_timespec_nsec(&root_item.otime, cur_time.tv_nsec);
- root_item.ctime = root_item.otime;
- btrfs_set_root_ctransid(&root_item, trans->transid);
- btrfs_set_root_otransid(&root_item, trans->transid);
+ memcpy(root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE);
+ btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec);
+ btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec);
+ root_item->ctime = root_item->otime;
+ btrfs_set_root_ctransid(root_item, trans->transid);
+ btrfs_set_root_otransid(root_item, trans->transid);
btrfs_tree_unlock(leaf);
free_extent_buffer(leaf);
leaf = NULL;
- btrfs_set_root_dirid(&root_item, new_dirid);
+ btrfs_set_root_dirid(root_item, new_dirid);
key.objectid = objectid;
key.offset = 0;
key.type = BTRFS_ROOT_ITEM_KEY;
ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
- &root_item);
+ root_item);
if (ret)
goto fail;
@@ -557,7 +561,7 @@ static noinline int create_subvol(struct inode *dir,
new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
if (IS_ERR(new_root)) {
ret = PTR_ERR(new_root);
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -566,7 +570,7 @@ static noinline int create_subvol(struct inode *dir,
ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid);
if (ret) {
/* We potentially lose an unused inode item here */
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -579,7 +583,7 @@ static noinline int create_subvol(struct inode *dir,
*/
ret = btrfs_set_inode_index(dir, &index);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -587,7 +591,7 @@ static noinline int create_subvol(struct inode *dir,
name, namelen, dir, &key,
BTRFS_FT_DIR, index);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -601,12 +605,13 @@ static noinline int create_subvol(struct inode *dir,
BUG_ON(ret);
ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root,
- root_item.uuid, BTRFS_UUID_KEY_SUBVOL,
+ root_item->uuid, BTRFS_UUID_KEY_SUBVOL,
objectid);
if (ret)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
fail:
+ kfree(root_item);
trans->block_rsv = NULL;
trans->bytes_reserved = 0;
btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
@@ -629,6 +634,10 @@ fail:
d_instantiate(dentry, inode);
}
return ret;
+
+fail_free:
+ kfree(root_item);
+ return ret;
}
static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root)
@@ -681,7 +690,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
if (ret)
goto dec_and_free;
- btrfs_wait_ordered_extents(root, -1);
+ btrfs_wait_ordered_extents(root, -1, 0, (u64)-1);
btrfs_init_block_rsv(&pending_snapshot->block_rsv,
BTRFS_BLOCK_RSV_TEMP);
@@ -771,7 +780,7 @@ free_pending:
* a. be owner of dir, or
* b. be owner of victim, or
* c. have CAP_FOWNER capability
- * 6. If the victim is append-only or immutable we can't do antyhing with
+ * 6. If the victim is append-only or immutable we can't do anything with
* links pointing to it.
* 7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
* 8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
@@ -837,7 +846,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
struct dentry *dentry;
int error;
- error = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
if (error == -EINTR)
return error;
@@ -898,7 +907,7 @@ static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh)
u64 end;
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
+ em = lookup_extent_mapping(em_tree, offset, PAGE_SIZE);
read_unlock(&em_tree->lock);
if (em) {
@@ -988,7 +997,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_map *em;
- u64 len = PAGE_CACHE_SIZE;
+ u64 len = PAGE_SIZE;
/*
* hopefully we have this extent in the tree already, try without
@@ -1124,15 +1133,15 @@ static int cluster_pages_for_defrag(struct inode *inode,
struct extent_io_tree *tree;
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
- file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
+ file_end = (isize - 1) >> PAGE_SHIFT;
if (!isize || start_index > file_end)
return 0;
page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
ret = btrfs_delalloc_reserve_space(inode,
- start_index << PAGE_CACHE_SHIFT,
- page_cnt << PAGE_CACHE_SHIFT);
+ start_index << PAGE_SHIFT,
+ page_cnt << PAGE_SHIFT);
if (ret)
return ret;
i_done = 0;
@@ -1148,7 +1157,7 @@ again:
break;
page_start = page_offset(page);
- page_end = page_start + PAGE_CACHE_SIZE - 1;
+ page_end = page_start + PAGE_SIZE - 1;
while (1) {
lock_extent_bits(tree, page_start, page_end,
&cached_state);
@@ -1169,7 +1178,7 @@ again:
*/
if (page->mapping != inode->i_mapping) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
goto again;
}
}
@@ -1179,7 +1188,7 @@ again:
lock_page(page);
if (!PageUptodate(page)) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
ret = -EIO;
break;
}
@@ -1187,7 +1196,7 @@ again:
if (page->mapping != inode->i_mapping) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
goto again;
}
@@ -1208,7 +1217,7 @@ again:
wait_on_page_writeback(pages[i]);
page_start = page_offset(pages[0]);
- page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE;
+ page_end = page_offset(pages[i_done - 1]) + PAGE_SIZE;
lock_extent_bits(&BTRFS_I(inode)->io_tree,
page_start, page_end - 1, &cached_state);
@@ -1222,13 +1231,13 @@ again:
BTRFS_I(inode)->outstanding_extents++;
spin_unlock(&BTRFS_I(inode)->lock);
btrfs_delalloc_release_space(inode,
- start_index << PAGE_CACHE_SHIFT,
- (page_cnt - i_done) << PAGE_CACHE_SHIFT);
+ start_index << PAGE_SHIFT,
+ (page_cnt - i_done) << PAGE_SHIFT);
}
set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1,
- &cached_state, GFP_NOFS);
+ &cached_state);
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
page_start, page_end - 1, &cached_state,
@@ -1240,17 +1249,17 @@ again:
set_page_extent_mapped(pages[i]);
set_page_dirty(pages[i]);
unlock_page(pages[i]);
- page_cache_release(pages[i]);
+ put_page(pages[i]);
}
return i_done;
out:
for (i = 0; i < i_done; i++) {
unlock_page(pages[i]);
- page_cache_release(pages[i]);
+ put_page(pages[i]);
}
btrfs_delalloc_release_space(inode,
- start_index << PAGE_CACHE_SHIFT,
- page_cnt << PAGE_CACHE_SHIFT);
+ start_index << PAGE_SHIFT,
+ page_cnt << PAGE_SHIFT);
return ret;
}
@@ -1273,7 +1282,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
int defrag_count = 0;
int compress_type = BTRFS_COMPRESS_ZLIB;
u32 extent_thresh = range->extent_thresh;
- unsigned long max_cluster = SZ_256K >> PAGE_CACHE_SHIFT;
+ unsigned long max_cluster = SZ_256K >> PAGE_SHIFT;
unsigned long cluster = max_cluster;
u64 new_align = ~((u64)SZ_128K - 1);
struct page **pages = NULL;
@@ -1317,9 +1326,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
/* find the last page to defrag */
if (range->start + range->len > range->start) {
last_index = min_t(u64, isize - 1,
- range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
+ range->start + range->len - 1) >> PAGE_SHIFT;
} else {
- last_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+ last_index = (isize - 1) >> PAGE_SHIFT;
}
if (newer_than) {
@@ -1331,11 +1340,11 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
* we always align our defrag to help keep
* the extents in the file evenly spaced
*/
- i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
+ i = (newer_off & new_align) >> PAGE_SHIFT;
} else
goto out_ra;
} else {
- i = range->start >> PAGE_CACHE_SHIFT;
+ i = range->start >> PAGE_SHIFT;
}
if (!max_to_defrag)
max_to_defrag = last_index - i + 1;
@@ -1348,7 +1357,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
inode->i_mapping->writeback_index = i;
while (i <= last_index && defrag_count < max_to_defrag &&
- (i < DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE))) {
+ (i < DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE))) {
/*
* make sure we stop running if someone unmounts
* the FS
@@ -1362,7 +1371,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
break;
}
- if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
+ if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT,
extent_thresh, &last_len, &skip,
&defrag_end, range->flags &
BTRFS_DEFRAG_RANGE_COMPRESS)) {
@@ -1371,14 +1380,14 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
* the should_defrag function tells us how much to skip
* bump our counter by the suggested amount
*/
- next = DIV_ROUND_UP(skip, PAGE_CACHE_SIZE);
+ next = DIV_ROUND_UP(skip, PAGE_SIZE);
i = max(i + 1, next);
continue;
}
if (!newer_than) {
- cluster = (PAGE_CACHE_ALIGN(defrag_end) >>
- PAGE_CACHE_SHIFT) - i;
+ cluster = (PAGE_ALIGN(defrag_end) >>
+ PAGE_SHIFT) - i;
cluster = min(cluster, max_cluster);
} else {
cluster = max_cluster;
@@ -1412,20 +1421,20 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
i += ret;
newer_off = max(newer_off + 1,
- (u64)i << PAGE_CACHE_SHIFT);
+ (u64)i << PAGE_SHIFT);
ret = find_new_extents(root, inode, newer_than,
&newer_off, SZ_64K);
if (!ret) {
range->start = newer_off;
- i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
+ i = (newer_off & new_align) >> PAGE_SHIFT;
} else {
break;
}
} else {
if (ret > 0) {
i += ret;
- last_len += ret << PAGE_CACHE_SHIFT;
+ last_len += ret << PAGE_SHIFT;
} else {
i++;
last_len = 0;
@@ -1654,7 +1663,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
src_inode = file_inode(src.file);
if (src_inode->i_sb != file_inode(file)->i_sb) {
- btrfs_info(BTRFS_I(src_inode)->root->fs_info,
+ btrfs_info(BTRFS_I(file_inode(file))->root->fs_info,
"Snapshot src from another FS");
ret = -EXDEV;
} else if (!inode_owner_or_capable(src_inode)) {
@@ -1722,7 +1731,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
readonly = true;
if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
- if (vol_args->size > PAGE_CACHE_SIZE) {
+ if (vol_args->size > PAGE_SIZE) {
ret = -EINVAL;
goto free_args;
}
@@ -1939,8 +1948,7 @@ static noinline int key_in_sk(struct btrfs_key *key,
return 1;
}
-static noinline int copy_to_sk(struct btrfs_root *root,
- struct btrfs_path *path,
+static noinline int copy_to_sk(struct btrfs_path *path,
struct btrfs_key *key,
struct btrfs_ioctl_search_key *sk,
size_t *buf_size,
@@ -2111,7 +2119,7 @@ static noinline int search_ioctl(struct inode *inode,
ret = 0;
goto err;
}
- ret = copy_to_sk(root, path, &key, sk, buf_size, ubuf,
+ ret = copy_to_sk(path, &key, sk, buf_size, ubuf,
&sk_offset, &num_found);
btrfs_release_path(path);
if (ret)
@@ -2366,7 +2374,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
goto out;
- err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
if (err == -EINTR)
goto out_drop_write;
dentry = lookup_one_len(vol_args->name, parent, namelen);
@@ -2397,7 +2405,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
* rmdir(2).
*/
err = -EPERM;
- if (!btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
+ if (!btrfs_test_opt(root->fs_info, USER_SUBVOL_RM_ALLOWED))
goto out_dput;
/*
@@ -2480,7 +2488,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
dentry->d_name.len);
if (ret) {
err = ret;
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
@@ -2496,7 +2504,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
root->fs_info->tree_root,
dest->root_key.objectid);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
err = ret;
goto out_end_trans;
}
@@ -2506,7 +2514,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
dest->root_item.uuid, BTRFS_UUID_KEY_SUBVOL,
dest->root_key.objectid);
if (ret && ret != -ENOENT) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
err = ret;
goto out_end_trans;
}
@@ -2516,7 +2524,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
dest->root_key.objectid);
if (ret && ret != -ENOENT) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
err = ret;
goto out_end_trans;
}
@@ -2667,10 +2675,10 @@ out:
return ret;
}
-static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
+static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
{
struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
- struct btrfs_ioctl_vol_args *vol_args;
+ struct btrfs_ioctl_vol_args_v2 *vol_args;
int ret;
if (!capable(CAP_SYS_ADMIN))
@@ -2686,7 +2694,9 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
goto err_drop;
}
- vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ /* Check for compatibility reject unknown flags */
+ if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED)
+ return -EOPNOTSUPP;
if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
1)) {
@@ -2695,13 +2705,23 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
}
mutex_lock(&root->fs_info->volume_mutex);
- ret = btrfs_rm_device(root, vol_args->name);
+ if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
+ ret = btrfs_rm_device(root, NULL, vol_args->devid);
+ } else {
+ vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
+ ret = btrfs_rm_device(root, vol_args->name, 0);
+ }
mutex_unlock(&root->fs_info->volume_mutex);
atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
- if (!ret)
- btrfs_info(root->fs_info, "disk deleted %s",vol_args->name);
-
+ if (!ret) {
+ if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
+ btrfs_info(root->fs_info, "device deleted: id %llu",
+ vol_args->devid);
+ else
+ btrfs_info(root->fs_info, "device deleted: %s",
+ vol_args->name);
+ }
out:
kfree(vol_args);
err_drop:
@@ -2709,6 +2729,47 @@ err_drop:
return ret;
}
+static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
+{
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct btrfs_ioctl_vol_args *vol_args;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+ 1)) {
+ ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+ goto out_drop_write;
+ }
+
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args)) {
+ ret = PTR_ERR(vol_args);
+ goto out;
+ }
+
+ vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ mutex_lock(&root->fs_info->volume_mutex);
+ ret = btrfs_rm_device(root, vol_args->name, 0);
+ mutex_unlock(&root->fs_info->volume_mutex);
+
+ if (!ret)
+ btrfs_info(root->fs_info, "disk deleted %s",vol_args->name);
+ kfree(vol_args);
+out:
+ atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+out_drop_write:
+ mnt_drop_write_file(file);
+
+ return ret;
+}
+
static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
{
struct btrfs_ioctl_fs_info_args *fi_args;
@@ -2806,12 +2867,12 @@ static struct page *extent_same_get_page(struct inode *inode, pgoff_t index)
lock_page(page);
if (!PageUptodate(page)) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return ERR_PTR(-EIO);
}
if (page->mapping != inode->i_mapping) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return ERR_PTR(-EAGAIN);
}
}
@@ -2823,7 +2884,7 @@ static int gather_extent_pages(struct inode *inode, struct page **pages,
int num_pages, u64 off)
{
int i;
- pgoff_t index = off >> PAGE_CACHE_SHIFT;
+ pgoff_t index = off >> PAGE_SHIFT;
for (i = 0; i < num_pages; i++) {
again:
@@ -2932,12 +2993,12 @@ static void btrfs_cmp_data_free(struct cmp_pages *cmp)
pg = cmp->src_pages[i];
if (pg) {
unlock_page(pg);
- page_cache_release(pg);
+ put_page(pg);
}
pg = cmp->dst_pages[i];
if (pg) {
unlock_page(pg);
- page_cache_release(pg);
+ put_page(pg);
}
}
kfree(cmp->src_pages);
@@ -2949,7 +3010,7 @@ static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
u64 len, struct cmp_pages *cmp)
{
int ret;
- int num_pages = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT;
+ int num_pages = PAGE_ALIGN(len) >> PAGE_SHIFT;
struct page **src_pgarr, **dst_pgarr;
/*
@@ -2987,12 +3048,12 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
int ret = 0;
int i;
struct page *src_page, *dst_page;
- unsigned int cmp_len = PAGE_CACHE_SIZE;
+ unsigned int cmp_len = PAGE_SIZE;
void *addr, *dst_addr;
i = 0;
while (len) {
- if (len < PAGE_CACHE_SIZE)
+ if (len < PAGE_SIZE)
cmp_len = len;
BUG_ON(i >= cmp->num_pages);
@@ -3191,7 +3252,7 @@ ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
if (olen > BTRFS_MAX_DEDUPE_LEN)
olen = BTRFS_MAX_DEDUPE_LEN;
- if (WARN_ON_ONCE(bs < PAGE_CACHE_SIZE)) {
+ if (WARN_ON_ONCE(bs < PAGE_SIZE)) {
/*
* Btrfs does not support blocksize < page_size. As a
* result, btrfs_cmp_data() won't correctly handle
@@ -3230,7 +3291,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
ret = btrfs_update_inode(trans, root, inode);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans, root);
goto out;
}
@@ -3468,13 +3529,16 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
u64 last_dest_end = destoff;
ret = -ENOMEM;
- buf = vmalloc(root->nodesize);
- if (!buf)
- return ret;
+ buf = kmalloc(root->nodesize, GFP_KERNEL | __GFP_NOWARN);
+ if (!buf) {
+ buf = vmalloc(root->nodesize);
+ if (!buf)
+ return ret;
+ }
path = btrfs_alloc_path();
if (!path) {
- vfree(buf);
+ kvfree(buf);
return ret;
}
@@ -3629,7 +3693,7 @@ process_slot:
if (ret) {
if (ret != -EOPNOTSUPP)
btrfs_abort_transaction(trans,
- root, ret);
+ ret);
btrfs_end_transaction(trans, root);
goto out;
}
@@ -3637,8 +3701,7 @@ process_slot:
ret = btrfs_insert_empty_item(trans, root, path,
&new_key, size);
if (ret) {
- btrfs_abort_transaction(trans, root,
- ret);
+ btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans, root);
goto out;
}
@@ -3670,7 +3733,6 @@ process_slot:
new_key.offset - datao);
if (ret) {
btrfs_abort_transaction(trans,
- root,
ret);
btrfs_end_transaction(trans,
root);
@@ -3707,7 +3769,6 @@ process_slot:
if (ret) {
if (ret != -EOPNOTSUPP)
btrfs_abort_transaction(trans,
- root,
ret);
btrfs_end_transaction(trans, root);
goto out;
@@ -3763,7 +3824,7 @@ process_slot:
last_dest_end, destoff + len, 1);
if (ret) {
if (ret != -EOPNOTSUPP)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans, root);
goto out;
}
@@ -3775,7 +3836,7 @@ process_slot:
out:
btrfs_free_path(path);
- vfree(buf);
+ kvfree(buf);
return ret;
}
@@ -3891,8 +3952,8 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
* data immediately and not the previous data.
*/
truncate_inode_pages_range(&inode->i_data,
- round_down(destoff, PAGE_CACHE_SIZE),
- round_up(destoff + len, PAGE_CACHE_SIZE) - 1);
+ round_down(destoff, PAGE_SIZE),
+ round_up(destoff + len, PAGE_SIZE) - 1);
out_unlock:
if (!same_inode)
btrfs_double_inode_unlock(src, inode);
@@ -4124,7 +4185,7 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
/* we generally have at most 6 or so space infos, one for each raid
* level. So, a whole page should be more than enough for everyone
*/
- if (alloc_size > PAGE_CACHE_SIZE)
+ if (alloc_size > PAGE_SIZE)
return -ENOMEM;
space_args.total_spaces = 0;
@@ -4376,7 +4437,7 @@ static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
1)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
} else {
- ret = btrfs_dev_replace_start(root, p);
+ ret = btrfs_dev_replace_by_ioctl(root, p);
atomic_set(
&root->fs_info->mutually_exclusive_operation_running,
0);
@@ -4585,7 +4646,7 @@ again:
}
/*
- * mut. excl. ops lock is locked. Three possibilites:
+ * mut. excl. ops lock is locked. Three possibilities:
* (1) some other op is running
* (2) balance is running
* (3) balance is paused -- special case (think resume)
@@ -4847,8 +4908,8 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
/* update qgroup status and info */
err = btrfs_run_qgroups(trans, root->fs_info);
if (err < 0)
- btrfs_std_error(root->fs_info, ret,
- "failed to update qgroup status and info\n");
+ btrfs_handle_fs_error(root->fs_info, err,
+ "failed to update qgroup status and info");
err = btrfs_end_transaction(trans, root);
if (err && !ret)
ret = err;
@@ -5099,13 +5160,13 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
root->root_key.objectid);
if (ret < 0 && ret != -EEXIST) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
}
ret = btrfs_commit_transaction(trans, root);
if (ret < 0) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -5394,9 +5455,15 @@ static int btrfs_ioctl_set_features(struct file *file, void __user *arg)
if (ret)
return ret;
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_drop_write;
+ }
spin_lock(&root->fs_info->super_lock);
newflags = btrfs_super_compat_flags(super_block);
@@ -5415,7 +5482,11 @@ static int btrfs_ioctl_set_features(struct file *file, void __user *arg)
btrfs_set_super_incompat_flags(super_block, newflags);
spin_unlock(&root->fs_info->super_lock);
- return btrfs_commit_transaction(trans, root);
+ ret = btrfs_commit_transaction(trans, root);
+out_drop_write:
+ mnt_drop_write_file(file);
+
+ return ret;
}
long btrfs_ioctl(struct file *file, unsigned int
@@ -5459,6 +5530,8 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_add_dev(root, argp);
case BTRFS_IOC_RM_DEV:
return btrfs_ioctl_rm_dev(file, argp);
+ case BTRFS_IOC_RM_DEV_V2:
+ return btrfs_ioctl_rm_dev_v2(file, argp);
case BTRFS_IOC_FS_INFO:
return btrfs_ioctl_fs_info(root, argp);
case BTRFS_IOC_DEV_INFO:
@@ -5490,7 +5563,7 @@ long btrfs_ioctl(struct file *file, unsigned int
ret = btrfs_sync_fs(file_inode(file)->i_sb, 1);
/*
* The transaction thread may want to do more work,
- * namely it pokes the cleaner ktread that will start
+ * namely it pokes the cleaner kthread that will start
* processing uncleaned subvols.
*/
wake_up_process(root->fs_info->transaction_kthread);
@@ -5552,3 +5625,24 @@ long btrfs_ioctl(struct file *file, unsigned int
return -ENOTTY;
}
+
+#ifdef CONFIG_COMPAT
+long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case FS_IOC32_GETFLAGS:
+ cmd = FS_IOC_GETFLAGS;
+ break;
+ case FS_IOC32_SETFLAGS:
+ cmd = FS_IOC_SETFLAGS;
+ break;
+ case FS_IOC32_GETVERSION:
+ cmd = FS_IOC_GETVERSION;
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+
+ return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+}
+#endif
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index a2f0513477313..1adfbe7be6b80 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -55,8 +55,8 @@ static struct list_head *lzo_alloc_workspace(void)
return ERR_PTR(-ENOMEM);
workspace->mem = vmalloc(LZO1X_MEM_COMPRESS);
- workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
- workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
+ workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_SIZE));
+ workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_SIZE));
if (!workspace->mem || !workspace->buf || !workspace->cbuf)
goto fail;
@@ -116,7 +116,7 @@ static int lzo_compress_pages(struct list_head *ws,
*total_out = 0;
*total_in = 0;
- in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+ in_page = find_get_page(mapping, start >> PAGE_SHIFT);
data_in = kmap(in_page);
/*
@@ -133,10 +133,10 @@ static int lzo_compress_pages(struct list_head *ws,
tot_out = LZO_LEN;
pages[0] = out_page;
nr_pages = 1;
- pg_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
+ pg_bytes_left = PAGE_SIZE - LZO_LEN;
/* compress at most one page of data each time */
- in_len = min(len, PAGE_CACHE_SIZE);
+ in_len = min(len, PAGE_SIZE);
while (tot_in < len) {
ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
&out_len, workspace->mem);
@@ -201,7 +201,7 @@ static int lzo_compress_pages(struct list_head *ws,
cpage_out = kmap(out_page);
pages[nr_pages++] = out_page;
- pg_bytes_left = PAGE_CACHE_SIZE;
+ pg_bytes_left = PAGE_SIZE;
out_offset = 0;
}
}
@@ -221,12 +221,12 @@ static int lzo_compress_pages(struct list_head *ws,
bytes_left = len - tot_in;
kunmap(in_page);
- page_cache_release(in_page);
+ put_page(in_page);
- start += PAGE_CACHE_SIZE;
- in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+ start += PAGE_SIZE;
+ in_page = find_get_page(mapping, start >> PAGE_SHIFT);
data_in = kmap(in_page);
- in_len = min(bytes_left, PAGE_CACHE_SIZE);
+ in_len = min(bytes_left, PAGE_SIZE);
}
if (tot_out > tot_in)
@@ -248,7 +248,7 @@ out:
if (in_page) {
kunmap(in_page);
- page_cache_release(in_page);
+ put_page(in_page);
}
return ret;
@@ -266,7 +266,7 @@ static int lzo_decompress_biovec(struct list_head *ws,
char *data_in;
unsigned long page_in_index = 0;
unsigned long page_out_index = 0;
- unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE);
+ unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
unsigned long buf_start;
unsigned long buf_offset = 0;
unsigned long bytes;
@@ -289,7 +289,7 @@ static int lzo_decompress_biovec(struct list_head *ws,
tot_in = LZO_LEN;
in_offset = LZO_LEN;
tot_len = min_t(size_t, srclen, tot_len);
- in_page_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
+ in_page_bytes_left = PAGE_SIZE - LZO_LEN;
tot_out = 0;
pg_offset = 0;
@@ -345,12 +345,12 @@ cont:
data_in = kmap(pages_in[++page_in_index]);
- in_page_bytes_left = PAGE_CACHE_SIZE;
+ in_page_bytes_left = PAGE_SIZE;
in_offset = 0;
}
}
- out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE);
+ out_len = lzo1x_worst_compress(PAGE_SIZE);
ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
&out_len);
if (need_unmap)
@@ -399,7 +399,7 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
in_len = read_compress_length(data_in);
data_in += LZO_LEN;
- out_len = PAGE_CACHE_SIZE;
+ out_len = PAGE_SIZE;
ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
if (ret != LZO_E_OK) {
printk(KERN_WARNING "BTRFS: decompress failed!\n");
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 0de7da5a610d7..3b78d38173b3f 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -661,14 +661,15 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
* wait for all the ordered extents in a root. This is done when balancing
* space between drives.
*/
-int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
+int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr,
+ const u64 range_start, const u64 range_len)
{
- struct list_head splice, works;
+ LIST_HEAD(splice);
+ LIST_HEAD(skipped);
+ LIST_HEAD(works);
struct btrfs_ordered_extent *ordered, *next;
int count = 0;
-
- INIT_LIST_HEAD(&splice);
- INIT_LIST_HEAD(&works);
+ const u64 range_end = range_start + range_len;
mutex_lock(&root->ordered_extent_mutex);
spin_lock(&root->ordered_extent_lock);
@@ -676,6 +677,14 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
while (!list_empty(&splice) && nr) {
ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
root_extent_list);
+
+ if (range_end <= ordered->start ||
+ ordered->start + ordered->disk_len <= range_start) {
+ list_move_tail(&ordered->root_extent_list, &skipped);
+ cond_resched_lock(&root->ordered_extent_lock);
+ continue;
+ }
+
list_move_tail(&ordered->root_extent_list,
&root->ordered_extents);
atomic_inc(&ordered->refs);
@@ -694,6 +703,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
nr--;
count++;
}
+ list_splice_tail(&skipped, &root->ordered_extents);
list_splice_tail(&splice, &root->ordered_extents);
spin_unlock(&root->ordered_extent_lock);
@@ -708,11 +718,13 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
return count;
}
-void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
+int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr,
+ const u64 range_start, const u64 range_len)
{
struct btrfs_root *root;
struct list_head splice;
int done;
+ int total_done = 0;
INIT_LIST_HEAD(&splice);
@@ -728,8 +740,10 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
&fs_info->ordered_roots);
spin_unlock(&fs_info->ordered_root_lock);
- done = btrfs_wait_ordered_extents(root, nr);
+ done = btrfs_wait_ordered_extents(root, nr,
+ range_start, range_len);
btrfs_put_fs_root(root);
+ total_done += done;
spin_lock(&fs_info->ordered_root_lock);
if (nr != -1) {
@@ -740,6 +754,8 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
list_splice_tail(&splice, &fs_info->ordered_roots);
spin_unlock(&fs_info->ordered_root_lock);
mutex_unlock(&fs_info->ordered_operations_mutex);
+
+ return total_done;
}
/*
@@ -952,6 +968,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct rb_node *prev = NULL;
struct btrfs_ordered_extent *test;
int ret = 1;
+ u64 orig_offset = offset;
spin_lock_irq(&tree->lock);
if (ordered) {
@@ -967,7 +984,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
/* truncate file */
if (disk_i_size > i_size) {
- BTRFS_I(inode)->disk_i_size = i_size;
+ BTRFS_I(inode)->disk_i_size = orig_offset;
ret = 0;
goto out;
}
@@ -1105,7 +1122,7 @@ int __init ordered_data_init(void)
{
btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
sizeof(struct btrfs_ordered_extent), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ SLAB_MEM_SPREAD,
NULL);
if (!btrfs_ordered_extent_cache)
return -ENOMEM;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 23c96059cef26..451507776ff59 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -58,7 +58,7 @@ struct btrfs_ordered_sum {
#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */
-#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
+#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to preallocated extent */
#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
@@ -197,8 +197,10 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_extent *ordered);
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
u32 *sum, int len);
-int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
-void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
+int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr,
+ const u64 range_start, const u64 range_len);
+int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr,
+ const u64 range_start, const u64 range_len);
void btrfs_get_logged_extents(struct inode *inode,
struct list_head *logged_list,
const loff_t start,
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 36992128c7466..cf0b444ac4f30 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -350,6 +350,7 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_root *parent_root)
{
+ struct super_block *sb = root->fs_info->sb;
struct btrfs_key key;
struct inode *parent_inode, *child_inode;
int ret;
@@ -358,12 +359,11 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- parent_inode = btrfs_iget(parent_root->fs_info->sb, &key,
- parent_root, NULL);
+ parent_inode = btrfs_iget(sb, &key, parent_root, NULL);
if (IS_ERR(parent_inode))
return PTR_ERR(parent_inode);
- child_inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
+ child_inode = btrfs_iget(sb, &key, root, NULL);
if (IS_ERR(child_inode)) {
iput(parent_inode);
return PTR_ERR(child_inode);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 5279fdae7142f..93ee1c18ef9d4 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -85,7 +85,7 @@ struct btrfs_qgroup {
/*
* temp variables for accounting operations
- * Refer to qgroup_shared_accouting() for details.
+ * Refer to qgroup_shared_accounting() for details.
*/
u64 old_refcnt;
u64 new_refcnt;
@@ -499,7 +499,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
}
/*
* we call btrfs_free_qgroup_config() when umounting
- * filesystem and disabling quota, so we set qgroup_ulit
+ * filesystem and disabling quota, so we set qgroup_ulist
* to be null here to avoid double free.
*/
ulist_free(fs_info->qgroup_ulist);
@@ -571,7 +571,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_key key;
- if (btrfs_test_is_dummy_root(quota_root))
+ if (btrfs_is_testing(quota_root->fs_info))
return 0;
path = btrfs_alloc_path();
@@ -728,7 +728,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
int ret;
int slot;
- if (btrfs_test_is_dummy_root(root))
+ if (btrfs_is_testing(root->fs_info))
return 0;
key.objectid = 0;
@@ -1036,7 +1036,7 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info,
/*
* The easy accounting, if we are adding/removing the only ref for an extent
- * then this qgroup and all of the parent qgroups get their refrence and
+ * then this qgroup and all of the parent qgroups get their reference and
* exclusive counts adjusted.
*
* Caller should hold fs_info->qgroup_lock.
@@ -1436,7 +1436,7 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
/*
* No need to do lock, since this function will only be called in
- * btrfs_commmit_transaction().
+ * btrfs_commit_transaction().
*/
node = rb_first(&delayed_refs->dirty_extent_root);
while (node) {
@@ -1453,9 +1453,10 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
return ret;
}
-struct btrfs_qgroup_extent_record
-*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_qgroup_extent_record *record)
+struct btrfs_qgroup_extent_record *
+btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_qgroup_extent_record *record)
{
struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
struct rb_node *parent_node = NULL;
@@ -1463,6 +1464,7 @@ struct btrfs_qgroup_extent_record
u64 bytenr = record->bytenr;
assert_spin_locked(&delayed_refs->lock);
+ trace_btrfs_qgroup_insert_dirty_extent(fs_info, record);
while (*p) {
parent_node = *p;
@@ -1556,7 +1558,7 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
* A: cur_old_roots < nr_old_roots (not exclusive before)
* !A: cur_old_roots == nr_old_roots (possible exclusive before)
* B: cur_new_roots < nr_new_roots (not exclusive now)
- * !B: cur_new_roots == nr_new_roots (possible exclsuive now)
+ * !B: cur_new_roots == nr_new_roots (possible exclusive now)
*
* Results:
* +: Possible sharing -> exclusive -: Possible exclusive -> sharing
@@ -1594,6 +1596,9 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
+ trace_qgroup_update_counters(fs_info, qg->qgroupid,
+ cur_old_count, cur_new_count);
+
/* Rfer update part */
if (cur_old_count == 0 && cur_new_count > 0) {
qg->rfer += num_bytes;
@@ -1683,6 +1688,9 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
goto out_free;
BUG_ON(!fs_info->quota_root);
+ trace_btrfs_qgroup_account_extent(fs_info, bytenr, num_bytes,
+ nr_old_roots, nr_new_roots);
+
qgroups = ulist_alloc(GFP_NOFS);
if (!qgroups) {
ret = -ENOMEM;
@@ -1752,6 +1760,8 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
record = rb_entry(node, struct btrfs_qgroup_extent_record,
node);
+ trace_btrfs_qgroup_account_extents(fs_info, record);
+
if (!ret) {
/*
* Use (u64)-1 as time_seq to do special search, which
@@ -1842,8 +1852,10 @@ out:
}
/*
- * copy the acounting information between qgroups. This is necessary when a
- * snapshot or a subvolume is created
+ * Copy the accounting information between qgroups. This is necessary
+ * when a snapshot or a subvolume is created. Throwing an error will
+ * cause a transaction abort so we take extra care here to only error
+ * when a readonly fs is a reasonable outcome.
*/
int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
@@ -1873,15 +1885,15 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
2 * inherit->num_excl_copies;
for (i = 0; i < nums; ++i) {
srcgroup = find_qgroup_rb(fs_info, *i_qgroups);
- if (!srcgroup) {
- ret = -EINVAL;
- goto out;
- }
- if ((srcgroup->qgroupid >> 48) <= (objectid >> 48)) {
- ret = -EINVAL;
- goto out;
- }
+ /*
+ * Zero out invalid groups so we can ignore
+ * them later.
+ */
+ if (!srcgroup ||
+ ((srcgroup->qgroupid >> 48) <= (objectid >> 48)))
+ *i_qgroups = 0ULL;
+
++i_qgroups;
}
}
@@ -1916,17 +1928,19 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
*/
if (inherit) {
i_qgroups = (u64 *)(inherit + 1);
- for (i = 0; i < inherit->num_qgroups; ++i) {
+ for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) {
+ if (*i_qgroups == 0)
+ continue;
ret = add_qgroup_relation_item(trans, quota_root,
objectid, *i_qgroups);
- if (ret)
+ if (ret && ret != -EEXIST)
goto out;
ret = add_qgroup_relation_item(trans, quota_root,
*i_qgroups, objectid);
- if (ret)
+ if (ret && ret != -EEXIST)
goto out;
- ++i_qgroups;
}
+ ret = 0;
}
@@ -1987,17 +2001,22 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
i_qgroups = (u64 *)(inherit + 1);
for (i = 0; i < inherit->num_qgroups; ++i) {
- ret = add_relation_rb(quota_root->fs_info, objectid,
- *i_qgroups);
- if (ret)
- goto unlock;
+ if (*i_qgroups) {
+ ret = add_relation_rb(quota_root->fs_info, objectid,
+ *i_qgroups);
+ if (ret)
+ goto unlock;
+ }
++i_qgroups;
}
- for (i = 0; i < inherit->num_ref_copies; ++i) {
+ for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) {
struct btrfs_qgroup *src;
struct btrfs_qgroup *dst;
+ if (!i_qgroups[0] || !i_qgroups[1])
+ continue;
+
src = find_qgroup_rb(fs_info, i_qgroups[0]);
dst = find_qgroup_rb(fs_info, i_qgroups[1]);
@@ -2008,12 +2027,14 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
dst->rfer = src->rfer - level_size;
dst->rfer_cmpr = src->rfer_cmpr - level_size;
- i_qgroups += 2;
}
- for (i = 0; i < inherit->num_excl_copies; ++i) {
+ for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) {
struct btrfs_qgroup *src;
struct btrfs_qgroup *dst;
+ if (!i_qgroups[0] || !i_qgroups[1])
+ continue;
+
src = find_qgroup_rb(fs_info, i_qgroups[0]);
dst = find_qgroup_rb(fs_info, i_qgroups[1]);
@@ -2024,7 +2045,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
dst->excl = src->excl + level_size;
dst->excl_cmpr = src->excl_cmpr + level_size;
- i_qgroups += 2;
}
unlock:
@@ -2176,7 +2196,7 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
{
if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
return;
- btrfs_err(trans->root->fs_info,
+ btrfs_err(trans->fs_info,
"qgroups not uptodate in trans handle %p: list is%s empty, "
"seq is %#x.%x",
trans, list_empty(&trans->qgroup_ref_list) ? "" : " not",
@@ -2321,7 +2341,7 @@ out:
mutex_unlock(&fs_info->qgroup_rescan_lock);
/*
- * only update status, since the previous part has alreay updated the
+ * only update status, since the previous part has already updated the
* qgroup info.
*/
trans = btrfs_start_transaction(fs_info->quota_root, 1);
@@ -2523,8 +2543,7 @@ int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len)
changeset.bytes_changed = 0;
changeset.range_changed = ulist_alloc(GFP_NOFS);
ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
- start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS,
- &changeset);
+ start + len -1, EXTENT_QGROUP_RESERVED, &changeset);
trace_btrfs_qgroup_reserve_data(inode, start, len,
changeset.bytes_changed,
QGROUP_RESERVE);
@@ -2561,8 +2580,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len,
return -ENOMEM;
ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
- start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS,
- &changeset);
+ start + len -1, EXTENT_QGROUP_RESERVED, &changeset);
if (ret < 0)
goto out;
@@ -2653,7 +2671,7 @@ void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
}
/*
- * Check qgroup reserved space leaking, normally at destory inode
+ * Check qgroup reserved space leaking, normally at destroy inode
* time
*/
void btrfs_qgroup_check_reserved_leak(struct inode *inode)
@@ -2669,7 +2687,7 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
return;
ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
- EXTENT_QGROUP_RESERVED, GFP_NOFS, &changeset);
+ EXTENT_QGROUP_RESERVED, &changeset);
WARN_ON(ret < 0);
if (WARN_ON(changeset.bytes_changed)) {
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index ecb2c143ef756..710887c06aaf4 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -63,9 +63,10 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
struct btrfs_delayed_extent_op;
int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
-struct btrfs_qgroup_extent_record
-*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_qgroup_extent_record *record);
+struct btrfs_qgroup_extent_record *
+btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_qgroup_extent_record *record);
int
btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
@@ -88,7 +89,7 @@ static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
u64 ref_root, u64 num_bytes)
{
btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes);
- trace_btrfs_qgroup_free_delayed_ref(ref_root, num_bytes);
+ trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes);
}
void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 55161369fab14..cd8d302a1f615 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -270,7 +270,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
s = kmap(rbio->bio_pages[i]);
d = kmap(rbio->stripe_pages[i]);
- memcpy(d, s, PAGE_CACHE_SIZE);
+ memcpy(d, s, PAGE_SIZE);
kunmap(rbio->bio_pages[i]);
kunmap(rbio->stripe_pages[i]);
@@ -576,7 +576,7 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
* we can't merge with cached rbios, since the
* idea is that when we merge the destination
* rbio is going to run our IO for us. We can
- * steal from cached rbio's though, other functions
+ * steal from cached rbios though, other functions
* handle that.
*/
if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
@@ -962,7 +962,7 @@ static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
*/
static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
{
- return DIV_ROUND_UP(stripe_len, PAGE_CACHE_SIZE) * nr_stripes;
+ return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes;
}
/*
@@ -1078,7 +1078,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
u64 disk_start;
stripe = &rbio->bbio->stripes[stripe_nr];
- disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
+ disk_start = stripe->physical + (page_index << PAGE_SHIFT);
/* if the device is missing, just fail this stripe */
if (!stripe->dev->bdev)
@@ -1096,8 +1096,8 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
if (last_end == disk_start && stripe->dev->bdev &&
!last->bi_error &&
last->bi_bdev == stripe->dev->bdev) {
- ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
- if (ret == PAGE_CACHE_SIZE)
+ ret = bio_add_page(last, page, PAGE_SIZE, 0);
+ if (ret == PAGE_SIZE)
return 0;
}
}
@@ -1111,7 +1111,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
bio->bi_bdev = stripe->dev->bdev;
bio->bi_iter.bi_sector = disk_start >> 9;
- bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+ bio_add_page(bio, page, PAGE_SIZE, 0);
bio_list_add(bio_list, bio);
return 0;
}
@@ -1154,7 +1154,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
bio_list_for_each(bio, &rbio->bio_list) {
start = (u64)bio->bi_iter.bi_sector << 9;
stripe_offset = start - rbio->bbio->raid_map[0];
- page_index = stripe_offset >> PAGE_CACHE_SHIFT;
+ page_index = stripe_offset >> PAGE_SHIFT;
for (i = 0; i < bio->bi_vcnt; i++) {
p = bio->bi_io_vec[i].bv_page;
@@ -1253,7 +1253,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
} else {
/* raid5 */
memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
- run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
+ run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
}
@@ -1320,7 +1320,9 @@ write_data:
bio->bi_private = rbio;
bio->bi_end_io = raid_write_end_io;
- submit_bio(WRITE, bio);
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+
+ submit_bio(bio);
}
return;
@@ -1573,11 +1575,12 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
bio->bi_private = rbio;
bio->bi_end_io = raid_rmw_end_io;
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
btrfs_bio_wq_end_io(rbio->fs_info, bio,
BTRFS_WQ_ENDIO_RAID56);
- submit_bio(READ, bio);
+ submit_bio(bio);
}
/* the actual write will happen once the reads are done */
return 0;
@@ -1914,7 +1917,7 @@ pstripe:
/* Copy parity block into failed block to start with */
memcpy(pointers[faila],
pointers[rbio->nr_data],
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
/* rearrange the pointer array */
p = pointers[faila];
@@ -1923,7 +1926,7 @@ pstripe:
pointers[rbio->nr_data - 1] = p;
/* xor in the rest */
- run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
+ run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE);
}
/* if we're doing this rebuild as part of an rmw, go through
* and set all of our private rbio pages in the
@@ -2097,11 +2100,12 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
bio->bi_private = rbio;
bio->bi_end_io = raid_recover_end_io;
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
btrfs_bio_wq_end_io(rbio->fs_info, bio,
BTRFS_WQ_ENDIO_RAID56);
- submit_bio(READ, bio);
+ submit_bio(bio);
}
out:
return 0;
@@ -2250,7 +2254,7 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
rbio->stripe_len * rbio->nr_data);
stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
- index = stripe_offset >> PAGE_CACHE_SHIFT;
+ index = stripe_offset >> PAGE_SHIFT;
rbio->bio_pages[index] = page;
}
@@ -2365,14 +2369,14 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
} else {
/* raid5 */
memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
- run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
+ run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
}
- /* Check scrubbing pairty and repair it */
+ /* Check scrubbing parity and repair it */
p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
parity = kmap(p);
- if (memcmp(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE))
- memcpy(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE);
+ if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE))
+ memcpy(parity, pointers[rbio->scrubp], PAGE_SIZE);
else
/* Parity is right, needn't writeback */
bitmap_clear(rbio->dbitmap, pagenr, 1);
@@ -2433,7 +2437,9 @@ submit_write:
bio->bi_private = rbio;
bio->bi_end_io = raid_write_end_io;
- submit_bio(WRITE, bio);
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+
+ submit_bio(bio);
}
return;
@@ -2493,7 +2499,7 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
/*
* Here means we got one corrupted data stripe and one
* corrupted parity on RAID6, if the corrupted parity
- * is scrubbing parity, luckly, use the other one to repair
+ * is scrubbing parity, luckily, use the other one to repair
* the data, or we can not repair the data stripe.
*/
if (failp != rbio->scrubp)
@@ -2610,11 +2616,12 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
bio->bi_private = rbio;
bio->bi_end_io = raid56_parity_scrub_end_io;
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
btrfs_bio_wq_end_io(rbio->fs_info, bio,
BTRFS_WQ_ENDIO_RAID56);
- submit_bio(READ, bio);
+ submit_bio(bio);
}
/* the actual write will happen once the reads are done */
return;
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index b892914968c18..8428db7cd88fa 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -226,7 +226,7 @@ int btree_readahead_hook(struct btrfs_fs_info *fs_info,
/* find extent */
spin_lock(&fs_info->reada_lock);
re = radix_tree_lookup(&fs_info->reada_tree,
- start >> PAGE_CACHE_SHIFT);
+ start >> PAGE_SHIFT);
if (re)
re->refcnt++;
spin_unlock(&fs_info->reada_lock);
@@ -257,7 +257,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
zone = NULL;
spin_lock(&fs_info->reada_lock);
ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
- logical >> PAGE_CACHE_SHIFT, 1);
+ logical >> PAGE_SHIFT, 1);
if (ret == 1 && logical >= zone->start && logical <= zone->end) {
kref_get(&zone->refcnt);
spin_unlock(&fs_info->reada_lock);
@@ -294,13 +294,13 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
spin_lock(&fs_info->reada_lock);
ret = radix_tree_insert(&dev->reada_zones,
- (unsigned long)(zone->end >> PAGE_CACHE_SHIFT),
+ (unsigned long)(zone->end >> PAGE_SHIFT),
zone);
if (ret == -EEXIST) {
kfree(zone);
ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
- logical >> PAGE_CACHE_SHIFT, 1);
+ logical >> PAGE_SHIFT, 1);
if (ret == 1 && logical >= zone->start && logical <= zone->end)
kref_get(&zone->refcnt);
else
@@ -326,7 +326,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
u64 length;
int real_stripes;
int nzones = 0;
- unsigned long index = logical >> PAGE_CACHE_SHIFT;
+ unsigned long index = logical >> PAGE_SHIFT;
int dev_replace_is_ongoing;
int have_zone = 0;
@@ -495,7 +495,7 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info,
struct reada_extent *re)
{
int i;
- unsigned long index = re->logical >> PAGE_CACHE_SHIFT;
+ unsigned long index = re->logical >> PAGE_SHIFT;
spin_lock(&fs_info->reada_lock);
if (--re->refcnt) {
@@ -538,7 +538,7 @@ static void reada_zone_release(struct kref *kref)
struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
radix_tree_delete(&zone->device->reada_zones,
- zone->end >> PAGE_CACHE_SHIFT);
+ zone->end >> PAGE_SHIFT);
kfree(zone);
}
@@ -587,7 +587,7 @@ static int reada_add_block(struct reada_control *rc, u64 logical,
static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
{
int i;
- unsigned long index = zone->end >> PAGE_CACHE_SHIFT;
+ unsigned long index = zone->end >> PAGE_SHIFT;
for (i = 0; i < zone->ndevs; ++i) {
struct reada_zone *peer;
@@ -622,7 +622,7 @@ static int reada_pick_zone(struct btrfs_device *dev)
(void **)&zone, index, 1);
if (ret == 0)
break;
- index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
+ index = (zone->end >> PAGE_SHIFT) + 1;
if (zone->locked) {
if (zone->elems > top_locked_elems) {
top_locked_elems = zone->elems;
@@ -673,7 +673,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
* plugging to speed things up
*/
ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
- dev->reada_next >> PAGE_CACHE_SHIFT, 1);
+ dev->reada_next >> PAGE_SHIFT, 1);
if (ret == 0 || re->logical > dev->reada_curr_zone->end) {
ret = reada_pick_zone(dev);
if (!ret) {
@@ -682,7 +682,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
}
re = NULL;
ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
- dev->reada_next >> PAGE_CACHE_SHIFT, 1);
+ dev->reada_next >> PAGE_SHIFT, 1);
}
if (ret == 0) {
spin_unlock(&fs_info->reada_lock);
@@ -761,12 +761,14 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info)
do {
enqueued = 0;
+ mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (atomic_read(&device->reada_in_flight) <
MAX_IN_FLIGHT)
enqueued += reada_start_machine_dev(fs_info,
device);
}
+ mutex_unlock(&fs_devices->device_list_mutex);
total += enqueued;
} while (enqueued && total < 10000);
@@ -838,7 +840,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
printk(KERN_CONT " curr off %llu",
device->reada_next - zone->start);
printk(KERN_CONT "\n");
- index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
+ index = (zone->end >> PAGE_SHIFT) + 1;
}
cnt = 0;
index = 0;
@@ -864,7 +866,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
}
}
printk(KERN_CONT "\n");
- index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+ index = (re->logical >> PAGE_SHIFT) + 1;
if (++cnt > 15)
break;
}
@@ -880,7 +882,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
if (ret == 0)
break;
if (!re->scheduled) {
- index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+ index = (re->logical >> PAGE_SHIFT) + 1;
continue;
}
printk(KERN_DEBUG
@@ -897,7 +899,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
}
}
printk(KERN_CONT "\n");
- index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+ index = (re->logical >> PAGE_SHIFT) + 1;
}
spin_unlock(&fs_info->reada_lock);
}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 2bd0011450df2..b26a5aea41b4a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -235,12 +235,12 @@ static void backref_cache_cleanup(struct backref_cache *cache)
cache->last_trans = 0;
for (i = 0; i < BTRFS_MAX_LEVEL; i++)
- BUG_ON(!list_empty(&cache->pending[i]));
- BUG_ON(!list_empty(&cache->changed));
- BUG_ON(!list_empty(&cache->detached));
- BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
- BUG_ON(cache->nr_nodes);
- BUG_ON(cache->nr_edges);
+ ASSERT(list_empty(&cache->pending[i]));
+ ASSERT(list_empty(&cache->changed));
+ ASSERT(list_empty(&cache->detached));
+ ASSERT(RB_EMPTY_ROOT(&cache->rb_root));
+ ASSERT(!cache->nr_nodes);
+ ASSERT(!cache->nr_edges);
}
static struct backref_node *alloc_backref_node(struct backref_cache *cache)
@@ -668,8 +668,8 @@ int find_inline_backref(struct extent_buffer *leaf, int slot,
* roots of b-trees that reference the tree block.
*
* the basic idea of this function is check backrefs of a given block
- * to find upper level blocks that refernece the block, and then check
- * bakcrefs of these upper level blocks recursively. the recursion stop
+ * to find upper level blocks that reference the block, and then check
+ * backrefs of these upper level blocks recursively. the recursion stop
* when tree root is reached or backrefs for the block is cached.
*
* NOTE: if we find backrefs for a block are cached, we know backrefs
@@ -1160,7 +1160,7 @@ out:
if (!RB_EMPTY_NODE(&upper->rb_node))
continue;
- /* Add this guy's upper edges to the list to proces */
+ /* Add this guy's upper edges to the list to process */
list_for_each_entry(edge, &upper->upper, list[LOWER])
list_add_tail(&edge->list[UPPER], &list);
if (list_empty(&upper->upper))
@@ -1171,8 +1171,12 @@ out:
lower = list_entry(useless.next,
struct backref_node, list);
list_del_init(&lower->list);
+ if (lower == node)
+ node = NULL;
free_backref_node(cache, lower);
}
+
+ free_backref_node(cache, node);
return ERR_PTR(err);
}
ASSERT(!node || !node->detached);
@@ -1719,7 +1723,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
btrfs_header_owner(leaf),
key.objectid, key.offset);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
break;
}
@@ -1727,7 +1731,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
parent, btrfs_header_owner(leaf),
key.objectid, key.offset);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
break;
}
}
@@ -1850,6 +1854,7 @@ again:
eb = read_tree_block(dest, old_bytenr, old_ptr_gen);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
+ break;
} else if (!extent_buffer_uptodate(eb)) {
ret = -EIO;
free_extent_buffer(eb);
@@ -2395,7 +2400,7 @@ again:
}
/*
- * we keep the old last snapshod transid in rtranid when we
+ * we keep the old last snapshot transid in rtranid when we
* created the relocation tree.
*/
last_snap = btrfs_root_rtransid(&reloc_root->root_item);
@@ -2417,7 +2422,7 @@ again:
}
out:
if (ret) {
- btrfs_std_error(root->fs_info, ret, NULL);
+ btrfs_handle_fs_error(root->fs_info, ret, NULL);
if (!list_empty(&reloc_roots))
free_reloc_roots(&reloc_roots);
@@ -2603,25 +2608,28 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
trans->block_rsv = rc->block_rsv;
rc->reserved_bytes += num_bytes;
+
+ /*
+ * We are under a transaction here so we can only do limited flushing.
+ * If we get an enospc just kick back -EAGAIN so we know to drop the
+ * transaction and try to refill when we can flush all the things.
+ */
ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
- BTRFS_RESERVE_FLUSH_ALL);
+ BTRFS_RESERVE_FLUSH_LIMIT);
if (ret) {
- if (ret == -EAGAIN) {
- tmp = rc->extent_root->nodesize *
- RELOCATION_RESERVED_NODES;
- while (tmp <= rc->reserved_bytes)
- tmp <<= 1;
- /*
- * only one thread can access block_rsv at this point,
- * so we don't need hold lock to protect block_rsv.
- * we expand more reservation size here to allow enough
- * space for relocation and we will return eailer in
- * enospc case.
- */
- rc->block_rsv->size = tmp + rc->extent_root->nodesize *
- RELOCATION_RESERVED_NODES;
- }
- return ret;
+ tmp = rc->extent_root->nodesize * RELOCATION_RESERVED_NODES;
+ while (tmp <= rc->reserved_bytes)
+ tmp <<= 1;
+ /*
+ * only one thread can access block_rsv at this point,
+ * so we don't need hold lock to protect block_rsv.
+ * we expand more reservation size here to allow enough
+ * space for relocation and we will return eailer in
+ * enospc case.
+ */
+ rc->block_rsv->size = tmp + rc->extent_root->nodesize *
+ RELOCATION_RESERVED_NODES;
+ return -EAGAIN;
}
return 0;
@@ -2813,7 +2821,7 @@ static void mark_block_processed(struct reloc_control *rc,
u64 bytenr, u32 blocksize)
{
set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
- EXTENT_DIRTY, GFP_NOFS);
+ EXTENT_DIRTY);
}
static void __mark_block_processed(struct reloc_control *rc,
@@ -3129,10 +3137,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
if (ret)
goto out;
- index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
- last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
+ index = (cluster->start - offset) >> PAGE_SHIFT;
+ last_index = (cluster->end - offset) >> PAGE_SHIFT;
while (index <= last_index) {
- ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
+ ret = btrfs_delalloc_reserve_metadata(inode, PAGE_SIZE);
if (ret)
goto out;
@@ -3145,7 +3153,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
mask);
if (!page) {
btrfs_delalloc_release_metadata(inode,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
ret = -ENOMEM;
goto out;
}
@@ -3162,16 +3170,16 @@ static int relocate_file_extent_cluster(struct inode *inode,
lock_page(page);
if (!PageUptodate(page)) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
btrfs_delalloc_release_metadata(inode,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
ret = -EIO;
goto out;
}
}
page_start = page_offset(page);
- page_end = page_start + PAGE_CACHE_SIZE - 1;
+ page_end = page_start + PAGE_SIZE - 1;
lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
@@ -3181,7 +3189,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
page_start + offset == cluster->boundary[nr]) {
set_extent_bits(&BTRFS_I(inode)->io_tree,
page_start, page_end,
- EXTENT_BOUNDARY, GFP_NOFS);
+ EXTENT_BOUNDARY);
nr++;
}
@@ -3191,7 +3199,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
unlock_extent(&BTRFS_I(inode)->io_tree,
page_start, page_end);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
index++;
balance_dirty_pages_ratelimited(inode->i_mapping);
@@ -3870,6 +3878,7 @@ static noinline_for_stack
int prepare_to_relocate(struct reloc_control *rc)
{
struct btrfs_trans_handle *trans;
+ int ret;
rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root,
BTRFS_BLOCK_RSV_TEMP);
@@ -3884,6 +3893,11 @@ int prepare_to_relocate(struct reloc_control *rc)
rc->reserved_bytes = 0;
rc->block_rsv->size = rc->extent_root->nodesize *
RELOCATION_RESERVED_NODES;
+ ret = btrfs_block_rsv_refill(rc->extent_root,
+ rc->block_rsv, rc->block_rsv->size,
+ BTRFS_RESERVE_FLUSH_ALL);
+ if (ret)
+ return ret;
rc->create_reloc_tree = 1;
set_reloc_control(rc);
@@ -4058,8 +4072,7 @@ restart:
}
btrfs_release_path(path);
- clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
- GFP_NOFS);
+ clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY);
if (trans) {
btrfs_end_transaction_throttle(trans, rc->extent_root);
@@ -4253,12 +4266,11 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu",
rc->block_group->key.objectid, rc->block_group->flags);
- ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
- if (ret < 0) {
- err = ret;
- goto out;
- }
- btrfs_wait_ordered_roots(fs_info, -1);
+ btrfs_wait_block_group_reservations(rc->block_group);
+ btrfs_wait_nocow_writers(rc->block_group);
+ btrfs_wait_ordered_roots(fs_info, -1,
+ rc->block_group->key.objectid,
+ rc->block_group->key.offset);
while (1) {
mutex_lock(&fs_info->cleaner_mutex);
@@ -4591,7 +4603,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
/*
* called before creating snapshot. it calculates metadata reservation
- * requried for relocating tree blocks in the snapshot
+ * required for relocating tree blocks in the snapshot
*/
void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
u64 *bytes_to_reserve)
@@ -4644,7 +4656,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
if (rc->merge_reloc_tree) {
ret = btrfs_block_rsv_migrate(&pending->block_rsv,
rc->block_rsv,
- rc->nodes_relocated);
+ rc->nodes_relocated, 1);
if (ret)
return ret;
}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 9fcd6dfc3266f..7fd7e1830cfe6 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -71,9 +71,9 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
* search_key: the key to search
* path: the path we search
* root_item: the root item of the tree we look for
- * root_key: the reak key of the tree we look for
+ * root_key: the root key of the tree we look for
*
- * If ->offset of 'seach_key' is -1ULL, it means we are not sure the offset
+ * If ->offset of 'search_key' is -1ULL, it means we are not sure the offset
* of the search key, just lookup the root with the highest offset for a
* given objectid.
*
@@ -150,7 +150,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
ret = btrfs_search_slot(trans, root, key, path, 0, 1);
if (ret < 0) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -176,20 +176,20 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
ret = btrfs_search_slot(trans, root, key, path,
-1, 1);
if (ret < 0) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
ret = btrfs_del_item(trans, root, path);
if (ret < 0) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
btrfs_release_path(path);
ret = btrfs_insert_empty_item(trans, root, path,
key, sizeof(*item));
if (ret < 0) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
l = path->nodes[0];
@@ -284,7 +284,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
trans = btrfs_join_transaction(tree_root);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
- btrfs_std_error(tree_root->fs_info, err,
+ btrfs_handle_fs_error(tree_root->fs_info, err,
"Failed to start trans to delete "
"orphan item");
break;
@@ -293,7 +293,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
root_key.objectid);
btrfs_end_transaction(trans, tree_root);
if (err) {
- btrfs_std_error(tree_root->fs_info, err,
+ btrfs_handle_fs_error(tree_root->fs_info, err,
"Failed to delete root orphan "
"item");
break;
@@ -448,7 +448,7 @@ again:
ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
sizeof(*ref) + name_len);
if (ret) {
- btrfs_abort_transaction(trans, tree_root, ret);
+ btrfs_abort_transaction(trans, ret);
btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 39dbdcbf4d134..1d195d2b32c6e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -703,7 +703,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
if (IS_ERR(inode))
return PTR_ERR(inode);
- index = offset >> PAGE_CACHE_SHIFT;
+ index = offset >> PAGE_SHIFT;
page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
if (!page) {
@@ -745,7 +745,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
* sure we read the bad mirror.
*/
ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
- EXTENT_DAMAGED, GFP_NOFS);
+ EXTENT_DAMAGED);
if (ret) {
/* set_extent_bits should give proper error */
WARN_ON(ret > 0);
@@ -763,7 +763,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
end, EXTENT_DAMAGED, 0, NULL);
if (!corrected)
clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
- EXTENT_DAMAGED, GFP_NOFS);
+ EXTENT_DAMAGED);
}
out:
@@ -1044,7 +1044,7 @@ nodatasum_case:
/*
* !is_metadata and !have_csum, this means that the data
- * might not be COW'ed, that it might be modified
+ * might not be COWed, that it might be modified
* concurrently. The general strategy to work on the
* commit root does not help in the case when COW is not
* used.
@@ -1125,7 +1125,7 @@ nodatasum_case:
* the 2nd page of mirror #1 faces I/O errors, and the 2nd page
* of mirror #2 is readable but the final checksum test fails,
* then the 2nd page of mirror #3 could be tried, whether now
- * the final checksum succeedes. But this would be a rare
+ * the final checksum succeeds. But this would be a rare
* exception and is therefore not implemented. At least it is
* avoided that the good copy is overwritten.
* A more useful improvement would be to pick the sectors
@@ -1350,7 +1350,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
recover->bbio = bbio;
recover->map_length = mapped_length;
- BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
+ BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
@@ -1504,8 +1504,9 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
sblock->no_io_error_seen = 0;
} else {
bio->bi_iter.bi_sector = page->physical >> 9;
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
- if (btrfsic_submit_bio_wait(READ, bio))
+ if (btrfsic_submit_bio_wait(bio))
sblock->no_io_error_seen = 0;
}
@@ -1583,6 +1584,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
return -EIO;
bio->bi_bdev = page_bad->dev->bdev;
bio->bi_iter.bi_sector = page_bad->physical >> 9;
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
if (PAGE_SIZE != ret) {
@@ -1590,7 +1592,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
return -EIO;
}
- if (btrfsic_submit_bio_wait(WRITE, bio)) {
+ if (btrfsic_submit_bio_wait(bio)) {
btrfs_dev_stat_inc_and_print(page_bad->dev,
BTRFS_DEV_STAT_WRITE_ERRS);
btrfs_dev_replace_stats_inc(
@@ -1636,7 +1638,7 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
if (spage->io_error) {
void *mapped_buffer = kmap_atomic(spage->page);
- memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
+ memset(mapped_buffer, 0, PAGE_SIZE);
flush_dcache_page(spage->page);
kunmap_atomic(mapped_buffer);
}
@@ -1684,6 +1686,7 @@ again:
bio->bi_end_io = scrub_wr_bio_end_io;
bio->bi_bdev = sbio->dev->bdev;
bio->bi_iter.bi_sector = sbio->physical >> 9;
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
sbio->err = 0;
} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
spage->physical_for_dev_replace ||
@@ -1731,7 +1734,7 @@ static void scrub_wr_submit(struct scrub_ctx *sctx)
* orders the requests before sending them to the driver which
* doubled the write performance on spinning disks when measured
* with Linux 3.5 */
- btrfsic_submit_bio(WRITE, sbio->bio);
+ btrfsic_submit_bio(sbio->bio);
}
static void scrub_wr_bio_end_io(struct bio *bio)
@@ -2041,7 +2044,7 @@ static void scrub_submit(struct scrub_ctx *sctx)
sbio = sctx->bios[sctx->curr];
sctx->curr = -1;
scrub_pending_bio_inc(sctx);
- btrfsic_submit_bio(READ, sbio->bio);
+ btrfsic_submit_bio(sbio->bio);
}
static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
@@ -2088,6 +2091,7 @@ again:
bio->bi_end_io = scrub_bio_end_io;
bio->bi_bdev = sbio->dev->bdev;
bio->bi_iter.bi_sector = sbio->physical >> 9;
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
sbio->err = 0;
} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
spage->physical ||
@@ -2127,6 +2131,8 @@ static void scrub_missing_raid56_end_io(struct bio *bio)
if (bio->bi_error)
sblock->no_io_error_seen = 0;
+ bio_put(bio);
+
btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
}
@@ -2179,7 +2185,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
u64 length = sblock->page_count * PAGE_SIZE;
u64 logical = sblock->pagev[0]->logical;
- struct btrfs_bio *bbio;
+ struct btrfs_bio *bbio = NULL;
struct bio *bio;
struct btrfs_raid_bio *rbio;
int ret;
@@ -2860,7 +2866,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
int extent_mirror_num;
int stop_loop = 0;
- nsectors = map->stripe_len / root->sectorsize;
+ nsectors = div_u64(map->stripe_len, root->sectorsize);
bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
GFP_NOFS);
@@ -2980,6 +2986,7 @@ again:
extent_len);
mapped_length = extent_len;
+ bbio = NULL;
ret = btrfs_map_block(fs_info, READ, extent_logical,
&mapped_length, &bbio, 0);
if (!ret) {
@@ -3070,7 +3077,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
int slot;
u64 nstripes;
struct extent_buffer *l;
- struct btrfs_key key;
u64 physical;
u64 logical;
u64 logic_end;
@@ -3079,7 +3085,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
int mirror_num;
struct reada_control *reada1;
struct reada_control *reada2;
- struct btrfs_key key_start;
+ struct btrfs_key key;
struct btrfs_key key_end;
u64 increment = map->stripe_len;
u64 offset;
@@ -3158,21 +3164,21 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
scrub_blocked_if_needed(fs_info);
/* FIXME it might be better to start readahead at commit root */
- key_start.objectid = logical;
- key_start.type = BTRFS_EXTENT_ITEM_KEY;
- key_start.offset = (u64)0;
+ key.objectid = logical;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = (u64)0;
key_end.objectid = logic_end;
key_end.type = BTRFS_METADATA_ITEM_KEY;
key_end.offset = (u64)-1;
- reada1 = btrfs_reada_add(root, &key_start, &key_end);
+ reada1 = btrfs_reada_add(root, &key, &key_end);
- key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
- key_start.type = BTRFS_EXTENT_CSUM_KEY;
- key_start.offset = logical;
+ key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ key.type = BTRFS_EXTENT_CSUM_KEY;
+ key.offset = logical;
key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
key_end.type = BTRFS_EXTENT_CSUM_KEY;
key_end.offset = logic_end;
- reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
+ reada2 = btrfs_reada_add(csum_root, &key, &key_end);
if (!IS_ERR(reada1))
btrfs_reada_wait(reada1);
@@ -3580,6 +3586,46 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
*/
scrub_pause_on(fs_info);
ret = btrfs_inc_block_group_ro(root, cache);
+ if (!ret && is_dev_replace) {
+ /*
+ * If we are doing a device replace wait for any tasks
+ * that started dellaloc right before we set the block
+ * group to RO mode, as they might have just allocated
+ * an extent from it or decided they could do a nocow
+ * write. And if any such tasks did that, wait for their
+ * ordered extents to complete and then commit the
+ * current transaction, so that we can later see the new
+ * extent items in the extent tree - the ordered extents
+ * create delayed data references (for cow writes) when
+ * they complete, which will be run and insert the
+ * corresponding extent items into the extent tree when
+ * we commit the transaction they used when running
+ * inode.c:btrfs_finish_ordered_io(). We later use
+ * the commit root of the extent tree to find extents
+ * to copy from the srcdev into the tgtdev, and we don't
+ * want to miss any new extents.
+ */
+ btrfs_wait_block_group_reservations(cache);
+ btrfs_wait_nocow_writers(cache);
+ ret = btrfs_wait_ordered_roots(fs_info, -1,
+ cache->key.objectid,
+ cache->key.offset);
+ if (ret > 0) {
+ struct btrfs_trans_handle *trans;
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans))
+ ret = PTR_ERR(trans);
+ else
+ ret = btrfs_commit_transaction(trans,
+ root);
+ if (ret) {
+ scrub_pause_off(fs_info);
+ btrfs_put_block_group(cache);
+ break;
+ }
+ }
+ }
scrub_pause_off(fs_info);
if (ret == 0) {
@@ -3600,9 +3646,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
break;
}
+ btrfs_dev_replace_lock(&fs_info->dev_replace, 1);
dev_replace->cursor_right = found_key.offset + length;
dev_replace->cursor_left = found_key.offset;
dev_replace->item_needs_writeback = 1;
+ btrfs_dev_replace_unlock(&fs_info->dev_replace, 1);
ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
found_key.offset, cache, is_dev_replace);
@@ -3638,6 +3686,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
scrub_pause_off(fs_info);
+ btrfs_dev_replace_lock(&fs_info->dev_replace, 1);
+ dev_replace->cursor_left = dev_replace->cursor_right;
+ dev_replace->item_needs_writeback = 1;
+ btrfs_dev_replace_unlock(&fs_info->dev_replace, 1);
+
if (ro_set)
btrfs_dec_block_group_ro(root, cache);
@@ -3675,9 +3728,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
ret = -ENOMEM;
break;
}
-
- dev_replace->cursor_left = dev_replace->cursor_right;
- dev_replace->item_needs_writeback = 1;
skip:
key.offset = found_key.offset + length;
btrfs_release_path(path);
@@ -3735,27 +3785,27 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
if (fs_info->scrub_workers_refcnt == 0) {
if (is_dev_replace)
fs_info->scrub_workers =
- btrfs_alloc_workqueue("scrub", flags,
+ btrfs_alloc_workqueue(fs_info, "scrub", flags,
1, 4);
else
fs_info->scrub_workers =
- btrfs_alloc_workqueue("scrub", flags,
+ btrfs_alloc_workqueue(fs_info, "scrub", flags,
max_active, 4);
if (!fs_info->scrub_workers)
goto fail_scrub_workers;
fs_info->scrub_wr_completion_workers =
- btrfs_alloc_workqueue("scrubwrc", flags,
+ btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
max_active, 2);
if (!fs_info->scrub_wr_completion_workers)
goto fail_scrub_wr_completion_workers;
fs_info->scrub_nocow_workers =
- btrfs_alloc_workqueue("scrubnc", flags, 1, 0);
+ btrfs_alloc_workqueue(fs_info, "scrubnc", flags, 1, 0);
if (!fs_info->scrub_nocow_workers)
goto fail_scrub_nocow_workers;
fs_info->scrub_parity_workers =
- btrfs_alloc_workqueue("scrubparity", flags,
+ btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
max_active, 2);
if (!fs_info->scrub_parity_workers)
goto fail_scrub_parity_workers;
@@ -3810,7 +3860,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
/* not supported for data w/o checksums */
- btrfs_err(fs_info,
+ btrfs_err_rl(fs_info,
"scrub: size assumption sectorsize != PAGE_SIZE "
"(%d != %lu) fails",
fs_info->chunk_root->sectorsize, PAGE_SIZE);
@@ -4294,8 +4344,8 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
goto out;
}
- while (len >= PAGE_CACHE_SIZE) {
- index = offset >> PAGE_CACHE_SHIFT;
+ while (len >= PAGE_SIZE) {
+ index = offset >> PAGE_SHIFT;
again:
page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
if (!page) {
@@ -4326,7 +4376,7 @@ again:
*/
if (page->mapping != inode->i_mapping) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
goto again;
}
if (!PageUptodate(page)) {
@@ -4348,15 +4398,15 @@ again:
ret = err;
next_page:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if (ret)
break;
- offset += PAGE_CACHE_SIZE;
- physical_for_dev_replace += PAGE_CACHE_SIZE;
- nocow_ctx_logical += PAGE_CACHE_SIZE;
- len -= PAGE_CACHE_SIZE;
+ offset += PAGE_SIZE;
+ physical_for_dev_replace += PAGE_SIZE;
+ nocow_ctx_logical += PAGE_SIZE;
+ len -= PAGE_SIZE;
}
ret = COPY_COMPLETE;
out:
@@ -4390,15 +4440,16 @@ static int write_page_nocow(struct scrub_ctx *sctx,
bio->bi_iter.bi_size = 0;
bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
bio->bi_bdev = dev->bdev;
- ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
- if (ret != PAGE_CACHE_SIZE) {
+ bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC);
+ ret = bio_add_page(bio, page, PAGE_SIZE, 0);
+ if (ret != PAGE_SIZE) {
leave_with_eio:
bio_put(bio);
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
return -EIO;
}
- if (btrfsic_submit_bio_wait(WRITE_SYNC, bio))
+ if (btrfsic_submit_bio_wait(bio))
goto leave_with_eio;
bio_put(bio);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 19b7bf4284ee9..b71dd298385c1 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1831,7 +1831,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
/*
* If we have a parent root we need to verify that the parent dir was
- * not delted and then re-created, if it was then we have no overwrite
+ * not deleted and then re-created, if it was then we have no overwrite
* and we can just unlink this entry.
*/
if (sctx->parent_root) {
@@ -4192,9 +4192,9 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
return -ENOMEM;
/*
- * This hack is needed because empty acl's are stored as zero byte
+ * This hack is needed because empty acls are stored as zero byte
* data in xattrs. Problem with that is, that receiving these zero byte
- * acl's will fail later. To fix this, we send a dummy acl list that
+ * acls will fail later. To fix this, we send a dummy acl list that
* only contains the version number and no entries.
*/
if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) ||
@@ -4449,9 +4449,9 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
struct page *page;
char *addr;
struct btrfs_key key;
- pgoff_t index = offset >> PAGE_CACHE_SHIFT;
+ pgoff_t index = offset >> PAGE_SHIFT;
pgoff_t last_index;
- unsigned pg_offset = offset & ~PAGE_CACHE_MASK;
+ unsigned pg_offset = offset & ~PAGE_MASK;
ssize_t ret = 0;
key.objectid = sctx->cur_ino;
@@ -4471,7 +4471,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
if (len == 0)
goto out;
- last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+ last_index = (offset + len - 1) >> PAGE_SHIFT;
/* initial readahead */
memset(&sctx->ra, 0, sizeof(struct file_ra_state));
@@ -4481,7 +4481,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
while (index <= last_index) {
unsigned cur_len = min_t(unsigned, len,
- PAGE_CACHE_SIZE - pg_offset);
+ PAGE_SIZE - pg_offset);
page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL);
if (!page) {
ret = -ENOMEM;
@@ -4493,7 +4493,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
lock_page(page);
if (!PageUptodate(page)) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
ret = -EIO;
break;
}
@@ -4503,7 +4503,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len);
kunmap(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
index++;
pg_offset = 0;
len -= cur_len;
@@ -4804,7 +4804,7 @@ static int clone_range(struct send_ctx *sctx,
type = btrfs_file_extent_type(leaf, ei);
if (type == BTRFS_FILE_EXTENT_INLINE) {
ext_len = btrfs_file_extent_inline_len(leaf, slot, ei);
- ext_len = PAGE_CACHE_ALIGN(ext_len);
+ ext_len = PAGE_ALIGN(ext_len);
} else {
ext_len = btrfs_file_extent_num_bytes(leaf, ei);
}
@@ -4886,7 +4886,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
* but there may be items after this page. Make
* sure to send the whole thing
*/
- len = PAGE_CACHE_ALIGN(len);
+ len = PAGE_ALIGN(len);
} else {
len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
}
@@ -5939,6 +5939,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
u32 i;
u64 *clone_sources_tmp = NULL;
int clone_sources_to_rollback = 0;
+ unsigned alloc_size;
int sort_clone_roots = 0;
int index;
@@ -5978,6 +5979,12 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
goto out;
}
+ if (arg->clone_sources_count >
+ ULLONG_MAX / sizeof(*arg->clone_sources)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
if (!access_ok(VERIFY_READ, arg->clone_sources,
sizeof(*arg->clone_sources) *
arg->clone_sources_count)) {
@@ -6022,40 +6029,53 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
sctx->clone_roots_cnt = arg->clone_sources_count;
sctx->send_max_size = BTRFS_SEND_BUF_SIZE;
- sctx->send_buf = vmalloc(sctx->send_max_size);
+ sctx->send_buf = kmalloc(sctx->send_max_size, GFP_KERNEL | __GFP_NOWARN);
if (!sctx->send_buf) {
- ret = -ENOMEM;
- goto out;
+ sctx->send_buf = vmalloc(sctx->send_max_size);
+ if (!sctx->send_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
}
- sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE);
+ sctx->read_buf = kmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL | __GFP_NOWARN);
if (!sctx->read_buf) {
- ret = -ENOMEM;
- goto out;
+ sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE);
+ if (!sctx->read_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
}
sctx->pending_dir_moves = RB_ROOT;
sctx->waiting_dir_moves = RB_ROOT;
sctx->orphan_dirs = RB_ROOT;
- sctx->clone_roots = vzalloc(sizeof(struct clone_root) *
- (arg->clone_sources_count + 1));
+ alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1);
+
+ sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN);
if (!sctx->clone_roots) {
- ret = -ENOMEM;
- goto out;
+ sctx->clone_roots = vzalloc(alloc_size);
+ if (!sctx->clone_roots) {
+ ret = -ENOMEM;
+ goto out;
+ }
}
+ alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources);
+
if (arg->clone_sources_count) {
- clone_sources_tmp = vmalloc(arg->clone_sources_count *
- sizeof(*arg->clone_sources));
+ clone_sources_tmp = kmalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN);
if (!clone_sources_tmp) {
- ret = -ENOMEM;
- goto out;
+ clone_sources_tmp = vmalloc(alloc_size);
+ if (!clone_sources_tmp) {
+ ret = -ENOMEM;
+ goto out;
+ }
}
ret = copy_from_user(clone_sources_tmp, arg->clone_sources,
- arg->clone_sources_count *
- sizeof(*arg->clone_sources));
+ alloc_size);
if (ret) {
ret = -EFAULT;
goto out;
@@ -6089,7 +6109,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
sctx->clone_roots[i].root = clone_root;
clone_sources_to_rollback = i + 1;
}
- vfree(clone_sources_tmp);
+ kvfree(clone_sources_tmp);
clone_sources_tmp = NULL;
}
@@ -6207,15 +6227,15 @@ out:
btrfs_root_dec_send_in_progress(sctx->parent_root);
kfree(arg);
- vfree(clone_sources_tmp);
+ kvfree(clone_sources_tmp);
if (sctx) {
if (sctx->send_filp)
fput(sctx->send_filp);
- vfree(sctx->clone_roots);
- vfree(sctx->send_buf);
- vfree(sctx->read_buf);
+ kvfree(sctx->clone_roots);
+ kvfree(sctx->send_buf);
+ kvfree(sctx->read_buf);
name_cache_free(sctx);
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index b976597b07216..875c757e73e2a 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -36,7 +36,7 @@ static inline void put_unaligned_le8(u8 val, void *p)
*
* The end result is that anyone who #includes ctree.h gets a
* declaration for the btrfs_set_foo functions and btrfs_foo functions,
- * which are wappers of btrfs_set_token_#bits functions and
+ * which are wrappers of btrfs_set_token_#bits functions and
* btrfs_get_token_#bits functions, which are defined in this file.
*
* These setget functions do all the extent_buffer related mapping
@@ -66,7 +66,7 @@ u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr, \
\
if (token && token->kaddr && token->offset <= offset && \
token->eb == eb && \
- (token->offset + PAGE_CACHE_SIZE >= offset + size)) { \
+ (token->offset + PAGE_SIZE >= offset + size)) { \
kaddr = token->kaddr; \
p = kaddr + part_offset - token->offset; \
res = get_unaligned_le##bits(p + off); \
@@ -104,7 +104,7 @@ void btrfs_set_token_##bits(struct extent_buffer *eb, \
\
if (token && token->kaddr && token->offset <= offset && \
token->eb == eb && \
- (token->offset + PAGE_CACHE_SIZE >= offset + size)) { \
+ (token->offset + PAGE_SIZE >= offset + size)) { \
kaddr = token->kaddr; \
p = kaddr + part_offset - token->offset; \
put_unaligned_le##bits(val, p + off); \
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 00b8f37cc306d..864ce334f696c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -97,15 +97,6 @@ const char *btrfs_decode_error(int errno)
return errstr;
}
-static void save_error_info(struct btrfs_fs_info *fs_info)
-{
- /*
- * today we only save the error info into ram. Long term we'll
- * also send it down to the disk
- */
- set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
-}
-
/* btrfs handle error by forcing the filesystem readonly */
static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
{
@@ -121,7 +112,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
* Note that a running device replace operation is not
* canceled here although there is no way to update
* the progress. It would add the risk of a deadlock,
- * therefore the canceling is ommited. The only penalty
+ * therefore the canceling is omitted. The only penalty
* is that some I/O remains active until the procedure
* completes. The next time when the filesystem is
* mounted writeable again, the device replace
@@ -131,11 +122,11 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
}
/*
- * __btrfs_std_error decodes expected errors from the caller and
+ * __btrfs_handle_fs_error decodes expected errors from the caller and
* invokes the approciate error response.
*/
__cold
-void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...)
{
struct super_block *sb = fs_info->sb;
@@ -170,8 +161,13 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
}
#endif
+ /*
+ * Today we only save the error info to memory. Long term we'll
+ * also send it down to the disk
+ */
+ set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
+
/* Don't go through full error handling during mount */
- save_error_info(fs_info);
if (sb->s_flags & MS_BORN)
btrfs_handle_error(fs_info);
}
@@ -188,6 +184,22 @@ static const char * const logtypes[] = {
"debug",
};
+
+/*
+ * Use one ratelimit state per log level so that a flood of less important
+ * messages doesn't cause more important ones to be dropped.
+ */
+static struct ratelimit_state printk_limits[] = {
+ RATELIMIT_STATE_INIT(printk_limits[0], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[1], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[2], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[3], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[4], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[5], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[6], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100),
+};
+
void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
struct super_block *sb = fs_info->sb;
@@ -196,6 +208,7 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
va_list args;
const char *type = logtypes[4];
int kern_level;
+ struct ratelimit_state *ratelimit;
va_start(args, fmt);
@@ -206,13 +219,18 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
lvl[size] = '\0';
fmt += size;
type = logtypes[kern_level - '0'];
- } else
+ ratelimit = &printk_limits[kern_level - '0'];
+ } else {
*lvl = '\0';
+ /* Default to debug output */
+ ratelimit = &printk_limits[7];
+ }
vaf.fmt = fmt;
vaf.va = &args;
- printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf);
+ if (__ratelimit(ratelimit))
+ printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf);
va_end(args);
}
@@ -233,26 +251,28 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
*/
__cold
void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, const char *function,
+ const char *function,
unsigned int line, int errno)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+
trans->aborted = errno;
/* Nothing used. The other threads that have joined this
* transaction may be able to continue. */
- if (!trans->blocks_used && list_empty(&trans->new_bgs)) {
+ if (!trans->dirty && list_empty(&trans->new_bgs)) {
const char *errstr;
errstr = btrfs_decode_error(errno);
- btrfs_warn(root->fs_info,
+ btrfs_warn(fs_info,
"%s:%d: Aborting unused transaction(%s).",
function, line, errstr);
return;
}
ACCESS_ONCE(trans->transaction->aborted) = errno;
/* Wake up anybody who may be waiting on this transaction */
- wake_up(&root->fs_info->transaction_wait);
- wake_up(&root->fs_info->transaction_blocked_wait);
- __btrfs_std_error(root->fs_info, function, line, errno, NULL);
+ wake_up(&fs_info->transaction_wait);
+ wake_up(&fs_info->transaction_blocked_wait);
+ __btrfs_handle_fs_error(fs_info, function, line, errno, NULL);
}
/*
* __btrfs_panic decodes unexpected, fatal errors from the caller,
@@ -436,12 +456,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
*/
break;
case Opt_nodatasum:
- btrfs_set_and_info(root, NODATASUM,
+ btrfs_set_and_info(info, NODATASUM,
"setting nodatasum");
break;
case Opt_datasum:
- if (btrfs_test_opt(root, NODATASUM)) {
- if (btrfs_test_opt(root, NODATACOW))
+ if (btrfs_test_opt(info, NODATASUM)) {
+ if (btrfs_test_opt(info, NODATACOW))
btrfs_info(root->fs_info, "setting datasum, datacow enabled");
else
btrfs_info(root->fs_info, "setting datasum");
@@ -450,9 +470,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
btrfs_clear_opt(info->mount_opt, NODATASUM);
break;
case Opt_nodatacow:
- if (!btrfs_test_opt(root, NODATACOW)) {
- if (!btrfs_test_opt(root, COMPRESS) ||
- !btrfs_test_opt(root, FORCE_COMPRESS)) {
+ if (!btrfs_test_opt(info, NODATACOW)) {
+ if (!btrfs_test_opt(info, COMPRESS) ||
+ !btrfs_test_opt(info, FORCE_COMPRESS)) {
btrfs_info(root->fs_info,
"setting nodatacow, compression disabled");
} else {
@@ -465,7 +485,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
btrfs_set_opt(info->mount_opt, NODATASUM);
break;
case Opt_datacow:
- btrfs_clear_and_info(root, NODATACOW,
+ btrfs_clear_and_info(info, NODATACOW,
"setting datacow");
break;
case Opt_compress_force:
@@ -474,10 +494,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
/* Fallthrough */
case Opt_compress:
case Opt_compress_type:
- saved_compress_type = btrfs_test_opt(root, COMPRESS) ?
+ saved_compress_type = btrfs_test_opt(info,
+ COMPRESS) ?
info->compress_type : BTRFS_COMPRESS_NONE;
saved_compress_force =
- btrfs_test_opt(root, FORCE_COMPRESS);
+ btrfs_test_opt(info, FORCE_COMPRESS);
if (token == Opt_compress ||
token == Opt_compress_force ||
strcmp(args[0].from, "zlib") == 0) {
@@ -517,10 +538,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
*/
btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
}
- if ((btrfs_test_opt(root, COMPRESS) &&
+ if ((btrfs_test_opt(info, COMPRESS) &&
(info->compress_type != saved_compress_type ||
compress_force != saved_compress_force)) ||
- (!btrfs_test_opt(root, COMPRESS) &&
+ (!btrfs_test_opt(info, COMPRESS) &&
no_compress == 1)) {
btrfs_info(root->fs_info,
"%s %s compression",
@@ -530,25 +551,25 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
compress_force = false;
break;
case Opt_ssd:
- btrfs_set_and_info(root, SSD,
+ btrfs_set_and_info(info, SSD,
"use ssd allocation scheme");
break;
case Opt_ssd_spread:
- btrfs_set_and_info(root, SSD_SPREAD,
+ btrfs_set_and_info(info, SSD_SPREAD,
"use spread ssd allocation scheme");
btrfs_set_opt(info->mount_opt, SSD);
break;
case Opt_nossd:
- btrfs_set_and_info(root, NOSSD,
+ btrfs_set_and_info(info, NOSSD,
"not using ssd allocation scheme");
btrfs_clear_opt(info->mount_opt, SSD);
break;
case Opt_barrier:
- btrfs_clear_and_info(root, NOBARRIER,
+ btrfs_clear_and_info(info, NOBARRIER,
"turning on barriers");
break;
case Opt_nobarrier:
- btrfs_set_and_info(root, NOBARRIER,
+ btrfs_set_and_info(info, NOBARRIER,
"turning off barriers");
break;
case Opt_thread_pool:
@@ -608,24 +629,24 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
root->fs_info->sb->s_flags &= ~MS_POSIXACL;
break;
case Opt_notreelog:
- btrfs_set_and_info(root, NOTREELOG,
+ btrfs_set_and_info(info, NOTREELOG,
"disabling tree log");
break;
case Opt_treelog:
- btrfs_clear_and_info(root, NOTREELOG,
+ btrfs_clear_and_info(info, NOTREELOG,
"enabling tree log");
break;
case Opt_norecovery:
case Opt_nologreplay:
- btrfs_set_and_info(root, NOLOGREPLAY,
+ btrfs_set_and_info(info, NOLOGREPLAY,
"disabling log replay at mount time");
break;
case Opt_flushoncommit:
- btrfs_set_and_info(root, FLUSHONCOMMIT,
+ btrfs_set_and_info(info, FLUSHONCOMMIT,
"turning on flush-on-commit");
break;
case Opt_noflushoncommit:
- btrfs_clear_and_info(root, FLUSHONCOMMIT,
+ btrfs_clear_and_info(info, FLUSHONCOMMIT,
"turning off flush-on-commit");
break;
case Opt_ratio:
@@ -642,11 +663,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
}
break;
case Opt_discard:
- btrfs_set_and_info(root, DISCARD,
+ btrfs_set_and_info(info, DISCARD,
"turning on discard");
break;
case Opt_nodiscard:
- btrfs_clear_and_info(root, DISCARD,
+ btrfs_clear_and_info(info, DISCARD,
"turning off discard");
break;
case Opt_space_cache:
@@ -655,12 +676,13 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
strcmp(args[0].from, "v1") == 0) {
btrfs_clear_opt(root->fs_info->mount_opt,
FREE_SPACE_TREE);
- btrfs_set_and_info(root, SPACE_CACHE,
+ btrfs_set_and_info(info, SPACE_CACHE,
"enabling disk space caching");
} else if (strcmp(args[0].from, "v2") == 0) {
btrfs_clear_opt(root->fs_info->mount_opt,
SPACE_CACHE);
- btrfs_set_and_info(root, FREE_SPACE_TREE,
+ btrfs_set_and_info(info,
+ FREE_SPACE_TREE,
"enabling free space tree");
} else {
ret = -EINVAL;
@@ -671,12 +693,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
break;
case Opt_no_space_cache:
- if (btrfs_test_opt(root, SPACE_CACHE)) {
- btrfs_clear_and_info(root, SPACE_CACHE,
+ if (btrfs_test_opt(info, SPACE_CACHE)) {
+ btrfs_clear_and_info(info,
+ SPACE_CACHE,
"disabling disk space caching");
}
- if (btrfs_test_opt(root, FREE_SPACE_TREE)) {
- btrfs_clear_and_info(root, FREE_SPACE_TREE,
+ if (btrfs_test_opt(info, FREE_SPACE_TREE)) {
+ btrfs_clear_and_info(info,
+ FREE_SPACE_TREE,
"disabling free space tree");
}
break;
@@ -689,7 +713,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
"disabling inode map caching");
break;
case Opt_clear_cache:
- btrfs_set_and_info(root, CLEAR_CACHE,
+ btrfs_set_and_info(info, CLEAR_CACHE,
"force clearing of disk cache");
break;
case Opt_user_subvol_rm_allowed:
@@ -702,11 +726,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG);
break;
case Opt_defrag:
- btrfs_set_and_info(root, AUTO_DEFRAG,
+ btrfs_set_and_info(info, AUTO_DEFRAG,
"enabling auto defrag");
break;
case Opt_nodefrag:
- btrfs_clear_and_info(root, AUTO_DEFRAG,
+ btrfs_clear_and_info(info, AUTO_DEFRAG,
"disabling auto defrag");
break;
case Opt_recovery:
@@ -814,22 +838,22 @@ check:
/*
* Extra check for current option against current flag
*/
- if (btrfs_test_opt(root, NOLOGREPLAY) && !(new_flags & MS_RDONLY)) {
+ if (btrfs_test_opt(info, NOLOGREPLAY) && !(new_flags & MS_RDONLY)) {
btrfs_err(root->fs_info,
"nologreplay must be used with ro mount option");
ret = -EINVAL;
}
out:
if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE) &&
- !btrfs_test_opt(root, FREE_SPACE_TREE) &&
- !btrfs_test_opt(root, CLEAR_CACHE)) {
+ !btrfs_test_opt(info, FREE_SPACE_TREE) &&
+ !btrfs_test_opt(info, CLEAR_CACHE)) {
btrfs_err(root->fs_info, "cannot disable free space tree");
ret = -EINVAL;
}
- if (!ret && btrfs_test_opt(root, SPACE_CACHE))
+ if (!ret && btrfs_test_opt(info, SPACE_CACHE))
btrfs_info(root->fs_info, "disk space caching is enabled");
- if (!ret && btrfs_test_opt(root, FREE_SPACE_TREE))
+ if (!ret && btrfs_test_opt(info, FREE_SPACE_TREE))
btrfs_info(root->fs_info, "using free space tree");
kfree(orig);
return ret;
@@ -1153,14 +1177,14 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *root = fs_info->tree_root;
- trace_btrfs_sync_fs(wait);
+ trace_btrfs_sync_fs(fs_info, wait);
if (!wait) {
filemap_flush(fs_info->btree_inode->i_mapping);
return 0;
}
- btrfs_wait_ordered_roots(fs_info, -1);
+ btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
@@ -1196,13 +1220,13 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
struct btrfs_root *root = info->tree_root;
char *compress_type;
- if (btrfs_test_opt(root, DEGRADED))
+ if (btrfs_test_opt(info, DEGRADED))
seq_puts(seq, ",degraded");
- if (btrfs_test_opt(root, NODATASUM))
+ if (btrfs_test_opt(info, NODATASUM))
seq_puts(seq, ",nodatasum");
- if (btrfs_test_opt(root, NODATACOW))
+ if (btrfs_test_opt(info, NODATACOW))
seq_puts(seq, ",nodatacow");
- if (btrfs_test_opt(root, NOBARRIER))
+ if (btrfs_test_opt(info, NOBARRIER))
seq_puts(seq, ",nobarrier");
if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE)
seq_printf(seq, ",max_inline=%llu", info->max_inline);
@@ -1211,56 +1235,56 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
if (info->thread_pool_size != min_t(unsigned long,
num_online_cpus() + 2, 8))
seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
- if (btrfs_test_opt(root, COMPRESS)) {
+ if (btrfs_test_opt(info, COMPRESS)) {
if (info->compress_type == BTRFS_COMPRESS_ZLIB)
compress_type = "zlib";
else
compress_type = "lzo";
- if (btrfs_test_opt(root, FORCE_COMPRESS))
+ if (btrfs_test_opt(info, FORCE_COMPRESS))
seq_printf(seq, ",compress-force=%s", compress_type);
else
seq_printf(seq, ",compress=%s", compress_type);
}
- if (btrfs_test_opt(root, NOSSD))
+ if (btrfs_test_opt(info, NOSSD))
seq_puts(seq, ",nossd");
- if (btrfs_test_opt(root, SSD_SPREAD))
+ if (btrfs_test_opt(info, SSD_SPREAD))
seq_puts(seq, ",ssd_spread");
- else if (btrfs_test_opt(root, SSD))
+ else if (btrfs_test_opt(info, SSD))
seq_puts(seq, ",ssd");
- if (btrfs_test_opt(root, NOTREELOG))
+ if (btrfs_test_opt(info, NOTREELOG))
seq_puts(seq, ",notreelog");
- if (btrfs_test_opt(root, NOLOGREPLAY))
+ if (btrfs_test_opt(info, NOLOGREPLAY))
seq_puts(seq, ",nologreplay");
- if (btrfs_test_opt(root, FLUSHONCOMMIT))
+ if (btrfs_test_opt(info, FLUSHONCOMMIT))
seq_puts(seq, ",flushoncommit");
- if (btrfs_test_opt(root, DISCARD))
+ if (btrfs_test_opt(info, DISCARD))
seq_puts(seq, ",discard");
if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
seq_puts(seq, ",noacl");
- if (btrfs_test_opt(root, SPACE_CACHE))
+ if (btrfs_test_opt(info, SPACE_CACHE))
seq_puts(seq, ",space_cache");
- else if (btrfs_test_opt(root, FREE_SPACE_TREE))
+ else if (btrfs_test_opt(info, FREE_SPACE_TREE))
seq_puts(seq, ",space_cache=v2");
else
seq_puts(seq, ",nospace_cache");
- if (btrfs_test_opt(root, RESCAN_UUID_TREE))
+ if (btrfs_test_opt(info, RESCAN_UUID_TREE))
seq_puts(seq, ",rescan_uuid_tree");
- if (btrfs_test_opt(root, CLEAR_CACHE))
+ if (btrfs_test_opt(info, CLEAR_CACHE))
seq_puts(seq, ",clear_cache");
- if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
+ if (btrfs_test_opt(info, USER_SUBVOL_RM_ALLOWED))
seq_puts(seq, ",user_subvol_rm_allowed");
- if (btrfs_test_opt(root, ENOSPC_DEBUG))
+ if (btrfs_test_opt(info, ENOSPC_DEBUG))
seq_puts(seq, ",enospc_debug");
- if (btrfs_test_opt(root, AUTO_DEFRAG))
+ if (btrfs_test_opt(info, AUTO_DEFRAG))
seq_puts(seq, ",autodefrag");
- if (btrfs_test_opt(root, INODE_MAP_CACHE))
+ if (btrfs_test_opt(info, INODE_MAP_CACHE))
seq_puts(seq, ",inode_cache");
- if (btrfs_test_opt(root, SKIP_BALANCE))
+ if (btrfs_test_opt(info, SKIP_BALANCE))
seq_puts(seq, ",skip_balance");
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- if (btrfs_test_opt(root, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA))
+ if (btrfs_test_opt(info, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA))
seq_puts(seq, ",check_int_data");
- else if (btrfs_test_opt(root, CHECK_INTEGRITY))
+ else if (btrfs_test_opt(info, CHECK_INTEGRITY))
seq_puts(seq, ",check_int");
if (info->check_integrity_print_mask)
seq_printf(seq, ",check_int_print_mask=%d",
@@ -1269,14 +1293,14 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
if (info->metadata_ratio)
seq_printf(seq, ",metadata_ratio=%d",
info->metadata_ratio);
- if (btrfs_test_opt(root, PANIC_ON_FATAL_ERROR))
+ if (btrfs_test_opt(info, PANIC_ON_FATAL_ERROR))
seq_puts(seq, ",fatal_errors=panic");
if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
seq_printf(seq, ",commit=%d", info->commit_interval);
#ifdef CONFIG_BTRFS_DEBUG
- if (btrfs_test_opt(root, FRAGMENT_DATA))
+ if (btrfs_test_opt(info, FRAGMENT_DATA))
seq_puts(seq, ",fragment=data");
- if (btrfs_test_opt(root, FRAGMENT_METADATA))
+ if (btrfs_test_opt(info, FRAGMENT_METADATA))
seq_puts(seq, ",fragment=metadata");
#endif
seq_printf(seq, ",subvolid=%llu",
@@ -1488,10 +1512,10 @@ static int setup_security_options(struct btrfs_fs_info *fs_info,
memcpy(&fs_info->security_opts, sec_opts, sizeof(*sec_opts));
} else {
/*
- * Since SELinux(the only one supports security_mnt_opts) does
- * NOT support changing context during remount/mount same sb,
- * This must be the same or part of the same security options,
- * just free it.
+ * Since SELinux (the only one supporting security_mnt_opts)
+ * does NOT support changing context during remount/mount of
+ * the same sb, this must be the same or part of the same
+ * security options, just free it.
*/
security_free_mnt_opts(sec_opts);
}
@@ -1669,8 +1693,8 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
unsigned long old_opts)
{
/*
- * We need cleanup all defragable inodes if the autodefragment is
- * close or the fs is R/O.
+ * We need to cleanup all defragable inodes if the autodefragment is
+ * close or the filesystem is read only.
*/
if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
(!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
@@ -1811,6 +1835,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
}
}
sb->s_flags &= ~MS_RDONLY;
+
+ fs_info->open = 1;
}
out:
wake_up_process(fs_info->transaction_kthread);
@@ -1881,7 +1907,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
int ret;
/*
- * We aren't under the device list lock, so this is racey-ish, but good
+ * We aren't under the device list lock, so this is racy-ish, but good
* enough for our purposes.
*/
nr_devices = fs_info->fs_devices->open_devices;
@@ -1900,7 +1926,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
if (!devices_info)
return -ENOMEM;
- /* calc min stripe number for data space alloction */
+ /* calc min stripe number for data space allocation */
type = btrfs_get_alloc_profile(root, 1);
if (type & BTRFS_BLOCK_GROUP_RAID0) {
min_stripes = 2;
@@ -1936,7 +1962,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
avail_space *= BTRFS_STRIPE_LEN;
/*
- * In order to avoid overwritting the superblock on the drive,
+ * In order to avoid overwriting the superblock on the drive,
* btrfs starts at an offset of at least 1MB when doing chunk
* allocation.
*/
@@ -2032,9 +2058,6 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
* chunk).
*
* If metadata is exhausted, f_bavail will be 0.
- *
- * FIXME: not accurate for mixed block groups, total and free/used are ok,
- * available appears slightly larger.
*/
static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
@@ -2051,9 +2074,10 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
int ret;
u64 thresh = 0;
+ int mixed = 0;
/*
- * holding chunk_muext to avoid allocating new chunks, holding
+ * holding chunk_mutex to avoid allocating new chunks, holding
* device_list_mutex to avoid the device being removed
*/
rcu_read_lock();
@@ -2076,8 +2100,17 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
}
}
}
- if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
- total_free_meta += found->disk_total - found->disk_used;
+
+ /*
+ * Metadata in mixed block goup profiles are accounted in data
+ */
+ if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) {
+ if (found->flags & BTRFS_BLOCK_GROUP_DATA)
+ mixed = 1;
+ else
+ total_free_meta += found->disk_total -
+ found->disk_used;
+ }
total_used += found->disk_used;
}
@@ -2090,7 +2123,11 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
/* Account global block reserve as used, it's in logical size already */
spin_lock(&block_rsv->lock);
- buf->f_bfree -= block_rsv->size >> bits;
+ /* Mixed block groups accounting is not byte-accurate, avoid overflow */
+ if (buf->f_bfree >= block_rsv->size >> bits)
+ buf->f_bfree -= block_rsv->size >> bits;
+ else
+ buf->f_bfree = 0;
spin_unlock(&block_rsv->lock);
buf->f_bavail = div_u64(total_free_data, factor);
@@ -2115,7 +2152,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
*/
thresh = 4 * 1024 * 1024;
- if (total_free_meta - thresh < block_rsv->size)
+ if (!mixed && total_free_meta - thresh < block_rsv->size)
buf->f_bavail = 0;
buf->f_type = BTRFS_SUPER_MAGIC;
@@ -2293,7 +2330,7 @@ static void btrfs_interface_exit(void)
static void btrfs_print_mod_info(void)
{
- printk(KERN_INFO "Btrfs loaded"
+ printk(KERN_INFO "Btrfs loaded, crc32c=%s"
#ifdef CONFIG_BTRFS_DEBUG
", debug=on"
#endif
@@ -2303,36 +2340,8 @@ static void btrfs_print_mod_info(void)
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
", integrity-checker=on"
#endif
- "\n");
-}
-
-static int btrfs_run_sanity_tests(void)
-{
- int ret;
-
- ret = btrfs_init_test_fs();
- if (ret)
- return ret;
-
- ret = btrfs_test_free_space_cache();
- if (ret)
- goto out;
- ret = btrfs_test_extent_buffer_operations();
- if (ret)
- goto out;
- ret = btrfs_test_extent_io();
- if (ret)
- goto out;
- ret = btrfs_test_inodes();
- if (ret)
- goto out;
- ret = btrfs_test_qgroups();
- if (ret)
- goto out;
- ret = btrfs_test_free_space_tree();
-out:
- btrfs_destroy_test_fs();
- return ret;
+ "\n",
+ btrfs_crc32c_impl());
}
static int __init init_btrfs_fs(void)
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 539e7b5e3f86a..c6569905d3d1c 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -120,6 +120,9 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
if (!fs_info)
return -EPERM;
+ if (fs_info->sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
ret = kstrtoul(skip_spaces(buf), 0, &val);
if (ret)
return ret;
@@ -323,6 +326,7 @@ SPACE_INFO_ATTR(bytes_used);
SPACE_INFO_ATTR(bytes_pinned);
SPACE_INFO_ATTR(bytes_reserved);
SPACE_INFO_ATTR(bytes_may_use);
+SPACE_INFO_ATTR(bytes_readonly);
SPACE_INFO_ATTR(disk_used);
SPACE_INFO_ATTR(disk_total);
BTRFS_ATTR(total_bytes_pinned, btrfs_space_info_show_total_bytes_pinned);
@@ -334,6 +338,7 @@ static struct attribute *space_info_attrs[] = {
BTRFS_ATTR_PTR(bytes_pinned),
BTRFS_ATTR_PTR(bytes_reserved),
BTRFS_ATTR_PTR(bytes_may_use),
+ BTRFS_ATTR_PTR(bytes_readonly),
BTRFS_ATTR_PTR(disk_used),
BTRFS_ATTR_PTR(disk_total),
BTRFS_ATTR_PTR(total_bytes_pinned),
@@ -364,7 +369,13 @@ static ssize_t btrfs_label_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
char *label = fs_info->super_copy->label;
- return snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label);
+ ssize_t ret;
+
+ spin_lock(&fs_info->super_lock);
+ ret = snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label);
+ spin_unlock(&fs_info->super_lock);
+
+ return ret;
}
static ssize_t btrfs_label_store(struct kobject *kobj,
@@ -374,6 +385,9 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
size_t p_len;
+ if (!fs_info)
+ return -EPERM;
+
if (fs_info->sb->s_flags & MS_RDONLY)
return -EROFS;
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index f54bf450bad3e..bf62ad919a95d 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -54,7 +54,7 @@ struct inode *btrfs_new_test_inode(void)
return new_inode(test_mnt->mnt_sb);
}
-int btrfs_init_test_fs(void)
+static int btrfs_init_test_fs(void)
{
int ret;
@@ -68,12 +68,12 @@ int btrfs_init_test_fs(void)
if (IS_ERR(test_mnt)) {
printk(KERN_ERR "btrfs: cannot mount test file system\n");
unregister_filesystem(&test_type);
- return ret;
+ return PTR_ERR(test_mnt);
}
return 0;
}
-void btrfs_destroy_test_fs(void)
+static void btrfs_destroy_test_fs(void)
{
kern_unmount(test_mnt);
unregister_filesystem(&test_type);
@@ -128,14 +128,27 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void)
extent_io_tree_init(&fs_info->freed_extents[0], NULL);
extent_io_tree_init(&fs_info->freed_extents[1], NULL);
fs_info->pinned_extents = &fs_info->freed_extents[0];
+ set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
+
+ test_mnt->mnt_sb->s_fs_info = fs_info;
+
return fs_info;
}
-static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
+void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
{
struct radix_tree_iter iter;
void **slot;
+ if (!fs_info)
+ return;
+
+ if (WARN_ON(!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO,
+ &fs_info->fs_state)))
+ return;
+
+ test_mnt->mnt_sb->s_fs_info = NULL;
+
spin_lock(&fs_info->buffer_lock);
radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
struct extent_buffer *eb;
@@ -167,15 +180,16 @@ void btrfs_free_dummy_root(struct btrfs_root *root)
{
if (!root)
return;
+ /* Will be freed by btrfs_free_fs_roots */
+ if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state)))
+ return;
if (root->node)
free_extent_buffer(root->node);
- if (root->fs_info)
- btrfs_free_dummy_fs_info(root->fs_info);
kfree(root);
}
struct btrfs_block_group_cache *
-btrfs_alloc_dummy_block_group(unsigned long length)
+btrfs_alloc_dummy_block_group(unsigned long length, u32 sectorsize)
{
struct btrfs_block_group_cache *cache;
@@ -192,8 +206,8 @@ btrfs_alloc_dummy_block_group(unsigned long length)
cache->key.objectid = 0;
cache->key.offset = length;
cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
- cache->sectorsize = 4096;
- cache->full_stripe_len = 4096;
+ cache->sectorsize = sectorsize;
+ cache->full_stripe_len = sectorsize;
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
@@ -220,3 +234,46 @@ void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans)
INIT_LIST_HEAD(&trans->qgroup_ref_list);
trans->type = __TRANS_DUMMY;
}
+
+int btrfs_run_sanity_tests(void)
+{
+ int ret, i;
+ u32 sectorsize, nodesize;
+ u32 test_sectorsize[] = {
+ PAGE_SIZE,
+ };
+ ret = btrfs_init_test_fs();
+ if (ret)
+ return ret;
+ for (i = 0; i < ARRAY_SIZE(test_sectorsize); i++) {
+ sectorsize = test_sectorsize[i];
+ for (nodesize = sectorsize;
+ nodesize <= BTRFS_MAX_METADATA_BLOCKSIZE;
+ nodesize <<= 1) {
+ pr_info("BTRFS: selftest: sectorsize: %u nodesize: %u\n",
+ sectorsize, nodesize);
+ ret = btrfs_test_free_space_cache(sectorsize, nodesize);
+ if (ret)
+ goto out;
+ ret = btrfs_test_extent_buffer_operations(sectorsize,
+ nodesize);
+ if (ret)
+ goto out;
+ ret = btrfs_test_extent_io(sectorsize, nodesize);
+ if (ret)
+ goto out;
+ ret = btrfs_test_inodes(sectorsize, nodesize);
+ if (ret)
+ goto out;
+ ret = btrfs_test_qgroups(sectorsize, nodesize);
+ if (ret)
+ goto out;
+ ret = btrfs_test_free_space_tree(sectorsize, nodesize);
+ if (ret)
+ goto out;
+ }
+ }
+out:
+ btrfs_destroy_test_fs();
+ return ret;
+}
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index 054b8c73c951e..b17ffbe8f9f33 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -20,56 +20,29 @@
#define __BTRFS_TESTS
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+int btrfs_run_sanity_tests(void);
#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__)
struct btrfs_root;
struct btrfs_trans_handle;
-int btrfs_test_free_space_cache(void);
-int btrfs_test_extent_buffer_operations(void);
-int btrfs_test_extent_io(void);
-int btrfs_test_inodes(void);
-int btrfs_test_qgroups(void);
-int btrfs_test_free_space_tree(void);
-int btrfs_init_test_fs(void);
-void btrfs_destroy_test_fs(void);
+int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize);
+int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize);
+int btrfs_test_extent_io(u32 sectorsize, u32 nodesize);
+int btrfs_test_inodes(u32 sectorsize, u32 nodesize);
+int btrfs_test_qgroups(u32 sectorsize, u32 nodesize);
+int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize);
struct inode *btrfs_new_test_inode(void);
struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void);
+void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info);
void btrfs_free_dummy_root(struct btrfs_root *root);
struct btrfs_block_group_cache *
-btrfs_alloc_dummy_block_group(unsigned long length);
+btrfs_alloc_dummy_block_group(unsigned long length, u32 sectorsize);
void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache);
void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans);
#else
-static inline int btrfs_test_free_space_cache(void)
-{
- return 0;
-}
-static inline int btrfs_test_extent_buffer_operations(void)
-{
- return 0;
-}
-static inline int btrfs_init_test_fs(void)
-{
- return 0;
-}
-static inline void btrfs_destroy_test_fs(void)
-{
-}
-static inline int btrfs_test_extent_io(void)
-{
- return 0;
-}
-static inline int btrfs_test_inodes(void)
-{
- return 0;
-}
-static inline int btrfs_test_qgroups(void)
-{
- return 0;
-}
-static inline int btrfs_test_free_space_tree(void)
+static inline int btrfs_run_sanity_tests(void)
{
return 0;
}
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index f51963a8f929e..1995691746374 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -22,10 +22,11 @@
#include "../extent_io.h"
#include "../disk-io.h"
-static int test_btrfs_split_item(void)
+static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
{
- struct btrfs_path *path;
- struct btrfs_root *root;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_path *path = NULL;
+ struct btrfs_root *root = NULL;
struct extent_buffer *eb;
struct btrfs_item *item;
char *value = "mary had a little lamb";
@@ -40,20 +41,28 @@ static int test_btrfs_split_item(void)
test_msg("Running btrfs_split_item tests\n");
- root = btrfs_alloc_dummy_root();
+ fs_info = btrfs_alloc_dummy_fs_info();
+ if (!fs_info) {
+ test_msg("Could not allocate fs_info\n");
+ return -ENOMEM;
+ }
+
+ root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
if (IS_ERR(root)) {
test_msg("Could not allocate root\n");
- return PTR_ERR(root);
+ ret = PTR_ERR(root);
+ goto out;
}
path = btrfs_alloc_path();
if (!path) {
test_msg("Could not allocate path\n");
- kfree(root);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
- path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, 4096);
+ path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, nodesize,
+ nodesize);
if (!eb) {
test_msg("Could not allocate dummy buffer\n");
ret = -ENOMEM;
@@ -218,12 +227,13 @@ static int test_btrfs_split_item(void)
}
out:
btrfs_free_path(path);
- kfree(root);
+ btrfs_free_dummy_root(root);
+ btrfs_free_dummy_fs_info(fs_info);
return ret;
}
-int btrfs_test_extent_buffer_operations(void)
+int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize)
{
- test_msg("Running extent buffer operation tests");
- return test_btrfs_split_item();
+ test_msg("Running extent buffer operation tests\n");
+ return test_btrfs_split_item(sectorsize, nodesize);
}
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 669b58201e368..d19ab0317283c 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -21,6 +21,7 @@
#include <linux/slab.h>
#include <linux/sizes.h>
#include "btrfs-tests.h"
+#include "../ctree.h"
#include "../extent_io.h"
#define PROCESS_UNLOCK (1 << 0)
@@ -32,8 +33,8 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
{
int ret;
struct page *pages[16];
- unsigned long index = start >> PAGE_CACHE_SHIFT;
- unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ unsigned long index = start >> PAGE_SHIFT;
+ unsigned long end_index = end >> PAGE_SHIFT;
unsigned long nr_pages = end_index - index + 1;
int i;
int count = 0;
@@ -49,9 +50,9 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
count++;
if (flags & PROCESS_UNLOCK && PageLocked(pages[i]))
unlock_page(pages[i]);
- page_cache_release(pages[i]);
+ put_page(pages[i]);
if (flags & PROCESS_RELEASE)
- page_cache_release(pages[i]);
+ put_page(pages[i]);
}
nr_pages -= ret;
index += ret;
@@ -65,7 +66,7 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
return count;
}
-static int test_find_delalloc(void)
+static int test_find_delalloc(u32 sectorsize)
{
struct inode *inode;
struct extent_io_tree tmp;
@@ -93,7 +94,7 @@ static int test_find_delalloc(void)
* everything to make sure our pages don't get evicted and screw up our
* test.
*/
- for (index = 0; index < (total_dirty >> PAGE_CACHE_SHIFT); index++) {
+ for (index = 0; index < (total_dirty >> PAGE_SHIFT); index++) {
page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL);
if (!page) {
test_msg("Failed to allocate test page\n");
@@ -104,7 +105,7 @@ static int test_find_delalloc(void)
if (index) {
unlock_page(page);
} else {
- page_cache_get(page);
+ get_page(page);
locked_page = page;
}
}
@@ -113,7 +114,7 @@ static int test_find_delalloc(void)
* |--- delalloc ---|
* |--- search ---|
*/
- set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_KERNEL);
+ set_extent_delalloc(&tmp, 0, sectorsize - 1, NULL);
start = 0;
end = 0;
found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
@@ -122,14 +123,14 @@ static int test_find_delalloc(void)
test_msg("Should have found at least one delalloc\n");
goto out_bits;
}
- if (start != 0 || end != 4095) {
- test_msg("Expected start 0 end 4095, got start %Lu end %Lu\n",
- start, end);
+ if (start != 0 || end != (sectorsize - 1)) {
+ test_msg("Expected start 0 end %u, got start %llu end %llu\n",
+ sectorsize - 1, start, end);
goto out_bits;
}
unlock_extent(&tmp, start, end);
unlock_page(locked_page);
- page_cache_release(locked_page);
+ put_page(locked_page);
/*
* Test this scenario
@@ -139,12 +140,12 @@ static int test_find_delalloc(void)
*/
test_start = SZ_64M;
locked_page = find_lock_page(inode->i_mapping,
- test_start >> PAGE_CACHE_SHIFT);
+ test_start >> PAGE_SHIFT);
if (!locked_page) {
test_msg("Couldn't find the locked page\n");
goto out_bits;
}
- set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_KERNEL);
+ set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, NULL);
start = test_start;
end = 0;
found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
@@ -165,18 +166,18 @@ static int test_find_delalloc(void)
}
unlock_extent(&tmp, start, end);
/* locked_page was unlocked above */
- page_cache_release(locked_page);
+ put_page(locked_page);
/*
* Test this scenario
* |--- delalloc ---|
* |--- search ---|
*/
- test_start = max_bytes + 4096;
+ test_start = max_bytes + sectorsize;
locked_page = find_lock_page(inode->i_mapping, test_start >>
- PAGE_CACHE_SHIFT);
+ PAGE_SHIFT);
if (!locked_page) {
- test_msg("Could'nt find the locked page\n");
+ test_msg("Couldn't find the locked page\n");
goto out_bits;
}
start = test_start;
@@ -199,7 +200,7 @@ static int test_find_delalloc(void)
*
* We are re-using our test_start from above since it works out well.
*/
- set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_KERNEL);
+ set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL);
start = test_start;
end = 0;
found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
@@ -225,13 +226,13 @@ static int test_find_delalloc(void)
* range we want to find.
*/
page = find_get_page(inode->i_mapping,
- (max_bytes + SZ_1M) >> PAGE_CACHE_SHIFT);
+ (max_bytes + SZ_1M) >> PAGE_SHIFT);
if (!page) {
test_msg("Couldn't find our page\n");
goto out_bits;
}
ClearPageDirty(page);
- page_cache_release(page);
+ put_page(page);
/* We unlocked it in the previous test */
lock_page(locked_page);
@@ -239,7 +240,7 @@ static int test_find_delalloc(void)
end = 0;
/*
* Currently if we fail to find dirty pages in the delalloc range we
- * will adjust max_bytes down to PAGE_CACHE_SIZE and then re-search. If
+ * will adjust max_bytes down to PAGE_SIZE and then re-search. If
* this changes at any point in the future we will need to fix this
* tests expected behavior.
*/
@@ -249,9 +250,9 @@ static int test_find_delalloc(void)
test_msg("Didn't find our range\n");
goto out_bits;
}
- if (start != test_start && end != test_start + PAGE_CACHE_SIZE - 1) {
+ if (start != test_start && end != test_start + PAGE_SIZE - 1) {
test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n",
- test_start, test_start + PAGE_CACHE_SIZE - 1, start,
+ test_start, test_start + PAGE_SIZE - 1, start,
end);
goto out_bits;
}
@@ -262,16 +263,26 @@ static int test_find_delalloc(void)
}
ret = 0;
out_bits:
- clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_KERNEL);
+ clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1);
out:
if (locked_page)
- page_cache_release(locked_page);
+ put_page(locked_page);
process_page_range(inode, 0, total_dirty - 1,
PROCESS_UNLOCK | PROCESS_RELEASE);
iput(inode);
return ret;
}
+/**
+ * test_bit_in_byte - Determine whether a bit is set in a byte
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static inline int test_bit_in_byte(int nr, const u8 *addr)
+{
+ return 1UL & (addr[nr / BITS_PER_BYTE] >> (nr & (BITS_PER_BYTE - 1)));
+}
+
static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
unsigned long len)
{
@@ -298,25 +309,29 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
return -EINVAL;
}
- bitmap_set(bitmap, (PAGE_CACHE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE,
- sizeof(long) * BITS_PER_BYTE);
- extent_buffer_bitmap_set(eb, PAGE_CACHE_SIZE - sizeof(long) / 2, 0,
- sizeof(long) * BITS_PER_BYTE);
- if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
- test_msg("Setting straddling pages failed\n");
- return -EINVAL;
- }
+ /* Straddling pages test */
+ if (len > PAGE_SIZE) {
+ bitmap_set(bitmap,
+ (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE,
+ sizeof(long) * BITS_PER_BYTE);
+ extent_buffer_bitmap_set(eb, PAGE_SIZE - sizeof(long) / 2, 0,
+ sizeof(long) * BITS_PER_BYTE);
+ if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
+ test_msg("Setting straddling pages failed\n");
+ return -EINVAL;
+ }
- bitmap_set(bitmap, 0, len * BITS_PER_BYTE);
- bitmap_clear(bitmap,
- (PAGE_CACHE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE,
- sizeof(long) * BITS_PER_BYTE);
- extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE);
- extent_buffer_bitmap_clear(eb, PAGE_CACHE_SIZE - sizeof(long) / 2, 0,
- sizeof(long) * BITS_PER_BYTE);
- if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
- test_msg("Clearing straddling pages failed\n");
- return -EINVAL;
+ bitmap_set(bitmap, 0, len * BITS_PER_BYTE);
+ bitmap_clear(bitmap,
+ (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE,
+ sizeof(long) * BITS_PER_BYTE);
+ extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE);
+ extent_buffer_bitmap_clear(eb, PAGE_SIZE - sizeof(long) / 2, 0,
+ sizeof(long) * BITS_PER_BYTE);
+ if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
+ test_msg("Clearing straddling pages failed\n");
+ return -EINVAL;
+ }
}
/*
@@ -333,7 +348,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
for (i = 0; i < len * BITS_PER_BYTE; i++) {
int bit, bit1;
- bit = !!test_bit(i, bitmap);
+ bit = !!test_bit_in_byte(i, (u8 *)bitmap);
bit1 = !!extent_buffer_test_bit(eb, 0, i);
if (bit1 != bit) {
test_msg("Testing bit pattern failed\n");
@@ -351,15 +366,22 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
return 0;
}
-static int test_eb_bitmaps(void)
+static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
{
- unsigned long len = PAGE_CACHE_SIZE * 4;
+ unsigned long len;
unsigned long *bitmap;
struct extent_buffer *eb;
int ret;
test_msg("Running extent buffer bitmap tests\n");
+ /*
+ * In ppc64, sectorsize can be 64K, thus 4 * 64K will be larger than
+ * BTRFS_MAX_METADATA_BLOCKSIZE.
+ */
+ len = (sectorsize < BTRFS_MAX_METADATA_BLOCKSIZE)
+ ? sectorsize * 4 : sectorsize;
+
bitmap = kmalloc(len, GFP_KERNEL);
if (!bitmap) {
test_msg("Couldn't allocate test bitmap\n");
@@ -379,7 +401,7 @@ static int test_eb_bitmaps(void)
/* Do it over again with an extent buffer which isn't page-aligned. */
free_extent_buffer(eb);
- eb = __alloc_dummy_extent_buffer(NULL, PAGE_CACHE_SIZE / 2, len);
+ eb = __alloc_dummy_extent_buffer(NULL, nodesize / 2, len);
if (!eb) {
test_msg("Couldn't allocate test extent buffer\n");
kfree(bitmap);
@@ -393,17 +415,17 @@ out:
return ret;
}
-int btrfs_test_extent_io(void)
+int btrfs_test_extent_io(u32 sectorsize, u32 nodesize)
{
int ret;
test_msg("Running extent I/O tests\n");
- ret = test_find_delalloc();
+ ret = test_find_delalloc(sectorsize);
if (ret)
goto out;
- ret = test_eb_bitmaps();
+ ret = test_eb_bitmaps(sectorsize, nodesize);
out:
test_msg("Extent I/O tests finished\n");
return ret;
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index c9ad97b1e6909..3221c8dee272f 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -22,10 +22,10 @@
#include "../disk-io.h"
#include "../free-space-cache.h"
-#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
+#define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
/*
- * This test just does basic sanity checking, making sure we can add an exten
+ * This test just does basic sanity checking, making sure we can add an extent
* entry and remove space from either end and the middle, and make sure we can
* remove space that covers adjacent extent entries.
*/
@@ -99,7 +99,8 @@ static int test_extents(struct btrfs_block_group_cache *cache)
return 0;
}
-static int test_bitmaps(struct btrfs_block_group_cache *cache)
+static int test_bitmaps(struct btrfs_block_group_cache *cache,
+ u32 sectorsize)
{
u64 next_bitmap_offset;
int ret;
@@ -139,7 +140,7 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache)
* The first bitmap we have starts at offset 0 so the next one is just
* at the end of the first bitmap.
*/
- next_bitmap_offset = (u64)(BITS_PER_BITMAP * 4096);
+ next_bitmap_offset = (u64)(BITS_PER_BITMAP * sectorsize);
/* Test a bit straddling two bitmaps */
ret = test_add_free_space_entry(cache, next_bitmap_offset - SZ_2M,
@@ -167,9 +168,10 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache)
}
/* This is the high grade jackassery */
-static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
+static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache,
+ u32 sectorsize)
{
- u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096);
+ u64 bitmap_offset = (u64)(BITS_PER_BITMAP * sectorsize);
int ret;
test_msg("Running bitmap and extent tests\n");
@@ -396,11 +398,13 @@ static int check_cache_empty(struct btrfs_block_group_cache *cache)
* wasn't optimal as they could be spread all over the block group while under
* concurrency (extra overhead and fragmentation).
*
- * This stealing approach is benefical, since we always prefer to allocate from
- * extent entries, both for clustered and non-clustered allocation requests.
+ * This stealing approach is beneficial, since we always prefer to allocate
+ * from extent entries, both for clustered and non-clustered allocation
+ * requests.
*/
static int
-test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
+test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
+ u32 sectorsize)
{
int ret;
u64 offset;
@@ -538,7 +542,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* The goal is to test that the bitmap entry space stealing doesn't
* steal this space region.
*/
- ret = btrfs_add_free_space(cache, SZ_128M + SZ_16M, 4096);
+ ret = btrfs_add_free_space(cache, SZ_128M + SZ_16M, sectorsize);
if (ret) {
test_msg("Error adding free space: %d\n", ret);
return ret;
@@ -596,8 +600,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
return -ENOENT;
}
- if (cache->free_space_ctl->free_space != (SZ_1M + 4096)) {
- test_msg("Cache free space is not 1Mb + 4Kb\n");
+ if (cache->free_space_ctl->free_space != (SZ_1M + sectorsize)) {
+ test_msg("Cache free space is not 1Mb + %u\n", sectorsize);
return -EINVAL;
}
@@ -610,22 +614,25 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
return -EINVAL;
}
- /* All that remains is a 4Kb free space region in a bitmap. Confirm. */
+ /*
+ * All that remains is a sectorsize free space region in a bitmap.
+ * Confirm.
+ */
ret = check_num_extents_and_bitmaps(cache, 1, 1);
if (ret)
return ret;
- if (cache->free_space_ctl->free_space != 4096) {
- test_msg("Cache free space is not 4Kb\n");
+ if (cache->free_space_ctl->free_space != sectorsize) {
+ test_msg("Cache free space is not %u\n", sectorsize);
return -EINVAL;
}
offset = btrfs_find_space_for_alloc(cache,
- 0, 4096, 0,
+ 0, sectorsize, 0,
&max_extent_size);
if (offset != (SZ_128M + SZ_16M)) {
- test_msg("Failed to allocate 4Kb from space cache, returned offset is: %llu\n",
- offset);
+ test_msg("Failed to allocate %u, returned offset : %llu\n",
+ sectorsize, offset);
return -EINVAL;
}
@@ -732,7 +739,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* The goal is to test that the bitmap entry space stealing doesn't
* steal this space region.
*/
- ret = btrfs_add_free_space(cache, SZ_32M, 8192);
+ ret = btrfs_add_free_space(cache, SZ_32M, 2 * sectorsize);
if (ret) {
test_msg("Error adding free space: %d\n", ret);
return ret;
@@ -756,7 +763,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
/*
* Confirm that our extent entry didn't stole all free space from the
- * bitmap, because of the small 8Kb free space region.
+ * bitmap, because of the small 2 * sectorsize free space region.
*/
ret = check_num_extents_and_bitmaps(cache, 2, 1);
if (ret)
@@ -782,8 +789,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
return -ENOENT;
}
- if (cache->free_space_ctl->free_space != (SZ_1M + 8192)) {
- test_msg("Cache free space is not 1Mb + 8Kb\n");
+ if (cache->free_space_ctl->free_space != (SZ_1M + 2 * sectorsize)) {
+ test_msg("Cache free space is not 1Mb + %u\n", 2 * sectorsize);
return -EINVAL;
}
@@ -795,21 +802,25 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
return -EINVAL;
}
- /* All that remains is a 8Kb free space region in a bitmap. Confirm. */
+ /*
+ * All that remains is 2 * sectorsize free space region
+ * in a bitmap. Confirm.
+ */
ret = check_num_extents_and_bitmaps(cache, 1, 1);
if (ret)
return ret;
- if (cache->free_space_ctl->free_space != 8192) {
- test_msg("Cache free space is not 8Kb\n");
+ if (cache->free_space_ctl->free_space != 2 * sectorsize) {
+ test_msg("Cache free space is not %u\n", 2 * sectorsize);
return -EINVAL;
}
offset = btrfs_find_space_for_alloc(cache,
- 0, 8192, 0,
+ 0, 2 * sectorsize, 0,
&max_extent_size);
if (offset != SZ_32M) {
- test_msg("Failed to allocate 8Kb from space cache, returned offset is: %llu\n",
+ test_msg("Failed to allocate %u, offset: %llu\n",
+ 2 * sectorsize,
offset);
return -EINVAL;
}
@@ -824,29 +835,38 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
return 0;
}
-int btrfs_test_free_space_cache(void)
+int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
{
+ struct btrfs_fs_info *fs_info;
struct btrfs_block_group_cache *cache;
struct btrfs_root *root = NULL;
int ret = -ENOMEM;
test_msg("Running btrfs free space cache tests\n");
- cache = btrfs_alloc_dummy_block_group(1024 * 1024 * 1024);
+ /*
+ * For ppc64 (with 64k page size), bytes per bitmap might be
+ * larger than 1G. To make bitmap test available in ppc64,
+ * alloc dummy block group whose size cross bitmaps.
+ */
+ cache = btrfs_alloc_dummy_block_group(BITS_PER_BITMAP * sectorsize
+ + PAGE_SIZE, sectorsize);
if (!cache) {
test_msg("Couldn't run the tests\n");
return 0;
}
- root = btrfs_alloc_dummy_root();
- if (IS_ERR(root)) {
- ret = PTR_ERR(root);
+ fs_info = btrfs_alloc_dummy_fs_info();
+ if (!fs_info) {
+ ret = -ENOMEM;
goto out;
}
- root->fs_info = btrfs_alloc_dummy_fs_info();
- if (!root->fs_info)
+ root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
goto out;
+ }
root->fs_info->extent_root = root;
cache->fs_info = root->fs_info;
@@ -854,17 +874,18 @@ int btrfs_test_free_space_cache(void)
ret = test_extents(cache);
if (ret)
goto out;
- ret = test_bitmaps(cache);
+ ret = test_bitmaps(cache, sectorsize);
if (ret)
goto out;
- ret = test_bitmaps_and_extents(cache);
+ ret = test_bitmaps_and_extents(cache, sectorsize);
if (ret)
goto out;
- ret = test_steal_space_from_bitmap_to_extent(cache);
+ ret = test_steal_space_from_bitmap_to_extent(cache, sectorsize);
out:
btrfs_free_dummy_block_group(cache);
btrfs_free_dummy_root(root);
+ btrfs_free_dummy_fs_info(fs_info);
test_msg("Free space cache tests finished\n");
return ret;
}
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index 7cea4462acd5f..7508d3b427804 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -16,6 +16,7 @@
* Boston, MA 021110-1307, USA.
*/
+#include <linux/types.h>
#include "btrfs-tests.h"
#include "../ctree.h"
#include "../disk-io.h"
@@ -30,7 +31,7 @@ struct free_space_extent {
* The test cases align their operations to this in order to hit some of the
* edge cases in the bitmap code.
*/
-#define BITMAP_RANGE (BTRFS_FREE_SPACE_BITMAP_BITS * 4096)
+#define BITMAP_RANGE (BTRFS_FREE_SPACE_BITMAP_BITS * PAGE_SIZE)
static int __check_free_space_extents(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
@@ -439,25 +440,27 @@ typedef int (*test_func_t)(struct btrfs_trans_handle *,
struct btrfs_block_group_cache *,
struct btrfs_path *);
-static int run_test(test_func_t test_func, int bitmaps)
+static int run_test(test_func_t test_func, int bitmaps,
+ u32 sectorsize, u32 nodesize)
{
+ struct btrfs_fs_info *fs_info;
struct btrfs_root *root = NULL;
struct btrfs_block_group_cache *cache = NULL;
struct btrfs_trans_handle trans;
struct btrfs_path *path = NULL;
int ret;
- root = btrfs_alloc_dummy_root();
- if (IS_ERR(root)) {
- test_msg("Couldn't allocate dummy root\n");
- ret = PTR_ERR(root);
+ fs_info = btrfs_alloc_dummy_fs_info();
+ if (!fs_info) {
+ test_msg("Couldn't allocate dummy fs info\n");
+ ret = -ENOMEM;
goto out;
}
- root->fs_info = btrfs_alloc_dummy_fs_info();
- if (!root->fs_info) {
- test_msg("Couldn't allocate dummy fs info\n");
- ret = -ENOMEM;
+ root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
+ if (IS_ERR(root)) {
+ test_msg("Couldn't allocate dummy root\n");
+ ret = PTR_ERR(root);
goto out;
}
@@ -466,7 +469,8 @@ static int run_test(test_func_t test_func, int bitmaps)
root->fs_info->free_space_root = root;
root->fs_info->tree_root = root;
- root->node = alloc_test_extent_buffer(root->fs_info, 4096);
+ root->node = alloc_test_extent_buffer(root->fs_info,
+ nodesize, nodesize);
if (!root->node) {
test_msg("Couldn't allocate dummy buffer\n");
ret = -ENOMEM;
@@ -474,9 +478,9 @@ static int run_test(test_func_t test_func, int bitmaps)
}
btrfs_set_header_level(root->node, 0);
btrfs_set_header_nritems(root->node, 0);
- root->alloc_bytenr += 8192;
+ root->alloc_bytenr += 2 * nodesize;
- cache = btrfs_alloc_dummy_block_group(8 * BITMAP_RANGE);
+ cache = btrfs_alloc_dummy_block_group(8 * BITMAP_RANGE, sectorsize);
if (!cache) {
test_msg("Couldn't allocate dummy block group cache\n");
ret = -ENOMEM;
@@ -531,20 +535,22 @@ out:
btrfs_free_path(path);
btrfs_free_dummy_block_group(cache);
btrfs_free_dummy_root(root);
+ btrfs_free_dummy_fs_info(fs_info);
return ret;
}
-static int run_test_both_formats(test_func_t test_func)
+static int run_test_both_formats(test_func_t test_func,
+ u32 sectorsize, u32 nodesize)
{
int ret;
- ret = run_test(test_func, 0);
+ ret = run_test(test_func, 0, sectorsize, nodesize);
if (ret)
return ret;
- return run_test(test_func, 1);
+ return run_test(test_func, 1, sectorsize, nodesize);
}
-int btrfs_test_free_space_tree(void)
+int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize)
{
test_func_t tests[] = {
test_empty_block_group,
@@ -561,9 +567,11 @@ int btrfs_test_free_space_tree(void)
test_msg("Running free space tree tests\n");
for (i = 0; i < ARRAY_SIZE(tests); i++) {
- int ret = run_test_both_formats(tests[i]);
+ int ret = run_test_both_formats(tests[i], sectorsize,
+ nodesize);
if (ret) {
- test_msg("%pf failed\n", tests[i]);
+ test_msg("%pf : sectorsize %u failed\n",
+ tests[i], sectorsize);
return ret;
}
}
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 863a6a3af1f88..9f72aeda92204 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -16,6 +16,7 @@
* Boston, MA 021110-1307, USA.
*/
+#include <linux/types.h>
#include "btrfs-tests.h"
#include "../ctree.h"
#include "../btrfs_inode.h"
@@ -86,19 +87,19 @@ static void insert_inode_item_key(struct btrfs_root *root)
* diagram of how the extents will look though this may not be possible we still
* want to make sure everything acts normally (the last number is not inclusive)
*
- * [0 - 5][5 - 6][6 - 10][10 - 4096][ 4096 - 8192 ][8192 - 12288]
- * [hole ][inline][ hole ][ regular ][regular1 split][ hole ]
+ * [0 - 5][5 - 6][ 6 - 4096 ][ 4096 - 4100][4100 - 8195][8195 - 12291]
+ * [hole ][inline][hole but no extent][ hole ][ regular ][regular1 split]
*
- * [ 12288 - 20480][20480 - 24576][ 24576 - 28672 ][28672 - 36864][36864 - 45056]
- * [regular1 split][ prealloc1 ][prealloc1 written][ prealloc1 ][ compressed ]
+ * [12291 - 16387][16387 - 24579][24579 - 28675][ 28675 - 32771][32771 - 36867 ]
+ * [ hole ][regular1 split][ prealloc ][ prealloc1 ][prealloc1 written]
*
- * [45056 - 49152][49152-53248][53248-61440][61440-65536][ 65536+81920 ]
- * [ compressed1 ][ regular ][compressed1][ regular ][ hole but no extent]
+ * [36867 - 45059][45059 - 53251][53251 - 57347][57347 - 61443][61443- 69635]
+ * [ prealloc1 ][ compressed ][ compressed1 ][ regular ][ compressed1]
*
- * [81920-86016]
- * [ regular ]
+ * [69635-73731][ 73731 - 86019 ][86019-90115]
+ * [ regular ][ hole but no extent][ regular ]
*/
-static void setup_file_extents(struct btrfs_root *root)
+static void setup_file_extents(struct btrfs_root *root, u32 sectorsize)
{
int slot = 0;
u64 disk_bytenr = SZ_1M;
@@ -119,7 +120,7 @@ static void setup_file_extents(struct btrfs_root *root)
insert_extent(root, offset, 1, 1, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0,
slot);
slot++;
- offset = 4096;
+ offset = sectorsize;
/* Now another hole */
insert_extent(root, offset, 4, 4, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0,
@@ -128,100 +129,108 @@ static void setup_file_extents(struct btrfs_root *root)
offset += 4;
/* Now for a regular extent */
- insert_extent(root, offset, 4095, 4095, 0, disk_bytenr, 4096,
- BTRFS_FILE_EXTENT_REG, 0, slot);
+ insert_extent(root, offset, sectorsize - 1, sectorsize - 1, 0,
+ disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot);
slot++;
- disk_bytenr += 4096;
- offset += 4095;
+ disk_bytenr += sectorsize;
+ offset += sectorsize - 1;
/*
* Now for 3 extents that were split from a hole punch so we test
* offsets properly.
*/
- insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384,
- BTRFS_FILE_EXTENT_REG, 0, slot);
+ insert_extent(root, offset, sectorsize, 4 * sectorsize, 0, disk_bytenr,
+ 4 * sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot);
slot++;
- offset += 4096;
- insert_extent(root, offset, 4096, 4096, 0, 0, 0, BTRFS_FILE_EXTENT_REG,
- 0, slot);
+ offset += sectorsize;
+ insert_extent(root, offset, sectorsize, sectorsize, 0, 0, 0,
+ BTRFS_FILE_EXTENT_REG, 0, slot);
slot++;
- offset += 4096;
- insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384,
+ offset += sectorsize;
+ insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize,
+ 2 * sectorsize, disk_bytenr, 4 * sectorsize,
BTRFS_FILE_EXTENT_REG, 0, slot);
slot++;
- offset += 8192;
- disk_bytenr += 16384;
+ offset += 2 * sectorsize;
+ disk_bytenr += 4 * sectorsize;
/* Now for a unwritten prealloc extent */
- insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096,
- BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
+ insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr,
+ sectorsize, BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
slot++;
- offset += 4096;
+ offset += sectorsize;
/*
* We want to jack up disk_bytenr a little more so the em stuff doesn't
* merge our records.
*/
- disk_bytenr += 8192;
+ disk_bytenr += 2 * sectorsize;
/*
* Now for a partially written prealloc extent, basically the same as
* the hole punch example above. Ram_bytes never changes when you mark
* extents written btw.
*/
- insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384,
- BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
+ insert_extent(root, offset, sectorsize, 4 * sectorsize, 0, disk_bytenr,
+ 4 * sectorsize, BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
slot++;
- offset += 4096;
- insert_extent(root, offset, 4096, 16384, 4096, disk_bytenr, 16384,
- BTRFS_FILE_EXTENT_REG, 0, slot);
+ offset += sectorsize;
+ insert_extent(root, offset, sectorsize, 4 * sectorsize, sectorsize,
+ disk_bytenr, 4 * sectorsize, BTRFS_FILE_EXTENT_REG, 0,
+ slot);
slot++;
- offset += 4096;
- insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384,
+ offset += sectorsize;
+ insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize,
+ 2 * sectorsize, disk_bytenr, 4 * sectorsize,
BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
slot++;
- offset += 8192;
- disk_bytenr += 16384;
+ offset += 2 * sectorsize;
+ disk_bytenr += 4 * sectorsize;
/* Now a normal compressed extent */
- insert_extent(root, offset, 8192, 8192, 0, disk_bytenr, 4096,
- BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot);
+ insert_extent(root, offset, 2 * sectorsize, 2 * sectorsize, 0,
+ disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG,
+ BTRFS_COMPRESS_ZLIB, slot);
slot++;
- offset += 8192;
+ offset += 2 * sectorsize;
/* No merges */
- disk_bytenr += 8192;
+ disk_bytenr += 2 * sectorsize;
/* Now a split compressed extent */
- insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 4096,
- BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot);
+ insert_extent(root, offset, sectorsize, 4 * sectorsize, 0, disk_bytenr,
+ sectorsize, BTRFS_FILE_EXTENT_REG,
+ BTRFS_COMPRESS_ZLIB, slot);
slot++;
- offset += 4096;
- insert_extent(root, offset, 4096, 4096, 0, disk_bytenr + 4096, 4096,
+ offset += sectorsize;
+ insert_extent(root, offset, sectorsize, sectorsize, 0,
+ disk_bytenr + sectorsize, sectorsize,
BTRFS_FILE_EXTENT_REG, 0, slot);
slot++;
- offset += 4096;
- insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 4096,
+ offset += sectorsize;
+ insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize,
+ 2 * sectorsize, disk_bytenr, sectorsize,
BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot);
slot++;
- offset += 8192;
- disk_bytenr += 8192;
+ offset += 2 * sectorsize;
+ disk_bytenr += 2 * sectorsize;
/* Now extents that have a hole but no hole extent */
- insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096,
- BTRFS_FILE_EXTENT_REG, 0, slot);
+ insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr,
+ sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot);
slot++;
- offset += 16384;
- disk_bytenr += 4096;
- insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096,
- BTRFS_FILE_EXTENT_REG, 0, slot);
+ offset += 4 * sectorsize;
+ disk_bytenr += sectorsize;
+ insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr,
+ sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot);
}
static unsigned long prealloc_only = 0;
static unsigned long compressed_only = 0;
static unsigned long vacancy_only = 0;
-static noinline int test_btrfs_get_extent(void)
+static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
{
+ struct btrfs_fs_info *fs_info = NULL;
struct inode *inode = NULL;
struct btrfs_root *root = NULL;
struct extent_map *em = NULL;
@@ -240,23 +249,19 @@ static noinline int test_btrfs_get_extent(void)
BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
BTRFS_I(inode)->location.offset = 0;
- root = btrfs_alloc_dummy_root();
- if (IS_ERR(root)) {
- test_msg("Couldn't allocate root\n");
+ fs_info = btrfs_alloc_dummy_fs_info();
+ if (!fs_info) {
+ test_msg("Couldn't allocate dummy fs info\n");
goto out;
}
- /*
- * We do this since btrfs_get_extent wants to assign em->bdev to
- * root->fs_info->fs_devices->latest_bdev.
- */
- root->fs_info = btrfs_alloc_dummy_fs_info();
- if (!root->fs_info) {
- test_msg("Couldn't allocate dummy fs info\n");
+ root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
+ if (IS_ERR(root)) {
+ test_msg("Couldn't allocate root\n");
goto out;
}
- root->node = alloc_dummy_extent_buffer(NULL, 4096);
+ root->node = alloc_dummy_extent_buffer(NULL, nodesize, nodesize);
if (!root->node) {
test_msg("Couldn't allocate dummy buffer\n");
goto out;
@@ -264,7 +269,7 @@ static noinline int test_btrfs_get_extent(void)
/*
* We will just free a dummy node if it's ref count is 2 so we need an
- * extra ref so our searches don't accidently release our page.
+ * extra ref so our searches don't accidentally release our page.
*/
extent_buffer_get(root->node);
btrfs_set_header_nritems(root->node, 0);
@@ -273,7 +278,7 @@ static noinline int test_btrfs_get_extent(void)
/* First with no extents */
BTRFS_I(inode)->root = root;
- em = btrfs_get_extent(inode, NULL, 0, 0, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, 0, sectorsize, 0);
if (IS_ERR(em)) {
em = NULL;
test_msg("Got an error when we shouldn't have\n");
@@ -295,7 +300,7 @@ static noinline int test_btrfs_get_extent(void)
* setup_file_extents, so if you change anything there you need to
* update the comment and update the expected values below.
*/
- setup_file_extents(root);
+ setup_file_extents(root, sectorsize);
em = btrfs_get_extent(inode, NULL, 0, 0, (u64)-1, 0);
if (IS_ERR(em)) {
@@ -318,7 +323,7 @@ static noinline int test_btrfs_get_extent(void)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -327,7 +332,8 @@ static noinline int test_btrfs_get_extent(void)
test_msg("Expected an inline, got %llu\n", em->block_start);
goto out;
}
- if (em->start != offset || em->len != 4091) {
+
+ if (em->start != offset || em->len != (sectorsize - 5)) {
test_msg("Unexpected extent wanted start %llu len 1, got start "
"%llu len %llu\n", offset, em->start, em->len);
goto out;
@@ -344,7 +350,7 @@ static noinline int test_btrfs_get_extent(void)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -366,7 +372,7 @@ static noinline int test_btrfs_get_extent(void)
free_extent_map(em);
/* Regular extent */
- em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -375,7 +381,7 @@ static noinline int test_btrfs_get_extent(void)
test_msg("Expected a real extent, got %llu\n", em->block_start);
goto out;
}
- if (em->start != offset || em->len != 4095) {
+ if (em->start != offset || em->len != sectorsize - 1) {
test_msg("Unexpected extent wanted start %llu len 4095, got "
"start %llu len %llu\n", offset, em->start, em->len);
goto out;
@@ -393,7 +399,7 @@ static noinline int test_btrfs_get_extent(void)
free_extent_map(em);
/* The next 3 are split extents */
- em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -402,9 +408,10 @@ static noinline int test_btrfs_get_extent(void)
test_msg("Expected a real extent, got %llu\n", em->block_start);
goto out;
}
- if (em->start != offset || em->len != 4096) {
- test_msg("Unexpected extent wanted start %llu len 4096, got "
- "start %llu len %llu\n", offset, em->start, em->len);
+ if (em->start != offset || em->len != sectorsize) {
+ test_msg("Unexpected extent start %llu len %u, "
+ "got start %llu len %llu\n",
+ offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
@@ -421,7 +428,7 @@ static noinline int test_btrfs_get_extent(void)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -430,9 +437,10 @@ static noinline int test_btrfs_get_extent(void)
test_msg("Expected a hole, got %llu\n", em->block_start);
goto out;
}
- if (em->start != offset || em->len != 4096) {
- test_msg("Unexpected extent wanted start %llu len 4096, got "
- "start %llu len %llu\n", offset, em->start, em->len);
+ if (em->start != offset || em->len != sectorsize) {
+ test_msg("Unexpected extent wanted start %llu len %u, "
+ "got start %llu len %llu\n",
+ offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
@@ -442,7 +450,7 @@ static noinline int test_btrfs_get_extent(void)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -451,9 +459,10 @@ static noinline int test_btrfs_get_extent(void)
test_msg("Expected a real extent, got %llu\n", em->block_start);
goto out;
}
- if (em->start != offset || em->len != 8192) {
- test_msg("Unexpected extent wanted start %llu len 8192, got "
- "start %llu len %llu\n", offset, em->start, em->len);
+ if (em->start != offset || em->len != 2 * sectorsize) {
+ test_msg("Unexpected extent wanted start %llu len %u, "
+ "got start %llu len %llu\n",
+ offset, 2 * sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
@@ -475,7 +484,7 @@ static noinline int test_btrfs_get_extent(void)
free_extent_map(em);
/* Prealloc extent */
- em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -484,9 +493,10 @@ static noinline int test_btrfs_get_extent(void)
test_msg("Expected a real extent, got %llu\n", em->block_start);
goto out;
}
- if (em->start != offset || em->len != 4096) {
- test_msg("Unexpected extent wanted start %llu len 4096, got "
- "start %llu len %llu\n", offset, em->start, em->len);
+ if (em->start != offset || em->len != sectorsize) {
+ test_msg("Unexpected extent wanted start %llu len %u, "
+ "got start %llu len %llu\n",
+ offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != prealloc_only) {
@@ -503,7 +513,7 @@ static noinline int test_btrfs_get_extent(void)
free_extent_map(em);
/* The next 3 are a half written prealloc extent */
- em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -512,9 +522,10 @@ static noinline int test_btrfs_get_extent(void)
test_msg("Expected a real extent, got %llu\n", em->block_start);
goto out;
}
- if (em->start != offset || em->len != 4096) {
- test_msg("Unexpected extent wanted start %llu len 4096, got "
- "start %llu len %llu\n", offset, em->start, em->len);
+ if (em->start != offset || em->len != sectorsize) {
+ test_msg("Unexpected extent wanted start %llu len %u, "
+ "got start %llu len %llu\n",
+ offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != prealloc_only) {
@@ -532,7 +543,7 @@ static noinline int test_btrfs_get_extent(void)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -541,9 +552,10 @@ static noinline int test_btrfs_get_extent(void)
test_msg("Expected a real extent, got %llu\n", em->block_start);
goto out;
}
- if (em->start != offset || em->len != 4096) {
- test_msg("Unexpected extent wanted start %llu len 4096, got "
- "start %llu len %llu\n", offset, em->start, em->len);
+ if (em->start != offset || em->len != sectorsize) {
+ test_msg("Unexpected extent wanted start %llu len %u, "
+ "got start %llu len %llu\n",
+ offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
@@ -564,7 +576,7 @@ static noinline int test_btrfs_get_extent(void)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -573,9 +585,10 @@ static noinline int test_btrfs_get_extent(void)
test_msg("Expected a real extent, got %llu\n", em->block_start);
goto out;
}
- if (em->start != offset || em->len != 8192) {
- test_msg("Unexpected extent wanted start %llu len 8192, got "
- "start %llu len %llu\n", offset, em->start, em->len);
+ if (em->start != offset || em->len != 2 * sectorsize) {
+ test_msg("Unexpected extent wanted start %llu len %u, "
+ "got start %llu len %llu\n",
+ offset, 2 * sectorsize, em->start, em->len);
goto out;
}
if (em->flags != prealloc_only) {
@@ -598,7 +611,7 @@ static noinline int test_btrfs_get_extent(void)
free_extent_map(em);
/* Now for the compressed extent */
- em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -607,9 +620,10 @@ static noinline int test_btrfs_get_extent(void)
test_msg("Expected a real extent, got %llu\n", em->block_start);
goto out;
}
- if (em->start != offset || em->len != 8192) {
- test_msg("Unexpected extent wanted start %llu len 8192, got "
- "start %llu len %llu\n", offset, em->start, em->len);
+ if (em->start != offset || em->len != 2 * sectorsize) {
+ test_msg("Unexpected extent wanted start %llu len %u,"
+ "got start %llu len %llu\n",
+ offset, 2 * sectorsize, em->start, em->len);
goto out;
}
if (em->flags != compressed_only) {
@@ -631,7 +645,7 @@ static noinline int test_btrfs_get_extent(void)
free_extent_map(em);
/* Split compressed extent */
- em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -640,9 +654,10 @@ static noinline int test_btrfs_get_extent(void)
test_msg("Expected a real extent, got %llu\n", em->block_start);
goto out;
}
- if (em->start != offset || em->len != 4096) {
- test_msg("Unexpected extent wanted start %llu len 4096, got "
- "start %llu len %llu\n", offset, em->start, em->len);
+ if (em->start != offset || em->len != sectorsize) {
+ test_msg("Unexpected extent wanted start %llu len %u,"
+ "got start %llu len %llu\n",
+ offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != compressed_only) {
@@ -665,7 +680,7 @@ static noinline int test_btrfs_get_extent(void)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -674,9 +689,10 @@ static noinline int test_btrfs_get_extent(void)
test_msg("Expected a real extent, got %llu\n", em->block_start);
goto out;
}
- if (em->start != offset || em->len != 4096) {
- test_msg("Unexpected extent wanted start %llu len 4096, got "
- "start %llu len %llu\n", offset, em->start, em->len);
+ if (em->start != offset || em->len != sectorsize) {
+ test_msg("Unexpected extent wanted start %llu len %u, "
+ "got start %llu len %llu\n",
+ offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
@@ -691,7 +707,7 @@ static noinline int test_btrfs_get_extent(void)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -701,9 +717,10 @@ static noinline int test_btrfs_get_extent(void)
disk_bytenr, em->block_start);
goto out;
}
- if (em->start != offset || em->len != 8192) {
- test_msg("Unexpected extent wanted start %llu len 8192, got "
- "start %llu len %llu\n", offset, em->start, em->len);
+ if (em->start != offset || em->len != 2 * sectorsize) {
+ test_msg("Unexpected extent wanted start %llu len %u, "
+ "got start %llu len %llu\n",
+ offset, 2 * sectorsize, em->start, em->len);
goto out;
}
if (em->flags != compressed_only) {
@@ -725,7 +742,7 @@ static noinline int test_btrfs_get_extent(void)
free_extent_map(em);
/* A hole between regular extents but no hole extent */
- em = btrfs_get_extent(inode, NULL, 0, offset + 6, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset + 6, sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -734,9 +751,10 @@ static noinline int test_btrfs_get_extent(void)
test_msg("Expected a real extent, got %llu\n", em->block_start);
goto out;
}
- if (em->start != offset || em->len != 4096) {
- test_msg("Unexpected extent wanted start %llu len 4096, got "
- "start %llu len %llu\n", offset, em->start, em->len);
+ if (em->start != offset || em->len != sectorsize) {
+ test_msg("Unexpected extent wanted start %llu len %u, "
+ "got start %llu len %llu\n",
+ offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
@@ -765,9 +783,10 @@ static noinline int test_btrfs_get_extent(void)
* length of the actual hole, if this changes we'll have to change this
* test.
*/
- if (em->start != offset || em->len != 12288) {
- test_msg("Unexpected extent wanted start %llu len 12288, got "
- "start %llu len %llu\n", offset, em->start, em->len);
+ if (em->start != offset || em->len != 3 * sectorsize) {
+ test_msg("Unexpected extent wanted start %llu len %u, "
+ "got start %llu len %llu\n",
+ offset, 3 * sectorsize, em->start, em->len);
goto out;
}
if (em->flags != vacancy_only) {
@@ -783,7 +802,7 @@ static noinline int test_btrfs_get_extent(void)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -792,9 +811,10 @@ static noinline int test_btrfs_get_extent(void)
test_msg("Expected a real extent, got %llu\n", em->block_start);
goto out;
}
- if (em->start != offset || em->len != 4096) {
- test_msg("Unexpected extent wanted start %llu len 4096, got "
- "start %llu len %llu\n", offset, em->start, em->len);
+ if (em->start != offset || em->len != sectorsize) {
+ test_msg("Unexpected extent wanted start %llu len %u,"
+ "got start %llu len %llu\n",
+ offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
@@ -812,11 +832,13 @@ out:
free_extent_map(em);
iput(inode);
btrfs_free_dummy_root(root);
+ btrfs_free_dummy_fs_info(fs_info);
return ret;
}
-static int test_hole_first(void)
+static int test_hole_first(u32 sectorsize, u32 nodesize)
{
+ struct btrfs_fs_info *fs_info = NULL;
struct inode *inode = NULL;
struct btrfs_root *root = NULL;
struct extent_map *em = NULL;
@@ -832,19 +854,19 @@ static int test_hole_first(void)
BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
BTRFS_I(inode)->location.offset = 0;
- root = btrfs_alloc_dummy_root();
- if (IS_ERR(root)) {
- test_msg("Couldn't allocate root\n");
+ fs_info = btrfs_alloc_dummy_fs_info();
+ if (!fs_info) {
+ test_msg("Couldn't allocate dummy fs info\n");
goto out;
}
- root->fs_info = btrfs_alloc_dummy_fs_info();
- if (!root->fs_info) {
- test_msg("Couldn't allocate dummy fs info\n");
+ root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
+ if (IS_ERR(root)) {
+ test_msg("Couldn't allocate root\n");
goto out;
}
- root->node = alloc_dummy_extent_buffer(NULL, 4096);
+ root->node = alloc_dummy_extent_buffer(NULL, nodesize, nodesize);
if (!root->node) {
test_msg("Couldn't allocate dummy buffer\n");
goto out;
@@ -861,9 +883,9 @@ static int test_hole_first(void)
* btrfs_get_extent.
*/
insert_inode_item_key(root);
- insert_extent(root, 4096, 4096, 4096, 0, 4096, 4096,
- BTRFS_FILE_EXTENT_REG, 0, 1);
- em = btrfs_get_extent(inode, NULL, 0, 0, 8192, 0);
+ insert_extent(root, sectorsize, sectorsize, sectorsize, 0, sectorsize,
+ sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1);
+ em = btrfs_get_extent(inode, NULL, 0, 0, 2 * sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
@@ -872,9 +894,10 @@ static int test_hole_first(void)
test_msg("Expected a hole, got %llu\n", em->block_start);
goto out;
}
- if (em->start != 0 || em->len != 4096) {
- test_msg("Unexpected extent wanted start 0 len 4096, got start "
- "%llu len %llu\n", em->start, em->len);
+ if (em->start != 0 || em->len != sectorsize) {
+ test_msg("Unexpected extent wanted start 0 len %u, "
+ "got start %llu len %llu\n",
+ sectorsize, em->start, em->len);
goto out;
}
if (em->flags != vacancy_only) {
@@ -884,18 +907,19 @@ static int test_hole_first(void)
}
free_extent_map(em);
- em = btrfs_get_extent(inode, NULL, 0, 4096, 8192, 0);
+ em = btrfs_get_extent(inode, NULL, 0, sectorsize, 2 * sectorsize, 0);
if (IS_ERR(em)) {
test_msg("Got an error when we shouldn't have\n");
goto out;
}
- if (em->block_start != 4096) {
+ if (em->block_start != sectorsize) {
test_msg("Expected a real extent, got %llu\n", em->block_start);
goto out;
}
- if (em->start != 4096 || em->len != 4096) {
- test_msg("Unexpected extent wanted start 4096 len 4096, got "
- "start %llu len %llu\n", em->start, em->len);
+ if (em->start != sectorsize || em->len != sectorsize) {
+ test_msg("Unexpected extent wanted start %u len %u, "
+ "got start %llu len %llu\n",
+ sectorsize, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
@@ -909,11 +933,13 @@ out:
free_extent_map(em);
iput(inode);
btrfs_free_dummy_root(root);
+ btrfs_free_dummy_fs_info(fs_info);
return ret;
}
-static int test_extent_accounting(void)
+static int test_extent_accounting(u32 sectorsize, u32 nodesize)
{
+ struct btrfs_fs_info *fs_info = NULL;
struct inode *inode = NULL;
struct btrfs_root *root = NULL;
int ret = -ENOMEM;
@@ -924,15 +950,15 @@ static int test_extent_accounting(void)
return ret;
}
- root = btrfs_alloc_dummy_root();
- if (IS_ERR(root)) {
- test_msg("Couldn't allocate root\n");
+ fs_info = btrfs_alloc_dummy_fs_info();
+ if (!fs_info) {
+ test_msg("Couldn't allocate dummy fs info\n");
goto out;
}
- root->fs_info = btrfs_alloc_dummy_fs_info();
- if (!root->fs_info) {
- test_msg("Couldn't allocate dummy fs info\n");
+ root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
+ if (IS_ERR(root)) {
+ test_msg("Couldn't allocate root\n");
goto out;
}
@@ -954,10 +980,11 @@ static int test_extent_accounting(void)
goto out;
}
- /* [BTRFS_MAX_EXTENT_SIZE][4k] */
+ /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */
BTRFS_I(inode)->outstanding_extents++;
ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE,
- BTRFS_MAX_EXTENT_SIZE + 4095, NULL);
+ BTRFS_MAX_EXTENT_SIZE + sectorsize - 1,
+ NULL);
if (ret) {
test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
goto out;
@@ -969,10 +996,10 @@ static int test_extent_accounting(void)
goto out;
}
- /* [BTRFS_MAX_EXTENT_SIZE/2][4K HOLE][the rest] */
+ /* [BTRFS_MAX_EXTENT_SIZE/2][sectorsize HOLE][the rest] */
ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
BTRFS_MAX_EXTENT_SIZE >> 1,
- (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095,
+ (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1,
EXTENT_DELALLOC | EXTENT_DIRTY |
EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0,
NULL, GFP_KERNEL);
@@ -987,10 +1014,11 @@ static int test_extent_accounting(void)
goto out;
}
- /* [BTRFS_MAX_EXTENT_SIZE][4K] */
+ /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */
BTRFS_I(inode)->outstanding_extents++;
ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1,
- (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095,
+ (BTRFS_MAX_EXTENT_SIZE >> 1)
+ + sectorsize - 1,
NULL);
if (ret) {
test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
@@ -1004,16 +1032,17 @@ static int test_extent_accounting(void)
}
/*
- * [BTRFS_MAX_EXTENT_SIZE+4K][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4K]
+ * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize HOLE][BTRFS_MAX_EXTENT_SIZE+sectorsize]
*
* I'm artificially adding 2 to outstanding_extents because in the
* buffered IO case we'd add things up as we go, but I don't feel like
* doing that here, this isn't the interesting case we want to test.
*/
BTRFS_I(inode)->outstanding_extents += 2;
- ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + 8192,
- (BTRFS_MAX_EXTENT_SIZE << 1) + 12287,
- NULL);
+ ret = btrfs_set_extent_delalloc(inode,
+ BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize,
+ (BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1,
+ NULL);
if (ret) {
test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
goto out;
@@ -1025,10 +1054,13 @@ static int test_extent_accounting(void)
goto out;
}
- /* [BTRFS_MAX_EXTENT_SIZE+4k][4k][BTRFS_MAX_EXTENT_SIZE+4k] */
+ /*
+ * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize][BTRFS_MAX_EXTENT_SIZE+sectorsize]
+ */
BTRFS_I(inode)->outstanding_extents++;
- ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096,
- BTRFS_MAX_EXTENT_SIZE+8191, NULL);
+ ret = btrfs_set_extent_delalloc(inode,
+ BTRFS_MAX_EXTENT_SIZE + sectorsize,
+ BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL);
if (ret) {
test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
goto out;
@@ -1042,8 +1074,8 @@ static int test_extent_accounting(void)
/* [BTRFS_MAX_EXTENT_SIZE+4k][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4k] */
ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
- BTRFS_MAX_EXTENT_SIZE+4096,
- BTRFS_MAX_EXTENT_SIZE+8191,
+ BTRFS_MAX_EXTENT_SIZE + sectorsize,
+ BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1,
EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
NULL, GFP_KERNEL);
@@ -1063,8 +1095,9 @@ static int test_extent_accounting(void)
* might fail and I'd rather satisfy my paranoia at this point.
*/
BTRFS_I(inode)->outstanding_extents++;
- ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096,
- BTRFS_MAX_EXTENT_SIZE+8191, NULL);
+ ret = btrfs_set_extent_delalloc(inode,
+ BTRFS_MAX_EXTENT_SIZE + sectorsize,
+ BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL);
if (ret) {
test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
goto out;
@@ -1100,10 +1133,11 @@ out:
NULL, GFP_KERNEL);
iput(inode);
btrfs_free_dummy_root(root);
+ btrfs_free_dummy_fs_info(fs_info);
return ret;
}
-int btrfs_test_inodes(void)
+int btrfs_test_inodes(u32 sectorsize, u32 nodesize)
{
int ret;
@@ -1112,13 +1146,13 @@ int btrfs_test_inodes(void)
set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only);
test_msg("Running btrfs_get_extent tests\n");
- ret = test_btrfs_get_extent();
+ ret = test_btrfs_get_extent(sectorsize, nodesize);
if (ret)
return ret;
test_msg("Running hole first btrfs_get_extent test\n");
- ret = test_hole_first();
+ ret = test_hole_first(sectorsize, nodesize);
if (ret)
return ret;
test_msg("Running outstanding_extents tests\n");
- return test_extent_accounting();
+ return test_extent_accounting(sectorsize, nodesize);
}
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 8ea5d34bc5a20..4407fef7c16c1 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -16,6 +16,7 @@
* Boston, MA 021110-1307, USA.
*/
+#include <linux/types.h>
#include "btrfs-tests.h"
#include "../ctree.h"
#include "../transaction.h"
@@ -216,7 +217,8 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
return ret;
}
-static int test_no_shared_qgroup(struct btrfs_root *root)
+static int test_no_shared_qgroup(struct btrfs_root *root,
+ u32 sectorsize, u32 nodesize)
{
struct btrfs_trans_handle trans;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -227,29 +229,30 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
btrfs_init_dummy_trans(&trans);
test_msg("Qgroup basic add\n");
- ret = btrfs_create_qgroup(NULL, fs_info, 5);
+ ret = btrfs_create_qgroup(NULL, fs_info, BTRFS_FS_TREE_OBJECTID);
if (ret) {
test_msg("Couldn't create a qgroup %d\n", ret);
return ret;
}
/*
- * Since the test trans doesn't havee the complicated delayed refs,
+ * Since the test trans doesn't have the complicated delayed refs,
* we can only call btrfs_qgroup_account_extent() directly to test
* quota.
*/
- ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots);
if (ret) {
ulist_free(old_roots);
test_msg("Couldn't find old roots: %d\n", ret);
return ret;
}
- ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5);
+ ret = insert_normal_tree_ref(root, nodesize, nodesize, 0,
+ BTRFS_FS_TREE_OBJECTID);
if (ret)
return ret;
- ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
@@ -257,32 +260,33 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
return ret;
}
- ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
- old_roots, new_roots);
+ ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize,
+ nodesize, old_roots, new_roots);
if (ret) {
test_msg("Couldn't account space for a qgroup %d\n", ret);
return ret;
}
- if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) {
+ if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
+ nodesize, nodesize)) {
test_msg("Qgroup counts didn't match expected values\n");
return -EINVAL;
}
old_roots = NULL;
new_roots = NULL;
- ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots);
if (ret) {
ulist_free(old_roots);
test_msg("Couldn't find old roots: %d\n", ret);
return ret;
}
- ret = remove_extent_item(root, 4096, 4096);
+ ret = remove_extent_item(root, nodesize, nodesize);
if (ret)
return -EINVAL;
- ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
@@ -290,14 +294,14 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
return ret;
}
- ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
- old_roots, new_roots);
+ ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize,
+ nodesize, old_roots, new_roots);
if (ret) {
test_msg("Couldn't account space for a qgroup %d\n", ret);
return -EINVAL;
}
- if (btrfs_verify_qgroup_counts(fs_info, 5, 0, 0)) {
+ if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, 0, 0)) {
test_msg("Qgroup counts didn't match expected values\n");
return -EINVAL;
}
@@ -310,7 +314,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
* right, also remove one of the roots and make sure the exclusive count is
* adjusted properly.
*/
-static int test_multiple_refs(struct btrfs_root *root)
+static int test_multiple_refs(struct btrfs_root *root,
+ u32 sectorsize, u32 nodesize)
{
struct btrfs_trans_handle trans;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -322,25 +327,29 @@ static int test_multiple_refs(struct btrfs_root *root)
test_msg("Qgroup multiple refs test\n");
- /* We have 5 created already from the previous test */
- ret = btrfs_create_qgroup(NULL, fs_info, 256);
+ /*
+ * We have BTRFS_FS_TREE_OBJECTID created already from the
+ * previous test.
+ */
+ ret = btrfs_create_qgroup(NULL, fs_info, BTRFS_FIRST_FREE_OBJECTID);
if (ret) {
test_msg("Couldn't create a qgroup %d\n", ret);
return ret;
}
- ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots);
if (ret) {
ulist_free(old_roots);
test_msg("Couldn't find old roots: %d\n", ret);
return ret;
}
- ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5);
+ ret = insert_normal_tree_ref(root, nodesize, nodesize, 0,
+ BTRFS_FS_TREE_OBJECTID);
if (ret)
return ret;
- ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
@@ -348,30 +357,32 @@ static int test_multiple_refs(struct btrfs_root *root)
return ret;
}
- ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
- old_roots, new_roots);
+ ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize,
+ nodesize, old_roots, new_roots);
if (ret) {
test_msg("Couldn't account space for a qgroup %d\n", ret);
return ret;
}
- if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) {
+ if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
+ nodesize, nodesize)) {
test_msg("Qgroup counts didn't match expected values\n");
return -EINVAL;
}
- ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots);
if (ret) {
ulist_free(old_roots);
test_msg("Couldn't find old roots: %d\n", ret);
return ret;
}
- ret = add_tree_ref(root, 4096, 4096, 0, 256);
+ ret = add_tree_ref(root, nodesize, nodesize, 0,
+ BTRFS_FIRST_FREE_OBJECTID);
if (ret)
return ret;
- ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
@@ -379,35 +390,38 @@ static int test_multiple_refs(struct btrfs_root *root)
return ret;
}
- ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
- old_roots, new_roots);
+ ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize,
+ nodesize, old_roots, new_roots);
if (ret) {
test_msg("Couldn't account space for a qgroup %d\n", ret);
return ret;
}
- if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 0)) {
+ if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
+ nodesize, 0)) {
test_msg("Qgroup counts didn't match expected values\n");
return -EINVAL;
}
- if (btrfs_verify_qgroup_counts(fs_info, 256, 4096, 0)) {
+ if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FIRST_FREE_OBJECTID,
+ nodesize, 0)) {
test_msg("Qgroup counts didn't match expected values\n");
return -EINVAL;
}
- ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots);
if (ret) {
ulist_free(old_roots);
test_msg("Couldn't find old roots: %d\n", ret);
return ret;
}
- ret = remove_extent_ref(root, 4096, 4096, 0, 256);
+ ret = remove_extent_ref(root, nodesize, nodesize, 0,
+ BTRFS_FIRST_FREE_OBJECTID);
if (ret)
return ret;
- ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
@@ -415,19 +429,21 @@ static int test_multiple_refs(struct btrfs_root *root)
return ret;
}
- ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
- old_roots, new_roots);
+ ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize,
+ nodesize, old_roots, new_roots);
if (ret) {
test_msg("Couldn't account space for a qgroup %d\n", ret);
return ret;
}
- if (btrfs_verify_qgroup_counts(fs_info, 256, 0, 0)) {
+ if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FIRST_FREE_OBJECTID,
+ 0, 0)) {
test_msg("Qgroup counts didn't match expected values\n");
return -EINVAL;
}
- if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) {
+ if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
+ nodesize, nodesize)) {
test_msg("Qgroup counts didn't match expected values\n");
return -EINVAL;
}
@@ -435,24 +451,26 @@ static int test_multiple_refs(struct btrfs_root *root)
return 0;
}
-int btrfs_test_qgroups(void)
+int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
{
+ struct btrfs_fs_info *fs_info = NULL;
struct btrfs_root *root;
struct btrfs_root *tmp_root;
int ret = 0;
- root = btrfs_alloc_dummy_root();
- if (IS_ERR(root)) {
- test_msg("Couldn't allocate root\n");
- return PTR_ERR(root);
+ fs_info = btrfs_alloc_dummy_fs_info();
+ if (!fs_info) {
+ test_msg("Couldn't allocate dummy fs info\n");
+ return -ENOMEM;
}
- root->fs_info = btrfs_alloc_dummy_fs_info();
- if (!root->fs_info) {
- test_msg("Couldn't allocate dummy fs info\n");
- ret = -ENOMEM;
+ root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
+ if (IS_ERR(root)) {
+ test_msg("Couldn't allocate root\n");
+ ret = PTR_ERR(root);
goto out;
}
+
/* We are using this root as our extent root */
root->fs_info->extent_root = root;
@@ -468,7 +486,8 @@ int btrfs_test_qgroups(void)
* Can't use bytenr 0, some things freak out
* *cough*backref walking code*cough*
*/
- root->node = alloc_test_extent_buffer(root->fs_info, 4096);
+ root->node = alloc_test_extent_buffer(root->fs_info, nodesize,
+ nodesize);
if (!root->node) {
test_msg("Couldn't allocate dummy buffer\n");
ret = -ENOMEM;
@@ -476,16 +495,16 @@ int btrfs_test_qgroups(void)
}
btrfs_set_header_level(root->node, 0);
btrfs_set_header_nritems(root->node, 0);
- root->alloc_bytenr += 8192;
+ root->alloc_bytenr += 2 * nodesize;
- tmp_root = btrfs_alloc_dummy_root();
+ tmp_root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
if (IS_ERR(tmp_root)) {
test_msg("Couldn't allocate a fs root\n");
ret = PTR_ERR(tmp_root);
goto out;
}
- tmp_root->root_key.objectid = 5;
+ tmp_root->root_key.objectid = BTRFS_FS_TREE_OBJECTID;
root->fs_info->fs_root = tmp_root;
ret = btrfs_insert_fs_root(root->fs_info, tmp_root);
if (ret) {
@@ -493,14 +512,14 @@ int btrfs_test_qgroups(void)
goto out;
}
- tmp_root = btrfs_alloc_dummy_root();
+ tmp_root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
if (IS_ERR(tmp_root)) {
test_msg("Couldn't allocate a fs root\n");
ret = PTR_ERR(tmp_root);
goto out;
}
- tmp_root->root_key.objectid = 256;
+ tmp_root->root_key.objectid = BTRFS_FIRST_FREE_OBJECTID;
ret = btrfs_insert_fs_root(root->fs_info, tmp_root);
if (ret) {
test_msg("Couldn't insert fs root %d\n", ret);
@@ -508,11 +527,12 @@ int btrfs_test_qgroups(void)
}
test_msg("Running qgroup tests\n");
- ret = test_no_shared_qgroup(root);
+ ret = test_no_shared_qgroup(root, sectorsize, nodesize);
if (ret)
goto out;
- ret = test_multiple_refs(root);
+ ret = test_multiple_refs(root, sectorsize, nodesize);
out:
btrfs_free_dummy_root(root);
+ btrfs_free_dummy_fs_info(fs_info);
return ret;
}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 43885e51b8829..9cca0a7219618 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -311,10 +311,11 @@ loop:
* when the transaction commits
*/
static int record_root_in_trans(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+ struct btrfs_root *root,
+ int force)
{
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
- root->last_trans < trans->transid) {
+ if ((test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ root->last_trans < trans->transid) || force) {
WARN_ON(root == root->fs_info->extent_root);
WARN_ON(root->commit_root != root->node);
@@ -331,7 +332,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
smp_wmb();
spin_lock(&root->fs_info->fs_roots_radix_lock);
- if (root->last_trans == trans->transid) {
+ if (root->last_trans == trans->transid && !force) {
spin_unlock(&root->fs_info->fs_roots_radix_lock);
return 0;
}
@@ -402,7 +403,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
return 0;
mutex_lock(&root->fs_info->reloc_mutex);
- record_root_in_trans(trans, root);
+ record_root_in_trans(trans, root, 0);
mutex_unlock(&root->fs_info->reloc_mutex);
return 0;
@@ -560,6 +561,7 @@ again:
h->transaction = cur_trans;
h->root = root;
h->use_count = 1;
+ h->fs_info = root->fs_info;
h->type = type;
h->can_flush_pending_bgs = true;
@@ -817,6 +819,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
{
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_fs_info *info = root->fs_info;
+ u64 transid = trans->transid;
unsigned long cur = trans->delayed_ref_updates;
int lock = (trans->type != TRANS_JOIN_NOLOCK);
int err = 0;
@@ -904,7 +907,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_trans_handle_cachep, trans);
if (must_run_delayed_refs) {
- btrfs_async_run_delayed_refs(root, cur,
+ btrfs_async_run_delayed_refs(root, cur, transid,
must_run_delayed_refs == 1);
}
return err;
@@ -943,7 +946,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
err = convert_extent_bit(dirty_pages, start, end,
EXTENT_NEED_WAIT,
- mark, &cached_state, GFP_NOFS);
+ mark, &cached_state);
/*
* convert_extent_bit can return -ENOMEM, which is most of the
* time a temporary error. So when it happens, ignore the error
@@ -1311,6 +1314,92 @@ int btrfs_defrag_root(struct btrfs_root *root)
}
/*
+ * Do all special snapshot related qgroup dirty hack.
+ *
+ * Will do all needed qgroup inherit and dirty hack like switch commit
+ * roots inside one transaction and write all btree into disk, to make
+ * qgroup works.
+ */
+static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_root *src,
+ struct btrfs_root *parent,
+ struct btrfs_qgroup_inherit *inherit,
+ u64 dst_objectid)
+{
+ struct btrfs_fs_info *fs_info = src->fs_info;
+ int ret;
+
+ /*
+ * Save some performance in the case that qgroups are not
+ * enabled. If this check races with the ioctl, rescan will
+ * kick in anyway.
+ */
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
+ if (!fs_info->quota_enabled) {
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ return 0;
+ }
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+
+ /*
+ * We are going to commit transaction, see btrfs_commit_transaction()
+ * comment for reason locking tree_log_mutex
+ */
+ mutex_lock(&fs_info->tree_log_mutex);
+
+ ret = commit_fs_roots(trans, src);
+ if (ret)
+ goto out;
+ ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
+ if (ret < 0)
+ goto out;
+ ret = btrfs_qgroup_account_extents(trans, fs_info);
+ if (ret < 0)
+ goto out;
+
+ /* Now qgroup are all updated, we can inherit it to new qgroups */
+ ret = btrfs_qgroup_inherit(trans, fs_info,
+ src->root_key.objectid, dst_objectid,
+ inherit);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * Now we do a simplified commit transaction, which will:
+ * 1) commit all subvolume and extent tree
+ * To ensure all subvolume and extent tree have a valid
+ * commit_root to accounting later insert_dir_item()
+ * 2) write all btree blocks onto disk
+ * This is to make sure later btree modification will be cowed
+ * Or commit_root can be populated and cause wrong qgroup numbers
+ * In this simplified commit, we don't really care about other trees
+ * like chunk and root tree, as they won't affect qgroup.
+ * And we don't write super to avoid half committed status.
+ */
+ ret = commit_cowonly_roots(trans, src);
+ if (ret)
+ goto out;
+ switch_commit_roots(trans->transaction, fs_info);
+ ret = btrfs_write_and_wait_transaction(trans, src);
+ if (ret)
+ btrfs_handle_fs_error(fs_info, ret,
+ "Error while writing out transaction for qgroup");
+
+out:
+ mutex_unlock(&fs_info->tree_log_mutex);
+
+ /*
+ * Force parent root to be updated, as we recorded it before so its
+ * last_trans == cur_transid.
+ * Or it won't be committed again onto disk after later
+ * insert_dir_item()
+ */
+ if (!ret)
+ record_root_in_trans(trans, parent, 1);
+ return ret;
+}
+
+/*
* new snapshots need to be created at a very specific time in the
* transaction commit. This does the actual creation.
*
@@ -1383,7 +1472,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
dentry = pending->dentry;
parent_inode = pending->dir;
parent_root = BTRFS_I(parent_inode)->root;
- record_root_in_trans(trans, parent_root);
+ record_root_in_trans(trans, parent_root, 0);
cur_time = current_fs_time(parent_inode->i_sb);
@@ -1403,7 +1492,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto dir_item_existed;
} else if (IS_ERR(dir_item)) {
ret = PTR_ERR(dir_item);
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
btrfs_release_path(path);
@@ -1416,11 +1505,11 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
*/
ret = btrfs_run_delayed_items(trans, root);
if (ret) { /* Transaction aborted */
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
- record_root_in_trans(trans, root);
+ record_root_in_trans(trans, root, 0);
btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
btrfs_check_and_init_root_item(new_root_item);
@@ -1455,7 +1544,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
if (ret) {
btrfs_tree_unlock(old);
free_extent_buffer(old);
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1466,7 +1555,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(old);
free_extent_buffer(old);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
/* see comments in should_cow_block() */
@@ -1480,7 +1569,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(tmp);
free_extent_buffer(tmp);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1492,7 +1581,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_ino(parent_inode), index,
dentry->d_name.name, dentry->d_name.len);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1500,22 +1589,33 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
if (IS_ERR(pending->snap)) {
ret = PTR_ERR(pending->snap);
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
ret = btrfs_reloc_post_snapshot(trans, pending);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
+ /*
+ * Do special qgroup accounting for snapshot, as we do some qgroup
+ * snapshot hack to do fast snapshot.
+ * To co-operate with that hack, we do hack again.
+ * Or snapshot will be greatly slowed down by a subtree qgroup rescan
+ */
+ ret = qgroup_account_snapshot(trans, root, parent_root,
+ pending->inherit, objectid);
+ if (ret < 0)
+ goto fail;
+
ret = btrfs_insert_dir_item(trans, parent_root,
dentry->d_name.name, dentry->d_name.len,
parent_inode, &key,
@@ -1523,7 +1623,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
/* We have check then name at the beginning, so it is impossible. */
BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1533,13 +1633,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
current_fs_time(parent_inode->i_sb);
ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, new_uuid.b,
BTRFS_UUID_KEY_SUBVOL, objectid);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) {
@@ -1548,31 +1648,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
objectid);
if (ret && ret != -EEXIST) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
}
ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- goto fail;
- }
-
- /*
- * account qgroup counters before qgroup_inherit()
- */
- ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
- if (ret)
- goto fail;
- ret = btrfs_qgroup_account_extents(trans, fs_info);
- if (ret)
- goto fail;
- ret = btrfs_qgroup_inherit(trans, fs_info,
- root->root_key.objectid,
- objectid, pending->inherit);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1627,7 +1710,7 @@ static void update_super_roots(struct btrfs_root *root)
super->root = root_item->bytenr;
super->generation = root_item->generation;
super->root_level = root_item->level;
- if (btrfs_test_opt(root, SPACE_CACHE))
+ if (btrfs_test_opt(root->fs_info, SPACE_CACHE))
super->cache_generation = root_item->generation;
if (root->fs_info->update_uuid_tree_gen)
super->uuid_tree_generation = root_item->generation;
@@ -1768,7 +1851,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
WARN_ON(trans->use_count > 1);
- btrfs_abort_transaction(trans, root, err);
+ btrfs_abort_transaction(trans, err);
spin_lock(&root->fs_info->trans_lock);
@@ -1813,15 +1896,15 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
{
- if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
+ if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
return btrfs_start_delalloc_roots(fs_info, 1, -1);
return 0;
}
static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
{
- if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
- btrfs_wait_ordered_roots(fs_info, -1);
+ if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
+ btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
}
static inline void
@@ -2145,7 +2228,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
ret = btrfs_write_and_wait_transaction(trans, root);
if (ret) {
- btrfs_std_error(root->fs_info, ret,
+ btrfs_handle_fs_error(root->fs_info, ret,
"Error while writing out transaction");
mutex_unlock(&root->fs_info->tree_log_mutex);
goto scrub_continue;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 72be51f7ca2fb..efb1226433800 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -110,7 +110,6 @@ struct btrfs_trans_handle {
u64 chunk_bytes_reserved;
unsigned long use_count;
unsigned long blocks_reserved;
- unsigned long blocks_used;
unsigned long delayed_ref_updates;
struct btrfs_transaction *transaction;
struct btrfs_block_rsv *block_rsv;
@@ -121,6 +120,7 @@ struct btrfs_trans_handle {
bool can_flush_pending_bgs;
bool reloc_reserved;
bool sync;
+ bool dirty;
unsigned int type;
/*
* this root is only needed to validate that the root passed to
@@ -128,6 +128,7 @@ struct btrfs_trans_handle {
* Subvolume quota depends on this
*/
struct btrfs_root *root;
+ struct btrfs_fs_info *fs_info;
struct seq_list delayed_ref_elem;
struct list_head qgroup_ref_list;
struct list_head new_bgs;
@@ -144,7 +145,7 @@ struct btrfs_pending_snapshot {
/* block reservation for the operation */
struct btrfs_block_rsv block_rsv;
u64 qgroup_reserved;
- /* extra metadata reseration for relocation */
+ /* extra metadata reservation for relocation */
int error;
bool readonly;
struct list_head list;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 24d03c751149f..d31a0c4f56bed 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2330,7 +2330,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
break;
/* for regular files, make sure corresponding
- * orhpan item exist. extents past the new EOF
+ * orphan item exist. extents past the new EOF
* will be truncated later by orphan cleanup.
*/
if (S_ISREG(mode)) {
@@ -2422,8 +2422,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
root_owner = btrfs_header_owner(parent);
next = btrfs_find_create_tree_block(root, bytenr);
- if (!next)
- return -ENOMEM;
+ if (IS_ERR(next))
+ return PTR_ERR(next);
if (*level == 1) {
ret = wc->process_func(root, next, wc, ptr_gen);
@@ -2757,7 +2757,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
while (1) {
int batch = atomic_read(&root->log_batch);
/* when we're on an ssd, just kick the log commit out */
- if (!btrfs_test_opt(root, SSD) &&
+ if (!btrfs_test_opt(root->fs_info, SSD) &&
test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
mutex_unlock(&root->log_mutex);
schedule_timeout_uninterruptible(1);
@@ -2788,7 +2788,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
if (ret) {
blk_finish_plug(&plug);
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
btrfs_free_logged_extents(log, log_transid);
btrfs_set_log_full_commit(root->fs_info, trans);
mutex_unlock(&root->log_mutex);
@@ -2838,7 +2838,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
btrfs_set_log_full_commit(root->fs_info, trans);
if (ret != -ENOSPC) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
mutex_unlock(&log_root_tree->log_mutex);
goto out;
}
@@ -2898,7 +2898,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
blk_finish_plug(&plug);
if (ret) {
btrfs_set_log_full_commit(root->fs_info, trans);
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
goto out_wake_log_root;
@@ -2934,7 +2934,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
if (ret) {
btrfs_set_log_full_commit(root->fs_info, trans);
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out_wake_log_root;
}
@@ -2991,7 +2991,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
ret = walk_log_tree(trans, log, &wc);
/* I don't think this can happen but just in case */
if (ret)
- btrfs_abort_transaction(trans, log, ret);
+ btrfs_abort_transaction(trans, ret);
while (1) {
ret = find_first_extent_bit(&log->dirty_log_pages,
@@ -3001,7 +3001,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
break;
clear_extent_bits(&log->dirty_log_pages, start, end,
- EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
+ EXTENT_DIRTY | EXTENT_NEW);
}
/*
@@ -3160,7 +3160,7 @@ out_unlock:
btrfs_set_log_full_commit(root->fs_info, trans);
ret = 0;
} else if (ret < 0)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
btrfs_end_log_trans(root);
@@ -3193,7 +3193,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
btrfs_set_log_full_commit(root->fs_info, trans);
ret = 0;
} else if (ret < 0 && ret != -ENOENT)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
btrfs_end_log_trans(root);
return ret;
@@ -4141,6 +4141,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
INIT_LIST_HEAD(&extents);
+ down_write(&BTRFS_I(inode)->dio_sem);
write_lock(&tree->lock);
test_gen = root->fs_info->last_trans_committed;
@@ -4169,13 +4170,20 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
}
list_sort(NULL, &extents, extent_cmp);
+ btrfs_get_logged_extents(inode, logged_list, start, end);
/*
- * Collect any new ordered extents within the range. This is to
- * prevent logging file extent items without waiting for the disk
- * location they point to being written. We do this only to deal
- * with races against concurrent lockless direct IO writes.
+ * Some ordered extents started by fsync might have completed
+ * before we could collect them into the list logged_list, which
+ * means they're gone, not in our logged_list nor in the inode's
+ * ordered tree. We want the application/user space to know an
+ * error happened while attempting to persist file data so that
+ * it can take proper action. If such error happened, we leave
+ * without writing to the log tree and the fsync must report the
+ * file data write error and not commit the current transaction.
*/
- btrfs_get_logged_extents(inode, logged_list, start, end);
+ ret = btrfs_inode_check_errors(inode);
+ if (ret)
+ ctx->io_err = ret;
process:
while (!list_empty(&extents)) {
em = list_entry(extents.next, struct extent_map, list);
@@ -4202,6 +4210,7 @@ process:
}
WARN_ON(!list_empty(&extents));
write_unlock(&tree->lock);
+ up_write(&BTRFS_I(inode)->dio_sem);
btrfs_release_path(path);
return ret;
@@ -4415,6 +4424,127 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
return ret;
}
+/*
+ * When we are logging a new inode X, check if it doesn't have a reference that
+ * matches the reference from some other inode Y created in a past transaction
+ * and that was renamed in the current transaction. If we don't do this, then at
+ * log replay time we can lose inode Y (and all its files if it's a directory):
+ *
+ * mkdir /mnt/x
+ * echo "hello world" > /mnt/x/foobar
+ * sync
+ * mv /mnt/x /mnt/y
+ * mkdir /mnt/x # or touch /mnt/x
+ * xfs_io -c fsync /mnt/x
+ * <power fail>
+ * mount fs, trigger log replay
+ *
+ * After the log replay procedure, we would lose the first directory and all its
+ * files (file foobar).
+ * For the case where inode Y is not a directory we simply end up losing it:
+ *
+ * echo "123" > /mnt/foo
+ * sync
+ * mv /mnt/foo /mnt/bar
+ * echo "abc" > /mnt/foo
+ * xfs_io -c fsync /mnt/foo
+ * <power fail>
+ *
+ * We also need this for cases where a snapshot entry is replaced by some other
+ * entry (file or directory) otherwise we end up with an unreplayable log due to
+ * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
+ * if it were a regular entry:
+ *
+ * mkdir /mnt/x
+ * btrfs subvolume snapshot /mnt /mnt/x/snap
+ * btrfs subvolume delete /mnt/x/snap
+ * rmdir /mnt/x
+ * mkdir /mnt/x
+ * fsync /mnt/x or fsync some new file inside it
+ * <power fail>
+ *
+ * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
+ * the same transaction.
+ */
+static int btrfs_check_ref_name_override(struct extent_buffer *eb,
+ const int slot,
+ const struct btrfs_key *key,
+ struct inode *inode)
+{
+ int ret;
+ struct btrfs_path *search_path;
+ char *name = NULL;
+ u32 name_len = 0;
+ u32 item_size = btrfs_item_size_nr(eb, slot);
+ u32 cur_offset = 0;
+ unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
+
+ search_path = btrfs_alloc_path();
+ if (!search_path)
+ return -ENOMEM;
+ search_path->search_commit_root = 1;
+ search_path->skip_locking = 1;
+
+ while (cur_offset < item_size) {
+ u64 parent;
+ u32 this_name_len;
+ u32 this_len;
+ unsigned long name_ptr;
+ struct btrfs_dir_item *di;
+
+ if (key->type == BTRFS_INODE_REF_KEY) {
+ struct btrfs_inode_ref *iref;
+
+ iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
+ parent = key->offset;
+ this_name_len = btrfs_inode_ref_name_len(eb, iref);
+ name_ptr = (unsigned long)(iref + 1);
+ this_len = sizeof(*iref) + this_name_len;
+ } else {
+ struct btrfs_inode_extref *extref;
+
+ extref = (struct btrfs_inode_extref *)(ptr +
+ cur_offset);
+ parent = btrfs_inode_extref_parent(eb, extref);
+ this_name_len = btrfs_inode_extref_name_len(eb, extref);
+ name_ptr = (unsigned long)&extref->name;
+ this_len = sizeof(*extref) + this_name_len;
+ }
+
+ if (this_name_len > name_len) {
+ char *new_name;
+
+ new_name = krealloc(name, this_name_len, GFP_NOFS);
+ if (!new_name) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ name_len = this_name_len;
+ name = new_name;
+ }
+
+ read_extent_buffer(eb, name, name_ptr, this_name_len);
+ di = btrfs_lookup_dir_item(NULL, BTRFS_I(inode)->root,
+ search_path, parent,
+ name, this_name_len, 0);
+ if (di && !IS_ERR(di)) {
+ ret = 1;
+ goto out;
+ } else if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ }
+ btrfs_release_path(search_path);
+
+ cur_offset += this_len;
+ }
+ ret = 0;
+out:
+ btrfs_free_path(search_path);
+ kfree(name);
+ return ret;
+}
+
/* log a single inode in the tree log.
* At least one parent directory for this inode must exist in the tree
* or be logged already.
@@ -4502,23 +4632,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
mutex_lock(&BTRFS_I(inode)->log_mutex);
/*
- * Collect ordered extents only if we are logging data. This is to
- * ensure a subsequent request to log this inode in LOG_INODE_ALL mode
- * will process the ordered extents if they still exists at the time,
- * because when we collect them we test and set for the flag
- * BTRFS_ORDERED_LOGGED to prevent multiple log requests to process the
- * same ordered extents. The consequence for the LOG_INODE_ALL log mode
- * not processing the ordered extents is that we end up logging the
- * corresponding file extent items, based on the extent maps in the
- * inode's extent_map_tree's modified_list, without logging the
- * respective checksums (since the may still be only attached to the
- * ordered extents and have not been inserted in the csum tree by
- * btrfs_finish_ordered_io() yet).
- */
- if (inode_only == LOG_INODE_ALL)
- btrfs_get_logged_extents(inode, &logged_list, start, end);
-
- /*
* a brute force approach to making sure we get the most uptodate
* copies of everything.
*/
@@ -4590,6 +4703,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
ins_nr = 0;
ret = btrfs_search_forward(root, &min_key,
path, trans->transid);
+ if (ret < 0) {
+ err = ret;
+ goto out_unlock;
+ }
if (ret != 0)
break;
again:
@@ -4602,6 +4719,22 @@ again:
if (min_key.type == BTRFS_INODE_ITEM_KEY)
need_log_inode_item = false;
+ if ((min_key.type == BTRFS_INODE_REF_KEY ||
+ min_key.type == BTRFS_INODE_EXTREF_KEY) &&
+ BTRFS_I(inode)->generation == trans->transid) {
+ ret = btrfs_check_ref_name_override(path->nodes[0],
+ path->slots[0],
+ &min_key, inode);
+ if (ret < 0) {
+ err = ret;
+ goto out_unlock;
+ } else if (ret > 0) {
+ err = 1;
+ btrfs_set_log_full_commit(root->fs_info, trans);
+ goto out_unlock;
+ }
+ }
+
/* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
if (min_key.type == BTRFS_XATTR_ITEM_KEY) {
if (ins_nr == 0)
@@ -4709,21 +4842,6 @@ log_extents:
goto out_unlock;
}
if (fast_search) {
- /*
- * Some ordered extents started by fsync might have completed
- * before we collected the ordered extents in logged_list, which
- * means they're gone, not in our logged_list nor in the inode's
- * ordered tree. We want the application/user space to know an
- * error happened while attempting to persist file data so that
- * it can take proper action. If such error happened, we leave
- * without writing to the log tree and the fsync must report the
- * file data write error and not commit the current transaction.
- */
- err = btrfs_inode_check_errors(inode);
- if (err) {
- ctx->io_err = err;
- goto out_unlock;
- }
ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
&logged_list, ctx, start, end);
if (ret) {
@@ -4800,7 +4918,7 @@ out_unlock:
* the actual unlink operation, so if we do this check before a concurrent task
* sets last_unlink_trans it means we've logged a consistent version/state of
* all the inode items, otherwise we are not sure and must do a transaction
- * commit (the concurrent task migth have only updated last_unlink_trans before
+ * commit (the concurrent task might have only updated last_unlink_trans before
* we logged the inode or it might have also done the unlink).
*/
static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
@@ -4851,7 +4969,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
goto out;
if (!S_ISDIR(inode->i_mode)) {
- if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
+ if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
goto out;
inode = d_inode(parent);
}
@@ -4859,7 +4977,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
while (1) {
/*
* If we are logging a directory then we start with our inode,
- * not our parents inode, so we need to skipp setting the
+ * not our parent's inode, so we need to skip setting the
* logged_trans so that further down in the log code we don't
* think this inode has already been logged.
*/
@@ -4872,7 +4990,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
break;
}
- if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
+ if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
break;
if (IS_ROOT(parent))
@@ -5021,7 +5139,7 @@ process_leaf:
}
ctx->log_new_dentries = false;
- if (type == BTRFS_FT_DIR)
+ if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
log_mode = LOG_INODE_ALL;
btrfs_release_path(path);
ret = btrfs_log_inode(trans, root, di_inode,
@@ -5141,11 +5259,16 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
if (IS_ERR(dir_inode))
continue;
+ if (ctx)
+ ctx->log_new_dentries = false;
ret = btrfs_log_inode(trans, root, dir_inode,
LOG_INODE_ALL, 0, LLONG_MAX, ctx);
if (!ret &&
btrfs_must_commit_transaction(trans, dir_inode))
ret = 1;
+ if (!ret && ctx && ctx->log_new_dentries)
+ ret = log_new_dir_dentries(trans, root,
+ dir_inode, ctx);
iput(dir_inode);
if (ret)
goto out;
@@ -5182,7 +5305,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
sb = inode->i_sb;
- if (btrfs_test_opt(root, NOTREELOG)) {
+ if (btrfs_test_opt(root->fs_info, NOTREELOG)) {
ret = 1;
goto end_no_trans;
}
@@ -5238,7 +5361,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
log_dentries = true;
/*
- * On unlink we must make sure all our current and old parent directores
+ * On unlink we must make sure all our current and old parent directory
* inodes are fully logged. This is to prevent leaving dangling
* directory index entries in directories that were our parents but are
* not anymore. Not doing this results in old parent directory being
@@ -5285,7 +5408,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
}
while (1) {
- if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
+ if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
break;
inode = d_inode(parent);
@@ -5382,7 +5505,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
ret = walk_log_tree(trans, log_root_tree, &wc);
if (ret) {
- btrfs_std_error(fs_info, ret, "Failed to pin buffers while "
+ btrfs_handle_fs_error(fs_info, ret, "Failed to pin buffers while "
"recovering log root tree.");
goto error;
}
@@ -5396,7 +5519,7 @@ again:
ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
if (ret < 0) {
- btrfs_std_error(fs_info, ret,
+ btrfs_handle_fs_error(fs_info, ret,
"Couldn't find tree log root.");
goto error;
}
@@ -5414,7 +5537,7 @@ again:
log = btrfs_read_fs_root(log_root_tree, &found_key);
if (IS_ERR(log)) {
ret = PTR_ERR(log);
- btrfs_std_error(fs_info, ret,
+ btrfs_handle_fs_error(fs_info, ret,
"Couldn't read tree log root.");
goto error;
}
@@ -5429,7 +5552,7 @@ again:
free_extent_buffer(log->node);
free_extent_buffer(log->commit_root);
kfree(log);
- btrfs_std_error(fs_info, ret, "Couldn't read target root "
+ btrfs_handle_fs_error(fs_info, ret, "Couldn't read target root "
"for tree log recovery.");
goto error;
}
@@ -5515,11 +5638,9 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
* into the file. When the file is logged we check it and
* don't log the parents if the file is fully on disk.
*/
- if (S_ISREG(inode->i_mode)) {
- mutex_lock(&BTRFS_I(inode)->log_mutex);
- BTRFS_I(inode)->last_unlink_trans = trans->transid;
- mutex_unlock(&BTRFS_I(inode)->log_mutex);
- }
+ mutex_lock(&BTRFS_I(inode)->log_mutex);
+ BTRFS_I(inode)->last_unlink_trans = trans->transid;
+ mutex_unlock(&BTRFS_I(inode)->log_mutex);
/*
* if this directory was already logged any new
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 91feb2bdefeeb..b1434bb57e36e 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -28,7 +28,7 @@
* }
* ulist_free(ulist);
*
- * This assumes the graph nodes are adressable by u64. This stems from the
+ * This assumes the graph nodes are addressable by u64. This stems from the
* usage for tree enumeration in btrfs, where the logical addresses are
* 64 bit.
*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e2b54d546b7c0..bb0addce75586 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -20,13 +20,13 @@
#include <linux/slab.h>
#include <linux/buffer_head.h>
#include <linux/blkdev.h>
-#include <linux/random.h>
#include <linux/iocontext.h>
#include <linux/capability.h>
#include <linux/ratelimit.h>
#include <linux/kthread.h>
#include <linux/raid/pq.h>
#include <linux/semaphore.h>
+#include <linux/uuid.h>
#include <asm/div64.h>
#include "ctree.h"
#include "extent_map.h"
@@ -118,6 +118,21 @@ const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID6] = BTRFS_BLOCK_GROUP_RAID6,
};
+/*
+ * Table to convert BTRFS_RAID_* to the error code if minimum number of devices
+ * condition is not met. Zero means there's no corresponding
+ * BTRFS_ERROR_DEV_*_NOT_MET value.
+ */
+const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = {
+ [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
+ [BTRFS_RAID_RAID1] = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
+ [BTRFS_RAID_DUP] = 0,
+ [BTRFS_RAID_RAID0] = 0,
+ [BTRFS_RAID_SINGLE] = 0,
+ [BTRFS_RAID_RAID5] = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
+ [BTRFS_RAID_RAID6] = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
+};
+
static int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_device *device);
@@ -125,7 +140,6 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
-static void btrfs_close_one_device(struct btrfs_device *device);
DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
@@ -447,7 +461,7 @@ loop_lock:
sync_pending = 0;
}
- btrfsic_submit_bio(cur->bi_rw, cur);
+ btrfsic_submit_bio(cur);
num_run++;
batch_run++;
@@ -699,7 +713,8 @@ static noinline int device_list_add(const char *path,
* if there is new btrfs on an already registered device,
* then remove the stale device entry.
*/
- btrfs_free_stale_device(device);
+ if (ret > 0)
+ btrfs_free_stale_device(device);
*fs_devices_ret = fs_devices;
@@ -837,6 +852,46 @@ static void free_device(struct rcu_head *head)
schedule_work(&device->rcu_work);
}
+static void btrfs_close_one_device(struct btrfs_device *device)
+{
+ struct btrfs_fs_devices *fs_devices = device->fs_devices;
+ struct btrfs_device *new_device;
+ struct rcu_string *name;
+
+ if (device->bdev)
+ fs_devices->open_devices--;
+
+ if (device->writeable &&
+ device->devid != BTRFS_DEV_REPLACE_DEVID) {
+ list_del_init(&device->dev_alloc_list);
+ fs_devices->rw_devices--;
+ }
+
+ if (device->missing)
+ fs_devices->missing_devices--;
+
+ if (device->bdev && device->writeable) {
+ sync_blockdev(device->bdev);
+ invalidate_bdev(device->bdev);
+ }
+
+ new_device = btrfs_alloc_device(NULL, &device->devid,
+ device->uuid);
+ BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
+
+ /* Safe because we are under uuid_mutex */
+ if (device->name) {
+ name = rcu_string_strdup(device->name->str, GFP_NOFS);
+ BUG_ON(!name); /* -ENOMEM */
+ rcu_assign_pointer(new_device->name, name);
+ }
+
+ list_replace_rcu(&device->dev_list, &new_device->dev_list);
+ new_device->fs_devices = device->fs_devices;
+
+ call_rcu(&device->rcu, free_device);
+}
+
static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_device *device, *tmp;
@@ -988,6 +1043,56 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
return ret;
}
+void btrfs_release_disk_super(struct page *page)
+{
+ kunmap(page);
+ put_page(page);
+}
+
+int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
+ struct page **page, struct btrfs_super_block **disk_super)
+{
+ void *p;
+ pgoff_t index;
+
+ /* make sure our super fits in the device */
+ if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
+ return 1;
+
+ /* make sure our super fits in the page */
+ if (sizeof(**disk_super) > PAGE_SIZE)
+ return 1;
+
+ /* make sure our super doesn't straddle pages on disk */
+ index = bytenr >> PAGE_SHIFT;
+ if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
+ return 1;
+
+ /* pull in the page with our super */
+ *page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
+ index, GFP_KERNEL);
+
+ if (IS_ERR_OR_NULL(*page))
+ return 1;
+
+ p = kmap(*page);
+
+ /* align our pointer to the offset of the super block */
+ *disk_super = p + (bytenr & ~PAGE_MASK);
+
+ if (btrfs_super_bytenr(*disk_super) != bytenr ||
+ btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
+ btrfs_release_disk_super(*page);
+ return 1;
+ }
+
+ if ((*disk_super)->label[0] &&
+ (*disk_super)->label[BTRFS_LABEL_SIZE - 1])
+ (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
+
+ return 0;
+}
+
/*
* Look for a btrfs signature on a device. This may be called out of the mount path
* and we are not allowed to call set_blocksize during the scan. The superblock
@@ -999,13 +1104,11 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
struct btrfs_super_block *disk_super;
struct block_device *bdev;
struct page *page;
- void *p;
int ret = -EINVAL;
u64 devid;
u64 transid;
u64 total_devices;
u64 bytenr;
- pgoff_t index;
/*
* we would like to check all the supers, but that would make
@@ -1018,41 +1121,14 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
mutex_lock(&uuid_mutex);
bdev = blkdev_get_by_path(path, flags, holder);
-
if (IS_ERR(bdev)) {
ret = PTR_ERR(bdev);
goto error;
}
- /* make sure our super fits in the device */
- if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode))
- goto error_bdev_put;
-
- /* make sure our super fits in the page */
- if (sizeof(*disk_super) > PAGE_CACHE_SIZE)
+ if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super))
goto error_bdev_put;
- /* make sure our super doesn't straddle pages on disk */
- index = bytenr >> PAGE_CACHE_SHIFT;
- if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index)
- goto error_bdev_put;
-
- /* pull in the page with our super */
- page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
- index, GFP_NOFS);
-
- if (IS_ERR_OR_NULL(page))
- goto error_bdev_put;
-
- p = kmap(page);
-
- /* align our pointer to the offset of the super block */
- disk_super = p + (bytenr & ~PAGE_CACHE_MASK);
-
- if (btrfs_super_bytenr(disk_super) != bytenr ||
- btrfs_super_magic(disk_super) != BTRFS_MAGIC)
- goto error_unmap;
-
devid = btrfs_stack_device_id(&disk_super->dev_item);
transid = btrfs_super_generation(disk_super);
total_devices = btrfs_super_num_devices(disk_super);
@@ -1060,8 +1136,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
ret = device_list_add(path, disk_super, devid, fs_devices_ret);
if (ret > 0) {
if (disk_super->label[0]) {
- if (disk_super->label[BTRFS_LABEL_SIZE - 1])
- disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
printk(KERN_INFO "BTRFS: device label %s ", disk_super->label);
} else {
printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid);
@@ -1073,9 +1147,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
if (!ret && fs_devices_ret)
(*fs_devices_ret)->total_devices = total_devices;
-error_unmap:
- kunmap(page);
- page_cache_release(page);
+ btrfs_release_disk_super(page);
error_bdev_put:
blkdev_put(bdev, flags);
@@ -1454,7 +1526,7 @@ again:
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_extent);
} else {
- btrfs_std_error(root->fs_info, ret, "Slot search failed");
+ btrfs_handle_fs_error(root->fs_info, ret, "Slot search failed");
goto out;
}
@@ -1462,7 +1534,7 @@ again:
ret = btrfs_del_item(trans, root, path);
if (ret) {
- btrfs_std_error(root->fs_info, ret,
+ btrfs_handle_fs_error(root->fs_info, ret,
"Failed to remove dev extent item");
} else {
set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
@@ -1688,32 +1760,92 @@ out:
return ret;
}
-int btrfs_rm_device(struct btrfs_root *root, char *device_path)
+/*
+ * Verify that @num_devices satisfies the RAID profile constraints in the whole
+ * filesystem. It's up to the caller to adjust that number regarding eg. device
+ * replace.
+ */
+static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
+ u64 num_devices)
+{
+ u64 all_avail;
+ unsigned seq;
+ int i;
+
+ do {
+ seq = read_seqbegin(&fs_info->profiles_lock);
+
+ all_avail = fs_info->avail_data_alloc_bits |
+ fs_info->avail_system_alloc_bits |
+ fs_info->avail_metadata_alloc_bits;
+ } while (read_seqretry(&fs_info->profiles_lock, seq));
+
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+ if (!(all_avail & btrfs_raid_group[i]))
+ continue;
+
+ if (num_devices < btrfs_raid_array[i].devs_min) {
+ int ret = btrfs_raid_mindev_error[i];
+
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+struct btrfs_device *btrfs_find_next_active_device(struct btrfs_fs_devices *fs_devs,
+ struct btrfs_device *device)
{
- struct btrfs_device *device;
struct btrfs_device *next_device;
- struct block_device *bdev;
- struct buffer_head *bh = NULL;
- struct btrfs_super_block *disk_super;
+
+ list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
+ if (next_device != device &&
+ !next_device->missing && next_device->bdev)
+ return next_device;
+ }
+
+ return NULL;
+}
+
+/*
+ * Helper function to check if the given device is part of s_bdev / latest_bdev
+ * and replace it with the provided or the next active device, in the context
+ * where this function called, there should be always be another device (or
+ * this_dev) which is active.
+ */
+void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *device, struct btrfs_device *this_dev)
+{
+ struct btrfs_device *next_device;
+
+ if (this_dev)
+ next_device = this_dev;
+ else
+ next_device = btrfs_find_next_active_device(fs_info->fs_devices,
+ device);
+ ASSERT(next_device);
+
+ if (fs_info->sb->s_bdev &&
+ (fs_info->sb->s_bdev == device->bdev))
+ fs_info->sb->s_bdev = next_device->bdev;
+
+ if (fs_info->fs_devices->latest_bdev == device->bdev)
+ fs_info->fs_devices->latest_bdev = next_device->bdev;
+}
+
+int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid)
+{
+ struct btrfs_device *device;
struct btrfs_fs_devices *cur_devices;
- u64 all_avail;
- u64 devid;
u64 num_devices;
- u8 *dev_uuid;
- unsigned seq;
int ret = 0;
bool clear_super = false;
+ char *dev_name = NULL;
mutex_lock(&uuid_mutex);
- do {
- seq = read_seqbegin(&root->fs_info->profiles_lock);
-
- all_avail = root->fs_info->avail_data_alloc_bits |
- root->fs_info->avail_system_alloc_bits |
- root->fs_info->avail_metadata_alloc_bits;
- } while (read_seqretry(&root->fs_info->profiles_lock, seq));
-
num_devices = root->fs_info->fs_devices->num_devices;
btrfs_dev_replace_lock(&root->fs_info->dev_replace, 0);
if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
@@ -1722,78 +1854,23 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
}
btrfs_dev_replace_unlock(&root->fs_info->dev_replace, 0);
- if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
- ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
- goto out;
- }
-
- if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
- ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET;
+ ret = btrfs_check_raid_min_devices(root->fs_info, num_devices - 1);
+ if (ret)
goto out;
- }
- if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
- root->fs_info->fs_devices->rw_devices <= 2) {
- ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET;
- goto out;
- }
- if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
- root->fs_info->fs_devices->rw_devices <= 3) {
- ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET;
+ ret = btrfs_find_device_by_devspec(root, devid, device_path,
+ &device);
+ if (ret)
goto out;
- }
-
- if (strcmp(device_path, "missing") == 0) {
- struct list_head *devices;
- struct btrfs_device *tmp;
-
- device = NULL;
- devices = &root->fs_info->fs_devices->devices;
- /*
- * It is safe to read the devices since the volume_mutex
- * is held.
- */
- list_for_each_entry(tmp, devices, dev_list) {
- if (tmp->in_fs_metadata &&
- !tmp->is_tgtdev_for_dev_replace &&
- !tmp->bdev) {
- device = tmp;
- break;
- }
- }
- bdev = NULL;
- bh = NULL;
- disk_super = NULL;
- if (!device) {
- ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
- goto out;
- }
- } else {
- ret = btrfs_get_bdev_and_sb(device_path,
- FMODE_WRITE | FMODE_EXCL,
- root->fs_info->bdev_holder, 0,
- &bdev, &bh);
- if (ret)
- goto out;
- disk_super = (struct btrfs_super_block *)bh->b_data;
- devid = btrfs_stack_device_id(&disk_super->dev_item);
- dev_uuid = disk_super->dev_item.uuid;
- device = btrfs_find_device(root->fs_info, devid, dev_uuid,
- disk_super->fsid);
- if (!device) {
- ret = -ENOENT;
- goto error_brelse;
- }
- }
if (device->is_tgtdev_for_dev_replace) {
ret = BTRFS_ERROR_DEV_TGT_REPLACE;
- goto error_brelse;
+ goto out;
}
if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
- goto error_brelse;
+ goto out;
}
if (device->writeable) {
@@ -1801,6 +1878,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
list_del_init(&device->dev_alloc_list);
device->fs_devices->rw_devices--;
unlock_chunks(root);
+ dev_name = kstrdup(device->name->str, GFP_KERNEL);
+ if (!dev_name) {
+ ret = -ENOMEM;
+ goto error_undo;
+ }
clear_super = true;
}
@@ -1842,12 +1924,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
if (device->missing)
device->fs_devices->missing_devices--;
- next_device = list_entry(root->fs_info->fs_devices->devices.next,
- struct btrfs_device, dev_list);
- if (device->bdev == root->fs_info->sb->s_bdev)
- root->fs_info->sb->s_bdev = next_device->bdev;
- if (device->bdev == root->fs_info->fs_devices->latest_bdev)
- root->fs_info->fs_devices->latest_bdev = next_device->bdev;
+ btrfs_assign_next_active_device(root->fs_info, device, NULL);
if (device->bdev) {
device->fs_devices->open_devices--;
@@ -1883,63 +1960,23 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
* at this point, the device is zero sized. We want to
* remove it from the devices list and zero out the old super
*/
- if (clear_super && disk_super) {
- u64 bytenr;
- int i;
-
- /* make sure this device isn't detected as part of
- * the FS anymore
- */
- memset(&disk_super->magic, 0, sizeof(disk_super->magic));
- set_buffer_dirty(bh);
- sync_dirty_buffer(bh);
-
- /* clear the mirror copies of super block on the disk
- * being removed, 0th copy is been taken care above and
- * the below would take of the rest
- */
- for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) {
- bytenr = btrfs_sb_offset(i);
- if (bytenr + BTRFS_SUPER_INFO_SIZE >=
- i_size_read(bdev->bd_inode))
- break;
-
- brelse(bh);
- bh = __bread(bdev, bytenr / 4096,
- BTRFS_SUPER_INFO_SIZE);
- if (!bh)
- continue;
-
- disk_super = (struct btrfs_super_block *)bh->b_data;
-
- if (btrfs_super_bytenr(disk_super) != bytenr ||
- btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
- continue;
- }
- memset(&disk_super->magic, 0,
- sizeof(disk_super->magic));
- set_buffer_dirty(bh);
- sync_dirty_buffer(bh);
+ if (clear_super) {
+ struct block_device *bdev;
+
+ bdev = blkdev_get_by_path(dev_name, FMODE_READ | FMODE_EXCL,
+ root->fs_info->bdev_holder);
+ if (!IS_ERR(bdev)) {
+ btrfs_scratch_superblocks(bdev, dev_name);
+ blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
}
}
- ret = 0;
-
- if (bdev) {
- /* Notify udev that device has changed */
- btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
-
- /* Update ctime/mtime for device path for libblkid */
- update_dev_time(device_path);
- }
-
-error_brelse:
- brelse(bh);
- if (bdev)
- blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
out:
+ kfree(dev_name);
+
mutex_unlock(&uuid_mutex);
return ret;
+
error_undo:
if (device->writeable) {
lock_chunks(root);
@@ -1948,7 +1985,7 @@ error_undo:
device->fs_devices->rw_devices++;
unlock_chunks(root);
}
- goto error_brelse;
+ goto out;
}
void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
@@ -1972,11 +2009,8 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
if (srcdev->missing)
fs_devices->missing_devices--;
- if (srcdev->writeable) {
+ if (srcdev->writeable)
fs_devices->rw_devices--;
- /* zero out the old super if it is writable */
- btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
- }
if (srcdev->bdev)
fs_devices->open_devices--;
@@ -1987,6 +2021,10 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
{
struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
+ if (srcdev->writeable) {
+ /* zero out the old super if it is writable */
+ btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
+ }
call_rcu(&srcdev->rcu, free_device);
/*
@@ -2016,32 +2054,33 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
struct btrfs_device *tgtdev)
{
- struct btrfs_device *next_device;
-
mutex_lock(&uuid_mutex);
WARN_ON(!tgtdev);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
- if (tgtdev->bdev) {
- btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
+ if (tgtdev->bdev)
fs_info->fs_devices->open_devices--;
- }
+
fs_info->fs_devices->num_devices--;
- next_device = list_entry(fs_info->fs_devices->devices.next,
- struct btrfs_device, dev_list);
- if (tgtdev->bdev == fs_info->sb->s_bdev)
- fs_info->sb->s_bdev = next_device->bdev;
- if (tgtdev->bdev == fs_info->fs_devices->latest_bdev)
- fs_info->fs_devices->latest_bdev = next_device->bdev;
- list_del_rcu(&tgtdev->dev_list);
+ btrfs_assign_next_active_device(fs_info, tgtdev, NULL);
- call_rcu(&tgtdev->rcu, free_device);
+ list_del_rcu(&tgtdev->dev_list);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
mutex_unlock(&uuid_mutex);
+
+ /*
+ * The update_dev_time() with in btrfs_scratch_superblocks()
+ * may lead to a call to btrfs_show_devname() which will try
+ * to hold device_list_mutex. And here this device
+ * is already out of device list, so we don't have to hold
+ * the device_list_mutex lock.
+ */
+ btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
+ call_rcu(&tgtdev->rcu, free_device);
}
static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
@@ -2102,6 +2141,31 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
}
/*
+ * Lookup a device given by device id, or the path if the id is 0.
+ */
+int btrfs_find_device_by_devspec(struct btrfs_root *root, u64 devid,
+ char *devpath,
+ struct btrfs_device **device)
+{
+ int ret;
+
+ if (devid) {
+ ret = 0;
+ *device = btrfs_find_device(root->fs_info, devid, NULL,
+ NULL);
+ if (!*device)
+ ret = -ENOENT;
+ } else {
+ if (!devpath || !devpath[0])
+ return -EINVAL;
+
+ ret = btrfs_find_device_missing_or_by_path(root, devpath,
+ device);
+ }
+ return ret;
+}
+
+/*
* does all the dirty work required for changing file system's UUID.
*/
static int btrfs_prepare_sprout(struct btrfs_root *root)
@@ -2165,7 +2229,7 @@ static int btrfs_prepare_sprout(struct btrfs_root *root)
}
/*
- * strore the expected generation for seed devices in device items.
+ * Store the expected generation for seed devices in device items.
*/
static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
@@ -2374,14 +2438,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
ret = init_first_rw_device(trans, root, device);
unlock_chunks(root);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto error_trans;
}
}
ret = btrfs_add_device(trans, root, device);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto error_trans;
}
@@ -2390,7 +2454,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
ret = btrfs_finish_sprout(trans, root);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto error_trans;
}
@@ -2418,7 +2482,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
ret = btrfs_relocate_sys_chunks(root);
if (ret < 0)
- btrfs_std_error(root->fs_info, ret,
+ btrfs_handle_fs_error(root->fs_info, ret,
"Failed to relocate sys chunks after "
"device initialization. This can be fixed "
"using the \"btrfs balance\" command.");
@@ -2663,7 +2727,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
if (ret < 0)
goto out;
else if (ret > 0) { /* Logic error or corruption */
- btrfs_std_error(root->fs_info, -ENOENT,
+ btrfs_handle_fs_error(root->fs_info, -ENOENT,
"Failed lookup while freeing chunk.");
ret = -ENOENT;
goto out;
@@ -2671,7 +2735,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
ret = btrfs_del_item(trans, root, path);
if (ret < 0)
- btrfs_std_error(root->fs_info, ret,
+ btrfs_handle_fs_error(root->fs_info, ret,
"Failed to delete chunk item.");
out:
btrfs_free_path(path);
@@ -2736,6 +2800,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
u64 dev_extent_len = 0;
u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
int i, ret = 0;
+ struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
/* Just in case */
root = root->fs_info->chunk_root;
@@ -2762,13 +2827,20 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
check_system_chunk(trans, extent_root, map->type);
unlock_chunks(root->fs_info->chunk_root);
+ /*
+ * Take the device list mutex to prevent races with the final phase of
+ * a device replace operation that replaces the device object associated
+ * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
+ */
+ mutex_lock(&fs_devices->device_list_mutex);
for (i = 0; i < map->num_stripes; i++) {
struct btrfs_device *device = map->stripes[i].dev;
ret = btrfs_free_dev_extent(trans, device,
map->stripes[i].physical,
&dev_extent_len);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ mutex_unlock(&fs_devices->device_list_mutex);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -2786,14 +2858,17 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
if (map->stripes[i].dev) {
ret = btrfs_update_device(trans, map->stripes[i].dev);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ mutex_unlock(&fs_devices->device_list_mutex);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
}
}
+ mutex_unlock(&fs_devices->device_list_mutex);
+
ret = btrfs_free_chunk(trans, root, chunk_objectid, chunk_offset);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -2802,14 +2877,14 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
}
ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em);
if (ret) {
- btrfs_abort_transaction(trans, extent_root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -2857,7 +2932,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset)
chunk_offset);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- btrfs_std_error(root->fs_info, ret, NULL);
+ btrfs_handle_fs_error(root->fs_info, ret, NULL);
return ret;
}
@@ -2866,7 +2941,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset)
* chunk tree entries
*/
ret = btrfs_remove_chunk(trans, root, chunk_offset);
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans, extent_root);
return ret;
}
@@ -3362,7 +3437,7 @@ static int should_balance_chunk(struct btrfs_root *root,
} else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
/*
* Same logic as the 'limit' filter; the minimum cannot be
- * determined here because we do not have the global informatoin
+ * determined here because we do not have the global information
* about the count of all chunks that satisfy the filters.
*/
if (bargs->limit_max == 0)
@@ -3385,7 +3460,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
u64 size_to_free;
u64 chunk_type;
struct btrfs_chunk *chunk;
- struct btrfs_path *path;
+ struct btrfs_path *path = NULL;
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_trans_handle *trans;
@@ -3402,6 +3477,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
u32 count_meta = 0;
u32 count_sys = 0;
int chunk_reserved = 0;
+ u64 bytes_used = 0;
/* step one make some room on all the devices */
devices = &fs_info->fs_devices->devices;
@@ -3418,13 +3494,33 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
ret = btrfs_shrink_device(device, old_size - size_to_free);
if (ret == -ENOSPC)
break;
- BUG_ON(ret);
+ if (ret) {
+ /* btrfs_shrink_device never returns ret > 0 */
+ WARN_ON(ret > 0);
+ goto error;
+ }
trans = btrfs_start_transaction(dev_root, 0);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ btrfs_info_in_rcu(fs_info,
+ "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu",
+ rcu_str_deref(device->name), ret,
+ old_size, old_size - size_to_free);
+ goto error;
+ }
ret = btrfs_grow_device(trans, device, old_size);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_end_transaction(trans, dev_root);
+ /* btrfs_grow_device never returns ret > 0 */
+ WARN_ON(ret > 0);
+ btrfs_info_in_rcu(fs_info,
+ "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu",
+ rcu_str_deref(device->name), ret,
+ old_size, old_size - size_to_free);
+ goto error;
+ }
btrfs_end_transaction(trans, dev_root);
}
@@ -3540,7 +3636,13 @@ again:
goto loop;
}
- if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) && !chunk_reserved) {
+ ASSERT(fs_info->data_sinfo);
+ spin_lock(&fs_info->data_sinfo->lock);
+ bytes_used = fs_info->data_sinfo->bytes_used;
+ spin_unlock(&fs_info->data_sinfo->lock);
+
+ if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
+ !chunk_reserved && !bytes_used) {
trans = btrfs_start_transaction(chunk_root, 0);
if (IS_ERR(trans)) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
@@ -3632,7 +3734,7 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
unset_balance_control(fs_info);
ret = del_balance_item(fs_info->tree_root);
if (ret)
- btrfs_std_error(fs_info, ret, NULL);
+ btrfs_handle_fs_error(fs_info, ret, NULL);
atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
}
@@ -3693,10 +3795,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
num_devices--;
}
btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
- allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
- if (num_devices == 1)
- allowed |= BTRFS_BLOCK_GROUP_DUP;
- else if (num_devices > 1)
+ allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP;
+ if (num_devices > 1)
allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
if (num_devices > 2)
allowed |= BTRFS_BLOCK_GROUP_RAID5;
@@ -3844,7 +3944,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
}
spin_unlock(&fs_info->balance_lock);
- if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
+ if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
btrfs_info(fs_info, "force skipping balance");
return 0;
}
@@ -4199,7 +4299,8 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
BTRFS_UUID_TREE_OBJECTID);
if (IS_ERR(uuid_root)) {
ret = PTR_ERR(uuid_root);
- btrfs_abort_transaction(trans, tree_root, ret);
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans, tree_root);
return ret;
}
@@ -4472,8 +4573,7 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
btrfs_set_fs_incompat(info, RAID56);
}
-#define BTRFS_MAX_DEVS(r) ((BTRFS_LEAF_DATA_SIZE(r) \
- - sizeof(struct btrfs_item) \
+#define BTRFS_MAX_DEVS(r) ((BTRFS_MAX_ITEM_SIZE(r) \
- sizeof(struct btrfs_chunk)) \
/ sizeof(struct btrfs_stripe) + 1)
@@ -4652,12 +4752,12 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
if (type & BTRFS_BLOCK_GROUP_RAID5) {
raid_stripe_len = find_raid56_stripe_len(ndevs - 1,
- btrfs_super_stripesize(info->super_copy));
+ extent_root->stripesize);
data_stripes = num_stripes - 1;
}
if (type & BTRFS_BLOCK_GROUP_RAID6) {
raid_stripe_len = find_raid56_stripe_len(ndevs - 2,
- btrfs_super_stripesize(info->super_copy));
+ extent_root->stripesize);
data_stripes = num_stripes - 2;
}
@@ -5218,7 +5318,7 @@ void btrfs_put_bbio(struct btrfs_bio *bbio)
kfree(bbio);
}
-static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
+static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret,
int mirror_num, int need_raid_map)
@@ -5278,7 +5378,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
stripe_nr = div64_u64(stripe_nr, stripe_len);
stripe_offset = stripe_nr * stripe_len;
- BUG_ON(offset < stripe_offset);
+ if (offset < stripe_offset) {
+ btrfs_crit(fs_info, "stripe math has gone wrong, "
+ "stripe_offset=%llu, offset=%llu, start=%llu, "
+ "logical=%llu, stripe_len=%llu",
+ stripe_offset, offset, em->start, logical,
+ stripe_len);
+ free_extent_map(em);
+ return -EINVAL;
+ }
/* stripe_offset is the offset of this block in its stripe*/
stripe_offset = offset - stripe_offset;
@@ -5296,7 +5404,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
raid56_full_stripe_start *= full_stripe_len;
}
- if (rw & REQ_DISCARD) {
+ if (op == REQ_OP_DISCARD) {
/* we don't discard raid56 yet */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
ret = -EOPNOTSUPP;
@@ -5309,7 +5417,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
For other RAID types and for RAID[56] reads, just allow a single
stripe (on a single disk). */
if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
- (rw & REQ_WRITE)) {
+ (op == REQ_OP_WRITE)) {
max_len = stripe_len * nr_data_stripes(map) -
(offset - raid56_full_stripe_start);
} else {
@@ -5334,8 +5442,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
btrfs_dev_replace_set_lock_blocking(dev_replace);
if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
- !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) &&
- dev_replace->tgtdev != NULL) {
+ op != REQ_OP_WRITE && op != REQ_OP_DISCARD &&
+ op != REQ_GET_READ_MIRRORS && dev_replace->tgtdev != NULL) {
/*
* in dev-replace case, for repair case (that's the only
* case where the mirror is selected explicitly when
@@ -5422,15 +5530,17 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
(offset + *length);
if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
- if (rw & REQ_DISCARD)
+ if (op == REQ_OP_DISCARD)
num_stripes = min_t(u64, map->num_stripes,
stripe_nr_end - stripe_nr_orig);
stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
&stripe_index);
- if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)))
+ if (op != REQ_OP_WRITE && op != REQ_OP_DISCARD &&
+ op != REQ_GET_READ_MIRRORS)
mirror_num = 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
- if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
+ if (op == REQ_OP_WRITE || op == REQ_OP_DISCARD ||
+ op == REQ_GET_READ_MIRRORS)
num_stripes = map->num_stripes;
else if (mirror_num)
stripe_index = mirror_num - 1;
@@ -5443,7 +5553,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
}
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
- if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) {
+ if (op == REQ_OP_WRITE || op == REQ_OP_DISCARD ||
+ op == REQ_GET_READ_MIRRORS) {
num_stripes = map->num_stripes;
} else if (mirror_num) {
stripe_index = mirror_num - 1;
@@ -5457,9 +5568,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
stripe_index *= map->sub_stripes;
- if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
+ if (op == REQ_OP_WRITE || op == REQ_GET_READ_MIRRORS)
num_stripes = map->sub_stripes;
- else if (rw & REQ_DISCARD)
+ else if (op == REQ_OP_DISCARD)
num_stripes = min_t(u64, map->sub_stripes *
(stripe_nr_end - stripe_nr_orig),
map->num_stripes);
@@ -5477,7 +5588,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
if (need_raid_map &&
- ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
+ (op == REQ_OP_WRITE || op == REQ_GET_READ_MIRRORS ||
mirror_num > 1)) {
/* push stripe_nr back to the start of the full stripe */
stripe_nr = div_u64(raid56_full_stripe_start,
@@ -5505,8 +5616,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
/* We distribute the parity blocks across stripes */
div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
&stripe_index);
- if (!(rw & (REQ_WRITE | REQ_DISCARD |
- REQ_GET_READ_MIRRORS)) && mirror_num <= 1)
+ if ((op != REQ_OP_WRITE && op != REQ_OP_DISCARD &&
+ op != REQ_GET_READ_MIRRORS) && mirror_num <= 1)
mirror_num = 1;
}
} else {
@@ -5519,13 +5630,19 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
&stripe_index);
mirror_num = stripe_index + 1;
}
- BUG_ON(stripe_index >= map->num_stripes);
+ if (stripe_index >= map->num_stripes) {
+ btrfs_crit(fs_info, "stripe index math went horribly wrong, "
+ "got stripe_index=%u, num_stripes=%u",
+ stripe_index, map->num_stripes);
+ ret = -EINVAL;
+ goto out;
+ }
num_alloc_stripes = num_stripes;
if (dev_replace_is_ongoing) {
- if (rw & (REQ_WRITE | REQ_DISCARD))
+ if (op == REQ_OP_WRITE || op == REQ_OP_DISCARD)
num_alloc_stripes <<= 1;
- if (rw & REQ_GET_READ_MIRRORS)
+ if (op == REQ_GET_READ_MIRRORS)
num_alloc_stripes++;
tgtdev_indexes = num_stripes;
}
@@ -5540,7 +5657,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
/* build raid_map */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
- need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
+ need_raid_map &&
+ ((op == REQ_OP_WRITE || op == REQ_GET_READ_MIRRORS) ||
mirror_num > 1)) {
u64 tmp;
unsigned rot;
@@ -5565,7 +5683,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
RAID6_Q_STRIPE;
}
- if (rw & REQ_DISCARD) {
+ if (op == REQ_OP_DISCARD) {
u32 factor = 0;
u32 sub_stripes = 0;
u64 stripes_per_dev = 0;
@@ -5645,14 +5763,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
}
}
- if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
+ if (op == REQ_OP_WRITE || op == REQ_GET_READ_MIRRORS)
max_errors = btrfs_chunk_max_errors(map);
if (bbio->raid_map)
sort_parity_stripes(bbio, num_stripes);
tgtdev_indexes = 0;
- if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
+ if (dev_replace_is_ongoing &&
+ (op == REQ_OP_WRITE || op == REQ_OP_DISCARD) &&
dev_replace->tgtdev != NULL) {
int index_where_to_add;
u64 srcdev_devid = dev_replace->srcdev->devid;
@@ -5687,7 +5806,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
}
}
num_stripes = index_where_to_add;
- } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) &&
+ } else if (dev_replace_is_ongoing && (op == REQ_GET_READ_MIRRORS) &&
dev_replace->tgtdev != NULL) {
u64 srcdev_devid = dev_replace->srcdev->devid;
int index_srcdev = 0;
@@ -5718,20 +5837,17 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
}
}
if (found) {
- if (physical_of_found + map->stripe_len <=
- dev_replace->cursor_left) {
- struct btrfs_bio_stripe *tgtdev_stripe =
- bbio->stripes + num_stripes;
+ struct btrfs_bio_stripe *tgtdev_stripe =
+ bbio->stripes + num_stripes;
- tgtdev_stripe->physical = physical_of_found;
- tgtdev_stripe->length =
- bbio->stripes[index_srcdev].length;
- tgtdev_stripe->dev = dev_replace->tgtdev;
- bbio->tgtdev_map[index_srcdev] = num_stripes;
+ tgtdev_stripe->physical = physical_of_found;
+ tgtdev_stripe->length =
+ bbio->stripes[index_srcdev].length;
+ tgtdev_stripe->dev = dev_replace->tgtdev;
+ bbio->tgtdev_map[index_srcdev] = num_stripes;
- tgtdev_indexes++;
- num_stripes++;
- }
+ tgtdev_indexes++;
+ num_stripes++;
}
}
@@ -5762,21 +5878,21 @@ out:
return ret;
}
-int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
+int btrfs_map_block(struct btrfs_fs_info *fs_info, int op,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret, int mirror_num)
{
- return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
+ return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
mirror_num, 0);
}
/* For Scrub/replace */
-int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
+int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int op,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret, int mirror_num,
int need_raid_map)
{
- return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
+ return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
mirror_num, need_raid_map);
}
@@ -5890,7 +6006,7 @@ static void btrfs_end_bio(struct bio *bio)
BUG_ON(stripe_index >= bbio->num_stripes);
dev = bbio->stripes[stripe_index].dev;
if (dev->bdev) {
- if (bio->bi_rw & WRITE)
+ if (bio_op(bio) == REQ_OP_WRITE)
btrfs_dev_stat_inc(dev,
BTRFS_DEV_STAT_WRITE_ERRS);
else
@@ -5944,7 +6060,7 @@ static void btrfs_end_bio(struct bio *bio)
*/
static noinline void btrfs_schedule_bio(struct btrfs_root *root,
struct btrfs_device *device,
- int rw, struct bio *bio)
+ struct bio *bio)
{
int should_queue = 1;
struct btrfs_pending_bios *pending_bios;
@@ -5955,9 +6071,9 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root,
}
/* don't bother with additional async steps for reads, right now */
- if (!(rw & REQ_WRITE)) {
+ if (bio_op(bio) == REQ_OP_READ) {
bio_get(bio);
- btrfsic_submit_bio(rw, bio);
+ btrfsic_submit_bio(bio);
bio_put(bio);
return;
}
@@ -5971,7 +6087,6 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root,
atomic_inc(&root->fs_info->nr_async_bios);
WARN_ON(bio->bi_next);
bio->bi_next = NULL;
- bio->bi_rw |= rw;
spin_lock(&device->io_lock);
if (bio->bi_rw & REQ_SYNC)
@@ -5997,7 +6112,7 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root,
static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
struct bio *bio, u64 physical, int dev_nr,
- int rw, int async)
+ int async)
{
struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
@@ -6011,8 +6126,8 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
rcu_read_lock();
name = rcu_dereference(dev->name);
- pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
- "(%s id %llu), size=%u\n", rw,
+ pr_debug("btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu "
+ "(%s id %llu), size=%u\n", bio_op(bio), bio->bi_rw,
(u64)bio->bi_iter.bi_sector, (u_long)dev->bdev->bd_dev,
name->str, dev->devid, bio->bi_iter.bi_size);
rcu_read_unlock();
@@ -6023,16 +6138,16 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
btrfs_bio_counter_inc_noblocked(root->fs_info);
if (async)
- btrfs_schedule_bio(root, dev, rw, bio);
+ btrfs_schedule_bio(root, dev, bio);
else
- btrfsic_submit_bio(rw, bio);
+ btrfsic_submit_bio(bio);
}
static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
{
atomic_inc(&bbio->error);
if (atomic_dec_and_test(&bbio->stripes_pending)) {
- /* Shoud be the original bio. */
+ /* Should be the original bio. */
WARN_ON(bio != bbio->orig_bio);
btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
@@ -6042,7 +6157,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
}
}
-int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
+int btrfs_map_bio(struct btrfs_root *root, struct bio *bio,
int mirror_num, int async_submit)
{
struct btrfs_device *dev;
@@ -6059,8 +6174,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
map_length = length;
btrfs_bio_counter_inc_blocked(root->fs_info);
- ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
- mirror_num, 1);
+ ret = __btrfs_map_block(root->fs_info, bio_op(bio), logical,
+ &map_length, &bbio, mirror_num, 1);
if (ret) {
btrfs_bio_counter_dec(root->fs_info);
return ret;
@@ -6074,10 +6189,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
atomic_set(&bbio->stripes_pending, bbio->num_stripes);
if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
- ((rw & WRITE) || (mirror_num > 1))) {
+ ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) {
/* In this case, map_length has been set to the length of
a single stripe; not the whole write */
- if (rw & WRITE) {
+ if (bio_op(bio) == REQ_OP_WRITE) {
ret = raid56_parity_write(root, bio, bbio, map_length);
} else {
ret = raid56_parity_recover(root, bio, bbio, map_length,
@@ -6096,7 +6211,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
dev = bbio->stripes[dev_nr].dev;
- if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
+ if (!dev || !dev->bdev ||
+ (bio_op(bio) == REQ_OP_WRITE && !dev->writeable)) {
bbio_error(bbio, first_bio, logical);
continue;
}
@@ -6108,7 +6224,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
bio = first_bio;
submit_stripe_bio(root, bbio, bio,
- bbio->stripes[dev_nr].physical, dev_nr, rw,
+ bbio->stripes[dev_nr].physical, dev_nr,
async_submit);
}
btrfs_bio_counter_dec(root->fs_info);
@@ -6206,27 +6322,23 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
return dev;
}
-static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
- struct extent_buffer *leaf,
- struct btrfs_chunk *chunk)
+/* Return -EIO if any error, otherwise return 0. */
+static int btrfs_check_chunk_valid(struct btrfs_root *root,
+ struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk, u64 logical)
{
- struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
- struct map_lookup *map;
- struct extent_map *em;
- u64 logical;
u64 length;
u64 stripe_len;
- u64 devid;
- u8 uuid[BTRFS_UUID_SIZE];
- int num_stripes;
- int ret;
- int i;
+ u16 num_stripes;
+ u16 sub_stripes;
+ u64 type;
- logical = key->offset;
length = btrfs_chunk_length(leaf, chunk);
stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
- /* Validation check */
+ sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
+ type = btrfs_chunk_type(leaf, chunk);
+
if (!num_stripes) {
btrfs_err(root->fs_info, "invalid chunk num_stripes: %u",
num_stripes);
@@ -6237,24 +6349,70 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
"invalid chunk logical %llu", logical);
return -EIO;
}
+ if (btrfs_chunk_sector_size(leaf, chunk) != root->sectorsize) {
+ btrfs_err(root->fs_info, "invalid chunk sectorsize %u",
+ btrfs_chunk_sector_size(leaf, chunk));
+ return -EIO;
+ }
if (!length || !IS_ALIGNED(length, root->sectorsize)) {
btrfs_err(root->fs_info,
"invalid chunk length %llu", length);
return -EIO;
}
- if (!is_power_of_2(stripe_len)) {
+ if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) {
btrfs_err(root->fs_info, "invalid chunk stripe length: %llu",
stripe_len);
return -EIO;
}
if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) &
- btrfs_chunk_type(leaf, chunk)) {
+ type) {
btrfs_err(root->fs_info, "unrecognized chunk type: %llu",
~(BTRFS_BLOCK_GROUP_TYPE_MASK |
BTRFS_BLOCK_GROUP_PROFILE_MASK) &
btrfs_chunk_type(leaf, chunk));
return -EIO;
}
+ if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
+ (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) ||
+ (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
+ (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
+ (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) ||
+ ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
+ num_stripes != 1)) {
+ btrfs_err(root->fs_info,
+ "invalid num_stripes:sub_stripes %u:%u for profile %llu",
+ num_stripes, sub_stripes,
+ type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
+ struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk)
+{
+ struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+ struct map_lookup *map;
+ struct extent_map *em;
+ u64 logical;
+ u64 length;
+ u64 stripe_len;
+ u64 devid;
+ u8 uuid[BTRFS_UUID_SIZE];
+ int num_stripes;
+ int ret;
+ int i;
+
+ logical = key->offset;
+ length = btrfs_chunk_length(leaf, chunk);
+ stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+ num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+
+ ret = btrfs_check_chunk_valid(root, leaf, chunk, logical);
+ if (ret)
+ return ret;
read_lock(&map_tree->map_tree.lock);
em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
@@ -6301,7 +6459,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
BTRFS_UUID_SIZE);
map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
uuid, NULL);
- if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
+ if (!map->stripes[i].dev &&
+ !btrfs_test_opt(root->fs_info, DEGRADED)) {
free_extent_map(em);
return -EIO;
}
@@ -6369,7 +6528,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root,
fs_devices = find_fsid(fsid);
if (!fs_devices) {
- if (!btrfs_test_opt(root, DEGRADED))
+ if (!btrfs_test_opt(root->fs_info, DEGRADED))
return ERR_PTR(-ENOENT);
fs_devices = alloc_fs_devices(fsid);
@@ -6431,7 +6590,7 @@ static int read_one_dev(struct btrfs_root *root,
device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
if (!device) {
- if (!btrfs_test_opt(root, DEGRADED))
+ if (!btrfs_test_opt(root->fs_info, DEGRADED))
return -EIO;
device = add_missing_dev(root, fs_devices, devid, dev_uuid);
@@ -6440,7 +6599,7 @@ static int read_one_dev(struct btrfs_root *root,
btrfs_warn(root->fs_info, "devid %llu uuid %pU missing",
devid, dev_uuid);
} else {
- if (!device->bdev && !btrfs_test_opt(root, DEGRADED))
+ if (!device->bdev && !btrfs_test_opt(root->fs_info, DEGRADED))
return -EIO;
if(!device->bdev && !device->missing) {
@@ -6502,6 +6661,7 @@ int btrfs_read_sys_array(struct btrfs_root *root)
u32 array_size;
u32 len = 0;
u32 cur_offset;
+ u64 type;
struct btrfs_key key;
ASSERT(BTRFS_SUPER_INFO_SIZE <= root->nodesize);
@@ -6511,12 +6671,12 @@ int btrfs_read_sys_array(struct btrfs_root *root)
* overallocate but we can keep it as-is, only the first page is used.
*/
sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET);
- if (!sb)
- return -ENOMEM;
+ if (IS_ERR(sb))
+ return PTR_ERR(sb);
set_extent_buffer_uptodate(sb);
btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
/*
- * The sb extent buffer is artifical and just used to read the system array.
+ * The sb extent buffer is artificial and just used to read the system array.
* set_extent_buffer_uptodate() call does not properly mark all it's
* pages up-to-date when the page is larger: extent does not cover the
* whole page and consequently check_page_uptodate does not find all
@@ -6527,7 +6687,7 @@ int btrfs_read_sys_array(struct btrfs_root *root)
* but sb spans only this function. Add an explicit SetPageUptodate call
* to silence the warning eg. on PowerPC 64.
*/
- if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE)
+ if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
SetPageUptodate(sb->pages[0]);
write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
@@ -6568,6 +6728,15 @@ int btrfs_read_sys_array(struct btrfs_root *root)
break;
}
+ type = btrfs_chunk_type(sb, chunk);
+ if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
+ btrfs_err(root->fs_info,
+ "invalid chunk type %llu in sys_array at offset %u",
+ type, cur_offset);
+ ret = -EIO;
+ break;
+ }
+
len = btrfs_chunk_item_size(num_stripes);
if (cur_offset + len > array_size)
goto out_short_read;
@@ -6586,13 +6755,15 @@ int btrfs_read_sys_array(struct btrfs_root *root)
sb_array_offset += len;
cur_offset += len;
}
- free_extent_buffer(sb);
+ clear_extent_buffer_uptodate(sb);
+ free_extent_buffer_stale(sb);
return ret;
out_short_read:
printk(KERN_ERR "BTRFS: sys_array too short to read %u bytes at offset %u\n",
len, cur_offset);
- free_extent_buffer(sb);
+ clear_extent_buffer_uptodate(sb);
+ free_extent_buffer_stale(sb);
return -EIO;
}
@@ -6604,6 +6775,7 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
struct btrfs_key found_key;
int ret;
int slot;
+ u64 total_dev = 0;
root = root->fs_info->chunk_root;
@@ -6645,6 +6817,7 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
ret = read_one_dev(root, leaf, dev_item);
if (ret)
goto error;
+ total_dev++;
} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
struct btrfs_chunk *chunk;
chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
@@ -6654,6 +6827,28 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
}
path->slots[0]++;
}
+
+ /*
+ * After loading chunk tree, we've got all device information,
+ * do another round of validation checks.
+ */
+ if (total_dev != root->fs_info->fs_devices->total_devices) {
+ btrfs_err(root->fs_info,
+ "super_num_devices %llu mismatch with num_devices %llu found here",
+ btrfs_super_num_devices(root->fs_info->super_copy),
+ total_dev);
+ ret = -EINVAL;
+ goto error;
+ }
+ if (btrfs_super_total_bytes(root->fs_info->super_copy) <
+ root->fs_info->fs_devices->total_rw_bytes) {
+ btrfs_err(root->fs_info,
+ "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
+ btrfs_super_total_bytes(root->fs_info->super_copy),
+ root->fs_info->fs_devices->total_rw_bytes);
+ ret = -EINVAL;
+ goto error;
+ }
ret = 0;
error:
unlock_chunks(root);
@@ -7007,38 +7202,3 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
fs_devices = fs_devices->seed;
}
}
-
-static void btrfs_close_one_device(struct btrfs_device *device)
-{
- struct btrfs_fs_devices *fs_devices = device->fs_devices;
- struct btrfs_device *new_device;
- struct rcu_string *name;
-
- if (device->bdev)
- fs_devices->open_devices--;
-
- if (device->writeable &&
- device->devid != BTRFS_DEV_REPLACE_DEVID) {
- list_del_init(&device->dev_alloc_list);
- fs_devices->rw_devices--;
- }
-
- if (device->missing)
- fs_devices->missing_devices--;
-
- new_device = btrfs_alloc_device(NULL, &device->devid,
- device->uuid);
- BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
-
- /* Safe because we are under uuid_mutex */
- if (device->name) {
- name = rcu_string_strdup(device->name->str, GFP_NOFS);
- BUG_ON(!name); /* -ENOMEM */
- rcu_assign_pointer(new_device->name, name);
- }
-
- list_replace_rcu(&device->dev_list, &new_device->dev_list);
- new_device->fs_devices = device->fs_devices;
-
- call_rcu(&device->rcu, free_device);
-}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 1939ebde63dff..6613e6335ca29 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -340,14 +340,14 @@ struct btrfs_raid_attr {
};
extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES];
-
+extern const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES];
extern const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES];
struct map_lookup {
u64 type;
int io_align;
int io_width;
- int stripe_len;
+ u64 stripe_len;
int sector_size;
int num_stripes;
int sub_stripes;
@@ -357,52 +357,6 @@ struct map_lookup {
#define map_lookup_size(n) (sizeof(struct map_lookup) + \
(sizeof(struct btrfs_bio_stripe) * (n)))
-/*
- * Restriper's general type filter
- */
-#define BTRFS_BALANCE_DATA (1ULL << 0)
-#define BTRFS_BALANCE_SYSTEM (1ULL << 1)
-#define BTRFS_BALANCE_METADATA (1ULL << 2)
-
-#define BTRFS_BALANCE_TYPE_MASK (BTRFS_BALANCE_DATA | \
- BTRFS_BALANCE_SYSTEM | \
- BTRFS_BALANCE_METADATA)
-
-#define BTRFS_BALANCE_FORCE (1ULL << 3)
-#define BTRFS_BALANCE_RESUME (1ULL << 4)
-
-/*
- * Balance filters
- */
-#define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0)
-#define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1)
-#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2)
-#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3)
-#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4)
-#define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5)
-#define BTRFS_BALANCE_ARGS_LIMIT_RANGE (1ULL << 6)
-#define BTRFS_BALANCE_ARGS_STRIPES_RANGE (1ULL << 7)
-#define BTRFS_BALANCE_ARGS_USAGE_RANGE (1ULL << 10)
-
-#define BTRFS_BALANCE_ARGS_MASK \
- (BTRFS_BALANCE_ARGS_PROFILES | \
- BTRFS_BALANCE_ARGS_USAGE | \
- BTRFS_BALANCE_ARGS_DEVID | \
- BTRFS_BALANCE_ARGS_DRANGE | \
- BTRFS_BALANCE_ARGS_VRANGE | \
- BTRFS_BALANCE_ARGS_LIMIT | \
- BTRFS_BALANCE_ARGS_LIMIT_RANGE | \
- BTRFS_BALANCE_ARGS_STRIPES_RANGE | \
- BTRFS_BALANCE_ARGS_USAGE_RANGE)
-
-/*
- * Profile changing flags. When SOFT is set we won't relocate chunk if
- * it already has the target profile (even though it may be
- * half-filled).
- */
-#define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8)
-#define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9)
-
struct btrfs_balance_args;
struct btrfs_balance_progress;
struct btrfs_balance_control {
@@ -421,10 +375,10 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
u64 end, u64 *length);
void btrfs_get_bbio(struct btrfs_bio *bbio);
void btrfs_put_bbio(struct btrfs_bio *bbio);
-int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
+int btrfs_map_block(struct btrfs_fs_info *fs_info, int op,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret, int mirror_num);
-int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
+int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int op,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret, int mirror_num,
int need_raid_map);
@@ -437,7 +391,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 type);
void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
-int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
+int btrfs_map_bio(struct btrfs_root *root, struct bio *bio,
int mirror_num, int async_submit);
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder);
@@ -445,13 +399,18 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
struct btrfs_fs_devices **fs_devices_ret);
int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step);
+void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *device, struct btrfs_device *this_dev);
int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
char *device_path,
struct btrfs_device **device);
+int btrfs_find_device_by_devspec(struct btrfs_root *root, u64 devid,
+ char *devpath,
+ struct btrfs_device **device);
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
const u64 *devid,
const u8 *uuid);
-int btrfs_rm_device(struct btrfs_root *root, char *device_path);
+int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid);
void btrfs_cleanup_fs_uuids(void);
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
int btrfs_grow_device(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 145d2b89e62dc..d1a177a3dbe89 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -237,6 +237,9 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
+ if (btrfs_root_readonly(root))
+ return -EROFS;
+
if (trans)
return do_setxattr(trans, inode, name, value, size, flags);
@@ -369,33 +372,29 @@ err:
}
static int btrfs_xattr_handler_get(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- void *buffer, size_t size)
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *buffer, size_t size)
{
- struct inode *inode = d_inode(dentry);
-
name = xattr_full_name(handler, name);
return __btrfs_getxattr(inode, name, buffer, size);
}
static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *buffer, size_t size,
- int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *buffer,
+ size_t size, int flags)
{
- struct inode *inode = d_inode(dentry);
-
name = xattr_full_name(handler, name);
return __btrfs_setxattr(NULL, inode, name, buffer, size, flags);
}
static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
- struct dentry *dentry,
+ struct dentry *unused, struct inode *inode,
const char *name, const void *value,
size_t size, int flags)
{
name = xattr_full_name(handler, name);
- return btrfs_set_prop(d_inode(dentry), name, value, size, flags);
+ return btrfs_set_prop(inode, name, value, size, flags);
}
static const struct xattr_handler btrfs_security_xattr_handler = {
@@ -434,25 +433,6 @@ const struct xattr_handler *btrfs_xattr_handlers[] = {
NULL,
};
-int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
- size_t size, int flags)
-{
- struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root;
-
- if (btrfs_root_readonly(root))
- return -EROFS;
- return generic_setxattr(dentry, name, value, size, flags);
-}
-
-int btrfs_removexattr(struct dentry *dentry, const char *name)
-{
- struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root;
-
- if (btrfs_root_readonly(root))
- return -EROFS;
- return generic_removexattr(dentry, name);
-}
-
static int btrfs_initxattrs(struct inode *inode,
const struct xattr *xattr_array, void *fs_info)
{
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 96807b3d22f50..15fc4743dc700 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -28,9 +28,6 @@ extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
extern int __btrfs_setxattr(struct btrfs_trans_handle *trans,
struct inode *inode, const char *name,
const void *value, size_t size, int flags);
-extern int btrfs_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags);
-extern int btrfs_removexattr(struct dentry *dentry, const char *name);
extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *dir,
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 82990b8f872b6..88d274e8ecf22 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -59,7 +59,7 @@ static struct list_head *zlib_alloc_workspace(void)
workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
zlib_inflate_workspacesize());
workspace->strm.workspace = vmalloc(workspacesize);
- workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+ workspace->buf = kmalloc(PAGE_SIZE, GFP_NOFS);
if (!workspace->strm.workspace || !workspace->buf)
goto fail;
@@ -103,7 +103,7 @@ static int zlib_compress_pages(struct list_head *ws,
workspace->strm.total_in = 0;
workspace->strm.total_out = 0;
- in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+ in_page = find_get_page(mapping, start >> PAGE_SHIFT);
data_in = kmap(in_page);
out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
@@ -117,8 +117,8 @@ static int zlib_compress_pages(struct list_head *ws,
workspace->strm.next_in = data_in;
workspace->strm.next_out = cpage_out;
- workspace->strm.avail_out = PAGE_CACHE_SIZE;
- workspace->strm.avail_in = min(len, PAGE_CACHE_SIZE);
+ workspace->strm.avail_out = PAGE_SIZE;
+ workspace->strm.avail_in = min(len, PAGE_SIZE);
while (workspace->strm.total_in < len) {
ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH);
@@ -156,7 +156,7 @@ static int zlib_compress_pages(struct list_head *ws,
cpage_out = kmap(out_page);
pages[nr_pages] = out_page;
nr_pages++;
- workspace->strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->strm.avail_out = PAGE_SIZE;
workspace->strm.next_out = cpage_out;
}
/* we're all done */
@@ -170,14 +170,14 @@ static int zlib_compress_pages(struct list_head *ws,
bytes_left = len - workspace->strm.total_in;
kunmap(in_page);
- page_cache_release(in_page);
+ put_page(in_page);
- start += PAGE_CACHE_SIZE;
+ start += PAGE_SIZE;
in_page = find_get_page(mapping,
- start >> PAGE_CACHE_SHIFT);
+ start >> PAGE_SHIFT);
data_in = kmap(in_page);
workspace->strm.avail_in = min(bytes_left,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
workspace->strm.next_in = data_in;
}
}
@@ -205,7 +205,7 @@ out:
if (in_page) {
kunmap(in_page);
- page_cache_release(in_page);
+ put_page(in_page);
}
return ret;
}
@@ -223,18 +223,18 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
size_t total_out = 0;
unsigned long page_in_index = 0;
unsigned long page_out_index = 0;
- unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE);
+ unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
unsigned long buf_start;
unsigned long pg_offset;
data_in = kmap(pages_in[page_in_index]);
workspace->strm.next_in = data_in;
- workspace->strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE);
+ workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE);
workspace->strm.total_in = 0;
workspace->strm.total_out = 0;
workspace->strm.next_out = workspace->buf;
- workspace->strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->strm.avail_out = PAGE_SIZE;
pg_offset = 0;
/* If it's deflate, and it's got no preset dictionary, then
@@ -274,7 +274,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
}
workspace->strm.next_out = workspace->buf;
- workspace->strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->strm.avail_out = PAGE_SIZE;
if (workspace->strm.avail_in == 0) {
unsigned long tmp;
@@ -288,7 +288,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
workspace->strm.next_in = data_in;
tmp = srclen - workspace->strm.total_in;
workspace->strm.avail_in = min(tmp,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
}
}
if (ret != Z_STREAM_END)
@@ -325,7 +325,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
workspace->strm.total_in = 0;
workspace->strm.next_out = workspace->buf;
- workspace->strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->strm.avail_out = PAGE_SIZE;
workspace->strm.total_out = 0;
/* If it's deflate, and it's got no preset dictionary, then
we can tell zlib to skip the adler32 check. */
@@ -368,8 +368,8 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
else
buf_offset = 0;
- bytes = min(PAGE_CACHE_SIZE - pg_offset,
- PAGE_CACHE_SIZE - buf_offset);
+ bytes = min(PAGE_SIZE - pg_offset,
+ PAGE_SIZE - buf_offset);
bytes = min(bytes, bytes_left);
kaddr = kmap_atomic(dest_page);
@@ -380,7 +380,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
bytes_left -= bytes;
next:
workspace->strm.next_out = workspace->buf;
- workspace->strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->strm.avail_out = PAGE_SIZE;
}
if (ret != Z_STREAM_END && bytes_left != 0)
diff --git a/fs/buffer.c b/fs/buffer.c
index 33be296753583..9c8eb9b6db6aa 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -21,6 +21,7 @@
#include <linux/kernel.h>
#include <linux/syscalls.h>
#include <linux/fs.h>
+#include <linux/iomap.h>
#include <linux/mm.h>
#include <linux/percpu.h>
#include <linux/slab.h>
@@ -45,7 +46,7 @@
#include <trace/events/block.h>
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
-static int submit_bh_wbc(int rw, struct buffer_head *bh,
+static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
unsigned long bio_flags,
struct writeback_control *wbc);
@@ -129,7 +130,7 @@ __clear_page_buffers(struct page *page)
{
ClearPagePrivate(page);
set_page_private(page, 0);
- page_cache_release(page);
+ put_page(page);
}
static void buffer_io_error(struct buffer_head *bh, char *msg)
@@ -153,7 +154,7 @@ static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
if (uptodate) {
set_buffer_uptodate(bh);
} else {
- /* This happens, due to failed READA attempts. */
+ /* This happens, due to failed read-ahead attempts. */
clear_buffer_uptodate(bh);
}
unlock_buffer(bh);
@@ -207,7 +208,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
struct page *page;
int all_mapped = 1;
- index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
+ index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
if (!page)
goto out;
@@ -245,7 +246,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
}
out_unlock:
spin_unlock(&bd_mapping->private_lock);
- page_cache_release(page);
+ put_page(page);
out:
return ret;
}
@@ -255,17 +256,17 @@ out:
*/
static void free_more_memory(void)
{
- struct zone *zone;
+ struct zoneref *z;
int nid;
wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
yield();
for_each_online_node(nid) {
- (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
- gfp_zone(GFP_NOFS), NULL,
- &zone);
- if (zone)
+
+ z = first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
+ gfp_zone(GFP_NOFS), NULL);
+ if (z->zone)
try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
GFP_NOFS, NULL);
}
@@ -588,7 +589,7 @@ void write_boundary_block(struct block_device *bdev,
struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
if (bh) {
if (buffer_dirty(bh))
- ll_rw_block(WRITE, 1, &bh);
+ ll_rw_block(REQ_OP_WRITE, 0, 1, &bh);
put_bh(bh);
}
}
@@ -1040,7 +1041,7 @@ done:
ret = (block < end_block) ? 1 : -ENXIO;
failed:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return ret;
}
@@ -1225,7 +1226,7 @@ static struct buffer_head *__bread_slow(struct buffer_head *bh)
} else {
get_bh(bh);
bh->b_end_io = end_buffer_read_sync;
- submit_bh(READ, bh);
+ submit_bh(REQ_OP_READ, 0, bh);
wait_on_buffer(bh);
if (buffer_uptodate(bh))
return bh;
@@ -1395,7 +1396,7 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
{
struct buffer_head *bh = __getblk(bdev, block, size);
if (likely(bh)) {
- ll_rw_block(READA, 1, &bh);
+ ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh);
brelse(bh);
}
}
@@ -1533,7 +1534,7 @@ void block_invalidatepage(struct page *page, unsigned int offset,
/*
* Check for overflow
*/
- BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+ BUG_ON(stop > PAGE_SIZE || stop < length);
head = page_buffers(page);
bh = head;
@@ -1687,7 +1688,7 @@ static struct buffer_head *create_page_buffers(struct page *page, struct inode *
* WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
* causes the writes to be flagged as synchronous writes.
*/
-static int __block_write_full_page(struct inode *inode, struct page *page,
+int __block_write_full_page(struct inode *inode, struct page *page,
get_block_t *get_block, struct writeback_control *wbc,
bh_end_io_t *handler)
{
@@ -1697,7 +1698,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
struct buffer_head *bh, *head;
unsigned int blocksize, bbits;
int nr_underway = 0;
- int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
+ int write_flags = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0);
head = create_page_buffers(page, inode,
(1 << BH_Dirty)|(1 << BH_Uptodate));
@@ -1716,7 +1717,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
blocksize = bh->b_size;
bbits = block_size_bits(blocksize);
- block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
+ block = (sector_t)page->index << (PAGE_SHIFT - bbits);
last_block = (i_size_read(inode) - 1) >> bbits;
/*
@@ -1786,7 +1787,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
- submit_bh_wbc(write_op, bh, 0, wbc);
+ submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, 0, wbc);
nr_underway++;
}
bh = next;
@@ -1840,7 +1841,7 @@ recover:
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
clear_buffer_dirty(bh);
- submit_bh_wbc(write_op, bh, 0, wbc);
+ submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, 0, wbc);
nr_underway++;
}
bh = next;
@@ -1848,6 +1849,7 @@ recover:
unlock_page(page);
goto done;
}
+EXPORT_SYMBOL(__block_write_full_page);
/*
* If a page has any new buffers, zero them out here, and mark them uptodate
@@ -1891,10 +1893,64 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
}
EXPORT_SYMBOL(page_zero_new_buffers);
-int __block_write_begin(struct page *page, loff_t pos, unsigned len,
- get_block_t *get_block)
+static void
+iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
+ struct iomap *iomap)
+{
+ loff_t offset = block << inode->i_blkbits;
+
+ bh->b_bdev = iomap->bdev;
+
+ /*
+ * Block points to offset in file we need to map, iomap contains
+ * the offset at which the map starts. If the map ends before the
+ * current block, then do not map the buffer and let the caller
+ * handle it.
+ */
+ BUG_ON(offset >= iomap->offset + iomap->length);
+
+ switch (iomap->type) {
+ case IOMAP_HOLE:
+ /*
+ * If the buffer is not up to date or beyond the current EOF,
+ * we need to mark it as new to ensure sub-block zeroing is
+ * executed if necessary.
+ */
+ if (!buffer_uptodate(bh) ||
+ (offset >= i_size_read(inode)))
+ set_buffer_new(bh);
+ break;
+ case IOMAP_DELALLOC:
+ if (!buffer_uptodate(bh) ||
+ (offset >= i_size_read(inode)))
+ set_buffer_new(bh);
+ set_buffer_uptodate(bh);
+ set_buffer_mapped(bh);
+ set_buffer_delay(bh);
+ break;
+ case IOMAP_UNWRITTEN:
+ /*
+ * For unwritten regions, we always need to ensure that
+ * sub-block writes cause the regions in the block we are not
+ * writing to are zeroed. Set the buffer as new to ensure this.
+ */
+ set_buffer_new(bh);
+ set_buffer_unwritten(bh);
+ /* FALLTHRU */
+ case IOMAP_MAPPED:
+ if (offset >= i_size_read(inode))
+ set_buffer_new(bh);
+ bh->b_blocknr = (iomap->blkno >> (inode->i_blkbits - 9)) +
+ ((offset - iomap->offset) >> inode->i_blkbits);
+ set_buffer_mapped(bh);
+ break;
+ }
+}
+
+int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
+ get_block_t *get_block, struct iomap *iomap)
{
- unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned from = pos & (PAGE_SIZE - 1);
unsigned to = from + len;
struct inode *inode = page->mapping->host;
unsigned block_start, block_end;
@@ -1904,15 +1960,15 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
BUG_ON(!PageLocked(page));
- BUG_ON(from > PAGE_CACHE_SIZE);
- BUG_ON(to > PAGE_CACHE_SIZE);
+ BUG_ON(from > PAGE_SIZE);
+ BUG_ON(to > PAGE_SIZE);
BUG_ON(from > to);
head = create_page_buffers(page, inode, 0);
blocksize = head->b_size;
bbits = block_size_bits(blocksize);
- block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
+ block = (sector_t)page->index << (PAGE_SHIFT - bbits);
for(bh = head, block_start = 0; bh != head || !block_start;
block++, block_start=block_end, bh = bh->b_this_page) {
@@ -1928,9 +1984,14 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
clear_buffer_new(bh);
if (!buffer_mapped(bh)) {
WARN_ON(bh->b_size != blocksize);
- err = get_block(inode, block, bh, 1);
- if (err)
- break;
+ if (get_block) {
+ err = get_block(inode, block, bh, 1);
+ if (err)
+ break;
+ } else {
+ iomap_to_bh(inode, block, bh, iomap);
+ }
+
if (buffer_new(bh)) {
unmap_underlying_metadata(bh->b_bdev,
bh->b_blocknr);
@@ -1955,7 +2016,7 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
!buffer_unwritten(bh) &&
(block_start < from || block_end > to)) {
- ll_rw_block(READ, 1, &bh);
+ ll_rw_block(REQ_OP_READ, 0, 1, &bh);
*wait_bh++=bh;
}
}
@@ -1971,6 +2032,12 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
page_zero_new_buffers(page, from, to);
return err;
}
+
+int __block_write_begin(struct page *page, loff_t pos, unsigned len,
+ get_block_t *get_block)
+{
+ return __block_write_begin_int(page, pos, len, get_block, NULL);
+}
EXPORT_SYMBOL(__block_write_begin);
static int __block_commit_write(struct inode *inode, struct page *page,
@@ -2020,7 +2087,7 @@ static int __block_commit_write(struct inode *inode, struct page *page,
int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
unsigned flags, struct page **pagep, get_block_t *get_block)
{
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t index = pos >> PAGE_SHIFT;
struct page *page;
int status;
@@ -2031,7 +2098,7 @@ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
status = __block_write_begin(page, pos, len, get_block);
if (unlikely(status)) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
page = NULL;
}
@@ -2047,7 +2114,7 @@ int block_write_end(struct file *file, struct address_space *mapping,
struct inode *inode = mapping->host;
unsigned start;
- start = pos & (PAGE_CACHE_SIZE - 1);
+ start = pos & (PAGE_SIZE - 1);
if (unlikely(copied < len)) {
/*
@@ -2099,7 +2166,7 @@ int generic_write_end(struct file *file, struct address_space *mapping,
}
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if (old_size < pos)
pagecache_isize_extended(inode, old_size, pos);
@@ -2136,9 +2203,9 @@ int block_is_partially_uptodate(struct page *page, unsigned long from,
head = page_buffers(page);
blocksize = head->b_size;
- to = min_t(unsigned, PAGE_CACHE_SIZE - from, count);
+ to = min_t(unsigned, PAGE_SIZE - from, count);
to = from + to;
- if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
+ if (from < blocksize && to > PAGE_SIZE - blocksize)
return 0;
bh = head;
@@ -2181,7 +2248,7 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
blocksize = head->b_size;
bbits = block_size_bits(blocksize);
- iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
+ iblock = (sector_t)page->index << (PAGE_SHIFT - bbits);
lblock = (i_size_read(inode)+blocksize-1) >> bbits;
bh = head;
nr = 0;
@@ -2248,7 +2315,7 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
if (buffer_uptodate(bh))
end_buffer_async_read(bh, 1);
else
- submit_bh(READ, bh);
+ submit_bh(REQ_OP_READ, 0, bh);
}
return 0;
}
@@ -2295,16 +2362,16 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
unsigned zerofrom, offset, len;
int err = 0;
- index = pos >> PAGE_CACHE_SHIFT;
- offset = pos & ~PAGE_CACHE_MASK;
+ index = pos >> PAGE_SHIFT;
+ offset = pos & ~PAGE_MASK;
- while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
- zerofrom = curpos & ~PAGE_CACHE_MASK;
+ while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) {
+ zerofrom = curpos & ~PAGE_MASK;
if (zerofrom & (blocksize-1)) {
*bytes |= (blocksize-1);
(*bytes)++;
}
- len = PAGE_CACHE_SIZE - zerofrom;
+ len = PAGE_SIZE - zerofrom;
err = pagecache_write_begin(file, mapping, curpos, len,
AOP_FLAG_UNINTERRUPTIBLE,
@@ -2329,7 +2396,7 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
/* page covers the boundary, find the boundary offset */
if (index == curidx) {
- zerofrom = curpos & ~PAGE_CACHE_MASK;
+ zerofrom = curpos & ~PAGE_MASK;
/* if we will expand the thing last block will be filled */
if (offset <= zerofrom) {
goto out;
@@ -2375,7 +2442,7 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
if (err)
return err;
- zerofrom = *bytes & ~PAGE_CACHE_MASK;
+ zerofrom = *bytes & ~PAGE_MASK;
if (pos+len > *bytes && zerofrom & (blocksize-1)) {
*bytes |= (blocksize-1);
(*bytes)++;
@@ -2430,10 +2497,10 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
}
/* page is wholly or partially inside EOF */
- if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
- end = size & ~PAGE_CACHE_MASK;
+ if (((page->index + 1) << PAGE_SHIFT) > size)
+ end = size & ~PAGE_MASK;
else
- end = PAGE_CACHE_SIZE;
+ end = PAGE_SIZE;
ret = __block_write_begin(page, 0, end, get_block);
if (!ret)
@@ -2508,8 +2575,8 @@ int nobh_write_begin(struct address_space *mapping,
int ret = 0;
int is_mapped_to_disk = 1;
- index = pos >> PAGE_CACHE_SHIFT;
- from = pos & (PAGE_CACHE_SIZE - 1);
+ index = pos >> PAGE_SHIFT;
+ from = pos & (PAGE_SIZE - 1);
to = from + len;
page = grab_cache_page_write_begin(mapping, index, flags);
@@ -2543,7 +2610,7 @@ int nobh_write_begin(struct address_space *mapping,
goto out_release;
}
- block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
+ block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
/*
* We loop across all blocks in the page, whether or not they are
@@ -2551,7 +2618,7 @@ int nobh_write_begin(struct address_space *mapping,
* page is fully mapped-to-disk.
*/
for (block_start = 0, block_in_page = 0, bh = head;
- block_start < PAGE_CACHE_SIZE;
+ block_start < PAGE_SIZE;
block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
int create;
@@ -2582,7 +2649,7 @@ int nobh_write_begin(struct address_space *mapping,
if (block_start < from || block_end > to) {
lock_buffer(bh);
bh->b_end_io = end_buffer_read_nobh;
- submit_bh(READ, bh);
+ submit_bh(REQ_OP_READ, 0, bh);
nr_reads++;
}
}
@@ -2623,7 +2690,7 @@ failed:
out_release:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
*pagep = NULL;
return ret;
@@ -2653,7 +2720,7 @@ int nobh_write_end(struct file *file, struct address_space *mapping,
}
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
while (head) {
bh = head;
@@ -2675,7 +2742,7 @@ int nobh_writepage(struct page *page, get_block_t *get_block,
{
struct inode * const inode = page->mapping->host;
loff_t i_size = i_size_read(inode);
- const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ const pgoff_t end_index = i_size >> PAGE_SHIFT;
unsigned offset;
int ret;
@@ -2684,7 +2751,7 @@ int nobh_writepage(struct page *page, get_block_t *get_block,
goto out;
/* Is the page fully outside i_size? (truncate in progress) */
- offset = i_size & (PAGE_CACHE_SIZE-1);
+ offset = i_size & (PAGE_SIZE-1);
if (page->index >= end_index+1 || !offset) {
/*
* The page may have dirty, unmapped buffers. For example,
@@ -2707,7 +2774,7 @@ int nobh_writepage(struct page *page, get_block_t *get_block,
* the page size, the remaining memory is zeroed when mapped, and
* writes to that region are not written out to the file."
*/
- zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+ zero_user_segment(page, offset, PAGE_SIZE);
out:
ret = mpage_writepage(page, get_block, wbc);
if (ret == -EAGAIN)
@@ -2720,8 +2787,8 @@ EXPORT_SYMBOL(nobh_writepage);
int nobh_truncate_page(struct address_space *mapping,
loff_t from, get_block_t *get_block)
{
- pgoff_t index = from >> PAGE_CACHE_SHIFT;
- unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ pgoff_t index = from >> PAGE_SHIFT;
+ unsigned offset = from & (PAGE_SIZE-1);
unsigned blocksize;
sector_t iblock;
unsigned length, pos;
@@ -2738,7 +2805,7 @@ int nobh_truncate_page(struct address_space *mapping,
return 0;
length = blocksize - length;
- iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);
page = grab_cache_page(mapping, index);
err = -ENOMEM;
@@ -2748,7 +2815,7 @@ int nobh_truncate_page(struct address_space *mapping,
if (page_has_buffers(page)) {
has_buffers:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return block_truncate_page(mapping, from, get_block);
}
@@ -2772,7 +2839,7 @@ has_buffers:
if (!PageUptodate(page)) {
err = mapping->a_ops->readpage(NULL, page);
if (err) {
- page_cache_release(page);
+ put_page(page);
goto out;
}
lock_page(page);
@@ -2789,7 +2856,7 @@ has_buffers:
unlock:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
out:
return err;
}
@@ -2798,8 +2865,8 @@ EXPORT_SYMBOL(nobh_truncate_page);
int block_truncate_page(struct address_space *mapping,
loff_t from, get_block_t *get_block)
{
- pgoff_t index = from >> PAGE_CACHE_SHIFT;
- unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ pgoff_t index = from >> PAGE_SHIFT;
+ unsigned offset = from & (PAGE_SIZE-1);
unsigned blocksize;
sector_t iblock;
unsigned length, pos;
@@ -2816,7 +2883,7 @@ int block_truncate_page(struct address_space *mapping,
return 0;
length = blocksize - length;
- iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);
page = grab_cache_page(mapping, index);
err = -ENOMEM;
@@ -2852,7 +2919,7 @@ int block_truncate_page(struct address_space *mapping,
if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
err = -EIO;
- ll_rw_block(READ, 1, &bh);
+ ll_rw_block(REQ_OP_READ, 0, 1, &bh);
wait_on_buffer(bh);
/* Uhhuh. Read error. Complain and punt. */
if (!buffer_uptodate(bh))
@@ -2865,7 +2932,7 @@ int block_truncate_page(struct address_space *mapping,
unlock:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
out:
return err;
}
@@ -2879,7 +2946,7 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
{
struct inode * const inode = page->mapping->host;
loff_t i_size = i_size_read(inode);
- const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ const pgoff_t end_index = i_size >> PAGE_SHIFT;
unsigned offset;
/* Is the page fully inside i_size? */
@@ -2888,14 +2955,14 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
end_buffer_async_write);
/* Is the page fully outside i_size? (truncate in progress) */
- offset = i_size & (PAGE_CACHE_SIZE-1);
+ offset = i_size & (PAGE_SIZE-1);
if (page->index >= end_index+1 || !offset) {
/*
* The page may have dirty, unmapped buffers. For example,
* they may have been added in ext3_writepage(). Make them
* freeable here, so the page does not leak.
*/
- do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+ do_invalidatepage(page, 0, PAGE_SIZE);
unlock_page(page);
return 0; /* don't care */
}
@@ -2907,7 +2974,7 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
* the page size, the remaining memory is zeroed when mapped, and
* writes to that region are not written out to the file."
*/
- zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+ zero_user_segment(page, offset, PAGE_SIZE);
return __block_write_full_page(inode, page, get_block, wbc,
end_buffer_async_write);
}
@@ -2949,7 +3016,7 @@ static void end_bio_bh_io_sync(struct bio *bio)
* errors, this only handles the "we need to be able to
* do IO at the final sector" case.
*/
-void guard_bio_eod(int rw, struct bio *bio)
+void guard_bio_eod(int op, struct bio *bio)
{
sector_t maxsector;
struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
@@ -2979,13 +3046,13 @@ void guard_bio_eod(int rw, struct bio *bio)
bvec->bv_len -= truncated_bytes;
/* ..and clear the end of the buffer for reads */
- if ((rw & RW_MASK) == READ) {
+ if (op == REQ_OP_READ) {
zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len,
truncated_bytes);
}
}
-static int submit_bh_wbc(int rw, struct buffer_head *bh,
+static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
unsigned long bio_flags, struct writeback_control *wbc)
{
struct bio *bio;
@@ -2999,7 +3066,7 @@ static int submit_bh_wbc(int rw, struct buffer_head *bh,
/*
* Only clear out a write error when rewriting
*/
- if (test_set_buffer_req(bh) && (rw & WRITE))
+ if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
clear_buffer_write_io_error(bh);
/*
@@ -3024,39 +3091,42 @@ static int submit_bh_wbc(int rw, struct buffer_head *bh,
bio->bi_flags |= bio_flags;
/* Take care of bh's that straddle the end of the device */
- guard_bio_eod(rw, bio);
+ guard_bio_eod(op, bio);
if (buffer_meta(bh))
- rw |= REQ_META;
+ op_flags |= REQ_META;
if (buffer_prio(bh))
- rw |= REQ_PRIO;
+ op_flags |= REQ_PRIO;
+ bio_set_op_attrs(bio, op, op_flags);
- submit_bio(rw, bio);
+ submit_bio(bio);
return 0;
}
-int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
+int _submit_bh(int op, int op_flags, struct buffer_head *bh,
+ unsigned long bio_flags)
{
- return submit_bh_wbc(rw, bh, bio_flags, NULL);
+ return submit_bh_wbc(op, op_flags, bh, bio_flags, NULL);
}
EXPORT_SYMBOL_GPL(_submit_bh);
-int submit_bh(int rw, struct buffer_head *bh)
+int submit_bh(int op, int op_flags, struct buffer_head *bh)
{
- return submit_bh_wbc(rw, bh, 0, NULL);
+ return submit_bh_wbc(op, op_flags, bh, 0, NULL);
}
EXPORT_SYMBOL(submit_bh);
/**
* ll_rw_block: low-level access to block devices (DEPRECATED)
- * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
+ * @op: whether to %READ or %WRITE
+ * @op_flags: rq_flag_bits
* @nr: number of &struct buffer_heads in the array
* @bhs: array of pointers to &struct buffer_head
*
* ll_rw_block() takes an array of pointers to &struct buffer_heads, and
- * requests an I/O operation on them, either a %READ or a %WRITE. The third
- * %READA option is described in the documentation for generic_make_request()
- * which ll_rw_block() calls.
+ * requests an I/O operation on them, either a %REQ_OP_READ or a %REQ_OP_WRITE.
+ * @op_flags contains flags modifying the detailed I/O behavior, most notably
+ * %REQ_RAHEAD.
*
* This function drops any buffer that it cannot get a lock on (with the
* BH_Lock state bit), any buffer that appears to be clean when doing a write
@@ -3072,7 +3142,7 @@ EXPORT_SYMBOL(submit_bh);
* All of the buffers must be for the same device, and must also be a
* multiple of the current approved size for the device.
*/
-void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
+void ll_rw_block(int op, int op_flags, int nr, struct buffer_head *bhs[])
{
int i;
@@ -3081,18 +3151,18 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
if (!trylock_buffer(bh))
continue;
- if (rw == WRITE) {
+ if (op == WRITE) {
if (test_clear_buffer_dirty(bh)) {
bh->b_end_io = end_buffer_write_sync;
get_bh(bh);
- submit_bh(WRITE, bh);
+ submit_bh(op, op_flags, bh);
continue;
}
} else {
if (!buffer_uptodate(bh)) {
bh->b_end_io = end_buffer_read_sync;
get_bh(bh);
- submit_bh(rw, bh);
+ submit_bh(op, op_flags, bh);
continue;
}
}
@@ -3101,7 +3171,7 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
}
EXPORT_SYMBOL(ll_rw_block);
-void write_dirty_buffer(struct buffer_head *bh, int rw)
+void write_dirty_buffer(struct buffer_head *bh, int op_flags)
{
lock_buffer(bh);
if (!test_clear_buffer_dirty(bh)) {
@@ -3110,7 +3180,7 @@ void write_dirty_buffer(struct buffer_head *bh, int rw)
}
bh->b_end_io = end_buffer_write_sync;
get_bh(bh);
- submit_bh(rw, bh);
+ submit_bh(REQ_OP_WRITE, op_flags, bh);
}
EXPORT_SYMBOL(write_dirty_buffer);
@@ -3119,7 +3189,7 @@ EXPORT_SYMBOL(write_dirty_buffer);
* and then start new I/O and then wait upon it. The caller must have a ref on
* the buffer_head.
*/
-int __sync_dirty_buffer(struct buffer_head *bh, int rw)
+int __sync_dirty_buffer(struct buffer_head *bh, int op_flags)
{
int ret = 0;
@@ -3128,7 +3198,7 @@ int __sync_dirty_buffer(struct buffer_head *bh, int rw)
if (test_clear_buffer_dirty(bh)) {
get_bh(bh);
bh->b_end_io = end_buffer_write_sync;
- ret = submit_bh(rw, bh);
+ ret = submit_bh(REQ_OP_WRITE, op_flags, bh);
wait_on_buffer(bh);
if (!ret && !buffer_uptodate(bh))
ret = -EIO;
@@ -3391,7 +3461,7 @@ int bh_submit_read(struct buffer_head *bh)
get_bh(bh);
bh->b_end_io = end_buffer_read_sync;
- submit_bh(READ, bh);
+ submit_bh(REQ_OP_READ, 0, bh);
wait_on_buffer(bh);
if (buffer_uptodate(bh))
return 0;
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 861d611b8c058..ce5f345d70f50 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -380,7 +380,7 @@ static void cachefiles_sync_cache(struct fscache_cache *_cache)
* check if the backing cache is updated to FS-Cache
* - called by FS-Cache when evaluates if need to invalidate the cache
*/
-static bool cachefiles_check_consistency(struct fscache_operation *op)
+static int cachefiles_check_consistency(struct fscache_operation *op)
{
struct cachefiles_object *object;
struct cachefiles_cache *cache;
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 4ae75006e73bd..3f7c2cd41f8fd 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -263,6 +263,8 @@ requeue:
void cachefiles_mark_object_inactive(struct cachefiles_cache *cache,
struct cachefiles_object *object)
{
+ blkcnt_t i_blocks = d_backing_inode(object->dentry)->i_blocks;
+
write_lock(&cache->active_lock);
rb_erase(&object->active_node, &cache->active_nodes);
clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
@@ -273,8 +275,7 @@ void cachefiles_mark_object_inactive(struct cachefiles_cache *cache,
/* This object can now be culled, so we need to let the daemon know
* that there is something it can remove if it needs to.
*/
- atomic_long_add(d_backing_inode(object->dentry)->i_blocks,
- &cache->b_released);
+ atomic_long_add(i_blocks, &cache->b_released);
if (atomic_inc_return(&cache->f_released))
cachefiles_state_changed(cache);
}
diff --git a/fs/cachefiles/proc.c b/fs/cachefiles/proc.c
index eccd33941199c..125b90f6c796c 100644
--- a/fs/cachefiles/proc.c
+++ b/fs/cachefiles/proc.c
@@ -93,7 +93,6 @@ static int cachefiles_histogram_open(struct inode *inode, struct file *file)
}
static const struct file_operations cachefiles_histogram_fops = {
- .owner = THIS_MODULE,
.open = cachefiles_histogram_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index c0f3da3926a0c..afbdc418966db 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -194,10 +194,10 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
error = -EIO;
}
- page_cache_release(monitor->back_page);
+ put_page(monitor->back_page);
fscache_end_io(op, monitor->netfs_page, error);
- page_cache_release(monitor->netfs_page);
+ put_page(monitor->netfs_page);
fscache_retrieval_complete(op, 1);
fscache_put_retrieval(op);
kfree(monitor);
@@ -288,8 +288,8 @@ monitor_backing_page:
_debug("- monitor add");
/* install the monitor */
- page_cache_get(monitor->netfs_page);
- page_cache_get(backpage);
+ get_page(monitor->netfs_page);
+ get_page(backpage);
monitor->back_page = backpage;
monitor->monitor.private = backpage;
add_page_wait_queue(backpage, &monitor->monitor);
@@ -310,7 +310,7 @@ backing_page_already_present:
_debug("- present");
if (newpage) {
- page_cache_release(newpage);
+ put_page(newpage);
newpage = NULL;
}
@@ -342,7 +342,7 @@ success:
out:
if (backpage)
- page_cache_release(backpage);
+ put_page(backpage);
if (monitor) {
fscache_put_retrieval(monitor->op);
kfree(monitor);
@@ -363,7 +363,7 @@ io_error:
goto out;
nomem_page:
- page_cache_release(newpage);
+ put_page(newpage);
nomem_monitor:
fscache_put_retrieval(monitor->op);
kfree(monitor);
@@ -530,7 +530,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
netpage->index, cachefiles_gfp);
if (ret < 0) {
if (ret == -EEXIST) {
- page_cache_release(netpage);
+ put_page(netpage);
fscache_retrieval_complete(op, 1);
continue;
}
@@ -538,10 +538,10 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
}
/* install a monitor */
- page_cache_get(netpage);
+ get_page(netpage);
monitor->netfs_page = netpage;
- page_cache_get(backpage);
+ get_page(backpage);
monitor->back_page = backpage;
monitor->monitor.private = backpage;
add_page_wait_queue(backpage, &monitor->monitor);
@@ -555,10 +555,10 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
unlock_page(backpage);
}
- page_cache_release(backpage);
+ put_page(backpage);
backpage = NULL;
- page_cache_release(netpage);
+ put_page(netpage);
netpage = NULL;
continue;
@@ -603,7 +603,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
netpage->index, cachefiles_gfp);
if (ret < 0) {
if (ret == -EEXIST) {
- page_cache_release(netpage);
+ put_page(netpage);
fscache_retrieval_complete(op, 1);
continue;
}
@@ -612,14 +612,14 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
copy_highpage(netpage, backpage);
- page_cache_release(backpage);
+ put_page(backpage);
backpage = NULL;
fscache_mark_page_cached(op, netpage);
/* the netpage is unlocked and marked up to date here */
fscache_end_io(op, netpage, 0);
- page_cache_release(netpage);
+ put_page(netpage);
netpage = NULL;
fscache_retrieval_complete(op, 1);
continue;
@@ -632,11 +632,11 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
out:
/* tidy up */
if (newpage)
- page_cache_release(newpage);
+ put_page(newpage);
if (netpage)
- page_cache_release(netpage);
+ put_page(netpage);
if (backpage)
- page_cache_release(backpage);
+ put_page(backpage);
if (monitor) {
fscache_put_retrieval(op);
kfree(monitor);
@@ -644,7 +644,7 @@ out:
list_for_each_entry_safe(netpage, _n, list, lru) {
list_del(&netpage->lru);
- page_cache_release(netpage);
+ put_page(netpage);
fscache_retrieval_complete(op, 1);
}
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index f19708487e2f7..4f67227f69a5a 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -37,6 +37,8 @@ static inline void ceph_set_cached_acl(struct inode *inode,
spin_lock(&ci->i_ceph_lock);
if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0))
set_cached_acl(inode, type, acl);
+ else
+ forget_cached_acl(inode, type);
spin_unlock(&ci->i_ceph_lock);
}
@@ -88,7 +90,6 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
char *value = NULL;
struct iattr newattrs;
umode_t new_mode = inode->i_mode, old_mode = inode->i_mode;
- struct dentry *dentry;
switch (type) {
case ACL_TYPE_ACCESS:
@@ -126,29 +127,26 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
goto out_free;
}
- dentry = d_find_alias(inode);
if (new_mode != old_mode) {
newattrs.ia_mode = new_mode;
newattrs.ia_valid = ATTR_MODE;
- ret = ceph_setattr(dentry, &newattrs);
+ ret = __ceph_setattr(inode, &newattrs);
if (ret)
- goto out_dput;
+ goto out_free;
}
- ret = __ceph_setxattr(dentry, name, value, size, 0);
+ ret = __ceph_setxattr(inode, name, value, size, 0);
if (ret) {
if (new_mode != old_mode) {
newattrs.ia_mode = old_mode;
newattrs.ia_valid = ATTR_MODE;
- ceph_setattr(dentry, &newattrs);
+ __ceph_setattr(inode, &newattrs);
}
- goto out_dput;
+ goto out_free;
}
ceph_set_cached_acl(inode, type, acl);
-out_dput:
- dput(dentry);
out_free:
kfree(value);
out:
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index fc5cae2a0db2d..d5b6f959a3c3a 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -143,7 +143,7 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset,
inode = page->mapping->host;
ci = ceph_inode(inode);
- if (offset != 0 || length != PAGE_CACHE_SIZE) {
+ if (offset != 0 || length != PAGE_SIZE) {
dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n",
inode, page, page->index, offset, length);
return;
@@ -197,10 +197,10 @@ static int readpage_nounlock(struct file *filp, struct page *page)
&ceph_inode_to_client(inode)->client->osdc;
int err = 0;
u64 off = page_offset(page);
- u64 len = PAGE_CACHE_SIZE;
+ u64 len = PAGE_SIZE;
if (off >= i_size_read(inode)) {
- zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ zero_user_segment(page, 0, PAGE_SIZE);
SetPageUptodate(page);
return 0;
}
@@ -212,7 +212,7 @@ static int readpage_nounlock(struct file *filp, struct page *page)
*/
if (off == 0)
return -EINVAL;
- zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ zero_user_segment(page, 0, PAGE_SIZE);
SetPageUptodate(page);
return 0;
}
@@ -234,9 +234,9 @@ static int readpage_nounlock(struct file *filp, struct page *page)
ceph_fscache_readpage_cancel(inode, page);
goto out;
}
- if (err < PAGE_CACHE_SIZE)
+ if (err < PAGE_SIZE)
/* zero fill remainder of page */
- zero_user_segment(page, err, PAGE_CACHE_SIZE);
+ zero_user_segment(page, err, PAGE_SIZE);
else
flush_dcache_page(page);
@@ -257,12 +257,12 @@ static int ceph_readpage(struct file *filp, struct page *page)
/*
* Finish an async read(ahead) op.
*/
-static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
+static void finish_read(struct ceph_osd_request *req)
{
struct inode *inode = req->r_inode;
struct ceph_osd_data *osd_data;
- int rc = req->r_result;
- int bytes = le32_to_cpu(msg->hdr.data_len);
+ int rc = req->r_result <= 0 ? req->r_result : 0;
+ int bytes = req->r_result >= 0 ? req->r_result : 0;
int num_pages;
int i;
@@ -276,12 +276,14 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
for (i = 0; i < num_pages; i++) {
struct page *page = osd_data->pages[i];
- if (rc < 0 && rc != -ENOENT)
+ if (rc < 0 && rc != -ENOENT) {
+ ceph_fscache_readpage_cancel(inode, page);
goto unlock;
- if (bytes < (int)PAGE_CACHE_SIZE) {
+ }
+ if (bytes < (int)PAGE_SIZE) {
/* zero (remainder of) page */
int s = bytes < 0 ? 0 : bytes;
- zero_user_segment(page, s, PAGE_CACHE_SIZE);
+ zero_user_segment(page, s, PAGE_SIZE);
}
dout("finish_read %p uptodate %p idx %lu\n", inode, page,
page->index);
@@ -290,8 +292,8 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
ceph_readpage_to_fscache(inode, page);
unlock:
unlock_page(page);
- page_cache_release(page);
- bytes -= PAGE_CACHE_SIZE;
+ put_page(page);
+ bytes -= PAGE_SIZE;
}
kfree(osd_data->pages);
}
@@ -336,7 +338,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
if (max && nr_pages == max)
break;
}
- len = nr_pages << PAGE_CACHE_SHIFT;
+ len = nr_pages << PAGE_SHIFT;
dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages,
off, len);
vino = ceph_vino(inode);
@@ -364,7 +366,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
if (add_to_page_cache_lru(page, &inode->i_data, page->index,
GFP_KERNEL)) {
ceph_fscache_uncache_page(inode, page);
- page_cache_release(page);
+ put_page(page);
dout("start_read %p add_to_page_cache failed %p\n",
inode, page);
nr_pages = i;
@@ -376,8 +378,6 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
req->r_callback = finish_read;
req->r_inode = inode;
- ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
-
dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
ret = ceph_osdc_start_request(osdc, req, false);
if (ret < 0)
@@ -415,8 +415,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
if (rc == 0)
goto out;
- if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
- max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
+ if (fsc->mount_options->rsize >= PAGE_SIZE)
+ max = (fsc->mount_options->rsize + PAGE_SIZE - 1)
>> PAGE_SHIFT;
dout("readpages %p file %p nr_pages %d max %d\n", inode,
@@ -484,7 +484,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
long writeback_stat;
u64 truncate_size;
u32 truncate_seq;
- int err = 0, len = PAGE_CACHE_SIZE;
+ int err = 0, len = PAGE_SIZE;
dout("writepage %p idx %lu\n", page, page->index);
@@ -537,8 +537,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
- ceph_readpage_to_fscache(inode, page);
-
set_page_writeback(page);
err = ceph_osdc_writepages(osdc, ceph_vino(inode),
&ci->i_layout, snapc,
@@ -546,11 +544,21 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
truncate_seq, truncate_size,
&inode->i_mtime, &page, 1);
if (err < 0) {
- dout("writepage setting page/mapping error %d %p\n", err, page);
+ struct writeback_control tmp_wbc;
+ if (!wbc)
+ wbc = &tmp_wbc;
+ if (err == -ERESTARTSYS) {
+ /* killed by SIGKILL */
+ dout("writepage interrupted page %p\n", page);
+ redirty_page_for_writepage(wbc, page);
+ end_page_writeback(page);
+ goto out;
+ }
+ dout("writepage setting page/mapping error %d %p\n",
+ err, page);
SetPageError(page);
mapping_set_error(&inode->i_data, err);
- if (wbc)
- wbc->pages_skipped++;
+ wbc->pages_skipped++;
} else {
dout("writepage cleaned page %p\n", page);
err = 0; /* vfs expects us to return 0 */
@@ -571,12 +579,16 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
BUG_ON(!inode);
ihold(inode);
err = writepage_nounlock(page, wbc);
+ if (err == -ERESTARTSYS) {
+ /* direct memory reclaimer was killed by SIGKILL. return 0
+ * to prevent caller from setting mapping/page error */
+ err = 0;
+ }
unlock_page(page);
iput(inode);
return err;
}
-
/*
* lame release_pages helper. release_pages() isn't exported to
* modules.
@@ -600,8 +612,7 @@ static void ceph_release_pages(struct page **pages, int num)
* If we get an error, set the mapping error bit, but not the individual
* page error bits.
*/
-static void writepages_finish(struct ceph_osd_request *req,
- struct ceph_msg *msg)
+static void writepages_finish(struct ceph_osd_request *req)
{
struct inode *inode = req->r_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
@@ -615,7 +626,6 @@ static void writepages_finish(struct ceph_osd_request *req,
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
bool remove_page;
-
dout("writepages_finish %p rc %d\n", inode, rc);
if (rc < 0)
mapping_set_error(mapping, rc);
@@ -650,6 +660,9 @@ static void writepages_finish(struct ceph_osd_request *req,
clear_bdi_congested(&fsc->backing_dev_info,
BLK_RW_ASYNC);
+ if (rc < 0)
+ SetPageError(page);
+
ceph_put_snap_context(page_snap_context(page));
page->private = 0;
ClearPagePrivate(page);
@@ -718,16 +731,19 @@ static int ceph_writepages_start(struct address_space *mapping,
(wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
- pr_warn("writepage_start %p on forced umount\n", inode);
- truncate_pagecache(inode, 0);
+ if (ci->i_wrbuffer_ref > 0) {
+ pr_warn_ratelimited(
+ "writepage_start %p %lld forced umount\n",
+ inode, ceph_ino(inode));
+ }
mapping_set_error(mapping, -EIO);
return -EIO; /* we're in a forced umount, don't write! */
}
if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
wsize = fsc->mount_options->wsize;
- if (wsize < PAGE_CACHE_SIZE)
- wsize = PAGE_CACHE_SIZE;
- max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
+ if (wsize < PAGE_SIZE)
+ wsize = PAGE_SIZE;
+ max_pages_ever = wsize >> PAGE_SHIFT;
pagevec_init(&pvec, 0);
@@ -737,8 +753,8 @@ static int ceph_writepages_start(struct address_space *mapping,
end = -1;
dout(" cyclic, start at %lu\n", start);
} else {
- start = wbc->range_start >> PAGE_CACHE_SHIFT;
- end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ start = wbc->range_start >> PAGE_SHIFT;
+ end = wbc->range_end >> PAGE_SHIFT;
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
should_loop = 0;
@@ -887,7 +903,7 @@ get_more_pages:
num_ops = 1 + do_sync;
strip_unit_end = page->index +
- ((len - 1) >> PAGE_CACHE_SHIFT);
+ ((len - 1) >> PAGE_SHIFT);
BUG_ON(pages);
max_pages = calc_pages_for(0, (u64)len);
@@ -901,7 +917,7 @@ get_more_pages:
len = 0;
} else if (page->index !=
- (offset + len) >> PAGE_CACHE_SHIFT) {
+ (offset + len) >> PAGE_SHIFT) {
if (num_ops >= (pool ? CEPH_OSD_SLAB_OPS :
CEPH_OSD_MAX_OPS)) {
redirty_page_for_writepage(wbc, page);
@@ -929,7 +945,7 @@ get_more_pages:
pages[locked_pages] = page;
locked_pages++;
- len += PAGE_CACHE_SIZE;
+ len += PAGE_SIZE;
}
/* did we get anything? */
@@ -981,7 +997,7 @@ new_request:
BUG_ON(IS_ERR(req));
}
BUG_ON(len < page_offset(pages[locked_pages - 1]) +
- PAGE_CACHE_SIZE - offset);
+ PAGE_SIZE - offset);
req->r_callback = writepages_finish;
req->r_inode = inode;
@@ -1011,7 +1027,7 @@ new_request:
}
set_page_writeback(pages[i]);
- len += PAGE_CACHE_SIZE;
+ len += PAGE_SIZE;
}
if (snap_size != -1) {
@@ -1020,7 +1036,7 @@ new_request:
/* writepages_finish() clears writeback pages
* according to the data length, so make sure
* data length covers all locked pages */
- u64 min_len = len + 1 - PAGE_CACHE_SIZE;
+ u64 min_len = len + 1 - PAGE_SIZE;
len = min(len, (u64)i_size_read(inode) - offset);
len = max(len, min_len);
}
@@ -1063,10 +1079,7 @@ new_request:
pages = NULL;
}
- vino = ceph_vino(inode);
- ceph_osdc_build_request(req, offset, snapc, vino.snap,
- &inode->i_mtime);
-
+ req->r_mtime = inode->i_mtime;
rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
BUG_ON(rc);
req = NULL;
@@ -1099,8 +1112,7 @@ release_pvec_pages:
mapping->writeback_index = index;
out:
- if (req)
- ceph_osdc_put_request(req);
+ ceph_osdc_put_request(req);
ceph_put_snap_context(snapc);
dout("writepages done, rc = %d\n", rc);
return rc;
@@ -1134,14 +1146,21 @@ static int ceph_update_writeable_page(struct file *file,
struct page *page)
{
struct inode *inode = file_inode(file);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
- loff_t page_off = pos & PAGE_CACHE_MASK;
- int pos_in_page = pos & ~PAGE_CACHE_MASK;
+ loff_t page_off = pos & PAGE_MASK;
+ int pos_in_page = pos & ~PAGE_MASK;
int end_in_page = pos_in_page + len;
loff_t i_size;
int r;
struct ceph_snap_context *snapc, *oldest;
+ if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+ dout(" page %p forced umount\n", page);
+ unlock_page(page);
+ return -EIO;
+ }
+
retry_locked:
/* writepages currently holds page lock, but if we change that later, */
wait_on_page_writeback(page);
@@ -1165,7 +1184,7 @@ retry_locked:
snapc = ceph_get_snap_context(snapc);
unlock_page(page);
ceph_queue_writeback(inode);
- r = wait_event_interruptible(ci->i_cap_wq,
+ r = wait_event_killable(ci->i_cap_wq,
context_is_writeable_or_written(inode, snapc));
ceph_put_snap_context(snapc);
if (r == -ERESTARTSYS)
@@ -1191,7 +1210,7 @@ retry_locked:
}
/* full page? */
- if (pos_in_page == 0 && len == PAGE_CACHE_SIZE)
+ if (pos_in_page == 0 && len == PAGE_SIZE)
return 0;
/* past end of file? */
@@ -1199,12 +1218,12 @@ retry_locked:
if (page_off >= i_size ||
(pos_in_page == 0 && (pos+len) >= i_size &&
- end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {
+ end_in_page - pos_in_page != PAGE_SIZE)) {
dout(" zeroing %p 0 - %d and %d - %d\n",
- page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE);
+ page, pos_in_page, end_in_page, (int)PAGE_SIZE);
zero_user_segments(page,
0, pos_in_page,
- end_in_page, PAGE_CACHE_SIZE);
+ end_in_page, PAGE_SIZE);
return 0;
}
@@ -1228,7 +1247,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
{
struct inode *inode = file_inode(file);
struct page *page;
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t index = pos >> PAGE_SHIFT;
int r;
do {
@@ -1242,7 +1261,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
r = ceph_update_writeable_page(file, pos, len, page);
if (r < 0)
- page_cache_release(page);
+ put_page(page);
else
*pagep = page;
} while (r == -EAGAIN);
@@ -1259,7 +1278,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata)
{
struct inode *inode = file_inode(file);
- unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned from = pos & (PAGE_SIZE - 1);
int check_cap = 0;
dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
@@ -1279,7 +1298,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
set_page_dirty(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if (check_cap)
ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
@@ -1292,8 +1311,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
* intercept O_DIRECT reads and writes early, this function should
* never get called.
*/
-static ssize_t ceph_direct_io(struct kiocb *iocb, struct iov_iter *iter,
- loff_t pos)
+static ssize_t ceph_direct_io(struct kiocb *iocb, struct iov_iter *iter)
{
WARN_ON(1);
return -EINVAL;
@@ -1312,6 +1330,17 @@ const struct address_space_operations ceph_aops = {
.direct_IO = ceph_direct_io,
};
+static void ceph_block_sigs(sigset_t *oldset)
+{
+ sigset_t mask;
+ siginitsetinv(&mask, sigmask(SIGKILL));
+ sigprocmask(SIG_BLOCK, &mask, oldset);
+}
+
+static void ceph_restore_sigs(sigset_t *oldset)
+{
+ sigprocmask(SIG_SETMASK, oldset, NULL);
+}
/*
* vm ops
@@ -1322,28 +1351,26 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_file_info *fi = vma->vm_file->private_data;
struct page *pinned_page = NULL;
- loff_t off = vmf->pgoff << PAGE_CACHE_SHIFT;
+ loff_t off = vmf->pgoff << PAGE_SHIFT;
int want, got, ret;
+ sigset_t oldset;
+
+ ceph_block_sigs(&oldset);
dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n",
- inode, ceph_vinop(inode), off, (size_t)PAGE_CACHE_SIZE);
+ inode, ceph_vinop(inode), off, (size_t)PAGE_SIZE);
if (fi->fmode & CEPH_FILE_MODE_LAZY)
want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
else
want = CEPH_CAP_FILE_CACHE;
- while (1) {
- got = 0;
- ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want,
- -1, &got, &pinned_page);
- if (ret == 0)
- break;
- if (ret != -ERESTARTSYS) {
- WARN_ON(1);
- return VM_FAULT_SIGBUS;
- }
- }
+
+ got = 0;
+ ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page);
+ if (ret < 0)
+ goto out_restore;
+
dout("filemap_fault %p %llu~%zd got cap refs on %s\n",
- inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got));
+ inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got));
if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) ||
ci->i_inline_version == CEPH_INLINE_NONE)
@@ -1352,16 +1379,16 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
ret = -EAGAIN;
dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n",
- inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret);
+ inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got), ret);
if (pinned_page)
- page_cache_release(pinned_page);
+ put_page(pinned_page);
ceph_put_cap_refs(ci, got);
if (ret != -EAGAIN)
- return ret;
+ goto out_restore;
/* read inline data */
- if (off >= PAGE_CACHE_SIZE) {
+ if (off >= PAGE_SIZE) {
/* does not support inline data > PAGE_SIZE */
ret = VM_FAULT_SIGBUS;
} else {
@@ -1372,27 +1399,35 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
~__GFP_FS));
if (!page) {
ret = VM_FAULT_OOM;
- goto out;
+ goto out_inline;
}
ret1 = __ceph_do_getattr(inode, page,
CEPH_STAT_CAP_INLINE_DATA, true);
if (ret1 < 0 || off >= i_size_read(inode)) {
unlock_page(page);
- page_cache_release(page);
- ret = VM_FAULT_SIGBUS;
- goto out;
+ put_page(page);
+ if (ret1 < 0)
+ ret = ret1;
+ else
+ ret = VM_FAULT_SIGBUS;
+ goto out_inline;
}
- if (ret1 < PAGE_CACHE_SIZE)
- zero_user_segment(page, ret1, PAGE_CACHE_SIZE);
+ if (ret1 < PAGE_SIZE)
+ zero_user_segment(page, ret1, PAGE_SIZE);
else
flush_dcache_page(page);
SetPageUptodate(page);
vmf->page = page;
ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
+out_inline:
+ dout("filemap_fault %p %llu~%zd read inline data ret %d\n",
+ inode, off, (size_t)PAGE_SIZE, ret);
}
-out:
- dout("filemap_fault %p %llu~%zd read inline data ret %d\n",
- inode, off, (size_t)PAGE_CACHE_SIZE, ret);
+out_restore:
+ ceph_restore_sigs(&oldset);
+ if (ret < 0)
+ ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
+
return ret;
}
@@ -1410,10 +1445,13 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
loff_t size = i_size_read(inode);
size_t len;
int want, got, ret;
+ sigset_t oldset;
prealloc_cf = ceph_alloc_cap_flush();
if (!prealloc_cf)
- return VM_FAULT_SIGBUS;
+ return VM_FAULT_OOM;
+
+ ceph_block_sigs(&oldset);
if (ci->i_inline_version != CEPH_INLINE_NONE) {
struct page *locked_page = NULL;
@@ -1424,16 +1462,14 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
ret = ceph_uninline_data(vma->vm_file, locked_page);
if (locked_page)
unlock_page(locked_page);
- if (ret < 0) {
- ret = VM_FAULT_SIGBUS;
+ if (ret < 0)
goto out_free;
- }
}
- if (off + PAGE_CACHE_SIZE <= size)
- len = PAGE_CACHE_SIZE;
+ if (off + PAGE_SIZE <= size)
+ len = PAGE_SIZE;
else
- len = size & ~PAGE_CACHE_MASK;
+ len = size & ~PAGE_MASK;
dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n",
inode, ceph_vinop(inode), off, len, size);
@@ -1441,45 +1477,36 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
else
want = CEPH_CAP_FILE_BUFFER;
- while (1) {
- got = 0;
- ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len,
- &got, NULL);
- if (ret == 0)
- break;
- if (ret != -ERESTARTSYS) {
- WARN_ON(1);
- ret = VM_FAULT_SIGBUS;
- goto out_free;
- }
- }
+
+ got = 0;
+ ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len,
+ &got, NULL);
+ if (ret < 0)
+ goto out_free;
+
dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
inode, off, len, ceph_cap_string(got));
/* Update time before taking page lock */
file_update_time(vma->vm_file);
- lock_page(page);
+ do {
+ lock_page(page);
- ret = VM_FAULT_NOPAGE;
- if ((off > size) ||
- (page->mapping != inode->i_mapping)) {
- unlock_page(page);
- goto out;
- }
+ if ((off > size) || (page->mapping != inode->i_mapping)) {
+ unlock_page(page);
+ ret = VM_FAULT_NOPAGE;
+ break;
+ }
+
+ ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
+ if (ret >= 0) {
+ /* success. we'll keep the page locked. */
+ set_page_dirty(page);
+ ret = VM_FAULT_LOCKED;
+ }
+ } while (ret == -EAGAIN);
- ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
- if (ret >= 0) {
- /* success. we'll keep the page locked. */
- set_page_dirty(page);
- ret = VM_FAULT_LOCKED;
- } else {
- if (ret == -ENOMEM)
- ret = VM_FAULT_OOM;
- else
- ret = VM_FAULT_SIGBUS;
- }
-out:
if (ret == VM_FAULT_LOCKED ||
ci->i_inline_version != CEPH_INLINE_NONE) {
int dirty;
@@ -1496,8 +1523,10 @@ out:
inode, off, len, ceph_cap_string(got), ret);
ceph_put_cap_refs(ci, got);
out_free:
+ ceph_restore_sigs(&oldset);
ceph_free_cap_flush(prealloc_cf);
-
+ if (ret < 0)
+ ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
return ret;
}
@@ -1519,7 +1548,7 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
return;
if (PageUptodate(page)) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return;
}
}
@@ -1534,14 +1563,14 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
}
if (page != locked_page) {
- if (len < PAGE_CACHE_SIZE)
- zero_user_segment(page, len, PAGE_CACHE_SIZE);
+ if (len < PAGE_SIZE)
+ zero_user_segment(page, len, PAGE_SIZE);
else
flush_dcache_page(page);
SetPageUptodate(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
}
@@ -1578,7 +1607,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
from_pagecache = true;
lock_page(page);
} else {
- page_cache_release(page);
+ put_page(page);
page = NULL;
}
}
@@ -1586,8 +1615,8 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
if (page) {
len = i_size_read(inode);
- if (len > PAGE_CACHE_SIZE)
- len = PAGE_CACHE_SIZE;
+ if (len > PAGE_SIZE)
+ len = PAGE_SIZE;
} else {
page = __page_cache_alloc(GFP_NOFS);
if (!page) {
@@ -1615,7 +1644,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
goto out;
}
- ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime);
+ req->r_mtime = inode->i_mtime;
err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
if (!err)
err = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1658,7 +1687,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
goto out_put;
}
- ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime);
+ req->r_mtime = inode->i_mtime;
err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
if (!err)
err = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1670,7 +1699,7 @@ out:
if (page && page != locked_page) {
if (from_pagecache) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
} else
__free_pages(page, 0);
}
@@ -1701,7 +1730,8 @@ enum {
POOL_WRITE = 2,
};
-static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
+static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
+ s64 pool, struct ceph_string *pool_ns)
{
struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
struct ceph_mds_client *mdsc = fsc->mdsc;
@@ -1709,6 +1739,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
struct rb_node **p, *parent;
struct ceph_pool_perm *perm;
struct page **pages;
+ size_t pool_ns_len;
int err = 0, err2 = 0, have = 0;
down_read(&mdsc->pool_perm_rwsem);
@@ -1720,17 +1751,31 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
else if (pool > perm->pool)
p = &(*p)->rb_right;
else {
- have = perm->perm;
- break;
+ int ret = ceph_compare_string(pool_ns,
+ perm->pool_ns,
+ perm->pool_ns_len);
+ if (ret < 0)
+ p = &(*p)->rb_left;
+ else if (ret > 0)
+ p = &(*p)->rb_right;
+ else {
+ have = perm->perm;
+ break;
+ }
}
}
up_read(&mdsc->pool_perm_rwsem);
if (*p)
goto out;
- dout("__ceph_pool_perm_get pool %u no perm cached\n", pool);
+ if (pool_ns)
+ dout("__ceph_pool_perm_get pool %lld ns %.*s no perm cached\n",
+ pool, (int)pool_ns->len, pool_ns->str);
+ else
+ dout("__ceph_pool_perm_get pool %lld no perm cached\n", pool);
down_write(&mdsc->pool_perm_rwsem);
+ p = &mdsc->pool_perm_tree.rb_node;
parent = NULL;
while (*p) {
parent = *p;
@@ -1740,8 +1785,17 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
else if (pool > perm->pool)
p = &(*p)->rb_right;
else {
- have = perm->perm;
- break;
+ int ret = ceph_compare_string(pool_ns,
+ perm->pool_ns,
+ perm->pool_ns_len);
+ if (ret < 0)
+ p = &(*p)->rb_left;
+ else if (ret > 0)
+ p = &(*p)->rb_right;
+ else {
+ have = perm->perm;
+ break;
+ }
}
}
if (*p) {
@@ -1759,9 +1813,13 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
rd_req->r_flags = CEPH_OSD_FLAG_READ;
osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0);
rd_req->r_base_oloc.pool = pool;
- snprintf(rd_req->r_base_oid.name, sizeof(rd_req->r_base_oid.name),
- "%llx.00000000", ci->i_vino.ino);
- rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name);
+ if (pool_ns)
+ rd_req->r_base_oloc.pool_ns = ceph_get_string(pool_ns);
+ ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino);
+
+ err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS);
+ if (err)
+ goto out_unlock;
wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
1, false, GFP_NOFS);
@@ -1770,11 +1828,14 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
goto out_unlock;
}
- wr_req->r_flags = CEPH_OSD_FLAG_WRITE |
- CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
+ wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK;
osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
- wr_req->r_base_oloc.pool = pool;
- wr_req->r_base_oid = rd_req->r_base_oid;
+ ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc);
+ ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
+
+ err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS);
+ if (err)
+ goto out_unlock;
/* one page should be large enough for STAT data */
pages = ceph_alloc_page_vector(1, GFP_KERNEL);
@@ -1785,12 +1846,9 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE,
0, false, true);
- ceph_osdc_build_request(rd_req, 0, NULL, CEPH_NOSNAP,
- &ci->vfs_inode.i_mtime);
err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
- ceph_osdc_build_request(wr_req, 0, NULL, CEPH_NOSNAP,
- &ci->vfs_inode.i_mtime);
+ wr_req->r_mtime = ci->vfs_inode.i_mtime;
err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
if (!err)
@@ -1810,7 +1868,8 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
goto out_unlock;
}
- perm = kmalloc(sizeof(*perm), GFP_NOFS);
+ pool_ns_len = pool_ns ? pool_ns->len : 0;
+ perm = kmalloc(sizeof(*perm) + pool_ns_len + 1, GFP_NOFS);
if (!perm) {
err = -ENOMEM;
goto out_unlock;
@@ -1818,56 +1877,62 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
perm->pool = pool;
perm->perm = have;
+ perm->pool_ns_len = pool_ns_len;
+ if (pool_ns_len > 0)
+ memcpy(perm->pool_ns, pool_ns->str, pool_ns_len);
+ perm->pool_ns[pool_ns_len] = 0;
+
rb_link_node(&perm->node, parent, p);
rb_insert_color(&perm->node, &mdsc->pool_perm_tree);
err = 0;
out_unlock:
up_write(&mdsc->pool_perm_rwsem);
- if (rd_req)
- ceph_osdc_put_request(rd_req);
- if (wr_req)
- ceph_osdc_put_request(wr_req);
+ ceph_osdc_put_request(rd_req);
+ ceph_osdc_put_request(wr_req);
out:
if (!err)
err = have;
- dout("__ceph_pool_perm_get pool %u result = %d\n", pool, err);
+ if (pool_ns)
+ dout("__ceph_pool_perm_get pool %lld ns %.*s result = %d\n",
+ pool, (int)pool_ns->len, pool_ns->str, err);
+ else
+ dout("__ceph_pool_perm_get pool %lld result = %d\n", pool, err);
return err;
}
int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
{
- u32 pool;
+ s64 pool;
+ struct ceph_string *pool_ns;
int ret, flags;
- /* does not support pool namespace yet */
- if (ci->i_pool_ns_len)
- return -EIO;
-
if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode),
NOPOOLPERM))
return 0;
spin_lock(&ci->i_ceph_lock);
flags = ci->i_ceph_flags;
- pool = ceph_file_layout_pg_pool(ci->i_layout);
+ pool = ci->i_layout.pool_id;
spin_unlock(&ci->i_ceph_lock);
check:
if (flags & CEPH_I_POOL_PERM) {
if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) {
- dout("ceph_pool_perm_check pool %u no read perm\n",
+ dout("ceph_pool_perm_check pool %lld no read perm\n",
pool);
return -EPERM;
}
if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) {
- dout("ceph_pool_perm_check pool %u no write perm\n",
+ dout("ceph_pool_perm_check pool %lld no write perm\n",
pool);
return -EPERM;
}
return 0;
}
- ret = __ceph_pool_perm_get(ci, pool);
+ pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
+ ret = __ceph_pool_perm_get(ci, pool, pool_ns);
+ ceph_put_string(pool_ns);
if (ret < 0)
return ret;
@@ -1878,10 +1943,11 @@ check:
flags |= CEPH_I_POOL_WR;
spin_lock(&ci->i_ceph_lock);
- if (pool == ceph_file_layout_pg_pool(ci->i_layout)) {
- ci->i_ceph_flags = flags;
+ if (pool == ci->i_layout.pool_id &&
+ pool_ns == rcu_dereference_raw(ci->i_layout.pool_ns)) {
+ ci->i_ceph_flags |= flags;
} else {
- pool = ceph_file_layout_pg_pool(ci->i_layout);
+ pool = ci->i_layout.pool_id;
flags = ci->i_ceph_flags;
}
spin_unlock(&ci->i_ceph_lock);
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index a351480dbabc9..5bc5d37b12171 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -25,6 +25,7 @@
#include "cache.h"
struct ceph_aux_inode {
+ u64 version;
struct timespec mtime;
loff_t size;
};
@@ -69,15 +70,8 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index,
&ceph_fscache_fsid_object_def,
fsc, true);
-
- if (fsc->fscache == NULL) {
- pr_err("Unable to resgister fsid: %p fscache cookie", fsc);
- return 0;
- }
-
- fsc->revalidate_wq = alloc_workqueue("ceph-revalidate", 0, 1);
- if (fsc->revalidate_wq == NULL)
- return -ENOMEM;
+ if (!fsc->fscache)
+ pr_err("Unable to register fsid: %p fscache cookie\n", fsc);
return 0;
}
@@ -105,6 +99,7 @@ static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data,
const struct inode* inode = &ci->vfs_inode;
memset(&aux, 0, sizeof(aux));
+ aux.version = ci->i_version;
aux.mtime = inode->i_mtime;
aux.size = i_size_read(inode);
@@ -131,6 +126,7 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux(
return FSCACHE_CHECKAUX_OBSOLETE;
memset(&aux, 0, sizeof(aux));
+ aux.version = ci->i_version;
aux.mtime = inode->i_mtime;
aux.size = i_size_read(inode);
@@ -181,32 +177,26 @@ static const struct fscache_cookie_def ceph_fscache_inode_object_def = {
.now_uncached = ceph_fscache_inode_now_uncached,
};
-void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
- struct ceph_inode_info* ci)
+void ceph_fscache_register_inode_cookie(struct inode *inode)
{
- struct inode* inode = &ci->vfs_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
/* No caching for filesystem */
if (fsc->fscache == NULL)
return;
/* Only cache for regular files that are read only */
- if ((ci->vfs_inode.i_mode & S_IFREG) == 0)
+ if (!S_ISREG(inode->i_mode))
return;
- /* Avoid multiple racing open requests */
- inode_lock(inode);
-
- if (ci->fscache)
- goto done;
-
- ci->fscache = fscache_acquire_cookie(fsc->fscache,
- &ceph_fscache_inode_object_def,
- ci, true);
- fscache_check_consistency(ci->fscache);
-done:
+ inode_lock_nested(inode, I_MUTEX_CHILD);
+ if (!ci->fscache) {
+ ci->fscache = fscache_acquire_cookie(fsc->fscache,
+ &ceph_fscache_inode_object_def,
+ ci, false);
+ }
inode_unlock(inode);
-
}
void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
@@ -222,6 +212,34 @@ void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
fscache_relinquish_cookie(cookie, 0);
}
+static bool ceph_fscache_can_enable(void *data)
+{
+ struct inode *inode = data;
+ return !inode_is_open_for_write(inode);
+}
+
+void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ if (!fscache_cookie_valid(ci->fscache))
+ return;
+
+ if (inode_is_open_for_write(inode)) {
+ dout("fscache_file_set_cookie %p %p disabling cache\n",
+ inode, filp);
+ fscache_disable_cookie(ci->fscache, false);
+ fscache_uncache_all_inode_pages(ci->fscache, inode);
+ } else {
+ fscache_enable_cookie(ci->fscache, ceph_fscache_can_enable,
+ inode);
+ if (fscache_cookie_enabled(ci->fscache)) {
+ dout("fscache_file_set_cookie %p %p enabing cache\n",
+ inode, filp);
+ }
+ }
+}
+
static void ceph_vfs_readpage_complete(struct page *page, void *data, int error)
{
if (!error)
@@ -236,10 +254,9 @@ static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int
unlock_page(page);
}
-static inline int cache_valid(struct ceph_inode_info *ci)
+static inline bool cache_valid(struct ceph_inode_info *ci)
{
- return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) &&
- (ci->i_fscache_gen == ci->i_rdcache_gen));
+ return ci->i_fscache_gen == ci->i_rdcache_gen;
}
@@ -332,69 +349,27 @@ void ceph_invalidate_fscache_page(struct inode* inode, struct page *page)
void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
{
- if (fsc->revalidate_wq)
- destroy_workqueue(fsc->revalidate_wq);
-
fscache_relinquish_cookie(fsc->fscache, 0);
fsc->fscache = NULL;
}
-static void ceph_revalidate_work(struct work_struct *work)
-{
- int issued;
- u32 orig_gen;
- struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
- i_revalidate_work);
- struct inode *inode = &ci->vfs_inode;
-
- spin_lock(&ci->i_ceph_lock);
- issued = __ceph_caps_issued(ci, NULL);
- orig_gen = ci->i_rdcache_gen;
- spin_unlock(&ci->i_ceph_lock);
-
- if (!(issued & CEPH_CAP_FILE_CACHE)) {
- dout("revalidate_work lost cache before validation %p\n",
- inode);
- goto out;
- }
-
- if (!fscache_check_consistency(ci->fscache))
- fscache_invalidate(ci->fscache);
-
- spin_lock(&ci->i_ceph_lock);
- /* Update the new valid generation (backwards sanity check too) */
- if (orig_gen > ci->i_fscache_gen) {
- ci->i_fscache_gen = orig_gen;
- }
- spin_unlock(&ci->i_ceph_lock);
-
-out:
- iput(&ci->vfs_inode);
-}
-
-void ceph_queue_revalidate(struct inode *inode)
+/*
+ * caller should hold CEPH_CAP_FILE_{RD,CACHE}
+ */
+void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
- struct ceph_inode_info *ci = ceph_inode(inode);
-
- if (fsc->revalidate_wq == NULL || ci->fscache == NULL)
+ if (cache_valid(ci))
return;
- ihold(inode);
-
- if (queue_work(ceph_sb_to_client(inode->i_sb)->revalidate_wq,
- &ci->i_revalidate_work)) {
- dout("ceph_queue_revalidate %p\n", inode);
- } else {
- dout("ceph_queue_revalidate %p failed\n)", inode);
- iput(inode);
+ /* resue i_truncate_mutex. There should be no pending
+ * truncate while the caller holds CEPH_CAP_FILE_RD */
+ mutex_lock(&ci->i_truncate_mutex);
+ if (!cache_valid(ci)) {
+ if (fscache_check_consistency(ci->fscache))
+ fscache_invalidate(ci->fscache);
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_fscache_gen = ci->i_rdcache_gen;
+ spin_unlock(&ci->i_ceph_lock);
}
-}
-
-void ceph_fscache_inode_init(struct ceph_inode_info *ci)
-{
- ci->fscache = NULL;
- /* The first load is verifed cookie open time */
- ci->i_fscache_gen = 1;
- INIT_WORK(&ci->i_revalidate_work, ceph_revalidate_work);
+ mutex_unlock(&ci->i_truncate_mutex);
}
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index 5ac591bd012bc..7e72c7594f0c8 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h
@@ -34,10 +34,10 @@ void ceph_fscache_unregister(void);
int ceph_fscache_register_fs(struct ceph_fs_client* fsc);
void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc);
-void ceph_fscache_inode_init(struct ceph_inode_info *ci);
-void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
- struct ceph_inode_info* ci);
+void ceph_fscache_register_inode_cookie(struct inode *inode);
void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci);
+void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp);
+void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci);
int ceph_readpage_from_fscache(struct inode *inode, struct page *page);
int ceph_readpages_from_fscache(struct inode *inode,
@@ -46,12 +46,11 @@ int ceph_readpages_from_fscache(struct inode *inode,
unsigned *nr_pages);
void ceph_readpage_to_fscache(struct inode *inode, struct page *page);
void ceph_invalidate_fscache_page(struct inode* inode, struct page *page);
-void ceph_queue_revalidate(struct inode *inode);
-static inline void ceph_fscache_update_objectsize(struct inode *inode)
+static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci)
{
- struct ceph_inode_info *ci = ceph_inode(inode);
- fscache_attr_changed(ci->fscache);
+ ci->fscache = NULL;
+ ci->i_fscache_gen = 0;
}
static inline void ceph_fscache_invalidate(struct inode *inode)
@@ -88,6 +87,11 @@ static inline void ceph_fscache_readpages_cancel(struct inode *inode,
return fscache_readpages_cancel(ci->fscache, pages);
}
+static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci)
+{
+ ci->i_fscache_gen = ci->i_rdcache_gen - 1;
+}
+
#else
static inline int ceph_fscache_register(void)
@@ -112,8 +116,20 @@ static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci)
{
}
-static inline void ceph_fscache_register_inode_cookie(struct ceph_fs_client* parent_fsc,
- struct ceph_inode_info* ci)
+static inline void ceph_fscache_register_inode_cookie(struct inode *inode)
+{
+}
+
+static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
+{
+}
+
+static inline void ceph_fscache_file_set_cookie(struct inode *inode,
+ struct file *filp)
+{
+}
+
+static inline void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci)
{
}
@@ -141,10 +157,6 @@ static inline void ceph_readpage_to_fscache(struct inode *inode,
{
}
-static inline void ceph_fscache_update_objectsize(struct inode *inode)
-{
-}
-
static inline void ceph_fscache_invalidate(struct inode *inode)
{
}
@@ -154,10 +166,6 @@ static inline void ceph_invalidate_fscache_page(struct inode *inode,
{
}
-static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
-{
-}
-
static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
{
return 1;
@@ -173,7 +181,7 @@ static inline void ceph_fscache_readpages_cancel(struct inode *inode,
{
}
-static inline void ceph_queue_revalidate(struct inode *inode)
+static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci)
{
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index de17bb232ff8d..99115cae1652a 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -40,6 +40,11 @@
* cluster to release server state.
*/
+static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc);
+static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ struct ceph_inode_info *ci,
+ u64 oldest_flush_tid);
/*
* Generate readable cap strings for debugging output.
@@ -849,12 +854,14 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
*/
int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
{
- int want = 0;
- int mode;
- for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++)
- if (ci->i_nr_by_mode[mode])
- want |= ceph_caps_for_mode(mode);
- return want;
+ int i, bits = 0;
+ for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
+ if (ci->i_nr_by_mode[i])
+ bits |= 1 << i;
+ }
+ if (bits == 0)
+ return 0;
+ return ceph_caps_for_mode(bits >> 1);
}
/*
@@ -991,7 +998,7 @@ static int send_cap_msg(struct ceph_mds_session *session,
u32 seq, u64 flush_tid, u64 oldest_flush_tid,
u32 issue_seq, u32 mseq, u64 size, u64 max_size,
struct timespec *mtime, struct timespec *atime,
- struct timespec *ctime, u64 time_warp_seq,
+ struct timespec *ctime, u32 time_warp_seq,
kuid_t uid, kgid_t gid, umode_t mode,
u64 xattr_version,
struct ceph_buffer *xattrs_buf,
@@ -1116,8 +1123,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
struct inode *inode = &ci->vfs_inode;
u64 cap_id = cap->cap_id;
int held, revoking, dropping, keep;
- u64 seq, issue_seq, mseq, time_warp_seq, follows;
- u64 size, max_size;
+ u64 follows, size, max_size;
+ u32 seq, issue_seq, mseq, time_warp_seq;
struct timespec mtime, atime, ctime;
int wake = 0;
umode_t mode;
@@ -1215,6 +1222,22 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
return delayed;
}
+static inline int __send_flush_snap(struct inode *inode,
+ struct ceph_mds_session *session,
+ struct ceph_cap_snap *capsnap,
+ u32 mseq, u64 oldest_flush_tid)
+{
+ return send_cap_msg(session, ceph_vino(inode).ino, 0,
+ CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
+ capsnap->dirty, 0, capsnap->cap_flush.tid,
+ oldest_flush_tid, 0, mseq, capsnap->size, 0,
+ &capsnap->mtime, &capsnap->atime,
+ &capsnap->ctime, capsnap->time_warp_seq,
+ capsnap->uid, capsnap->gid, capsnap->mode,
+ capsnap->xattr_version, capsnap->xattr_blob,
+ capsnap->follows, capsnap->inline_data);
+}
+
/*
* When a snapshot is taken, clients accumulate dirty metadata on
* inodes with capabilities in ceph_cap_snaps to describe the file
@@ -1222,37 +1245,22 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
* asynchronously back to the MDS once sync writes complete and dirty
* data is written out.
*
- * Unless @kick is true, skip cap_snaps that were already sent to
- * the MDS (i.e., during this session).
- *
* Called under i_ceph_lock. Takes s_mutex as needed.
*/
-void __ceph_flush_snaps(struct ceph_inode_info *ci,
- struct ceph_mds_session **psession,
- int kick)
+static void __ceph_flush_snaps(struct ceph_inode_info *ci,
+ struct ceph_mds_session *session)
__releases(ci->i_ceph_lock)
__acquires(ci->i_ceph_lock)
{
struct inode *inode = &ci->vfs_inode;
- int mds;
+ struct ceph_mds_client *mdsc = session->s_mdsc;
struct ceph_cap_snap *capsnap;
- u32 mseq;
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
- struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
- session->s_mutex */
- u64 next_follows = 0; /* keep track of how far we've gotten through the
- i_cap_snaps list, and skip these entries next time
- around to avoid an infinite loop */
+ u64 oldest_flush_tid = 0;
+ u64 first_tid = 1, last_tid = 0;
- if (psession)
- session = *psession;
+ dout("__flush_snaps %p session %p\n", inode, session);
- dout("__flush_snaps %p\n", inode);
-retry:
list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
- /* avoid an infiniute loop after retry */
- if (capsnap->follows < next_follows)
- continue;
/*
* we need to wait for sync writes to complete and for dirty
* pages to be written out.
@@ -1263,97 +1271,129 @@ retry:
/* should be removed by ceph_try_drop_cap_snap() */
BUG_ON(!capsnap->need_flush);
- /* pick mds, take s_mutex */
- if (ci->i_auth_cap == NULL) {
- dout("no auth cap (migrating?), doing nothing\n");
- goto out;
- }
-
/* only flush each capsnap once */
- if (!kick && !list_empty(&capsnap->flushing_item)) {
- dout("already flushed %p, skipping\n", capsnap);
+ if (capsnap->cap_flush.tid > 0) {
+ dout(" already flushed %p, skipping\n", capsnap);
continue;
}
- mds = ci->i_auth_cap->session->s_mds;
- mseq = ci->i_auth_cap->mseq;
+ spin_lock(&mdsc->cap_dirty_lock);
+ capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid;
+ list_add_tail(&capsnap->cap_flush.g_list,
+ &mdsc->cap_flush_list);
+ if (oldest_flush_tid == 0)
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+ if (list_empty(&ci->i_flushing_item)) {
+ list_add_tail(&ci->i_flushing_item,
+ &session->s_cap_flushing);
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
- if (session && session->s_mds != mds) {
- dout("oops, wrong session %p mutex\n", session);
- if (kick)
- goto out;
+ list_add_tail(&capsnap->cap_flush.i_list,
+ &ci->i_cap_flush_list);
- mutex_unlock(&session->s_mutex);
- ceph_put_mds_session(session);
- session = NULL;
+ if (first_tid == 1)
+ first_tid = capsnap->cap_flush.tid;
+ last_tid = capsnap->cap_flush.tid;
+ }
+
+ ci->i_ceph_flags &= ~CEPH_I_FLUSH_SNAPS;
+
+ while (first_tid <= last_tid) {
+ struct ceph_cap *cap = ci->i_auth_cap;
+ struct ceph_cap_flush *cf;
+ int ret;
+
+ if (!(cap && cap->session == session)) {
+ dout("__flush_snaps %p auth cap %p not mds%d, "
+ "stop\n", inode, cap, session->s_mds);
+ break;
}
- if (!session) {
- spin_unlock(&ci->i_ceph_lock);
- mutex_lock(&mdsc->mutex);
- session = __ceph_lookup_mds_session(mdsc, mds);
- mutex_unlock(&mdsc->mutex);
- if (session) {
- dout("inverting session/ino locks on %p\n",
- session);
- mutex_lock(&session->s_mutex);
+
+ ret = -ENOENT;
+ list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
+ if (cf->tid >= first_tid) {
+ ret = 0;
+ break;
}
- /*
- * if session == NULL, we raced against a cap
- * deletion or migration. retry, and we'll
- * get a better @mds value next time.
- */
- spin_lock(&ci->i_ceph_lock);
- goto retry;
}
+ if (ret < 0)
+ break;
- spin_lock(&mdsc->cap_dirty_lock);
- capsnap->flush_tid = ++mdsc->last_cap_flush_tid;
- spin_unlock(&mdsc->cap_dirty_lock);
+ first_tid = cf->tid + 1;
+ capsnap = container_of(cf, struct ceph_cap_snap, cap_flush);
atomic_inc(&capsnap->nref);
- if (list_empty(&capsnap->flushing_item))
- list_add_tail(&capsnap->flushing_item,
- &session->s_cap_snaps_flushing);
spin_unlock(&ci->i_ceph_lock);
- dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",
- inode, capsnap, capsnap->follows, capsnap->flush_tid);
- send_cap_msg(session, ceph_vino(inode).ino, 0,
- CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
- capsnap->dirty, 0, capsnap->flush_tid, 0,
- 0, mseq, capsnap->size, 0,
- &capsnap->mtime, &capsnap->atime,
- &capsnap->ctime, capsnap->time_warp_seq,
- capsnap->uid, capsnap->gid, capsnap->mode,
- capsnap->xattr_version, capsnap->xattr_blob,
- capsnap->follows, capsnap->inline_data);
-
- next_follows = capsnap->follows + 1;
- ceph_put_cap_snap(capsnap);
+ dout("__flush_snaps %p capsnap %p tid %llu %s\n",
+ inode, capsnap, cf->tid, ceph_cap_string(capsnap->dirty));
+ ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
+ oldest_flush_tid);
+ if (ret < 0) {
+ pr_err("__flush_snaps: error sending cap flushsnap, "
+ "ino (%llx.%llx) tid %llu follows %llu\n",
+ ceph_vinop(inode), cf->tid, capsnap->follows);
+ }
+
+ ceph_put_cap_snap(capsnap);
spin_lock(&ci->i_ceph_lock);
- goto retry;
}
+}
- /* we flushed them all; remove this inode from the queue */
- spin_lock(&mdsc->snap_flush_lock);
- list_del_init(&ci->i_snap_flush_item);
- spin_unlock(&mdsc->snap_flush_lock);
+void ceph_flush_snaps(struct ceph_inode_info *ci,
+ struct ceph_mds_session **psession)
+{
+ struct inode *inode = &ci->vfs_inode;
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_session *session = *psession;
+ int mds;
+ dout("ceph_flush_snaps %p\n", inode);
+retry:
+ spin_lock(&ci->i_ceph_lock);
+ if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) {
+ dout(" no capsnap needs flush, doing nothing\n");
+ goto out;
+ }
+ if (!ci->i_auth_cap) {
+ dout(" no auth cap (migrating?), doing nothing\n");
+ goto out;
+ }
-out:
- if (psession)
- *psession = session;
- else if (session) {
+ mds = ci->i_auth_cap->session->s_mds;
+ if (session && session->s_mds != mds) {
+ dout(" oops, wrong session %p mutex\n", session);
mutex_unlock(&session->s_mutex);
ceph_put_mds_session(session);
+ session = NULL;
+ }
+ if (!session) {
+ spin_unlock(&ci->i_ceph_lock);
+ mutex_lock(&mdsc->mutex);
+ session = __ceph_lookup_mds_session(mdsc, mds);
+ mutex_unlock(&mdsc->mutex);
+ if (session) {
+ dout(" inverting session/ino locks on %p\n", session);
+ mutex_lock(&session->s_mutex);
+ }
+ goto retry;
}
-}
-static void ceph_flush_snaps(struct ceph_inode_info *ci)
-{
- spin_lock(&ci->i_ceph_lock);
- __ceph_flush_snaps(ci, NULL, 0);
+ __ceph_flush_snaps(ci, session);
+out:
spin_unlock(&ci->i_ceph_lock);
+
+ if (psession) {
+ *psession = session;
+ } else {
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+ }
+ /* we flushed them all; remove this inode from the queue */
+ spin_lock(&mdsc->snap_flush_lock);
+ list_del_init(&ci->i_snap_flush_item);
+ spin_unlock(&mdsc->snap_flush_lock);
}
/*
@@ -1411,52 +1451,6 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
return dirty;
}
-static void __add_cap_flushing_to_inode(struct ceph_inode_info *ci,
- struct ceph_cap_flush *cf)
-{
- struct rb_node **p = &ci->i_cap_flush_tree.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_cap_flush *other = NULL;
-
- while (*p) {
- parent = *p;
- other = rb_entry(parent, struct ceph_cap_flush, i_node);
-
- if (cf->tid < other->tid)
- p = &(*p)->rb_left;
- else if (cf->tid > other->tid)
- p = &(*p)->rb_right;
- else
- BUG();
- }
-
- rb_link_node(&cf->i_node, parent, p);
- rb_insert_color(&cf->i_node, &ci->i_cap_flush_tree);
-}
-
-static void __add_cap_flushing_to_mdsc(struct ceph_mds_client *mdsc,
- struct ceph_cap_flush *cf)
-{
- struct rb_node **p = &mdsc->cap_flush_tree.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_cap_flush *other = NULL;
-
- while (*p) {
- parent = *p;
- other = rb_entry(parent, struct ceph_cap_flush, g_node);
-
- if (cf->tid < other->tid)
- p = &(*p)->rb_left;
- else if (cf->tid > other->tid)
- p = &(*p)->rb_right;
- else
- BUG();
- }
-
- rb_link_node(&cf->g_node, parent, p);
- rb_insert_color(&cf->g_node, &mdsc->cap_flush_tree);
-}
-
struct ceph_cap_flush *ceph_alloc_cap_flush(void)
{
return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
@@ -1470,23 +1464,54 @@ void ceph_free_cap_flush(struct ceph_cap_flush *cf)
static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc)
{
- struct rb_node *n = rb_first(&mdsc->cap_flush_tree);
- if (n) {
+ if (!list_empty(&mdsc->cap_flush_list)) {
struct ceph_cap_flush *cf =
- rb_entry(n, struct ceph_cap_flush, g_node);
+ list_first_entry(&mdsc->cap_flush_list,
+ struct ceph_cap_flush, g_list);
return cf->tid;
}
return 0;
}
/*
+ * Remove cap_flush from the mdsc's or inode's flushing cap list.
+ * Return true if caller needs to wake up flush waiters.
+ */
+static bool __finish_cap_flush(struct ceph_mds_client *mdsc,
+ struct ceph_inode_info *ci,
+ struct ceph_cap_flush *cf)
+{
+ struct ceph_cap_flush *prev;
+ bool wake = cf->wake;
+ if (mdsc) {
+ /* are there older pending cap flushes? */
+ if (wake && cf->g_list.prev != &mdsc->cap_flush_list) {
+ prev = list_prev_entry(cf, g_list);
+ prev->wake = true;
+ wake = false;
+ }
+ list_del(&cf->g_list);
+ } else if (ci) {
+ if (wake && cf->i_list.prev != &ci->i_cap_flush_list) {
+ prev = list_prev_entry(cf, i_list);
+ prev->wake = true;
+ wake = false;
+ }
+ list_del(&cf->i_list);
+ } else {
+ BUG_ON(1);
+ }
+ return wake;
+}
+
+/*
* Add dirty inode to the flushing list. Assigned a seq number so we
* can wait for caps to flush without starving.
*
* Called under i_ceph_lock.
*/
static int __mark_caps_flushing(struct inode *inode,
- struct ceph_mds_session *session,
+ struct ceph_mds_session *session, bool wake,
u64 *flush_tid, u64 *oldest_flush_tid)
{
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
@@ -1509,26 +1534,22 @@ static int __mark_caps_flushing(struct inode *inode,
swap(cf, ci->i_prealloc_cap_flush);
cf->caps = flushing;
+ cf->wake = wake;
spin_lock(&mdsc->cap_dirty_lock);
list_del_init(&ci->i_dirty_item);
cf->tid = ++mdsc->last_cap_flush_tid;
- __add_cap_flushing_to_mdsc(mdsc, cf);
+ list_add_tail(&cf->g_list, &mdsc->cap_flush_list);
*oldest_flush_tid = __get_oldest_flush_tid(mdsc);
if (list_empty(&ci->i_flushing_item)) {
list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
mdsc->num_cap_flushing++;
- dout(" inode %p now flushing tid %llu\n", inode, cf->tid);
- } else {
- list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
- dout(" inode %p now flushing (more) tid %llu\n",
- inode, cf->tid);
}
spin_unlock(&mdsc->cap_dirty_lock);
- __add_cap_flushing_to_inode(ci, cf);
+ list_add_tail(&cf->i_list, &ci->i_cap_flush_list);
*flush_tid = cf->tid;
return flushing;
@@ -1583,10 +1604,11 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
int mds = -1; /* keep track of how far we've gone through i_caps list
to avoid an infinite loop on retry */
struct rb_node *p;
- int tried_invalidate = 0;
- int delayed = 0, sent = 0, force_requeue = 0, num;
- int queue_invalidate = 0;
- int is_delayed = flags & CHECK_CAPS_NODELAY;
+ int delayed = 0, sent = 0, num;
+ bool is_delayed = flags & CHECK_CAPS_NODELAY;
+ bool queue_invalidate = false;
+ bool force_requeue = false;
+ bool tried_invalidate = false;
/* if we are unmounting, flush any unused caps immediately. */
if (mdsc->stopping)
@@ -1597,9 +1619,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
if (ci->i_ceph_flags & CEPH_I_FLUSH)
flags |= CHECK_CAPS_FLUSH;
- /* flush snaps first time around only */
- if (!list_empty(&ci->i_cap_snaps))
- __ceph_flush_snaps(ci, &session, 0);
goto retry_locked;
retry:
spin_lock(&ci->i_ceph_lock);
@@ -1656,7 +1675,7 @@ retry_locked:
*/
if ((!is_delayed || mdsc->stopping) &&
!S_ISDIR(inode->i_mode) && /* ignore readdir cache */
- ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
+ !(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */
inode->i_data.nrpages && /* have cached pages */
(revoking & (CEPH_CAP_FILE_CACHE|
CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */
@@ -1666,17 +1685,17 @@ retry_locked:
if (revoking & (CEPH_CAP_FILE_CACHE|
CEPH_CAP_FILE_LAZYIO)) {
dout("check_caps queuing invalidate\n");
- queue_invalidate = 1;
+ queue_invalidate = true;
ci->i_rdcache_revoking = ci->i_rdcache_gen;
} else {
dout("check_caps failed to invalidate pages\n");
/* we failed to invalidate pages. check these
caps again later. */
- force_requeue = 1;
+ force_requeue = true;
__cap_set_timeouts(mdsc, ci);
}
}
- tried_invalidate = 1;
+ tried_invalidate = true;
goto retry_locked;
}
@@ -1698,8 +1717,8 @@ retry_locked:
revoking = cap->implemented & ~cap->issued;
dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
- cap->mds, cap, ceph_cap_string(cap->issued),
- ceph_cap_string(cap_used),
+ cap->mds, cap, ceph_cap_string(cap_used),
+ ceph_cap_string(cap->issued),
ceph_cap_string(cap->implemented),
ceph_cap_string(revoking));
@@ -1720,10 +1739,15 @@ retry_locked:
}
}
/* flush anything dirty? */
- if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
- ci->i_dirty_caps) {
- dout("flushing dirty caps\n");
- goto ack;
+ if (cap == ci->i_auth_cap) {
+ if ((flags & CHECK_CAPS_FLUSH) && ci->i_dirty_caps) {
+ dout("flushing dirty caps\n");
+ goto ack;
+ }
+ if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) {
+ dout("flushing snap caps\n");
+ goto ack;
+ }
}
/* completed revocation? going down and there are no caps? */
@@ -1782,6 +1806,26 @@ ack:
goto retry;
}
}
+
+ /* kick flushing and flush snaps before sending normal
+ * cap message */
+ if (cap == ci->i_auth_cap &&
+ (ci->i_ceph_flags &
+ (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) {
+ if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
+ spin_lock(&mdsc->cap_dirty_lock);
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+ spin_unlock(&mdsc->cap_dirty_lock);
+ __kick_flushing_caps(mdsc, session, ci,
+ oldest_flush_tid);
+ ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+ }
+ if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
+ __ceph_flush_snaps(ci, session);
+
+ goto retry_locked;
+ }
+
/* take snap_rwsem after session mutex */
if (!took_snap_rwsem) {
if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
@@ -1796,7 +1840,7 @@ ack:
}
if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
- flushing = __mark_caps_flushing(inode, session,
+ flushing = __mark_caps_flushing(inode, session, false,
&flush_tid,
&oldest_flush_tid);
} else {
@@ -1822,7 +1866,7 @@ ack:
* otherwise cancel.
*/
if (delayed && is_delayed)
- force_requeue = 1; /* __send_cap delayed release; requeue */
+ force_requeue = true; /* __send_cap delayed release; requeue */
if (!delayed && !is_delayed)
__cap_delay_cancel(mdsc, ci);
else if (!is_delayed || force_requeue)
@@ -1873,8 +1917,8 @@ retry:
if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
goto out;
- flushing = __mark_caps_flushing(inode, session, &flush_tid,
- &oldest_flush_tid);
+ flushing = __mark_caps_flushing(inode, session, true,
+ &flush_tid, &oldest_flush_tid);
/* __send_cap drops i_ceph_lock */
delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
@@ -1887,10 +1931,11 @@ retry:
spin_unlock(&ci->i_ceph_lock);
}
} else {
- struct rb_node *n = rb_last(&ci->i_cap_flush_tree);
- if (n) {
+ if (!list_empty(&ci->i_cap_flush_list)) {
struct ceph_cap_flush *cf =
- rb_entry(n, struct ceph_cap_flush, i_node);
+ list_last_entry(&ci->i_cap_flush_list,
+ struct ceph_cap_flush, i_list);
+ cf->wake = true;
flush_tid = cf->tid;
}
flushing = ci->i_flushing_caps;
@@ -1910,14 +1955,13 @@ out:
static int caps_are_flushed(struct inode *inode, u64 flush_tid)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_cap_flush *cf;
- struct rb_node *n;
int ret = 1;
spin_lock(&ci->i_ceph_lock);
- n = rb_first(&ci->i_cap_flush_tree);
- if (n) {
- cf = rb_entry(n, struct ceph_cap_flush, i_node);
+ if (!list_empty(&ci->i_cap_flush_list)) {
+ struct ceph_cap_flush * cf =
+ list_first_entry(&ci->i_cap_flush_list,
+ struct ceph_cap_flush, i_list);
if (cf->tid <= flush_tid)
ret = 0;
}
@@ -1926,53 +1970,6 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid)
}
/*
- * Wait on any unsafe replies for the given inode. First wait on the
- * newest request, and make that the upper bound. Then, if there are
- * more requests, keep waiting on the oldest as long as it is still older
- * than the original request.
- */
-static void sync_write_wait(struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct list_head *head = &ci->i_unsafe_writes;
- struct ceph_osd_request *req;
- u64 last_tid;
-
- if (!S_ISREG(inode->i_mode))
- return;
-
- spin_lock(&ci->i_unsafe_lock);
- if (list_empty(head))
- goto out;
-
- /* set upper bound as _last_ entry in chain */
- req = list_last_entry(head, struct ceph_osd_request,
- r_unsafe_item);
- last_tid = req->r_tid;
-
- do {
- ceph_osdc_get_request(req);
- spin_unlock(&ci->i_unsafe_lock);
- dout("sync_write_wait on tid %llu (until %llu)\n",
- req->r_tid, last_tid);
- wait_for_completion(&req->r_safe_completion);
- spin_lock(&ci->i_unsafe_lock);
- ceph_osdc_put_request(req);
-
- /*
- * from here on look at first entry in chain, since we
- * only want to wait for anything older than last_tid
- */
- if (list_empty(head))
- break;
- req = list_first_entry(head, struct ceph_osd_request,
- r_unsafe_item);
- } while (req->r_tid < last_tid);
-out:
- spin_unlock(&ci->i_unsafe_lock);
-}
-
-/*
* wait for any unsafe requests to complete.
*/
static int unsafe_request_wait(struct inode *inode)
@@ -2024,7 +2021,8 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
int dirty;
dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
- sync_write_wait(inode);
+
+ ceph_sync_write_wait(inode);
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (ret < 0)
@@ -2087,87 +2085,74 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
return err;
}
-/*
- * After a recovering MDS goes active, we need to resend any caps
- * we were flushing.
- *
- * Caller holds session->s_mutex.
- */
-static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
-{
- struct ceph_cap_snap *capsnap;
-
- dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
- list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
- flushing_item) {
- struct ceph_inode_info *ci = capsnap->ci;
- struct inode *inode = &ci->vfs_inode;
- struct ceph_cap *cap;
-
- spin_lock(&ci->i_ceph_lock);
- cap = ci->i_auth_cap;
- if (cap && cap->session == session) {
- dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
- cap, capsnap);
- __ceph_flush_snaps(ci, &session, 1);
- } else {
- pr_err("%p auth cap %p not mds%d ???\n", inode,
- cap, session->s_mds);
- }
- spin_unlock(&ci->i_ceph_lock);
- }
-}
-
-static int __kick_flushing_caps(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session,
- struct ceph_inode_info *ci)
+static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ struct ceph_inode_info *ci,
+ u64 oldest_flush_tid)
+ __releases(ci->i_ceph_lock)
+ __acquires(ci->i_ceph_lock)
{
struct inode *inode = &ci->vfs_inode;
struct ceph_cap *cap;
struct ceph_cap_flush *cf;
- struct rb_node *n;
- int delayed = 0;
+ int ret;
u64 first_tid = 0;
- u64 oldest_flush_tid;
- spin_lock(&mdsc->cap_dirty_lock);
- oldest_flush_tid = __get_oldest_flush_tid(mdsc);
- spin_unlock(&mdsc->cap_dirty_lock);
+ list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
+ if (cf->tid < first_tid)
+ continue;
- while (true) {
- spin_lock(&ci->i_ceph_lock);
cap = ci->i_auth_cap;
if (!(cap && cap->session == session)) {
- pr_err("%p auth cap %p not mds%d ???\n", inode,
- cap, session->s_mds);
- spin_unlock(&ci->i_ceph_lock);
+ pr_err("%p auth cap %p not mds%d ???\n",
+ inode, cap, session->s_mds);
break;
}
- for (n = rb_first(&ci->i_cap_flush_tree); n; n = rb_next(n)) {
- cf = rb_entry(n, struct ceph_cap_flush, i_node);
- if (cf->tid >= first_tid)
- break;
- }
- if (!n) {
+ first_tid = cf->tid + 1;
+
+ if (cf->caps) {
+ dout("kick_flushing_caps %p cap %p tid %llu %s\n",
+ inode, cap, cf->tid, ceph_cap_string(cf->caps));
+ ci->i_ceph_flags |= CEPH_I_NODELAY;
+ ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+ __ceph_caps_used(ci),
+ __ceph_caps_wanted(ci),
+ cap->issued | cap->implemented,
+ cf->caps, cf->tid, oldest_flush_tid);
+ if (ret) {
+ pr_err("kick_flushing_caps: error sending "
+ "cap flush, ino (%llx.%llx) "
+ "tid %llu flushing %s\n",
+ ceph_vinop(inode), cf->tid,
+ ceph_cap_string(cf->caps));
+ }
+ } else {
+ struct ceph_cap_snap *capsnap =
+ container_of(cf, struct ceph_cap_snap,
+ cap_flush);
+ dout("kick_flushing_caps %p capsnap %p tid %llu %s\n",
+ inode, capsnap, cf->tid,
+ ceph_cap_string(capsnap->dirty));
+
+ atomic_inc(&capsnap->nref);
spin_unlock(&ci->i_ceph_lock);
- break;
- }
- cf = rb_entry(n, struct ceph_cap_flush, i_node);
+ ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
+ oldest_flush_tid);
+ if (ret < 0) {
+ pr_err("kick_flushing_caps: error sending "
+ "cap flushsnap, ino (%llx.%llx) "
+ "tid %llu follows %llu\n",
+ ceph_vinop(inode), cf->tid,
+ capsnap->follows);
+ }
- first_tid = cf->tid + 1;
+ ceph_put_cap_snap(capsnap);
+ }
- dout("kick_flushing_caps %p cap %p tid %llu %s\n", inode,
- cap, cf->tid, ceph_cap_string(cf->caps));
- delayed |= __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
- __ceph_caps_used(ci),
- __ceph_caps_wanted(ci),
- cap->issued | cap->implemented,
- cf->caps, cf->tid, oldest_flush_tid);
+ spin_lock(&ci->i_ceph_lock);
}
- return delayed;
}
void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
@@ -2175,8 +2160,14 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
{
struct ceph_inode_info *ci;
struct ceph_cap *cap;
+ u64 oldest_flush_tid;
dout("early_kick_flushing_caps mds%d\n", session->s_mds);
+
+ spin_lock(&mdsc->cap_dirty_lock);
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+ spin_unlock(&mdsc->cap_dirty_lock);
+
list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
spin_lock(&ci->i_ceph_lock);
cap = ci->i_auth_cap;
@@ -2196,10 +2187,11 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
*/
if ((cap->issued & ci->i_flushing_caps) !=
ci->i_flushing_caps) {
- spin_unlock(&ci->i_ceph_lock);
- if (!__kick_flushing_caps(mdsc, session, ci))
- continue;
- spin_lock(&ci->i_ceph_lock);
+ ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+ __kick_flushing_caps(mdsc, session, ci,
+ oldest_flush_tid);
+ } else {
+ ci->i_ceph_flags |= CEPH_I_KICK_FLUSH;
}
spin_unlock(&ci->i_ceph_lock);
@@ -2210,50 +2202,56 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
struct ceph_inode_info *ci;
-
- kick_flushing_capsnaps(mdsc, session);
+ struct ceph_cap *cap;
+ u64 oldest_flush_tid;
dout("kick_flushing_caps mds%d\n", session->s_mds);
+
+ spin_lock(&mdsc->cap_dirty_lock);
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+ spin_unlock(&mdsc->cap_dirty_lock);
+
list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
- int delayed = __kick_flushing_caps(mdsc, session, ci);
- if (delayed) {
- spin_lock(&ci->i_ceph_lock);
- __cap_delay_requeue(mdsc, ci);
+ spin_lock(&ci->i_ceph_lock);
+ cap = ci->i_auth_cap;
+ if (!(cap && cap->session == session)) {
+ pr_err("%p auth cap %p not mds%d ???\n",
+ &ci->vfs_inode, cap, session->s_mds);
spin_unlock(&ci->i_ceph_lock);
+ continue;
+ }
+ if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
+ ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+ __kick_flushing_caps(mdsc, session, ci,
+ oldest_flush_tid);
}
+ spin_unlock(&ci->i_ceph_lock);
}
}
static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
struct inode *inode)
+ __releases(ci->i_ceph_lock)
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_cap *cap;
- spin_lock(&ci->i_ceph_lock);
cap = ci->i_auth_cap;
dout("kick_flushing_inode_caps %p flushing %s\n", inode,
ceph_cap_string(ci->i_flushing_caps));
- __ceph_flush_snaps(ci, &session, 1);
-
- if (ci->i_flushing_caps) {
- int delayed;
-
+ if (!list_empty(&ci->i_cap_flush_list)) {
+ u64 oldest_flush_tid;
spin_lock(&mdsc->cap_dirty_lock);
list_move_tail(&ci->i_flushing_item,
&cap->session->s_cap_flushing);
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
spin_unlock(&mdsc->cap_dirty_lock);
+ ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+ __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid);
spin_unlock(&ci->i_ceph_lock);
-
- delayed = __kick_flushing_caps(mdsc, session, ci);
- if (delayed) {
- spin_lock(&ci->i_ceph_lock);
- __cap_delay_requeue(mdsc, ci);
- spin_unlock(&ci->i_ceph_lock);
- }
} else {
spin_unlock(&ci->i_ceph_lock);
}
@@ -2317,7 +2315,7 @@ again:
/* make sure file is actually open */
file_wanted = __ceph_caps_file_wanted(ci);
- if ((file_wanted & need) == 0) {
+ if ((file_wanted & need) != need) {
dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
ceph_cap_string(need), ceph_cap_string(file_wanted));
*err = -EBADF;
@@ -2393,6 +2391,9 @@ again:
snap_rwsem_locked = true;
}
*got = need | (have & want);
+ if ((need & CEPH_CAP_FILE_RD) &&
+ !(*got & CEPH_CAP_FILE_CACHE))
+ ceph_disable_fscache_readpage(ci);
__take_cap_refs(ci, *got, true);
ret = 1;
}
@@ -2412,12 +2413,26 @@ again:
goto out_unlock;
}
- if (!__ceph_is_any_caps(ci) &&
- ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
- dout("get_cap_refs %p forced umount\n", inode);
- *err = -EIO;
- ret = 1;
- goto out_unlock;
+ if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
+ int mds_wanted;
+ if (ACCESS_ONCE(mdsc->fsc->mount_state) ==
+ CEPH_MOUNT_SHUTDOWN) {
+ dout("get_cap_refs %p forced umount\n", inode);
+ *err = -EIO;
+ ret = 1;
+ goto out_unlock;
+ }
+ mds_wanted = __ceph_caps_mds_wanted(ci);
+ if ((mds_wanted & need) != need) {
+ dout("get_cap_refs %p caps were dropped"
+ " (session killed?)\n", inode);
+ *err = -ESTALE;
+ ret = 1;
+ goto out_unlock;
+ }
+ if ((mds_wanted & file_wanted) ==
+ (file_wanted & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
+ ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
}
dout("get_cap_refs %p have %s needed %s\n", inode,
@@ -2487,7 +2502,7 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
if (err == -EAGAIN)
continue;
if (err < 0)
- return err;
+ ret = err;
} else {
ret = wait_event_interruptible(ci->i_cap_wq,
try_get_cap_refs(ci, need, want, endoff,
@@ -2496,8 +2511,15 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
continue;
if (err < 0)
ret = err;
- if (ret < 0)
- return ret;
+ }
+ if (ret < 0) {
+ if (err == -ESTALE) {
+ /* session was killed, try renew caps */
+ ret = ceph_renew_caps(&ci->vfs_inode);
+ if (ret == 0)
+ continue;
+ }
+ return ret;
}
if (ci->i_inline_version != CEPH_INLINE_NONE &&
@@ -2510,7 +2532,7 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
*pinned_page = page;
break;
}
- page_cache_release(page);
+ put_page(page);
}
/*
* drop cap refs first because getattr while
@@ -2533,6 +2555,9 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
break;
}
+ if ((_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE))
+ ceph_fscache_revalidate_cookie(ci);
+
*got = _got;
return 0;
}
@@ -2553,16 +2578,19 @@ void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
* drop cap_snap that is not associated with any snapshot.
* we don't need to send FLUSHSNAP message for it.
*/
-static int ceph_try_drop_cap_snap(struct ceph_cap_snap *capsnap)
+static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci,
+ struct ceph_cap_snap *capsnap)
{
if (!capsnap->need_flush &&
!capsnap->writing && !capsnap->dirty_pages) {
-
dout("dropping cap_snap %p follows %llu\n",
capsnap, capsnap->follows);
+ BUG_ON(capsnap->cap_flush.tid > 0);
ceph_put_snap_context(capsnap->context);
+ if (!list_is_last(&capsnap->ci_item, &ci->i_cap_snaps))
+ ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
+
list_del(&capsnap->ci_item);
- list_del(&capsnap->flushing_item);
ceph_put_cap_snap(capsnap);
return 1;
}
@@ -2609,7 +2637,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
struct ceph_cap_snap,
ci_item);
capsnap->writing = 0;
- if (ceph_try_drop_cap_snap(capsnap))
+ if (ceph_try_drop_cap_snap(ci, capsnap))
put++;
else if (__ceph_finish_cap_snap(ci, capsnap))
flushsnaps = 1;
@@ -2634,7 +2662,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
if (last && !flushsnaps)
ceph_check_caps(ci, 0, NULL);
else if (flushsnaps)
- ceph_flush_snaps(ci);
+ ceph_flush_snaps(ci, NULL);
if (wake)
wake_up_all(&ci->i_cap_wq);
while (put-- > 0)
@@ -2652,15 +2680,19 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
struct ceph_snap_context *snapc)
{
struct inode *inode = &ci->vfs_inode;
- int last = 0;
- int complete_capsnap = 0;
- int drop_capsnap = 0;
- int found = 0;
struct ceph_cap_snap *capsnap = NULL;
+ int put = 0;
+ bool last = false;
+ bool found = false;
+ bool flush_snaps = false;
+ bool complete_capsnap = false;
spin_lock(&ci->i_ceph_lock);
ci->i_wrbuffer_ref -= nr;
- last = !ci->i_wrbuffer_ref;
+ if (ci->i_wrbuffer_ref == 0) {
+ last = true;
+ put++;
+ }
if (ci->i_head_snapc == snapc) {
ci->i_wrbuffer_ref_head -= nr;
@@ -2680,15 +2712,22 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
} else {
list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
if (capsnap->context == snapc) {
- found = 1;
+ found = true;
break;
}
}
BUG_ON(!found);
capsnap->dirty_pages -= nr;
if (capsnap->dirty_pages == 0) {
- complete_capsnap = 1;
- drop_capsnap = ceph_try_drop_cap_snap(capsnap);
+ complete_capsnap = true;
+ if (!capsnap->writing) {
+ if (ceph_try_drop_cap_snap(ci, capsnap)) {
+ put++;
+ } else {
+ ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
+ flush_snaps = true;
+ }
+ }
}
dout("put_wrbuffer_cap_refs on %p cap_snap %p "
" snap %lld %d/%d -> %d/%d %s%s\n",
@@ -2703,12 +2742,12 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
if (last) {
ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
- iput(inode);
- } else if (complete_capsnap) {
- ceph_flush_snaps(ci);
- wake_up_all(&ci->i_cap_wq);
+ } else if (flush_snaps) {
+ ceph_flush_snaps(ci, NULL);
}
- if (drop_capsnap)
+ if (complete_capsnap)
+ wake_up_all(&ci->i_cap_wq);
+ while (put-- > 0)
iput(inode);
}
@@ -2752,12 +2791,11 @@ static void invalidate_aliases(struct inode *inode)
*/
static void handle_cap_grant(struct ceph_mds_client *mdsc,
struct inode *inode, struct ceph_mds_caps *grant,
- u64 inline_version,
- void *inline_data, int inline_len,
+ struct ceph_string **pns, u64 inline_version,
+ void *inline_data, u32 inline_len,
struct ceph_buffer *xattr_buf,
struct ceph_mds_session *session,
- struct ceph_cap *cap, int issued,
- u32 pool_ns_len)
+ struct ceph_cap *cap, int issued)
__releases(ci->i_ceph_lock)
__releases(mdsc->snap_rwsem)
{
@@ -2774,7 +2812,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
bool writeback = false;
bool queue_trunc = false;
bool queue_invalidate = false;
- bool queue_revalidate = false;
bool deleted_inode = false;
bool fill_inline = false;
@@ -2807,7 +2844,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */
((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
(newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
- !ci->i_wrbuffer_ref) {
+ !(ci->i_wrbuffer_ref || ci->i_wb_ref)) {
if (try_nonblocking_invalidate(inode)) {
/* there were locked pages.. invalidate later
in a separate thread. */
@@ -2816,8 +2853,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
ci->i_rdcache_revoking = ci->i_rdcache_gen;
}
}
-
- ceph_fscache_invalidate(inode);
}
/* side effects now are allowed */
@@ -2859,11 +2894,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
}
}
- /* Do we need to revalidate our fscache cookie. Don't bother on the
- * first cache cap as we already validate at cookie creation time. */
- if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1)
- queue_revalidate = true;
-
if (newcaps & CEPH_CAP_ANY_RD) {
/* ctime/mtime/atime? */
ceph_decode_timespec(&mtime, &grant->mtime);
@@ -2876,8 +2906,18 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
/* file layout may have changed */
- ci->i_layout = grant->layout;
- ci->i_pool_ns_len = pool_ns_len;
+ s64 old_pool = ci->i_layout.pool_id;
+ struct ceph_string *old_ns;
+
+ ceph_file_layout_from_legacy(&ci->i_layout, &grant->layout);
+ old_ns = rcu_dereference_protected(ci->i_layout.pool_ns,
+ lockdep_is_held(&ci->i_ceph_lock));
+ rcu_assign_pointer(ci->i_layout.pool_ns, *pns);
+
+ if (ci->i_layout.pool_id != old_pool || *pns != old_ns)
+ ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
+
+ *pns = old_ns;
/* size/truncate_seq? */
queue_trunc = ceph_fill_file_size(inode, issued,
@@ -2960,23 +3000,20 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
fill_inline = true;
}
- spin_unlock(&ci->i_ceph_lock);
-
if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
- kick_flushing_inode_caps(mdsc, session, inode);
- up_read(&mdsc->snap_rwsem);
if (newcaps & ~issued)
wake = true;
+ kick_flushing_inode_caps(mdsc, session, inode);
+ up_read(&mdsc->snap_rwsem);
+ } else {
+ spin_unlock(&ci->i_ceph_lock);
}
if (fill_inline)
ceph_fill_inline_data(inode, NULL, inline_data, inline_len);
- if (queue_trunc) {
+ if (queue_trunc)
ceph_queue_vmtruncate(inode);
- ceph_queue_revalidate(inode);
- } else if (queue_revalidate)
- ceph_queue_revalidate(inode);
if (writeback)
/*
@@ -3013,23 +3050,24 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
- struct ceph_cap_flush *cf;
- struct rb_node *n;
+ struct ceph_cap_flush *cf, *tmp_cf;
LIST_HEAD(to_remove);
unsigned seq = le32_to_cpu(m->seq);
int dirty = le32_to_cpu(m->dirty);
int cleaned = 0;
- int drop = 0;
+ bool drop = false;
+ bool wake_ci = 0;
+ bool wake_mdsc = 0;
- n = rb_first(&ci->i_cap_flush_tree);
- while (n) {
- cf = rb_entry(n, struct ceph_cap_flush, i_node);
- n = rb_next(&cf->i_node);
+ list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) {
if (cf->tid == flush_tid)
cleaned = cf->caps;
+ if (cf->caps == 0) /* capsnap */
+ continue;
if (cf->tid <= flush_tid) {
- rb_erase(&cf->i_node, &ci->i_cap_flush_tree);
- list_add_tail(&cf->list, &to_remove);
+ if (__finish_cap_flush(NULL, ci, cf))
+ wake_ci = true;
+ list_add_tail(&cf->i_list, &to_remove);
} else {
cleaned &= ~cf->caps;
if (!cleaned)
@@ -3050,31 +3088,29 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
spin_lock(&mdsc->cap_dirty_lock);
- if (!list_empty(&to_remove)) {
- list_for_each_entry(cf, &to_remove, list)
- rb_erase(&cf->g_node, &mdsc->cap_flush_tree);
-
- n = rb_first(&mdsc->cap_flush_tree);
- cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL;
- if (!cf || cf->tid > flush_tid)
- wake_up_all(&mdsc->cap_flushing_wq);
+ list_for_each_entry(cf, &to_remove, i_list) {
+ if (__finish_cap_flush(mdsc, NULL, cf))
+ wake_mdsc = true;
}
if (ci->i_flushing_caps == 0) {
- list_del_init(&ci->i_flushing_item);
- if (!list_empty(&session->s_cap_flushing))
- dout(" mds%d still flushing cap on %p\n",
- session->s_mds,
- &list_entry(session->s_cap_flushing.next,
- struct ceph_inode_info,
- i_flushing_item)->vfs_inode);
+ if (list_empty(&ci->i_cap_flush_list)) {
+ list_del_init(&ci->i_flushing_item);
+ if (!list_empty(&session->s_cap_flushing)) {
+ dout(" mds%d still flushing cap on %p\n",
+ session->s_mds,
+ &list_first_entry(&session->s_cap_flushing,
+ struct ceph_inode_info,
+ i_flushing_item)->vfs_inode);
+ }
+ }
mdsc->num_cap_flushing--;
dout(" inode %p now !flushing\n", inode);
if (ci->i_dirty_caps == 0) {
dout(" inode %p now clean\n", inode);
BUG_ON(!list_empty(&ci->i_dirty_item));
- drop = 1;
+ drop = true;
if (ci->i_wr_ref == 0 &&
ci->i_wrbuffer_ref_head == 0) {
BUG_ON(!ci->i_head_snapc);
@@ -3086,17 +3122,21 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
}
}
spin_unlock(&mdsc->cap_dirty_lock);
- wake_up_all(&ci->i_cap_wq);
out:
spin_unlock(&ci->i_ceph_lock);
while (!list_empty(&to_remove)) {
cf = list_first_entry(&to_remove,
- struct ceph_cap_flush, list);
- list_del(&cf->list);
+ struct ceph_cap_flush, i_list);
+ list_del(&cf->i_list);
ceph_free_cap_flush(cf);
}
+
+ if (wake_ci)
+ wake_up_all(&ci->i_cap_wq);
+ if (wake_mdsc)
+ wake_up_all(&mdsc->cap_flushing_wq);
if (drop)
iput(inode);
}
@@ -3115,7 +3155,9 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
u64 follows = le64_to_cpu(m->snap_follows);
struct ceph_cap_snap *capsnap;
- int drop = 0;
+ bool flushed = false;
+ bool wake_ci = false;
+ bool wake_mdsc = false;
dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
inode, ci, session->s_mds, follows);
@@ -3123,30 +3165,47 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
spin_lock(&ci->i_ceph_lock);
list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
if (capsnap->follows == follows) {
- if (capsnap->flush_tid != flush_tid) {
+ if (capsnap->cap_flush.tid != flush_tid) {
dout(" cap_snap %p follows %lld tid %lld !="
" %lld\n", capsnap, follows,
- flush_tid, capsnap->flush_tid);
+ flush_tid, capsnap->cap_flush.tid);
break;
}
- WARN_ON(capsnap->dirty_pages || capsnap->writing);
- dout(" removing %p cap_snap %p follows %lld\n",
- inode, capsnap, follows);
- ceph_put_snap_context(capsnap->context);
- list_del(&capsnap->ci_item);
- list_del(&capsnap->flushing_item);
- ceph_put_cap_snap(capsnap);
- wake_up_all(&mdsc->cap_flushing_wq);
- drop = 1;
+ flushed = true;
break;
} else {
dout(" skipping cap_snap %p follows %lld\n",
capsnap, capsnap->follows);
}
}
+ if (flushed) {
+ WARN_ON(capsnap->dirty_pages || capsnap->writing);
+ dout(" removing %p cap_snap %p follows %lld\n",
+ inode, capsnap, follows);
+ list_del(&capsnap->ci_item);
+ if (__finish_cap_flush(NULL, ci, &capsnap->cap_flush))
+ wake_ci = true;
+
+ spin_lock(&mdsc->cap_dirty_lock);
+
+ if (list_empty(&ci->i_cap_flush_list))
+ list_del_init(&ci->i_flushing_item);
+
+ if (__finish_cap_flush(mdsc, NULL, &capsnap->cap_flush))
+ wake_mdsc = true;
+
+ spin_unlock(&mdsc->cap_dirty_lock);
+ }
spin_unlock(&ci->i_ceph_lock);
- if (drop)
+ if (flushed) {
+ ceph_put_snap_context(capsnap->context);
+ ceph_put_cap_snap(capsnap);
+ if (wake_ci)
+ wake_up_all(&ci->i_cap_wq);
+ if (wake_mdsc)
+ wake_up_all(&mdsc->cap_flushing_wq);
iput(inode);
+ }
}
/*
@@ -3178,10 +3237,8 @@ static void handle_cap_trunc(struct inode *inode,
truncate_seq, truncate_size, size);
spin_unlock(&ci->i_ceph_lock);
- if (queue_trunc) {
+ if (queue_trunc)
ceph_queue_vmtruncate(inode);
- ceph_fscache_invalidate(inode);
- }
}
/*
@@ -3226,6 +3283,8 @@ retry:
if (target < 0) {
__ceph_remove_cap(cap, false);
+ if (!ci->i_auth_cap)
+ ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
goto out_unlock;
}
@@ -3251,7 +3310,8 @@ retry:
tcap->implemented |= issued;
if (cap == ci->i_auth_cap)
ci->i_auth_cap = tcap;
- if (ci->i_flushing_caps && ci->i_auth_cap == tcap) {
+ if (!list_empty(&ci->i_cap_flush_list) &&
+ ci->i_auth_cap == tcap) {
spin_lock(&mdsc->cap_dirty_lock);
list_move_tail(&ci->i_flushing_item,
&tcap->session->s_cap_flushing);
@@ -3404,20 +3464,18 @@ void ceph_handle_caps(struct ceph_mds_session *session,
struct ceph_cap *cap;
struct ceph_mds_caps *h;
struct ceph_mds_cap_peer *peer = NULL;
- struct ceph_snap_realm *realm;
+ struct ceph_snap_realm *realm = NULL;
+ struct ceph_string *pool_ns = NULL;
int mds = session->s_mds;
int op, issued;
u32 seq, mseq;
struct ceph_vino vino;
- u64 cap_id;
- u64 size, max_size;
u64 tid;
u64 inline_version = 0;
void *inline_data = NULL;
u32 inline_len = 0;
void *snaptrace;
size_t snaptrace_len;
- u32 pool_ns_len = 0;
void *p, *end;
dout("handle_caps from mds%d\n", mds);
@@ -3431,11 +3489,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
op = le32_to_cpu(h->op);
vino.ino = le64_to_cpu(h->ino);
vino.snap = CEPH_NOSNAP;
- cap_id = le64_to_cpu(h->cap_id);
seq = le32_to_cpu(h->seq);
mseq = le32_to_cpu(h->migrate_seq);
- size = le64_to_cpu(h->size);
- max_size = le64_to_cpu(h->max_size);
snaptrace = h + 1;
snaptrace_len = le32_to_cpu(h->snap_trace_len);
@@ -3474,6 +3529,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
u64 flush_tid;
u32 caller_uid, caller_gid;
u32 osd_epoch_barrier;
+ u32 pool_ns_len;
/* version >= 5 */
ceph_decode_32_safe(&p, end, osd_epoch_barrier, bad);
/* version >= 6 */
@@ -3483,6 +3539,11 @@ void ceph_handle_caps(struct ceph_mds_session *session,
ceph_decode_32_safe(&p, end, caller_gid, bad);
/* version >= 8 */
ceph_decode_32_safe(&p, end, pool_ns_len, bad);
+ if (pool_ns_len > 0) {
+ ceph_decode_need(&p, end, pool_ns_len, bad);
+ pool_ns = ceph_find_or_create_string(p, pool_ns_len);
+ p += pool_ns_len;
+ }
}
/* lookup ino */
@@ -3503,7 +3564,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
cap = ceph_get_cap(mdsc, NULL);
cap->cap_ino = vino.ino;
cap->queue_release = 1;
- cap->cap_id = cap_id;
+ cap->cap_id = le64_to_cpu(h->cap_id);
cap->mseq = mseq;
cap->seq = seq;
spin_lock(&session->s_cap_lock);
@@ -3538,10 +3599,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
}
handle_cap_import(mdsc, inode, h, peer, session,
&cap, &issued);
- handle_cap_grant(mdsc, inode, h,
+ handle_cap_grant(mdsc, inode, h, &pool_ns,
inline_version, inline_data, inline_len,
- msg->middle, session, cap, issued,
- pool_ns_len);
+ msg->middle, session, cap, issued);
if (realm)
ceph_put_snap_realm(mdsc, realm);
goto done_unlocked;
@@ -3563,10 +3623,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
case CEPH_CAP_OP_GRANT:
__ceph_caps_issued(ci, &issued);
issued |= __ceph_caps_dirty(ci);
- handle_cap_grant(mdsc, inode, h,
+ handle_cap_grant(mdsc, inode, h, &pool_ns,
inline_version, inline_data, inline_len,
- msg->middle, session, cap, issued,
- pool_ns_len);
+ msg->middle, session, cap, issued);
goto done_unlocked;
case CEPH_CAP_OP_FLUSH_ACK:
@@ -3597,6 +3656,7 @@ done:
mutex_unlock(&session->s_mutex);
done_unlocked:
iput(inode);
+ ceph_put_string(pool_ns);
return;
bad:
@@ -3657,6 +3717,16 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
dout("flush_dirty_caps done\n");
}
+void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
+{
+ int i;
+ int bits = (fmode << 1) | 1;
+ for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
+ if (bits & (1 << i))
+ ci->i_nr_by_mode[i]++;
+ }
+}
+
/*
* Drop open file reference. If we were the last open file,
* we may need to release capabilities to the MDS (or schedule
@@ -3664,15 +3734,20 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
*/
void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
{
- struct inode *inode = &ci->vfs_inode;
- int last = 0;
-
+ int i, last = 0;
+ int bits = (fmode << 1) | 1;
spin_lock(&ci->i_ceph_lock);
- dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
- ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
- BUG_ON(ci->i_nr_by_mode[fmode] == 0);
- if (--ci->i_nr_by_mode[fmode] == 0)
- last++;
+ for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
+ if (bits & (1 << i)) {
+ BUG_ON(ci->i_nr_by_mode[i] == 0);
+ if (--ci->i_nr_by_mode[i] == 0)
+ last++;
+ }
+ }
+ dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n",
+ &ci->vfs_inode, fmode,
+ ci->i_nr_by_mode[0], ci->i_nr_by_mode[1],
+ ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]);
spin_unlock(&ci->i_ceph_lock);
if (last && ci->i_vino.snap == CEPH_NOSNAP)
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 31f831471ed28..39ff678e567fc 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -109,7 +109,7 @@ static int mdsc_show(struct seq_file *s, void *p)
path ? path : "");
spin_unlock(&req->r_old_dentry->d_lock);
kfree(path);
- } else if (req->r_path2) {
+ } else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) {
if (req->r_ino2.ino)
seq_printf(s, " #%llx/%s", req->r_ino2.ino,
req->r_path2);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index fadc243dfb284..c64a0b794d491 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -5,6 +5,7 @@
#include <linux/namei.h>
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/xattr.h>
#include "super.h"
#include "mds_client.h"
@@ -58,7 +59,7 @@ int ceph_init_dentry(struct dentry *dentry)
di->dentry = dentry;
di->lease_session = NULL;
- dentry->d_time = jiffies;
+ di->time = jiffies;
/* avoid reordering d_fsdata setup so that the check above is safe */
smp_mb();
dentry->d_fsdata = di;
@@ -69,16 +70,42 @@ out_unlock:
}
/*
- * for readdir, we encode the directory frag and offset within that
- * frag into f_pos.
+ * for f_pos for readdir:
+ * - hash order:
+ * (0xff << 52) | ((24 bits hash) << 28) |
+ * (the nth entry has hash collision);
+ * - frag+name order;
+ * ((frag value) << 28) | (the nth entry in frag);
*/
+#define OFFSET_BITS 28
+#define OFFSET_MASK ((1 << OFFSET_BITS) - 1)
+#define HASH_ORDER (0xffull << (OFFSET_BITS + 24))
+loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order)
+{
+ loff_t fpos = ((loff_t)high << 28) | (loff_t)off;
+ if (hash_order)
+ fpos |= HASH_ORDER;
+ return fpos;
+}
+
+static bool is_hash_order(loff_t p)
+{
+ return (p & HASH_ORDER) == HASH_ORDER;
+}
+
static unsigned fpos_frag(loff_t p)
{
- return p >> 32;
+ return p >> OFFSET_BITS;
}
+
+static unsigned fpos_hash(loff_t p)
+{
+ return ceph_frag_value(fpos_frag(p));
+}
+
static unsigned fpos_off(loff_t p)
{
- return p & 0xffffffff;
+ return p & OFFSET_MASK;
}
static int fpos_cmp(loff_t l, loff_t r)
@@ -110,6 +137,50 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name,
return 0;
}
+
+static struct dentry *
+__dcache_find_get_entry(struct dentry *parent, u64 idx,
+ struct ceph_readdir_cache_control *cache_ctl)
+{
+ struct inode *dir = d_inode(parent);
+ struct dentry *dentry;
+ unsigned idx_mask = (PAGE_SIZE / sizeof(struct dentry *)) - 1;
+ loff_t ptr_pos = idx * sizeof(struct dentry *);
+ pgoff_t ptr_pgoff = ptr_pos >> PAGE_SHIFT;
+
+ if (ptr_pos >= i_size_read(dir))
+ return NULL;
+
+ if (!cache_ctl->page || ptr_pgoff != page_index(cache_ctl->page)) {
+ ceph_readdir_cache_release(cache_ctl);
+ cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff);
+ if (!cache_ctl->page) {
+ dout(" page %lu not found\n", ptr_pgoff);
+ return ERR_PTR(-EAGAIN);
+ }
+ /* reading/filling the cache are serialized by
+ i_mutex, no need to use page lock */
+ unlock_page(cache_ctl->page);
+ cache_ctl->dentries = kmap(cache_ctl->page);
+ }
+
+ cache_ctl->index = idx & idx_mask;
+
+ rcu_read_lock();
+ spin_lock(&parent->d_lock);
+ /* check i_size again here, because empty directory can be
+ * marked as complete while not holding the i_mutex. */
+ if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir))
+ dentry = cache_ctl->dentries[cache_ctl->index];
+ else
+ dentry = NULL;
+ spin_unlock(&parent->d_lock);
+ if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
+ dentry = NULL;
+ rcu_read_unlock();
+ return dentry ? : ERR_PTR(-EAGAIN);
+}
+
/*
* When possible, we try to satisfy a readdir by peeking at the
* dcache. We make this work by carefully ordering dentries on
@@ -129,75 +200,68 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
struct inode *dir = d_inode(parent);
struct dentry *dentry, *last = NULL;
struct ceph_dentry_info *di;
- unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry *);
- int err = 0;
- loff_t ptr_pos = 0;
struct ceph_readdir_cache_control cache_ctl = {};
+ u64 idx = 0;
+ int err = 0;
- dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos);
+ dout("__dcache_readdir %p v%u at %llx\n", dir, shared_gen, ctx->pos);
+
+ /* search start position */
+ if (ctx->pos > 2) {
+ u64 count = div_u64(i_size_read(dir), sizeof(struct dentry *));
+ while (count > 0) {
+ u64 step = count >> 1;
+ dentry = __dcache_find_get_entry(parent, idx + step,
+ &cache_ctl);
+ if (!dentry) {
+ /* use linar search */
+ idx = 0;
+ break;
+ }
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
+ goto out;
+ }
+ di = ceph_dentry(dentry);
+ spin_lock(&dentry->d_lock);
+ if (fpos_cmp(di->offset, ctx->pos) < 0) {
+ idx += step + 1;
+ count -= step + 1;
+ } else {
+ count = step;
+ }
+ spin_unlock(&dentry->d_lock);
+ dput(dentry);
+ }
- /* we can calculate cache index for the first dirfrag */
- if (ceph_frag_is_leftmost(fpos_frag(ctx->pos))) {
- cache_ctl.index = fpos_off(ctx->pos) - 2;
- BUG_ON(cache_ctl.index < 0);
- ptr_pos = cache_ctl.index * sizeof(struct dentry *);
+ dout("__dcache_readdir %p cache idx %llu\n", dir, idx);
}
- while (true) {
- pgoff_t pgoff;
- bool emit_dentry;
- if (ptr_pos >= i_size_read(dir)) {
+ for (;;) {
+ bool emit_dentry = false;
+ dentry = __dcache_find_get_entry(parent, idx++, &cache_ctl);
+ if (!dentry) {
fi->flags |= CEPH_F_ATEND;
err = 0;
break;
}
-
- err = -EAGAIN;
- pgoff = ptr_pos >> PAGE_CACHE_SHIFT;
- if (!cache_ctl.page || pgoff != page_index(cache_ctl.page)) {
- ceph_readdir_cache_release(&cache_ctl);
- cache_ctl.page = find_lock_page(&dir->i_data, pgoff);
- if (!cache_ctl.page) {
- dout(" page %lu not found\n", pgoff);
- break;
- }
- /* reading/filling the cache are serialized by
- * i_mutex, no need to use page lock */
- unlock_page(cache_ctl.page);
- cache_ctl.dentries = kmap(cache_ctl.page);
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
+ goto out;
}
- rcu_read_lock();
- spin_lock(&parent->d_lock);
- /* check i_size again here, because empty directory can be
- * marked as complete while not holding the i_mutex. */
- if (ceph_dir_is_complete_ordered(dir) &&
- ptr_pos < i_size_read(dir))
- dentry = cache_ctl.dentries[cache_ctl.index % nsize];
- else
- dentry = NULL;
- spin_unlock(&parent->d_lock);
- if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
- dentry = NULL;
- rcu_read_unlock();
- if (!dentry)
- break;
-
- emit_dentry = false;
di = ceph_dentry(dentry);
spin_lock(&dentry->d_lock);
if (di->lease_shared_gen == shared_gen &&
d_really_is_positive(dentry) &&
- ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR &&
- ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH &&
fpos_cmp(ctx->pos, di->offset) <= 0) {
emit_dentry = true;
}
spin_unlock(&dentry->d_lock);
if (emit_dentry) {
- dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos,
+ dout(" %llx dentry %p %pd %p\n", di->offset,
dentry, dentry, d_inode(dentry));
ctx->pos = di->offset;
if (!dir_emit(ctx, dentry->d_name.name,
@@ -217,10 +281,8 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
} else {
dput(dentry);
}
-
- cache_ctl.index++;
- ptr_pos += sizeof(struct dentry *);
}
+out:
ceph_readdir_cache_release(&cache_ctl);
if (last) {
int ret;
@@ -234,6 +296,16 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
return err;
}
+static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos)
+{
+ if (!fi->last_readdir)
+ return true;
+ if (is_hash_order(pos))
+ return !ceph_frag_contains_value(fi->frag, fpos_hash(pos));
+ else
+ return fi->frag != fpos_frag(pos);
+}
+
static int ceph_readdir(struct file *file, struct dir_context *ctx)
{
struct ceph_file_info *fi = file->private_data;
@@ -241,13 +313,12 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_mds_client *mdsc = fsc->mdsc;
- unsigned frag = fpos_frag(ctx->pos);
- int off = fpos_off(ctx->pos);
+ int i;
int err;
u32 ftype;
struct ceph_mds_reply_info_parsed *rinfo;
- dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off);
+ dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos);
if (fi->flags & CEPH_F_ATEND)
return 0;
@@ -259,7 +330,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
inode->i_mode >> 12))
return 0;
ctx->pos = 1;
- off = 1;
}
if (ctx->pos == 1) {
ino_t ino = parent_ino(file->f_path.dentry);
@@ -269,7 +339,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
inode->i_mode >> 12))
return 0;
ctx->pos = 2;
- off = 2;
}
/* can we use the dcache? */
@@ -284,8 +353,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
err = __dcache_readdir(file, ctx, shared_gen);
if (err != -EAGAIN)
return err;
- frag = fpos_frag(ctx->pos);
- off = fpos_off(ctx->pos);
} else {
spin_unlock(&ci->i_ceph_lock);
}
@@ -293,8 +360,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
/* proceed with a normal readdir */
more:
/* do we have the correct frag content buffered? */
- if (fi->frag != frag || fi->last_readdir == NULL) {
+ if (need_send_readdir(fi, ctx->pos)) {
struct ceph_mds_request *req;
+ unsigned frag;
int op = ceph_snap(inode) == CEPH_SNAPDIR ?
CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
@@ -304,6 +372,13 @@ more:
fi->last_readdir = NULL;
}
+ if (is_hash_order(ctx->pos)) {
+ frag = ceph_choose_frag(ci, fpos_hash(ctx->pos),
+ NULL, NULL);
+ } else {
+ frag = fpos_frag(ctx->pos);
+ }
+
dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
ceph_vinop(inode), frag, fi->last_name);
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
@@ -330,6 +405,8 @@ more:
req->r_readdir_cache_idx = fi->readdir_cache_idx;
req->r_readdir_offset = fi->next_offset;
req->r_args.readdir.frag = cpu_to_le32(frag);
+ req->r_args.readdir.flags =
+ cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS);
req->r_inode = inode;
ihold(inode);
@@ -339,22 +416,26 @@ more:
ceph_mdsc_put_request(req);
return err;
}
- dout("readdir got and parsed readdir result=%d"
- " on frag %x, end=%d, complete=%d\n", err, frag,
+ dout("readdir got and parsed readdir result=%d on "
+ "frag %x, end=%d, complete=%d, hash_order=%d\n",
+ err, frag,
(int)req->r_reply_info.dir_end,
- (int)req->r_reply_info.dir_complete);
-
+ (int)req->r_reply_info.dir_complete,
+ (int)req->r_reply_info.hash_order);
- /* note next offset and last dentry name */
rinfo = &req->r_reply_info;
if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
frag = le32_to_cpu(rinfo->dir_dir->frag);
- off = req->r_readdir_offset;
- fi->next_offset = off;
+ if (!rinfo->hash_order) {
+ fi->next_offset = req->r_readdir_offset;
+ /* adjust ctx->pos to beginning of frag */
+ ctx->pos = ceph_make_fpos(frag,
+ fi->next_offset,
+ false);
+ }
}
fi->frag = frag;
- fi->offset = fi->next_offset;
fi->last_readdir = req;
if (req->r_did_prepopulate) {
@@ -362,7 +443,8 @@ more:
if (fi->readdir_cache_idx < 0) {
/* preclude from marking dir ordered */
fi->dir_ordered_count = 0;
- } else if (ceph_frag_is_leftmost(frag) && off == 2) {
+ } else if (ceph_frag_is_leftmost(frag) &&
+ fi->next_offset == 2) {
/* note dir version at start of readdir so
* we can tell if any dentries get dropped */
fi->dir_release_count = req->r_dir_release_cnt;
@@ -376,65 +458,87 @@ more:
fi->dir_release_count = 0;
}
- if (req->r_reply_info.dir_end) {
- kfree(fi->last_name);
- fi->last_name = NULL;
- if (ceph_frag_is_rightmost(frag))
- fi->next_offset = 2;
- else
- fi->next_offset = 0;
- } else {
- err = note_last_dentry(fi,
- rinfo->dir_dname[rinfo->dir_nr-1],
- rinfo->dir_dname_len[rinfo->dir_nr-1],
- fi->next_offset + rinfo->dir_nr);
+ /* note next offset and last dentry name */
+ if (rinfo->dir_nr > 0) {
+ struct ceph_mds_reply_dir_entry *rde =
+ rinfo->dir_entries + (rinfo->dir_nr-1);
+ unsigned next_offset = req->r_reply_info.dir_end ?
+ 2 : (fpos_off(rde->offset) + 1);
+ err = note_last_dentry(fi, rde->name, rde->name_len,
+ next_offset);
if (err)
return err;
+ } else if (req->r_reply_info.dir_end) {
+ fi->next_offset = 2;
+ /* keep last name */
}
}
rinfo = &fi->last_readdir->r_reply_info;
- dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
- rinfo->dir_nr, off, fi->offset);
-
- ctx->pos = ceph_make_fpos(frag, off);
- while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {
- struct ceph_mds_reply_inode *in =
- rinfo->dir_in[off - fi->offset].in;
+ dout("readdir frag %x num %d pos %llx chunk first %llx\n",
+ fi->frag, rinfo->dir_nr, ctx->pos,
+ rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL);
+
+ i = 0;
+ /* search start position */
+ if (rinfo->dir_nr > 0) {
+ int step, nr = rinfo->dir_nr;
+ while (nr > 0) {
+ step = nr >> 1;
+ if (rinfo->dir_entries[i + step].offset < ctx->pos) {
+ i += step + 1;
+ nr -= step + 1;
+ } else {
+ nr = step;
+ }
+ }
+ }
+ for (; i < rinfo->dir_nr; i++) {
+ struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
struct ceph_vino vino;
ino_t ino;
- dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
- off, off - fi->offset, rinfo->dir_nr, ctx->pos,
- rinfo->dir_dname_len[off - fi->offset],
- rinfo->dir_dname[off - fi->offset], in);
- BUG_ON(!in);
- ftype = le32_to_cpu(in->mode) >> 12;
- vino.ino = le64_to_cpu(in->ino);
- vino.snap = le64_to_cpu(in->snapid);
+ BUG_ON(rde->offset < ctx->pos);
+
+ ctx->pos = rde->offset;
+ dout("readdir (%d/%d) -> %llx '%.*s' %p\n",
+ i, rinfo->dir_nr, ctx->pos,
+ rde->name_len, rde->name, &rde->inode.in);
+
+ BUG_ON(!rde->inode.in);
+ ftype = le32_to_cpu(rde->inode.in->mode) >> 12;
+ vino.ino = le64_to_cpu(rde->inode.in->ino);
+ vino.snap = le64_to_cpu(rde->inode.in->snapid);
ino = ceph_vino_to_ino(vino);
- if (!dir_emit(ctx,
- rinfo->dir_dname[off - fi->offset],
- rinfo->dir_dname_len[off - fi->offset],
- ceph_translate_ino(inode->i_sb, ino), ftype)) {
+
+ if (!dir_emit(ctx, rde->name, rde->name_len,
+ ceph_translate_ino(inode->i_sb, ino), ftype)) {
dout("filldir stopping us...\n");
return 0;
}
- off++;
ctx->pos++;
}
- if (fi->last_name) {
+ if (fi->next_offset > 2) {
ceph_mdsc_put_request(fi->last_readdir);
fi->last_readdir = NULL;
goto more;
}
/* more frags? */
- if (!ceph_frag_is_rightmost(frag)) {
- frag = ceph_frag_next(frag);
- off = 0;
- ctx->pos = ceph_make_fpos(frag, off);
+ if (!ceph_frag_is_rightmost(fi->frag)) {
+ unsigned frag = ceph_frag_next(fi->frag);
+ if (is_hash_order(ctx->pos)) {
+ loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag),
+ fi->next_offset, true);
+ if (new_pos > ctx->pos)
+ ctx->pos = new_pos;
+ /* keep last_name */
+ } else {
+ ctx->pos = ceph_make_fpos(frag, fi->next_offset, false);
+ kfree(fi->last_name);
+ fi->last_name = NULL;
+ }
dout("readdir next frag is %x\n", frag);
goto more;
}
@@ -466,7 +570,7 @@ more:
return 0;
}
-static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
+static void reset_readdir(struct ceph_file_info *fi)
{
if (fi->last_readdir) {
ceph_mdsc_put_request(fi->last_readdir);
@@ -476,18 +580,38 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
fi->last_name = NULL;
fi->dir_release_count = 0;
fi->readdir_cache_idx = -1;
- if (ceph_frag_is_leftmost(frag))
- fi->next_offset = 2; /* compensate for . and .. */
- else
- fi->next_offset = 0;
+ fi->next_offset = 2; /* compensate for . and .. */
fi->flags &= ~CEPH_F_ATEND;
}
+/*
+ * discard buffered readdir content on seekdir(0), or seek to new frag,
+ * or seek prior to current chunk
+ */
+static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos)
+{
+ struct ceph_mds_reply_info_parsed *rinfo;
+ loff_t chunk_offset;
+ if (new_pos == 0)
+ return true;
+ if (is_hash_order(new_pos)) {
+ /* no need to reset last_name for a forward seek when
+ * dentries are sotred in hash order */
+ } else if (fi->frag |= fpos_frag(new_pos)) {
+ return true;
+ }
+ rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL;
+ if (!rinfo || !rinfo->dir_nr)
+ return true;
+ chunk_offset = rinfo->dir_entries[0].offset;
+ return new_pos < chunk_offset ||
+ is_hash_order(new_pos) != is_hash_order(chunk_offset);
+}
+
static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
{
struct ceph_file_info *fi = file->private_data;
struct inode *inode = file->f_mapping->host;
- loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset);
loff_t retval;
inode_lock(inode);
@@ -504,25 +628,22 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
}
if (offset >= 0) {
+ if (need_reset_readdir(fi, offset)) {
+ dout("dir_llseek dropping %p content\n", file);
+ reset_readdir(fi);
+ } else if (is_hash_order(offset) && offset > file->f_pos) {
+ /* for hash offset, we don't know if a forward seek
+ * is within same frag */
+ fi->dir_release_count = 0;
+ fi->readdir_cache_idx = -1;
+ }
+
if (offset != file->f_pos) {
file->f_pos = offset;
file->f_version = 0;
fi->flags &= ~CEPH_F_ATEND;
}
retval = offset;
-
- if (offset == 0 ||
- fpos_frag(offset) != fi->frag ||
- fpos_off(offset) < fi->offset) {
- /* discard buffered readdir content on seekdir(0), or
- * seek to new frag, or seek prior to current chunk */
- dout("dir_llseek dropping %p content\n", file);
- reset_readdir(fi, fpos_frag(offset));
- } else if (fpos_cmp(offset, old_offset) > 0) {
- /* reset dir_release_count if we did a forward seek */
- fi->dir_release_count = 0;
- fi->readdir_cache_idx = -1;
- }
}
out:
inode_unlock(inode);
@@ -590,7 +711,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
return dentry;
}
-static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
+static bool is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
{
return ceph_ino(inode) == CEPH_INO_ROOT &&
strncmp(dentry->d_name.name, ".ceph", 5) == 0;
@@ -1003,7 +1124,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
void ceph_invalidate_dentry_lease(struct dentry *dentry)
{
spin_lock(&dentry->d_lock);
- dentry->d_time = jiffies;
+ ceph_dentry(dentry)->time = jiffies;
ceph_dentry(dentry)->lease_shared_gen = 0;
spin_unlock(&dentry->d_lock);
}
@@ -1012,7 +1133,8 @@ void ceph_invalidate_dentry_lease(struct dentry *dentry)
* Check if dentry lease is valid. If not, delete the lease. Try to
* renew if the least is more than half up.
*/
-static int dentry_lease_is_valid(struct dentry *dentry)
+static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
+ struct inode *dir)
{
struct ceph_dentry_info *di;
struct ceph_mds_session *s;
@@ -1020,12 +1142,11 @@ static int dentry_lease_is_valid(struct dentry *dentry)
u32 gen;
unsigned long ttl;
struct ceph_mds_session *session = NULL;
- struct inode *dir = NULL;
u32 seq = 0;
spin_lock(&dentry->d_lock);
di = ceph_dentry(dentry);
- if (di->lease_session) {
+ if (di && di->lease_session) {
s = di->lease_session;
spin_lock(&s->s_gen_ttl_lock);
gen = s->s_cap_gen;
@@ -1033,17 +1154,24 @@ static int dentry_lease_is_valid(struct dentry *dentry)
spin_unlock(&s->s_gen_ttl_lock);
if (di->lease_gen == gen &&
- time_before(jiffies, dentry->d_time) &&
+ time_before(jiffies, di->time) &&
time_before(jiffies, ttl)) {
valid = 1;
if (di->lease_renew_after &&
time_after(jiffies, di->lease_renew_after)) {
- /* we should renew */
- dir = d_inode(dentry->d_parent);
- session = ceph_get_mds_session(s);
- seq = di->lease_seq;
- di->lease_renew_after = 0;
- di->lease_renew_from = jiffies;
+ /*
+ * We should renew. If we're in RCU walk mode
+ * though, we can't do that so just return
+ * -ECHILD.
+ */
+ if (flags & LOOKUP_RCU) {
+ valid = -ECHILD;
+ } else {
+ session = ceph_get_mds_session(s);
+ seq = di->lease_seq;
+ di->lease_renew_after = 0;
+ di->lease_renew_from = jiffies;
+ }
}
}
}
@@ -1086,15 +1214,19 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
struct dentry *parent;
struct inode *dir;
- if (flags & LOOKUP_RCU)
- return -ECHILD;
+ if (flags & LOOKUP_RCU) {
+ parent = ACCESS_ONCE(dentry->d_parent);
+ dir = d_inode_rcu(parent);
+ if (!dir)
+ return -ECHILD;
+ } else {
+ parent = dget_parent(dentry);
+ dir = d_inode(parent);
+ }
dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry,
dentry, d_inode(dentry), ceph_dentry(dentry)->offset);
- parent = dget_parent(dentry);
- dir = d_inode(parent);
-
/* always trust cached snapped dentries, snapdir dentry */
if (ceph_snap(dir) != CEPH_NOSNAP) {
dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry,
@@ -1103,12 +1235,16 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
} else if (d_really_is_positive(dentry) &&
ceph_snap(d_inode(dentry)) == CEPH_SNAPDIR) {
valid = 1;
- } else if (dentry_lease_is_valid(dentry) ||
- dir_lease_is_valid(dir, dentry)) {
- if (d_really_is_positive(dentry))
- valid = ceph_is_any_caps(d_inode(dentry));
- else
- valid = 1;
+ } else {
+ valid = dentry_lease_is_valid(dentry, flags, dir);
+ if (valid == -ECHILD)
+ return valid;
+ if (valid || dir_lease_is_valid(dir, dentry)) {
+ if (d_really_is_positive(dentry))
+ valid = ceph_is_any_caps(d_inode(dentry));
+ else
+ valid = 1;
+ }
}
if (!valid) {
@@ -1117,6 +1253,9 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
struct ceph_mds_request *req;
int op, mask, err;
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
op = ceph_snap(dir) == CEPH_SNAPDIR ?
CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
@@ -1152,7 +1291,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
ceph_dir_clear_complete(dir);
}
- dput(parent);
+ if (!(flags & LOOKUP_RCU))
+ dput(parent);
return valid;
}
@@ -1165,10 +1305,14 @@ static void ceph_d_release(struct dentry *dentry)
dout("d_release %p\n", dentry);
ceph_dentry_lru_del(dentry);
+
+ spin_lock(&dentry->d_lock);
+ dentry->d_fsdata = NULL;
+ spin_unlock(&dentry->d_lock);
+
if (di->lease_session)
ceph_put_mds_session(di->lease_session);
kmem_cache_free(ceph_dentry_cachep, di);
- dentry->d_fsdata = NULL;
}
static int ceph_snapdir_d_revalidate(struct dentry *dentry,
@@ -1342,10 +1486,10 @@ const struct inode_operations ceph_dir_iops = {
.permission = ceph_permission,
.getattr = ceph_getattr,
.setattr = ceph_setattr,
- .setxattr = ceph_setxattr,
- .getxattr = ceph_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
.listxattr = ceph_listxattr,
- .removexattr = ceph_removexattr,
+ .removexattr = generic_removexattr,
.get_acl = ceph_get_acl,
.set_acl = ceph_set_acl,
.mknod = ceph_mknod,
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 6e72c98162d53..1780218a48f08 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -95,10 +95,8 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
}
dentry = d_obtain_alias(inode);
- if (IS_ERR(dentry)) {
- iput(inode);
+ if (IS_ERR(dentry))
return dentry;
- }
err = ceph_init_dentry(dentry);
if (err < 0) {
dput(dentry);
@@ -167,10 +165,8 @@ static struct dentry *__get_parent(struct super_block *sb,
return ERR_PTR(-ENOENT);
dentry = d_obtain_alias(inode);
- if (IS_ERR(dentry)) {
- iput(inode);
+ if (IS_ERR(dentry))
return dentry;
- }
err = ceph_init_dentry(dentry);
if (err < 0) {
dput(dentry);
@@ -210,7 +206,7 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
dout("fh_to_parent %llx\n", cfh->parent_ino);
dentry = __get_parent(sb, NULL, cfh->ino);
- if (IS_ERR(dentry) && PTR_ERR(dentry) == -ENOENT)
+ if (unlikely(dentry == ERR_PTR(-ENOENT)))
dentry = __fh_to_dentry(sb, cfh->parent_ino);
return dentry;
}
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ef38f01c1795a..0f5375d8e0305 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -137,23 +137,11 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
{
struct ceph_file_info *cf;
int ret = 0;
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
- /* First file open request creates the cookie, we want to keep
- * this cookie around for the filetime of the inode as not to
- * have to worry about fscache register / revoke / operation
- * races.
- *
- * Also, if we know the operation is going to invalidate data
- * (non readonly) just nuke the cache right away.
- */
- ceph_fscache_register_inode_cookie(mdsc->fsc, ci);
- if ((fmode & CEPH_FILE_MODE_WR))
- ceph_fscache_invalidate(inode);
+ ceph_fscache_register_inode_cookie(inode);
+ ceph_fscache_file_set_cookie(inode, file);
case S_IFDIR:
dout("init_file %p %p 0%o (regular)\n", inode, file,
inode->i_mode);
@@ -192,6 +180,59 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
}
/*
+ * try renew caps after session gets killed.
+ */
+int ceph_renew_caps(struct inode *inode)
+{
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_request *req;
+ int err, flags, wanted;
+
+ spin_lock(&ci->i_ceph_lock);
+ wanted = __ceph_caps_file_wanted(ci);
+ if (__ceph_is_any_real_caps(ci) &&
+ (!(wanted & CEPH_CAP_ANY_WR) == 0 || ci->i_auth_cap)) {
+ int issued = __ceph_caps_issued(ci, NULL);
+ spin_unlock(&ci->i_ceph_lock);
+ dout("renew caps %p want %s issued %s updating mds_wanted\n",
+ inode, ceph_cap_string(wanted), ceph_cap_string(issued));
+ ceph_check_caps(ci, 0, NULL);
+ return 0;
+ }
+ spin_unlock(&ci->i_ceph_lock);
+
+ flags = 0;
+ if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
+ flags = O_RDWR;
+ else if (wanted & CEPH_CAP_FILE_RD)
+ flags = O_RDONLY;
+ else if (wanted & CEPH_CAP_FILE_WR)
+ flags = O_WRONLY;
+#ifdef O_LAZY
+ if (wanted & CEPH_CAP_FILE_LAZYIO)
+ flags |= O_LAZY;
+#endif
+
+ req = prepare_open_request(inode->i_sb, flags, 0);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out;
+ }
+
+ req->r_inode = inode;
+ ihold(inode);
+ req->r_num_caps = 1;
+ req->r_fmode = -1;
+
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ ceph_mdsc_put_request(req);
+out:
+ dout("renew caps %p open result=%d\n", inode, err);
+ return err < 0 ? err : 0;
+}
+
+/*
* If we already have the requisite capabilities, we can satisfy
* the open request locally (no need to request new caps from the
* MDS). We do, however, need to inform the MDS (asynchronously)
@@ -353,7 +394,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
- if (d_unhashed(dentry)) {
+ if (d_in_lookup(dentry)) {
dn = ceph_finish_lookup(req, dentry, err);
if (IS_ERR(dn))
err = PTR_ERR(dn);
@@ -466,7 +507,7 @@ more:
ret += zlen;
}
- didpages = (page_align + ret) >> PAGE_CACHE_SHIFT;
+ didpages = (page_align + ret) >> PAGE_SHIFT;
pos += ret;
read = pos - off;
left -= ret;
@@ -616,8 +657,7 @@ static void ceph_aio_complete(struct inode *inode,
kfree(aio_req);
}
-static void ceph_aio_complete_req(struct ceph_osd_request *req,
- struct ceph_msg *msg)
+static void ceph_aio_complete_req(struct ceph_osd_request *req)
{
int rc = req->r_result;
struct inode *inode = req->r_inode;
@@ -668,7 +708,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req,
}
}
- ceph_put_page_vector(osd_data->pages, num_pages, false);
+ ceph_put_page_vector(osd_data->pages, num_pages, !aio_req->write);
ceph_osdc_put_request(req);
if (rc < 0)
@@ -714,14 +754,21 @@ static void ceph_aio_retry_work(struct work_struct *work)
req->r_flags = CEPH_OSD_FLAG_ORDERSNAP |
CEPH_OSD_FLAG_ONDISK |
CEPH_OSD_FLAG_WRITE;
- req->r_base_oloc = orig_req->r_base_oloc;
- req->r_base_oid = orig_req->r_base_oid;
+ ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
+ ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
+
+ ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
+ if (ret) {
+ ceph_osdc_put_request(req);
+ req = orig_req;
+ goto out;
+ }
req->r_ops[0] = orig_req->r_ops[0];
osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
- ceph_osdc_build_request(req, req->r_ops[0].extent.offset,
- snapc, CEPH_NOSNAP, &aio_req->mtime);
+ req->r_mtime = aio_req->mtime;
+ req->r_data_offset = req->r_ops[0].extent.offset;
ceph_osdc_put_request(orig_req);
@@ -733,7 +780,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
out:
if (ret < 0) {
req->r_result = ret;
- ceph_aio_complete_req(req, NULL);
+ ceph_aio_complete_req(req);
}
ceph_put_snap_context(snapc);
@@ -764,6 +811,8 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
list_add_tail(&req->r_unsafe_item,
&ci->i_unsafe_writes);
spin_unlock(&ci->i_unsafe_lock);
+
+ complete_all(&req->r_completion);
} else {
spin_lock(&ci->i_unsafe_lock);
list_del_init(&req->r_unsafe_item);
@@ -772,6 +821,54 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
}
}
+/*
+ * Wait on any unsafe replies for the given inode. First wait on the
+ * newest request, and make that the upper bound. Then, if there are
+ * more requests, keep waiting on the oldest as long as it is still older
+ * than the original request.
+ */
+void ceph_sync_write_wait(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct list_head *head = &ci->i_unsafe_writes;
+ struct ceph_osd_request *req;
+ u64 last_tid;
+
+ if (!S_ISREG(inode->i_mode))
+ return;
+
+ spin_lock(&ci->i_unsafe_lock);
+ if (list_empty(head))
+ goto out;
+
+ /* set upper bound as _last_ entry in chain */
+
+ req = list_last_entry(head, struct ceph_osd_request,
+ r_unsafe_item);
+ last_tid = req->r_tid;
+
+ do {
+ ceph_osdc_get_request(req);
+ spin_unlock(&ci->i_unsafe_lock);
+
+ dout("sync_write_wait on tid %llu (until %llu)\n",
+ req->r_tid, last_tid);
+ wait_for_completion(&req->r_safe_completion);
+ ceph_osdc_put_request(req);
+
+ spin_lock(&ci->i_unsafe_lock);
+ /*
+ * from here on look at first entry in chain, since we
+ * only want to wait for anything older than last_tid
+ */
+ if (list_empty(head))
+ break;
+ req = list_first_entry(head, struct ceph_osd_request,
+ r_unsafe_item);
+ } while (req->r_tid < last_tid);
+out:
+ spin_unlock(&ci->i_unsafe_lock);
+}
static ssize_t
ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
@@ -806,8 +903,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
if (write) {
ret = invalidate_inode_pages2_range(inode->i_mapping,
- pos >> PAGE_CACHE_SHIFT,
- (pos + count) >> PAGE_CACHE_SHIFT);
+ pos >> PAGE_SHIFT,
+ (pos + count) >> PAGE_SHIFT);
if (ret < 0)
dout("invalidate_inode_pages2_range returned %d\n", ret);
@@ -872,17 +969,15 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
* may block.
*/
truncate_inode_pages_range(inode->i_mapping, pos,
- (pos+len) | (PAGE_CACHE_SIZE - 1));
+ (pos+len) | (PAGE_SIZE - 1));
osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
+ req->r_mtime = mtime;
}
-
osd_req_op_extent_osd_data_pages(req, 0, pages, len, start,
false, false);
- ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
-
if (aio_req) {
aio_req->total_len += len;
aio_req->num_reqs++;
@@ -917,7 +1012,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
len = ret;
}
- ceph_put_page_vector(pages, num_pages, false);
+ ceph_put_page_vector(pages, num_pages, !write);
ceph_osdc_put_request(req);
if (ret < 0)
@@ -938,6 +1033,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
}
if (aio_req) {
+ LIST_HEAD(osd_reqs);
+
if (aio_req->num_reqs == 0) {
kfree(aio_req);
return ret;
@@ -946,8 +1043,9 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR :
CEPH_CAP_FILE_RD);
- while (!list_empty(&aio_req->osd_reqs)) {
- req = list_first_entry(&aio_req->osd_reqs,
+ list_splice(&aio_req->osd_reqs, &osd_reqs);
+ while (!list_empty(&osd_reqs)) {
+ req = list_first_entry(&osd_reqs,
struct ceph_osd_request,
r_unsafe_item);
list_del_init(&req->r_unsafe_item);
@@ -956,7 +1054,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
req, false);
if (ret < 0) {
req->r_result = ret;
- ceph_aio_complete_req(req, NULL);
+ ceph_aio_complete_req(req);
}
}
return -EIOCBQUEUED;
@@ -1006,8 +1104,8 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
return ret;
ret = invalidate_inode_pages2_range(inode->i_mapping,
- pos >> PAGE_CACHE_SHIFT,
- (pos + count) >> PAGE_CACHE_SHIFT);
+ pos >> PAGE_SHIFT,
+ (pos + count) >> PAGE_SHIFT);
if (ret < 0)
dout("invalidate_inode_pages2_range returned %d\n", ret);
@@ -1036,7 +1134,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
* write from beginning of first page,
* regardless of io alignment
*/
- num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ num_pages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
if (IS_ERR(pages)) {
@@ -1067,9 +1165,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
false, true);
- /* BUG_ON(vino.snap != CEPH_NOSNAP); */
- ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
-
+ req->r_mtime = mtime;
ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
if (!ret)
ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1159,7 +1255,7 @@ again:
dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
if (pinned_page) {
- page_cache_release(pinned_page);
+ put_page(pinned_page);
pinned_page = NULL;
}
ceph_put_cap_refs(ci, got);
@@ -1188,10 +1284,10 @@ again:
if (retry_op == READ_INLINE) {
BUG_ON(ret > 0 || read > 0);
if (iocb->ki_pos < i_size &&
- iocb->ki_pos < PAGE_CACHE_SIZE) {
+ iocb->ki_pos < PAGE_SIZE) {
loff_t end = min_t(loff_t, i_size,
iocb->ki_pos + len);
- end = min_t(loff_t, end, PAGE_CACHE_SIZE);
+ end = min_t(loff_t, end, PAGE_SIZE);
if (statret < end)
zero_user_segment(page, statret, end);
ret = copy_page_to_iter(page,
@@ -1292,7 +1388,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
retry_snap:
- if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) {
+ if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL)) {
err = -ENOSPC;
goto out;
}
@@ -1350,7 +1446,6 @@ retry_snap:
iov_iter_advance(from, written);
ceph_put_snap_context(snapc);
} else {
- loff_t old_size = i_size_read(inode);
/*
* No need to acquire the i_truncate_mutex. Because
* the MDS revokes Fwb caps before sending truncate
@@ -1361,8 +1456,6 @@ retry_snap:
written = generic_perform_write(file, from, pos);
if (likely(written >= 0))
iocb->ki_pos = pos + written;
- if (i_size_read(inode) > old_size)
- ceph_fscache_update_objectsize(inode);
inode_unlock(inode);
}
@@ -1382,12 +1475,11 @@ retry_snap:
ceph_cap_string(got));
ceph_put_cap_refs(ci, got);
- if (written >= 0 &&
- ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) ||
- ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
- err = vfs_fsync_range(file, pos, pos + written - 1, 1);
- if (err < 0)
- written = err;
+ if (written >= 0) {
+ if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_NEARFULL))
+ iocb->ki_flags |= IOCB_DSYNC;
+
+ written = generic_write_sync(iocb, written);
}
goto out_unlocked;
@@ -1407,16 +1499,14 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
loff_t i_size;
- int ret;
+ loff_t ret;
inode_lock(inode);
if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
- if (ret < 0) {
- offset = ret;
+ if (ret < 0)
goto out;
- }
}
i_size = i_size_read(inode);
@@ -1432,7 +1522,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
* write() or lseek() might have altered it
*/
if (offset == 0) {
- offset = file->f_pos;
+ ret = file->f_pos;
goto out;
}
offset += file->f_pos;
@@ -1452,32 +1542,32 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
break;
}
- offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
+ ret = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
out:
inode_unlock(inode);
- return offset;
+ return ret;
}
static inline void ceph_zero_partial_page(
struct inode *inode, loff_t offset, unsigned size)
{
struct page *page;
- pgoff_t index = offset >> PAGE_CACHE_SHIFT;
+ pgoff_t index = offset >> PAGE_SHIFT;
page = find_lock_page(inode->i_mapping, index);
if (page) {
wait_on_page_writeback(page);
- zero_user(page, offset & (PAGE_CACHE_SIZE - 1), size);
+ zero_user(page, offset & (PAGE_SIZE - 1), size);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
}
static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset,
loff_t length)
{
- loff_t nearly = round_up(offset, PAGE_CACHE_SIZE);
+ loff_t nearly = round_up(offset, PAGE_SIZE);
if (offset < nearly) {
loff_t size = nearly - offset;
if (length < size)
@@ -1486,8 +1576,8 @@ static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset,
offset += size;
length -= size;
}
- if (length >= PAGE_CACHE_SIZE) {
- loff_t size = round_down(length, PAGE_CACHE_SIZE);
+ if (length >= PAGE_SIZE) {
+ loff_t size = round_down(length, PAGE_SIZE);
truncate_pagecache_range(inode, offset, offset + size - 1);
offset += size;
length -= size;
@@ -1525,9 +1615,7 @@ static int ceph_zero_partial_object(struct inode *inode,
goto out;
}
- ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap,
- &inode->i_mtime);
-
+ req->r_mtime = inode->i_mtime;
ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
if (!ret) {
ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1544,9 +1632,9 @@ static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length)
{
int ret = 0;
struct ceph_inode_info *ci = ceph_inode(inode);
- s32 stripe_unit = ceph_file_layout_su(ci->i_layout);
- s32 stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
- s32 object_size = ceph_file_layout_object_size(ci->i_layout);
+ s32 stripe_unit = ci->i_layout.stripe_unit;
+ s32 stripe_count = ci->i_layout.stripe_count;
+ s32 object_size = ci->i_layout.object_size;
u64 object_set_size = object_size * stripe_count;
u64 nearly, t;
@@ -1618,8 +1706,8 @@ static long ceph_fallocate(struct file *file, int mode,
goto unlock;
}
- if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) &&
- !(mode & FALLOC_FL_PUNCH_HOLE)) {
+ if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) &&
+ !(mode & FALLOC_FL_PUNCH_HOLE)) {
ret = -ENOSPC;
goto unlock;
}
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index ed58b168904a9..dd3a6dbf71ebc 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -8,8 +8,10 @@
#include <linux/kernel.h>
#include <linux/writeback.h>
#include <linux/vmalloc.h>
+#include <linux/xattr.h>
#include <linux/posix_acl.h>
#include <linux/random.h>
+#include <linux/sort.h>
#include "super.h"
#include "mds_client.h"
@@ -92,10 +94,10 @@ const struct inode_operations ceph_file_iops = {
.permission = ceph_permission,
.setattr = ceph_setattr,
.getattr = ceph_getattr,
- .setxattr = ceph_setxattr,
- .getxattr = ceph_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
.listxattr = ceph_listxattr,
- .removexattr = ceph_removexattr,
+ .removexattr = generic_removexattr,
.get_acl = ceph_get_acl,
.set_acl = ceph_set_acl,
};
@@ -253,6 +255,9 @@ static int ceph_fill_dirfrag(struct inode *inode,
diri_auth = ci->i_auth_cap->mds;
spin_unlock(&ci->i_ceph_lock);
+ if (mds == -1) /* CDIR_AUTH_PARENT */
+ mds = diri_auth;
+
mutex_lock(&ci->i_fragtree_mutex);
if (ndist == 0 && mds == diri_auth) {
/* no delegation info needed. */
@@ -299,20 +304,38 @@ out:
return err;
}
+static int frag_tree_split_cmp(const void *l, const void *r)
+{
+ struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l;
+ struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r;
+ return ceph_frag_compare(ls->frag, rs->frag);
+}
+
+static bool is_frag_child(u32 f, struct ceph_inode_frag *frag)
+{
+ if (!frag)
+ return f == ceph_frag_make(0, 0);
+ if (ceph_frag_bits(f) != ceph_frag_bits(frag->frag) + frag->split_by)
+ return false;
+ return ceph_frag_contains_value(frag->frag, ceph_frag_value(f));
+}
+
static int ceph_fill_fragtree(struct inode *inode,
struct ceph_frag_tree_head *fragtree,
struct ceph_mds_reply_dirfrag *dirinfo)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_inode_frag *frag;
+ struct ceph_inode_frag *frag, *prev_frag = NULL;
struct rb_node *rb_node;
- int i;
- u32 id, nsplits;
+ unsigned i, split_by, nsplits;
+ u32 id;
bool update = false;
mutex_lock(&ci->i_fragtree_mutex);
nsplits = le32_to_cpu(fragtree->nsplits);
- if (nsplits) {
+ if (nsplits != ci->i_fragtree_nsplits) {
+ update = true;
+ } else if (nsplits) {
i = prandom_u32() % nsplits;
id = le32_to_cpu(fragtree->splits[i].frag);
if (!__ceph_find_frag(ci, id))
@@ -331,10 +354,22 @@ static int ceph_fill_fragtree(struct inode *inode,
if (!update)
goto out_unlock;
+ if (nsplits > 1) {
+ sort(fragtree->splits, nsplits, sizeof(fragtree->splits[0]),
+ frag_tree_split_cmp, NULL);
+ }
+
dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode));
rb_node = rb_first(&ci->i_fragtree);
for (i = 0; i < nsplits; i++) {
id = le32_to_cpu(fragtree->splits[i].frag);
+ split_by = le32_to_cpu(fragtree->splits[i].by);
+ if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) {
+ pr_err("fill_fragtree %llx.%llx invalid split %d/%u, "
+ "frag %x split by %d\n", ceph_vinop(inode),
+ i, nsplits, id, split_by);
+ continue;
+ }
frag = NULL;
while (rb_node) {
frag = rb_entry(rb_node, struct ceph_inode_frag, node);
@@ -346,8 +381,14 @@ static int ceph_fill_fragtree(struct inode *inode,
break;
}
rb_node = rb_next(rb_node);
- rb_erase(&frag->node, &ci->i_fragtree);
- kfree(frag);
+ /* delete stale split/leaf node */
+ if (frag->split_by > 0 ||
+ !is_frag_child(frag->frag, prev_frag)) {
+ rb_erase(&frag->node, &ci->i_fragtree);
+ if (frag->split_by > 0)
+ ci->i_fragtree_nsplits--;
+ kfree(frag);
+ }
frag = NULL;
}
if (!frag) {
@@ -355,14 +396,23 @@ static int ceph_fill_fragtree(struct inode *inode,
if (IS_ERR(frag))
continue;
}
- frag->split_by = le32_to_cpu(fragtree->splits[i].by);
+ if (frag->split_by == 0)
+ ci->i_fragtree_nsplits++;
+ frag->split_by = split_by;
dout(" frag %x split by %d\n", frag->frag, frag->split_by);
+ prev_frag = frag;
}
while (rb_node) {
frag = rb_entry(rb_node, struct ceph_inode_frag, node);
rb_node = rb_next(rb_node);
- rb_erase(&frag->node, &ci->i_fragtree);
- kfree(frag);
+ /* delete stale split/leaf node */
+ if (frag->split_by > 0 ||
+ !is_frag_child(frag->frag, prev_frag)) {
+ rb_erase(&frag->node, &ci->i_fragtree);
+ if (frag->split_by > 0)
+ ci->i_fragtree_nsplits--;
+ kfree(frag);
+ }
}
out_unlock:
mutex_unlock(&ci->i_fragtree_mutex);
@@ -396,7 +446,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_symlink = NULL;
memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
- ci->i_pool_ns_len = 0;
+ RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL);
ci->i_fragtree = RB_ROOT;
mutex_init(&ci->i_fragtree_mutex);
@@ -418,7 +468,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
INIT_LIST_HEAD(&ci->i_dirty_item);
INIT_LIST_HEAD(&ci->i_flushing_item);
ci->i_prealloc_cap_flush = NULL;
- ci->i_cap_flush_tree = RB_ROOT;
+ INIT_LIST_HEAD(&ci->i_cap_flush_list);
init_waitqueue_head(&ci->i_cap_wq);
ci->i_hold_caps_min = 0;
ci->i_hold_caps_max = 0;
@@ -427,7 +477,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_head_snapc = NULL;
ci->i_snap_caps = 0;
- for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
+ for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
ci->i_nr_by_mode[i] = 0;
mutex_init(&ci->i_truncate_mutex);
@@ -512,6 +562,7 @@ void ceph_destroy_inode(struct inode *inode)
rb_erase(n, &ci->i_fragtree);
kfree(frag);
}
+ ci->i_fragtree_nsplits = 0;
__ceph_destroy_xattrs(ci);
if (ci->i_xattrs.blob)
@@ -519,6 +570,8 @@ void ceph_destroy_inode(struct inode *inode)
if (ci->i_xattrs.prealloc_blob)
ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+ ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
+
call_rcu(&inode->i_rcu, ceph_i_callback);
}
@@ -532,6 +585,19 @@ int ceph_drop_inode(struct inode *inode)
return 1;
}
+void ceph_evict_inode(struct inode *inode)
+{
+ /* wait unsafe sync writes */
+ ceph_sync_write_wait(inode);
+ truncate_inode_pages_final(&inode->i_data);
+ clear_inode(inode);
+}
+
+static inline blkcnt_t calc_inode_blocks(u64 size)
+{
+ return (size + (1<<9) - 1) >> 9;
+}
+
/*
* Helpers to fill in size, ctime, mtime, and atime. We have to be
* careful because either the client or MDS may have more up to date
@@ -554,7 +620,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
size = 0;
}
i_size_write(inode, size);
- inode->i_blocks = (size + (1<<9) - 1) >> 9;
+ inode->i_blocks = calc_inode_blocks(size);
ci->i_reported_size = size;
if (truncate_seq != ci->i_truncate_seq) {
dout("truncate_seq %u -> %u\n",
@@ -677,6 +743,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
int issued = 0, implemented, new_issued;
struct timespec mtime, atime, ctime;
struct ceph_buffer *xattr_blob = NULL;
+ struct ceph_string *pool_ns = NULL;
struct ceph_cap *new_cap = NULL;
int err = 0;
bool wake = false;
@@ -704,6 +771,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
iinfo->xattr_len);
}
+ if (iinfo->pool_ns_len > 0)
+ pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data,
+ iinfo->pool_ns_len);
+
spin_lock(&ci->i_ceph_lock);
/*
@@ -758,10 +829,18 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
if (new_version ||
(new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
- if (ci->i_layout.fl_pg_pool != info->layout.fl_pg_pool)
+ s64 old_pool = ci->i_layout.pool_id;
+ struct ceph_string *old_ns;
+
+ ceph_file_layout_from_legacy(&ci->i_layout, &info->layout);
+ old_ns = rcu_dereference_protected(ci->i_layout.pool_ns,
+ lockdep_is_held(&ci->i_ceph_lock));
+ rcu_assign_pointer(ci->i_layout.pool_ns, pool_ns);
+
+ if (ci->i_layout.pool_id != old_pool || pool_ns != old_ns)
ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
- ci->i_layout = info->layout;
- ci->i_pool_ns_len = iinfo->pool_ns_len;
+
+ pool_ns = old_ns;
queue_trunc = ceph_fill_file_size(inode, issued,
le32_to_cpu(info->truncate_seq),
@@ -813,9 +892,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
spin_unlock(&ci->i_ceph_lock);
- err = -EINVAL;
- if (WARN_ON(symlen != i_size_read(inode)))
- goto out;
+ if (symlen != i_size_read(inode)) {
+ pr_err("fill_inode %llx.%llx BAD symlink "
+ "size %lld\n", ceph_vinop(inode),
+ i_size_read(inode));
+ i_size_write(inode, symlen);
+ inode->i_blocks = calc_inode_blocks(symlen);
+ }
err = -ENOMEM;
sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
@@ -925,6 +1008,7 @@ out:
ceph_put_cap(mdsc, new_cap);
if (xattr_blob)
ceph_buffer_put(xattr_blob);
+ ceph_put_string(pool_ns);
return err;
}
@@ -958,7 +1042,7 @@ static void update_dentry_lease(struct dentry *dentry,
goto out_unlock;
if (di->lease_gen == session->s_cap_gen &&
- time_before(ttl, dentry->d_time))
+ time_before(ttl, di->time))
goto out_unlock; /* we already have a newer lease. */
if (di->lease_session && di->lease_session != session)
@@ -972,7 +1056,7 @@ static void update_dentry_lease(struct dentry *dentry,
di->lease_seq = le32_to_cpu(lease->seq);
di->lease_renew_after = half_ttl;
di->lease_renew_from = 0;
- dentry->d_time = ttl;
+ di->time = ttl;
out_unlock:
spin_unlock(&dentry->d_lock);
return;
@@ -1104,7 +1188,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
dname.name = rinfo->dname;
dname.len = rinfo->dname_len;
- dname.hash = full_name_hash(dname.name, dname.len);
+ dname.hash = full_name_hash(parent, dname.name, dname.len);
vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
retry_lookup:
@@ -1308,12 +1392,13 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
int i, err = 0;
for (i = 0; i < rinfo->dir_nr; i++) {
+ struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
struct ceph_vino vino;
struct inode *in;
int rc;
- vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
- vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
+ vino.ino = le64_to_cpu(rde->inode.in->ino);
+ vino.snap = le64_to_cpu(rde->inode.in->snapid);
in = ceph_get_inode(req->r_dentry->d_sb, vino);
if (IS_ERR(in)) {
@@ -1321,14 +1406,14 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
dout("new_inode badness got %d\n", err);
continue;
}
- rc = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
+ rc = fill_inode(in, NULL, &rde->inode, NULL, session,
req->r_request_started, -1,
&req->r_caps_reservation);
if (rc < 0) {
pr_err("fill_inode badness on %p got %d\n", in, rc);
err = rc;
- continue;
}
+ iput(in);
}
return err;
@@ -1338,7 +1423,7 @@ void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl)
{
if (ctl->page) {
kunmap(ctl->page);
- page_cache_release(ctl->page);
+ put_page(ctl->page);
ctl->page = NULL;
}
}
@@ -1348,7 +1433,7 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
struct ceph_mds_request *req)
{
struct ceph_inode_info *ci = ceph_inode(dir);
- unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry*);
+ unsigned nsize = PAGE_SIZE / sizeof(struct dentry*);
unsigned idx = ctl->index % nsize;
pgoff_t pgoff = ctl->index / nsize;
@@ -1367,7 +1452,7 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
unlock_page(ctl->page);
ctl->dentries = kmap(ctl->page);
if (idx == 0)
- memset(ctl->dentries, 0, PAGE_CACHE_SIZE);
+ memset(ctl->dentries, 0, PAGE_SIZE);
}
if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) &&
@@ -1386,6 +1471,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct ceph_mds_session *session)
{
struct dentry *parent = req->r_dentry;
+ struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
struct qstr dname;
struct dentry *dn;
@@ -1393,22 +1479,27 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
int err = 0, skipped = 0, ret, i;
struct inode *snapdir = NULL;
struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
- struct ceph_dentry_info *di;
u32 frag = le32_to_cpu(rhead->args.readdir.frag);
+ u32 last_hash = 0;
+ u32 fpos_offset;
struct ceph_readdir_cache_control cache_ctl = {};
if (req->r_aborted)
return readdir_prepopulate_inodes_only(req, session);
+ if (rinfo->hash_order && req->r_path2) {
+ last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
+ req->r_path2, strlen(req->r_path2));
+ last_hash = ceph_frag_value(last_hash);
+ }
+
if (rinfo->dir_dir &&
le32_to_cpu(rinfo->dir_dir->frag) != frag) {
dout("readdir_prepopulate got new frag %x -> %x\n",
frag, le32_to_cpu(rinfo->dir_dir->frag));
frag = le32_to_cpu(rinfo->dir_dir->frag);
- if (ceph_frag_is_leftmost(frag))
+ if (!rinfo->hash_order)
req->r_readdir_offset = 2;
- else
- req->r_readdir_offset = 0;
}
if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
@@ -1426,24 +1517,37 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) {
/* note dir version at start of readdir so we can tell
* if any dentries get dropped */
- struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
req->r_dir_release_cnt = atomic64_read(&ci->i_release_count);
req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count);
req->r_readdir_cache_idx = 0;
}
cache_ctl.index = req->r_readdir_cache_idx;
+ fpos_offset = req->r_readdir_offset;
/* FIXME: release caps/leases if error occurs */
for (i = 0; i < rinfo->dir_nr; i++) {
+ struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
struct ceph_vino vino;
- dname.name = rinfo->dir_dname[i];
- dname.len = rinfo->dir_dname_len[i];
- dname.hash = full_name_hash(dname.name, dname.len);
-
- vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
- vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
+ dname.name = rde->name;
+ dname.len = rde->name_len;
+ dname.hash = full_name_hash(parent, dname.name, dname.len);
+
+ vino.ino = le64_to_cpu(rde->inode.in->ino);
+ vino.snap = le64_to_cpu(rde->inode.in->snapid);
+
+ if (rinfo->hash_order) {
+ u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
+ rde->name, rde->name_len);
+ hash = ceph_frag_value(hash);
+ if (hash != last_hash)
+ fpos_offset = 2;
+ last_hash = hash;
+ rde->offset = ceph_make_fpos(hash, fpos_offset++, true);
+ } else {
+ rde->offset = ceph_make_fpos(frag, fpos_offset++, false);
+ }
retry_lookup:
dn = d_lookup(parent, &dname);
@@ -1489,7 +1593,7 @@ retry_lookup:
}
}
- ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
+ ret = fill_inode(in, NULL, &rde->inode, NULL, session,
req->r_request_started, -1,
&req->r_caps_reservation);
if (ret < 0) {
@@ -1522,11 +1626,9 @@ retry_lookup:
dn = realdn;
}
- di = dn->d_fsdata;
- di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
+ ceph_dentry(dn)->offset = rde->offset;
- update_dentry_lease(dn, rinfo->dir_dlease[i],
- req->r_session,
+ update_dentry_lease(dn, rde->lease, req->r_session,
req->r_request_started);
if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
@@ -1561,7 +1663,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)
spin_lock(&ci->i_ceph_lock);
dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
i_size_write(inode, size);
- inode->i_blocks = (size + (1 << 9) - 1) >> 9;
+ inode->i_blocks = calc_inode_blocks(size);
/* tell the MDS if we are approaching max_size */
if ((size << 1) >= ci->i_max_size &&
@@ -1623,10 +1725,21 @@ static void ceph_invalidate_work(struct work_struct *work)
struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
i_pg_inv_work);
struct inode *inode = &ci->vfs_inode;
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
u32 orig_gen;
int check = 0;
mutex_lock(&ci->i_truncate_mutex);
+
+ if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+ pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n",
+ inode, ceph_ino(inode));
+ mapping_set_error(inode->i_mapping, -EIO);
+ truncate_pagecache(inode, 0);
+ mutex_unlock(&ci->i_truncate_mutex);
+ goto out;
+ }
+
spin_lock(&ci->i_ceph_lock);
dout("invalidate_pages %p gen %d revoking %d\n", inode,
ci->i_rdcache_gen, ci->i_rdcache_revoking);
@@ -1640,7 +1753,9 @@ static void ceph_invalidate_work(struct work_struct *work)
orig_gen = ci->i_rdcache_gen;
spin_unlock(&ci->i_ceph_lock);
- truncate_pagecache(inode, 0);
+ if (invalidate_inode_pages2(inode->i_mapping) < 0) {
+ pr_err("invalidate_pages %p fails\n", inode);
+ }
spin_lock(&ci->i_ceph_lock);
if (orig_gen == ci->i_rdcache_gen &&
@@ -1770,22 +1885,18 @@ static const struct inode_operations ceph_symlink_iops = {
.get_link = simple_get_link,
.setattr = ceph_setattr,
.getattr = ceph_getattr,
- .setxattr = ceph_setxattr,
- .getxattr = ceph_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
.listxattr = ceph_listxattr,
- .removexattr = ceph_removexattr,
+ .removexattr = generic_removexattr,
};
-/*
- * setattr
- */
-int ceph_setattr(struct dentry *dentry, struct iattr *attr)
+int __ceph_setattr(struct inode *inode, struct iattr *attr)
{
- struct inode *inode = d_inode(dentry);
struct ceph_inode_info *ci = ceph_inode(inode);
const unsigned int ia_valid = attr->ia_valid;
struct ceph_mds_request *req;
- struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_cap_flush *prealloc_cf;
int issued;
int release = 0, dirtied = 0;
@@ -1923,8 +2034,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
if ((issued & CEPH_CAP_FILE_EXCL) &&
attr->ia_size > inode->i_size) {
i_size_write(inode, attr->ia_size);
- inode->i_blocks =
- (attr->ia_size + (1 << 9) - 1) >> 9;
+ inode->i_blocks = calc_inode_blocks(attr->ia_size);
inode->i_ctime = attr->ia_ctime;
ci->i_reported_size = attr->ia_size;
dirtied |= CEPH_CAP_FILE_EXCL;
@@ -2010,6 +2120,14 @@ out_put:
}
/*
+ * setattr
+ */
+int ceph_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ return __ceph_setattr(d_inode(dentry), attr);
+}
+
+/*
* Verify that we have a lease on the given mask. If not,
* do a getattr against an mds.
*/
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index f851d8d70158e..7d752d53353a2 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -21,10 +21,10 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT, false);
if (!err) {
- l.stripe_unit = ceph_file_layout_su(ci->i_layout);
- l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
- l.object_size = ceph_file_layout_object_size(ci->i_layout);
- l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
+ l.stripe_unit = ci->i_layout.stripe_unit;
+ l.stripe_count = ci->i_layout.stripe_count;
+ l.object_size = ci->i_layout.object_size;
+ l.data_pool = ci->i_layout.pool_id;
l.preferred_osd = (s32)-1;
if (copy_to_user(arg, &l, sizeof(l)))
return -EFAULT;
@@ -82,19 +82,19 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
if (l.stripe_count)
nl.stripe_count = l.stripe_count;
else
- nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
+ nl.stripe_count = ci->i_layout.stripe_count;
if (l.stripe_unit)
nl.stripe_unit = l.stripe_unit;
else
- nl.stripe_unit = ceph_file_layout_su(ci->i_layout);
+ nl.stripe_unit = ci->i_layout.stripe_unit;
if (l.object_size)
nl.object_size = l.object_size;
else
- nl.object_size = ceph_file_layout_object_size(ci->i_layout);
+ nl.object_size = ci->i_layout.object_size;
if (l.data_pool)
nl.data_pool = l.data_pool;
else
- nl.data_pool = ceph_file_layout_pg_pool(ci->i_layout);
+ nl.data_pool = ci->i_layout.pool_id;
/* this is obsolete, and always -1 */
nl.preferred_osd = le64_to_cpu(-1);
@@ -183,7 +183,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
struct ceph_osd_client *osdc =
&ceph_sb_to_client(inode->i_sb)->client->osdc;
struct ceph_object_locator oloc;
- struct ceph_object_id oid;
+ CEPH_DEFINE_OID_ONSTACK(oid);
u64 len = 1, olen;
u64 tmp;
struct ceph_pg pgid;
@@ -193,17 +193,17 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
if (copy_from_user(&dl, arg, sizeof(dl)))
return -EFAULT;
- down_read(&osdc->map_sem);
+ down_read(&osdc->lock);
r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,
&dl.object_no, &dl.object_offset,
&olen);
if (r < 0) {
- up_read(&osdc->map_sem);
+ up_read(&osdc->lock);
return -EIO;
}
dl.file_offset -= dl.object_offset;
- dl.object_size = ceph_file_layout_object_size(ci->i_layout);
- dl.block_size = ceph_file_layout_su(ci->i_layout);
+ dl.object_size = ci->i_layout.object_size;
+ dl.block_size = ci->i_layout.stripe_unit;
/* block_offset = object_offset % block_size */
tmp = dl.object_offset;
@@ -212,16 +212,19 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
ceph_ino(inode), dl.object_no);
- oloc.pool = ceph_file_layout_pg_pool(ci->i_layout);
- ceph_oid_set_name(&oid, dl.object_name);
+ oloc.pool = ci->i_layout.pool_id;
+ oloc.pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
+ ceph_oid_printf(&oid, "%s", dl.object_name);
- r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid);
+ r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid);
+
+ ceph_oloc_destroy(&oloc);
if (r < 0) {
- up_read(&osdc->map_sem);
+ up_read(&osdc->lock);
return r;
}
- dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
+ dl.osd = ceph_pg_to_acting_primary(osdc->osdmap, &pgid);
if (dl.osd >= 0) {
struct ceph_entity_addr *a =
ceph_osd_addr(osdc->osdmap, dl.osd);
@@ -230,7 +233,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
} else {
memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
}
- up_read(&osdc->map_sem);
+ up_read(&osdc->lock);
/* send result back to user */
if (copy_to_user(arg, &dl, sizeof(dl)))
@@ -247,9 +250,8 @@ static long ceph_ioctl_lazyio(struct file *file)
if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
spin_lock(&ci->i_ceph_lock);
- ci->i_nr_by_mode[fi->fmode]--;
fi->fmode |= CEPH_FILE_MODE_LAZY;
- ci->i_nr_by_mode[fi->fmode]++;
+ ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++;
spin_unlock(&ci->i_ceph_lock);
dout("ioctl_layzio: file %p marked lazy\n", file);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 44852c3ae5311..fa59a85226b26 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -48,7 +48,7 @@
struct ceph_reconnect_state {
int nr_caps;
struct ceph_pagelist *pagelist;
- bool flock;
+ unsigned msg_version;
};
static void __wake_requests(struct ceph_mds_client *mdsc,
@@ -100,12 +100,15 @@ static int parse_reply_info_in(void **p, void *end,
} else
info->inline_version = CEPH_INLINE_NONE;
+ info->pool_ns_len = 0;
+ info->pool_ns_data = NULL;
if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
- ceph_decode_need(p, end, info->pool_ns_len, bad);
- *p += info->pool_ns_len;
- } else {
- info->pool_ns_len = 0;
+ if (info->pool_ns_len > 0) {
+ ceph_decode_need(p, end, info->pool_ns_len, bad);
+ info->pool_ns_data = *p;
+ *p += info->pool_ns_len;
+ }
}
return 0;
@@ -181,17 +184,18 @@ static int parse_reply_info_dir(void **p, void *end,
ceph_decode_need(p, end, sizeof(num) + 2, bad);
num = ceph_decode_32(p);
- info->dir_end = ceph_decode_8(p);
- info->dir_complete = ceph_decode_8(p);
+ {
+ u16 flags = ceph_decode_16(p);
+ info->dir_end = !!(flags & CEPH_READDIR_FRAG_END);
+ info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE);
+ info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER);
+ }
if (num == 0)
goto done;
- BUG_ON(!info->dir_in);
- info->dir_dname = (void *)(info->dir_in + num);
- info->dir_dname_len = (void *)(info->dir_dname + num);
- info->dir_dlease = (void *)(info->dir_dname_len + num);
- if ((unsigned long)(info->dir_dlease + num) >
- (unsigned long)info->dir_in + info->dir_buf_size) {
+ BUG_ON(!info->dir_entries);
+ if ((unsigned long)(info->dir_entries + num) >
+ (unsigned long)info->dir_entries + info->dir_buf_size) {
pr_err("dir contents are larger than expected\n");
WARN_ON(1);
goto bad;
@@ -199,21 +203,23 @@ static int parse_reply_info_dir(void **p, void *end,
info->dir_nr = num;
while (num) {
+ struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;
/* dentry */
ceph_decode_need(p, end, sizeof(u32)*2, bad);
- info->dir_dname_len[i] = ceph_decode_32(p);
- ceph_decode_need(p, end, info->dir_dname_len[i], bad);
- info->dir_dname[i] = *p;
- *p += info->dir_dname_len[i];
- dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
- info->dir_dname[i]);
- info->dir_dlease[i] = *p;
+ rde->name_len = ceph_decode_32(p);
+ ceph_decode_need(p, end, rde->name_len, bad);
+ rde->name = *p;
+ *p += rde->name_len;
+ dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name);
+ rde->lease = *p;
*p += sizeof(struct ceph_mds_reply_lease);
/* inode */
- err = parse_reply_info_in(p, end, &info->dir_in[i], features);
+ err = parse_reply_info_in(p, end, &rde->inode, features);
if (err < 0)
goto out_bad;
+ /* ceph_readdir_prepopulate() will update it */
+ rde->offset = 0;
i++;
num--;
}
@@ -345,9 +351,9 @@ out_bad:
static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
{
- if (!info->dir_in)
+ if (!info->dir_entries)
return;
- free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size));
+ free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size));
}
@@ -386,9 +392,7 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
if (atomic_dec_and_test(&s->s_ref)) {
if (s->s_auth.authorizer)
- ceph_auth_destroy_authorizer(
- s->s_mdsc->fsc->client->monc.auth,
- s->s_auth.authorizer);
+ ceph_auth_destroy_authorizer(s->s_auth.authorizer);
kfree(s);
}
}
@@ -468,7 +472,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
s->s_cap_iterator = NULL;
INIT_LIST_HEAD(&s->s_cap_releases);
INIT_LIST_HEAD(&s->s_cap_flushing);
- INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
dout("register_session mds%d\n", mds);
if (mds >= mdsc->max_sessions) {
@@ -569,51 +572,23 @@ void ceph_mdsc_release_request(struct kref *kref)
kfree(req);
}
+DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node)
+
/*
* lookup session, bump ref if found.
*
* called under mdsc->mutex.
*/
-static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
- u64 tid)
+static struct ceph_mds_request *
+lookup_get_request(struct ceph_mds_client *mdsc, u64 tid)
{
struct ceph_mds_request *req;
- struct rb_node *n = mdsc->request_tree.rb_node;
-
- while (n) {
- req = rb_entry(n, struct ceph_mds_request, r_node);
- if (tid < req->r_tid)
- n = n->rb_left;
- else if (tid > req->r_tid)
- n = n->rb_right;
- else {
- ceph_mdsc_get_request(req);
- return req;
- }
- }
- return NULL;
-}
-
-static void __insert_request(struct ceph_mds_client *mdsc,
- struct ceph_mds_request *new)
-{
- struct rb_node **p = &mdsc->request_tree.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_mds_request *req = NULL;
- while (*p) {
- parent = *p;
- req = rb_entry(parent, struct ceph_mds_request, r_node);
- if (new->r_tid < req->r_tid)
- p = &(*p)->rb_left;
- else if (new->r_tid > req->r_tid)
- p = &(*p)->rb_right;
- else
- BUG();
- }
+ req = lookup_request(&mdsc->request_tree, tid);
+ if (req)
+ ceph_mdsc_get_request(req);
- rb_link_node(&new->r_node, parent, p);
- rb_insert_color(&new->r_node, &mdsc->request_tree);
+ return req;
}
/*
@@ -632,7 +607,7 @@ static void __register_request(struct ceph_mds_client *mdsc,
req->r_num_caps);
dout("__register_request %p tid %lld\n", req, req->r_tid);
ceph_mdsc_get_request(req);
- __insert_request(mdsc, req);
+ insert_request(&mdsc->request_tree, req);
req->r_uid = current_fsuid();
req->r_gid = current_fsgid();
@@ -665,8 +640,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
}
}
- rb_erase(&req->r_node, &mdsc->request_tree);
- RB_CLEAR_NODE(&req->r_node);
+ erase_request(&mdsc->request_tree, req);
if (req->r_unsafe_dir && req->r_got_unsafe) {
struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
@@ -870,12 +844,14 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
int metadata_bytes = 0;
int metadata_key_count = 0;
struct ceph_options *opt = mdsc->fsc->client->options;
+ struct ceph_mount_options *fsopt = mdsc->fsc->mount_options;
void *p;
const char* metadata[][2] = {
{"hostname", utsname()->nodename},
{"kernel_version", utsname()->release},
- {"entity_id", opt->name ? opt->name : ""},
+ {"entity_id", opt->name ? : ""},
+ {"root", fsopt->server_path ? : "/"},
{NULL, NULL}
};
@@ -1151,9 +1127,11 @@ out:
static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
void *arg)
{
+ struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg;
struct ceph_inode_info *ci = ceph_inode(inode);
LIST_HEAD(to_remove);
- int drop = 0;
+ bool drop = false;
+ bool invalidate = false;
dout("removing cap %p, ci is %p, inode is %p\n",
cap, ci, &ci->vfs_inode);
@@ -1161,22 +1139,25 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
__ceph_remove_cap(cap, false);
if (!ci->i_auth_cap) {
struct ceph_cap_flush *cf;
- struct ceph_mds_client *mdsc =
- ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = fsc->mdsc;
- while (true) {
- struct rb_node *n = rb_first(&ci->i_cap_flush_tree);
- if (!n)
- break;
- cf = rb_entry(n, struct ceph_cap_flush, i_node);
- rb_erase(&cf->i_node, &ci->i_cap_flush_tree);
- list_add(&cf->list, &to_remove);
+ ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
+
+ if (ci->i_wrbuffer_ref > 0 &&
+ ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
+ invalidate = true;
+
+ while (!list_empty(&ci->i_cap_flush_list)) {
+ cf = list_first_entry(&ci->i_cap_flush_list,
+ struct ceph_cap_flush, i_list);
+ list_del(&cf->i_list);
+ list_add(&cf->i_list, &to_remove);
}
spin_lock(&mdsc->cap_dirty_lock);
- list_for_each_entry(cf, &to_remove, list)
- rb_erase(&cf->g_node, &mdsc->cap_flush_tree);
+ list_for_each_entry(cf, &to_remove, i_list)
+ list_del(&cf->g_list);
if (!list_empty(&ci->i_dirty_item)) {
pr_warn_ratelimited(
@@ -1185,7 +1166,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
inode, ceph_ino(inode));
ci->i_dirty_caps = 0;
list_del_init(&ci->i_dirty_item);
- drop = 1;
+ drop = true;
}
if (!list_empty(&ci->i_flushing_item)) {
pr_warn_ratelimited(
@@ -1195,12 +1176,12 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
ci->i_flushing_caps = 0;
list_del_init(&ci->i_flushing_item);
mdsc->num_cap_flushing--;
- drop = 1;
+ drop = true;
}
spin_unlock(&mdsc->cap_dirty_lock);
if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
- list_add(&ci->i_prealloc_cap_flush->list, &to_remove);
+ list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove);
ci->i_prealloc_cap_flush = NULL;
}
}
@@ -1208,11 +1189,15 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
while (!list_empty(&to_remove)) {
struct ceph_cap_flush *cf;
cf = list_first_entry(&to_remove,
- struct ceph_cap_flush, list);
- list_del(&cf->list);
+ struct ceph_cap_flush, i_list);
+ list_del(&cf->i_list);
ceph_free_cap_flush(cf);
}
- while (drop--)
+
+ wake_up_all(&ci->i_cap_wq);
+ if (invalidate)
+ ceph_queue_invalidate(inode);
+ if (drop)
iput(inode);
return 0;
}
@@ -1222,12 +1207,15 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
*/
static void remove_session_caps(struct ceph_mds_session *session)
{
+ struct ceph_fs_client *fsc = session->s_mdsc->fsc;
+ struct super_block *sb = fsc->sb;
dout("remove_session_caps on %p\n", session);
- iterate_session_caps(session, remove_session_caps_cb, NULL);
+ iterate_session_caps(session, remove_session_caps_cb, fsc);
+
+ wake_up_all(&fsc->mdsc->cap_flushing_wq);
spin_lock(&session->s_cap_lock);
if (session->s_nr_caps > 0) {
- struct super_block *sb = session->s_mdsc->fsc->sb;
struct inode *inode;
struct ceph_cap *cap, *prev = NULL;
struct ceph_vino vino;
@@ -1272,13 +1260,13 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
{
struct ceph_inode_info *ci = ceph_inode(inode);
- wake_up_all(&ci->i_cap_wq);
if (arg) {
spin_lock(&ci->i_ceph_lock);
ci->i_wanted_max_size = 0;
ci->i_requested_max_size = 0;
spin_unlock(&ci->i_ceph_lock);
}
+ wake_up_all(&ci->i_cap_wq);
return 0;
}
@@ -1492,35 +1480,21 @@ static int trim_caps(struct ceph_mds_client *mdsc,
return 0;
}
-static int check_capsnap_flush(struct ceph_inode_info *ci,
- u64 want_snap_seq)
-{
- int ret = 1;
- spin_lock(&ci->i_ceph_lock);
- if (want_snap_seq > 0 && !list_empty(&ci->i_cap_snaps)) {
- struct ceph_cap_snap *capsnap =
- list_first_entry(&ci->i_cap_snaps,
- struct ceph_cap_snap, ci_item);
- ret = capsnap->follows >= want_snap_seq;
- }
- spin_unlock(&ci->i_ceph_lock);
- return ret;
-}
-
static int check_caps_flush(struct ceph_mds_client *mdsc,
u64 want_flush_tid)
{
- struct rb_node *n;
- struct ceph_cap_flush *cf;
int ret = 1;
spin_lock(&mdsc->cap_dirty_lock);
- n = rb_first(&mdsc->cap_flush_tree);
- cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL;
- if (cf && cf->tid <= want_flush_tid) {
- dout("check_caps_flush still flushing tid %llu <= %llu\n",
- cf->tid, want_flush_tid);
- ret = 0;
+ if (!list_empty(&mdsc->cap_flush_list)) {
+ struct ceph_cap_flush *cf =
+ list_first_entry(&mdsc->cap_flush_list,
+ struct ceph_cap_flush, g_list);
+ if (cf->tid <= want_flush_tid) {
+ dout("check_caps_flush still flushing tid "
+ "%llu <= %llu\n", cf->tid, want_flush_tid);
+ ret = 0;
+ }
}
spin_unlock(&mdsc->cap_dirty_lock);
return ret;
@@ -1532,54 +1506,9 @@ static int check_caps_flush(struct ceph_mds_client *mdsc,
* returns true if we've flushed through want_flush_tid
*/
static void wait_caps_flush(struct ceph_mds_client *mdsc,
- u64 want_flush_tid, u64 want_snap_seq)
+ u64 want_flush_tid)
{
- int mds;
-
- dout("check_caps_flush want %llu snap want %llu\n",
- want_flush_tid, want_snap_seq);
- mutex_lock(&mdsc->mutex);
- for (mds = 0; mds < mdsc->max_sessions; ) {
- struct ceph_mds_session *session = mdsc->sessions[mds];
- struct inode *inode = NULL;
-
- if (!session) {
- mds++;
- continue;
- }
- get_session(session);
- mutex_unlock(&mdsc->mutex);
-
- mutex_lock(&session->s_mutex);
- if (!list_empty(&session->s_cap_snaps_flushing)) {
- struct ceph_cap_snap *capsnap =
- list_first_entry(&session->s_cap_snaps_flushing,
- struct ceph_cap_snap,
- flushing_item);
- struct ceph_inode_info *ci = capsnap->ci;
- if (!check_capsnap_flush(ci, want_snap_seq)) {
- dout("check_cap_flush still flushing snap %p "
- "follows %lld <= %lld to mds%d\n",
- &ci->vfs_inode, capsnap->follows,
- want_snap_seq, mds);
- inode = igrab(&ci->vfs_inode);
- }
- }
- mutex_unlock(&session->s_mutex);
- ceph_put_mds_session(session);
-
- if (inode) {
- wait_event(mdsc->cap_flushing_wq,
- check_capsnap_flush(ceph_inode(inode),
- want_snap_seq));
- iput(inode);
- } else {
- mds++;
- }
-
- mutex_lock(&mdsc->mutex);
- }
- mutex_unlock(&mdsc->mutex);
+ dout("check_caps_flush want %llu\n", want_flush_tid);
wait_event(mdsc->cap_flushing_wq,
check_caps_flush(mdsc, want_flush_tid));
@@ -1610,7 +1539,7 @@ again:
while (!list_empty(&tmp_list)) {
if (!msg) {
msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE,
- PAGE_CACHE_SIZE, GFP_NOFS, false);
+ PAGE_SIZE, GFP_NOFS, false);
if (!msg)
goto out_err;
head = msg->front.iov_base;
@@ -1673,8 +1602,7 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
struct ceph_inode_info *ci = ceph_inode(dir);
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options;
- size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) +
- sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease);
+ size_t size = sizeof(struct ceph_mds_reply_dir_entry);
int order, num_entries;
spin_lock(&ci->i_ceph_lock);
@@ -1685,14 +1613,14 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
order = get_order(size * num_entries);
while (order >= 0) {
- rinfo->dir_in = (void*)__get_free_pages(GFP_KERNEL |
- __GFP_NOWARN,
- order);
- if (rinfo->dir_in)
+ rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL |
+ __GFP_NOWARN,
+ order);
+ if (rinfo->dir_entries)
break;
order--;
}
- if (!rinfo->dir_in)
+ if (!rinfo->dir_entries)
return -ENOMEM;
num_entries = (PAGE_SIZE << order) / size;
@@ -1724,6 +1652,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
INIT_LIST_HEAD(&req->r_unsafe_target_item);
req->r_fmode = -1;
kref_init(&req->r_kref);
+ RB_CLEAR_NODE(&req->r_node);
INIT_LIST_HEAD(&req->r_wait);
init_completion(&req->r_completion);
init_completion(&req->r_safe_completion);
@@ -2177,6 +2106,11 @@ static int __do_request(struct ceph_mds_client *mdsc,
mds = __choose_mds(mdsc, req);
if (mds < 0 ||
ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
+ if (mdsc->mdsmap_err) {
+ err = mdsc->mdsmap_err;
+ dout("do_request mdsmap err %d\n", err);
+ goto finish;
+ }
dout("do_request no mds or not active, waiting for map\n");
list_add(&req->r_wait, &mdsc->waiting_for_map);
goto out;
@@ -2306,14 +2240,6 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
CEPH_CAP_PIN);
- /* deny access to directories with pool_ns layouts */
- if (req->r_inode && S_ISDIR(req->r_inode->i_mode) &&
- ceph_inode(req->r_inode)->i_pool_ns_len)
- return -EIO;
- if (req->r_locked_dir &&
- ceph_inode(req->r_locked_dir)->i_pool_ns_len)
- return -EIO;
-
/* issue */
mutex_lock(&mdsc->mutex);
__register_request(mdsc, req, dir);
@@ -2416,7 +2342,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
/* get request, session */
tid = le64_to_cpu(msg->hdr.tid);
mutex_lock(&mdsc->mutex);
- req = __lookup_request(mdsc, tid);
+ req = lookup_get_request(mdsc, tid);
if (!req) {
dout("handle_reply on unknown tid %llu\n", tid);
mutex_unlock(&mdsc->mutex);
@@ -2606,7 +2532,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
fwd_seq = ceph_decode_32(&p);
mutex_lock(&mdsc->mutex);
- req = __lookup_request(mdsc, tid);
+ req = lookup_get_request(mdsc, tid);
if (!req) {
dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
goto out; /* dup reply? */
@@ -2805,13 +2731,13 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
struct ceph_mds_cap_reconnect v2;
struct ceph_mds_cap_reconnect_v1 v1;
} rec;
- size_t reclen;
struct ceph_inode_info *ci;
struct ceph_reconnect_state *recon_state = arg;
struct ceph_pagelist *pagelist = recon_state->pagelist;
char *path;
int pathlen, err;
u64 pathbase;
+ u64 snap_follows;
struct dentry *dentry;
ci = cap->ci;
@@ -2834,9 +2760,6 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
path = NULL;
pathlen = 0;
}
- err = ceph_pagelist_encode_string(pagelist, path, pathlen);
- if (err)
- goto out_free;
spin_lock(&ci->i_ceph_lock);
cap->seq = 0; /* reset cap seq */
@@ -2844,14 +2767,13 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
cap->mseq = 0; /* and migrate_seq */
cap->cap_gen = cap->session->s_cap_gen;
- if (recon_state->flock) {
+ if (recon_state->msg_version >= 2) {
rec.v2.cap_id = cpu_to_le64(cap->cap_id);
rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
rec.v2.issued = cpu_to_le32(cap->issued);
rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
rec.v2.pathbase = cpu_to_le64(pathbase);
rec.v2.flock_len = 0;
- reclen = sizeof(rec.v2);
} else {
rec.v1.cap_id = cpu_to_le64(cap->cap_id);
rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
@@ -2861,13 +2783,23 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
ceph_encode_timespec(&rec.v1.atime, &inode->i_atime);
rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
rec.v1.pathbase = cpu_to_le64(pathbase);
- reclen = sizeof(rec.v1);
+ }
+
+ if (list_empty(&ci->i_cap_snaps)) {
+ snap_follows = 0;
+ } else {
+ struct ceph_cap_snap *capsnap =
+ list_first_entry(&ci->i_cap_snaps,
+ struct ceph_cap_snap, ci_item);
+ snap_follows = capsnap->follows;
}
spin_unlock(&ci->i_ceph_lock);
- if (recon_state->flock) {
+ if (recon_state->msg_version >= 2) {
int num_fcntl_locks, num_flock_locks;
struct ceph_filelock *flocks;
+ size_t struct_len, total_len = 0;
+ u8 struct_v = 0;
encode_again:
ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
@@ -2886,20 +2818,51 @@ encode_again:
goto encode_again;
goto out_free;
}
+
+ if (recon_state->msg_version >= 3) {
+ /* version, compat_version and struct_len */
+ total_len = 2 * sizeof(u8) + sizeof(u32);
+ struct_v = 2;
+ }
/*
* number of encoded locks is stable, so copy to pagelist
*/
- rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) +
- (num_fcntl_locks+num_flock_locks) *
- sizeof(struct ceph_filelock));
- err = ceph_pagelist_append(pagelist, &rec, reclen);
- if (!err)
- err = ceph_locks_to_pagelist(flocks, pagelist,
- num_fcntl_locks,
- num_flock_locks);
+ struct_len = 2 * sizeof(u32) +
+ (num_fcntl_locks + num_flock_locks) *
+ sizeof(struct ceph_filelock);
+ rec.v2.flock_len = cpu_to_le32(struct_len);
+
+ struct_len += sizeof(rec.v2);
+ struct_len += sizeof(u32) + pathlen;
+
+ if (struct_v >= 2)
+ struct_len += sizeof(u64); /* snap_follows */
+
+ total_len += struct_len;
+ err = ceph_pagelist_reserve(pagelist, total_len);
+
+ if (!err) {
+ if (recon_state->msg_version >= 3) {
+ ceph_pagelist_encode_8(pagelist, struct_v);
+ ceph_pagelist_encode_8(pagelist, 1);
+ ceph_pagelist_encode_32(pagelist, struct_len);
+ }
+ ceph_pagelist_encode_string(pagelist, path, pathlen);
+ ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
+ ceph_locks_to_pagelist(flocks, pagelist,
+ num_fcntl_locks,
+ num_flock_locks);
+ if (struct_v >= 2)
+ ceph_pagelist_encode_64(pagelist, snap_follows);
+ }
kfree(flocks);
} else {
- err = ceph_pagelist_append(pagelist, &rec, reclen);
+ size_t size = sizeof(u32) + pathlen + sizeof(rec.v1);
+ err = ceph_pagelist_reserve(pagelist, size);
+ if (!err) {
+ ceph_pagelist_encode_string(pagelist, path, pathlen);
+ ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
+ }
}
recon_state->nr_caps++;
@@ -2990,7 +2953,12 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
recon_state.nr_caps = 0;
recon_state.pagelist = pagelist;
- recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK;
+ if (session->s_con.peer_features & CEPH_FEATURE_MDSENC)
+ recon_state.msg_version = 3;
+ else if (session->s_con.peer_features & CEPH_FEATURE_FLOCK)
+ recon_state.msg_version = 2;
+ else
+ recon_state.msg_version = 1;
err = iterate_session_caps(session, encode_caps_cb, &recon_state);
if (err < 0)
goto fail;
@@ -3019,8 +2987,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
goto fail;
}
- if (recon_state.flock)
- reply->hdr.version = cpu_to_le16(2);
+ reply->hdr.version = cpu_to_le16(recon_state.msg_version);
/* raced with cap release? */
if (s_nr_caps != recon_state.nr_caps) {
@@ -3218,7 +3185,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
WARN_ON(1);
goto release; /* hrm... */
}
- dname.hash = full_name_hash(dname.name, dname.len);
+ dname.hash = full_name_hash(parent, dname.name, dname.len);
dentry = d_lookup(parent, &dname);
dput(parent);
if (!dentry)
@@ -3245,7 +3212,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
msecs_to_jiffies(le32_to_cpu(h->duration_ms));
di->lease_seq = seq;
- dentry->d_time = di->lease_renew_from + duration;
+ di->time = di->lease_renew_from + duration;
di->lease_renew_after = di->lease_renew_from +
(duration >> 1);
di->lease_renew_from = 0;
@@ -3311,47 +3278,6 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
}
/*
- * Preemptively release a lease we expect to invalidate anyway.
- * Pass @inode always, @dentry is optional.
- */
-void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
- struct dentry *dentry)
-{
- struct ceph_dentry_info *di;
- struct ceph_mds_session *session;
- u32 seq;
-
- BUG_ON(inode == NULL);
- BUG_ON(dentry == NULL);
-
- /* is dentry lease valid? */
- spin_lock(&dentry->d_lock);
- di = ceph_dentry(dentry);
- if (!di || !di->lease_session ||
- di->lease_session->s_mds < 0 ||
- di->lease_gen != di->lease_session->s_cap_gen ||
- !time_before(jiffies, dentry->d_time)) {
- dout("lease_release inode %p dentry %p -- "
- "no lease\n",
- inode, dentry);
- spin_unlock(&dentry->d_lock);
- return;
- }
-
- /* we do have a lease on this dentry; note mds and seq */
- session = ceph_get_mds_session(di->lease_session);
- seq = di->lease_seq;
- __ceph_mdsc_drop_dentry_lease(dentry);
- spin_unlock(&dentry->d_lock);
-
- dout("lease_release inode %p dentry %p to mds%d\n",
- inode, dentry, session->s_mds);
- ceph_mdsc_lease_send_msg(session, inode, dentry,
- CEPH_MDS_LEASE_RELEASE, seq);
- ceph_put_mds_session(session);
-}
-
-/*
* drop all leases (and dentry refs) in preparation for umount
*/
static void drop_leases(struct ceph_mds_client *mdsc)
@@ -3484,7 +3410,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
INIT_LIST_HEAD(&mdsc->snap_flush_list);
spin_lock_init(&mdsc->snap_flush_lock);
mdsc->last_cap_flush_tid = 1;
- mdsc->cap_flush_tree = RB_ROOT;
+ INIT_LIST_HEAD(&mdsc->cap_flush_list);
INIT_LIST_HEAD(&mdsc->cap_dirty);
INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
mdsc->num_cap_flushing = 0;
@@ -3599,7 +3525,7 @@ restart:
void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
{
- u64 want_tid, want_flush, want_snap;
+ u64 want_tid, want_flush;
if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
return;
@@ -3612,17 +3538,19 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
ceph_flush_dirty_caps(mdsc);
spin_lock(&mdsc->cap_dirty_lock);
want_flush = mdsc->last_cap_flush_tid;
+ if (!list_empty(&mdsc->cap_flush_list)) {
+ struct ceph_cap_flush *cf =
+ list_last_entry(&mdsc->cap_flush_list,
+ struct ceph_cap_flush, g_list);
+ cf->wake = true;
+ }
spin_unlock(&mdsc->cap_dirty_lock);
- down_read(&mdsc->snap_rwsem);
- want_snap = mdsc->last_snap_seq;
- up_read(&mdsc->snap_rwsem);
-
- dout("sync want tid %lld flush_seq %lld snap_seq %lld\n",
- want_tid, want_flush, want_snap);
+ dout("sync want tid %lld flush_seq %lld\n",
+ want_tid, want_flush);
wait_unsafe_requests(mdsc, want_tid);
- wait_caps_flush(mdsc, want_flush, want_snap);
+ wait_caps_flush(mdsc, want_flush);
}
/*
@@ -3743,11 +3671,86 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
dout("mdsc_destroy %p done\n", mdsc);
}
+void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
+{
+ struct ceph_fs_client *fsc = mdsc->fsc;
+ const char *mds_namespace = fsc->mount_options->mds_namespace;
+ void *p = msg->front.iov_base;
+ void *end = p + msg->front.iov_len;
+ u32 epoch;
+ u32 map_len;
+ u32 num_fs;
+ u32 mount_fscid = (u32)-1;
+ u8 struct_v, struct_cv;
+ int err = -EINVAL;
+
+ ceph_decode_need(&p, end, sizeof(u32), bad);
+ epoch = ceph_decode_32(&p);
+
+ dout("handle_fsmap epoch %u\n", epoch);
+
+ ceph_decode_need(&p, end, 2 + sizeof(u32), bad);
+ struct_v = ceph_decode_8(&p);
+ struct_cv = ceph_decode_8(&p);
+ map_len = ceph_decode_32(&p);
+
+ ceph_decode_need(&p, end, sizeof(u32) * 3, bad);
+ p += sizeof(u32) * 2; /* skip epoch and legacy_client_fscid */
+
+ num_fs = ceph_decode_32(&p);
+ while (num_fs-- > 0) {
+ void *info_p, *info_end;
+ u32 info_len;
+ u8 info_v, info_cv;
+ u32 fscid, namelen;
+
+ ceph_decode_need(&p, end, 2 + sizeof(u32), bad);
+ info_v = ceph_decode_8(&p);
+ info_cv = ceph_decode_8(&p);
+ info_len = ceph_decode_32(&p);
+ ceph_decode_need(&p, end, info_len, bad);
+ info_p = p;
+ info_end = p + info_len;
+ p = info_end;
+
+ ceph_decode_need(&info_p, info_end, sizeof(u32) * 2, bad);
+ fscid = ceph_decode_32(&info_p);
+ namelen = ceph_decode_32(&info_p);
+ ceph_decode_need(&info_p, info_end, namelen, bad);
+
+ if (mds_namespace &&
+ strlen(mds_namespace) == namelen &&
+ !strncmp(mds_namespace, (char *)info_p, namelen)) {
+ mount_fscid = fscid;
+ break;
+ }
+ }
+
+ ceph_monc_got_map(&fsc->client->monc, CEPH_SUB_FSMAP, epoch);
+ if (mount_fscid != (u32)-1) {
+ fsc->client->monc.fs_cluster_id = mount_fscid;
+ ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP,
+ 0, true);
+ ceph_monc_renew_subs(&fsc->client->monc);
+ } else {
+ err = -ENOENT;
+ goto err_out;
+ }
+ return;
+bad:
+ pr_err("error decoding fsmap\n");
+err_out:
+ mutex_lock(&mdsc->mutex);
+ mdsc->mdsmap_err = -ENOENT;
+ __wake_requests(mdsc, &mdsc->waiting_for_map);
+ mutex_unlock(&mdsc->mutex);
+ return;
+}
/*
* handle mds map update.
*/
-void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
+void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
{
u32 epoch;
u32 maplen;
@@ -3854,7 +3857,10 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
switch (type) {
case CEPH_MSG_MDS_MAP:
- ceph_mdsc_handle_map(mdsc, msg);
+ ceph_mdsc_handle_mdsmap(mdsc, msg);
+ break;
+ case CEPH_MSG_FS_MAP_USER:
+ ceph_mdsc_handle_fsmap(mdsc, msg);
break;
case CEPH_MSG_CLIENT_SESSION:
handle_session(s, msg);
@@ -3900,7 +3906,7 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
struct ceph_auth_handshake *auth = &s->s_auth;
if (force_new && auth->authorizer) {
- ceph_auth_destroy_authorizer(ac, auth->authorizer);
+ ceph_auth_destroy_authorizer(auth->authorizer);
auth->authorizer = NULL;
}
if (!auth->authorizer) {
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 37712ccffcc6b..6b3679737d4a6 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -45,6 +45,15 @@ struct ceph_mds_reply_info_in {
u32 inline_len;
char *inline_data;
u32 pool_ns_len;
+ char *pool_ns_data;
+};
+
+struct ceph_mds_reply_dir_entry {
+ char *name;
+ u32 name_len;
+ struct ceph_mds_reply_lease *lease;
+ struct ceph_mds_reply_info_in inode;
+ loff_t offset;
};
/*
@@ -73,11 +82,10 @@ struct ceph_mds_reply_info_parsed {
struct ceph_mds_reply_dirfrag *dir_dir;
size_t dir_buf_size;
int dir_nr;
- char **dir_dname;
- u32 *dir_dname_len;
- struct ceph_mds_reply_lease **dir_dlease;
- struct ceph_mds_reply_info_in *dir_in;
- u8 dir_complete, dir_end;
+ bool dir_complete;
+ bool dir_end;
+ bool hash_order;
+ struct ceph_mds_reply_dir_entry *dir_entries;
};
/* for create results */
@@ -97,7 +105,7 @@ struct ceph_mds_reply_info_parsed {
/*
* cap releases are batched and sent to the MDS en masse.
*/
-#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE - \
+#define CEPH_CAPS_PER_RELEASE ((PAGE_SIZE - \
sizeof(struct ceph_mds_cap_release)) / \
sizeof(struct ceph_mds_cap_item))
@@ -144,7 +152,6 @@ struct ceph_mds_session {
/* protected by mutex */
struct list_head s_cap_flushing; /* inodes w/ flushing caps */
- struct list_head s_cap_snaps_flushing;
unsigned long s_renew_requested; /* last time we sent a renew req */
u64 s_renew_seq;
@@ -268,8 +275,10 @@ struct ceph_mds_request {
struct ceph_pool_perm {
struct rb_node node;
- u32 pool;
int perm;
+ s64 pool;
+ size_t pool_ns_len;
+ char pool_ns[];
};
/*
@@ -283,6 +292,7 @@ struct ceph_mds_client {
struct completion safe_umount_waiters;
wait_queue_head_t session_close_wq;
struct list_head waiting_for_map;
+ int mdsmap_err;
struct ceph_mds_session **sessions; /* NULL for mds if no session */
atomic_t num_sessions;
@@ -314,7 +324,7 @@ struct ceph_mds_client {
spinlock_t snap_flush_lock;
u64 last_cap_flush_tid;
- struct rb_root cap_flush_tree;
+ struct list_head cap_flush_list;
struct list_head cap_dirty; /* inodes with dirty caps */
struct list_head cap_dirty_migrating; /* ...that are migration... */
int num_cap_flushing; /* # caps we are flushing */
@@ -375,10 +385,6 @@ extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
-extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
- struct inode *inode,
- struct dentry *dn);
-
extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
struct inode *dir);
@@ -413,8 +419,10 @@ extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
struct dentry *dentry, char action,
u32 seq);
-extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
- struct ceph_msg *msg);
+extern void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc,
+ struct ceph_msg *msg);
+extern void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc,
+ struct ceph_msg *msg);
extern struct ceph_mds_session *
ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target);
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 261531e55e9d0..8c3591a7fbaee 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -54,16 +54,21 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
const void *start = *p;
int i, j, n;
int err = -EINVAL;
- u16 version;
+ u8 mdsmap_v, mdsmap_cv;
m = kzalloc(sizeof(*m), GFP_NOFS);
if (m == NULL)
return ERR_PTR(-ENOMEM);
- ceph_decode_16_safe(p, end, version, bad);
- if (version > 3) {
- pr_warn("got mdsmap version %d > 3, failing", version);
- goto bad;
+ ceph_decode_need(p, end, 1 + 1, bad);
+ mdsmap_v = ceph_decode_8(p);
+ mdsmap_cv = ceph_decode_8(p);
+ if (mdsmap_v >= 4) {
+ u32 mdsmap_len;
+ ceph_decode_32_safe(p, end, mdsmap_len, bad);
+ if (end < *p + mdsmap_len)
+ goto bad;
+ end = *p + mdsmap_len;
}
ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
@@ -87,16 +92,29 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
u32 namelen;
s32 mds, inc, state;
u64 state_seq;
- u8 infoversion;
+ u8 info_v;
+ void *info_end = NULL;
struct ceph_entity_addr addr;
u32 num_export_targets;
void *pexport_targets = NULL;
struct ceph_timespec laggy_since;
struct ceph_mds_info *info;
- ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
+ ceph_decode_need(p, end, sizeof(u64) + 1, bad);
global_id = ceph_decode_64(p);
- infoversion = ceph_decode_8(p);
+ info_v= ceph_decode_8(p);
+ if (info_v >= 4) {
+ u32 info_len;
+ u8 info_cv;
+ ceph_decode_need(p, end, 1 + sizeof(u32), bad);
+ info_cv = ceph_decode_8(p);
+ info_len = ceph_decode_32(p);
+ info_end = *p + info_len;
+ if (info_end > end)
+ goto bad;
+ }
+
+ ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
*p += sizeof(u64);
namelen = ceph_decode_32(p); /* skip mds name */
*p += namelen;
@@ -115,7 +133,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
*p += sizeof(u32);
ceph_decode_32_safe(p, end, namelen, bad);
*p += namelen;
- if (infoversion >= 2) {
+ if (info_v >= 2) {
ceph_decode_32_safe(p, end, num_export_targets, bad);
pexport_targets = *p;
*p += num_export_targets * sizeof(u32);
@@ -123,6 +141,12 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
num_export_targets = 0;
}
+ if (info_end && *p != info_end) {
+ if (*p > info_end)
+ goto bad;
+ *p = info_end;
+ }
+
dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
i+1, n, global_id, mds, inc,
ceph_pr_addr(&addr.in_addr),
@@ -163,6 +187,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
m->m_cas_pg_pool = ceph_decode_64(p);
/* ok, we don't care about the rest. */
+ *p = end;
dout("mdsmap_decode success epoch %u\n", m->m_epoch);
return m;
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 9caaa7ffc93fe..9ff5219d849e9 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -520,9 +520,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
ihold(inode);
atomic_set(&capsnap->nref, 1);
- capsnap->ci = ci;
INIT_LIST_HEAD(&capsnap->ci_item);
- INIT_LIST_HEAD(&capsnap->flushing_item);
capsnap->follows = old_snapc->seq;
capsnap->issued = __ceph_caps_issued(ci, NULL);
@@ -551,7 +549,6 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
ci->i_wrbuffer_ref_head = 0;
capsnap->context = old_snapc;
list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
- old_snapc = NULL;
if (used & CEPH_CAP_FILE_WR) {
dout("queue_cap_snap %p cap_snap %p snapc %p"
@@ -563,6 +560,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
__ceph_finish_cap_snap(ci, capsnap);
}
capsnap = NULL;
+ old_snapc = NULL;
update_snapc:
if (ci->i_head_snapc) {
@@ -603,6 +601,8 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
capsnap->dirty_pages);
return 0;
}
+
+ ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n",
inode, capsnap, capsnap->context,
capsnap->context->seq, ceph_cap_string(capsnap->dirty),
@@ -799,9 +799,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
inode = &ci->vfs_inode;
ihold(inode);
spin_unlock(&mdsc->snap_flush_lock);
- spin_lock(&ci->i_ceph_lock);
- __ceph_flush_snaps(ci, &session, 0);
- spin_unlock(&ci->i_ceph_lock);
+ ceph_flush_snaps(ci, &session);
iput(inode);
spin_lock(&mdsc->snap_flush_lock);
}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index c973043deb0ec..e247f6f0feb72 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -120,6 +120,7 @@ enum {
Opt_last_int,
/* int args above */
Opt_snapdirname,
+ Opt_mds_namespace,
Opt_last_string,
/* string args above */
Opt_dirstat,
@@ -154,6 +155,7 @@ static match_table_t fsopt_tokens = {
{Opt_congestion_kb, "write_congestion_kb=%d"},
/* int args above */
{Opt_snapdirname, "snapdirname=%s"},
+ {Opt_mds_namespace, "mds_namespace=%s"},
/* string args above */
{Opt_dirstat, "dirstat"},
{Opt_nodirstat, "nodirstat"},
@@ -210,7 +212,13 @@ static int parse_fsopt_token(char *c, void *private)
if (!fsopt->snapdir_name)
return -ENOMEM;
break;
-
+ case Opt_mds_namespace:
+ fsopt->mds_namespace = kstrndup(argstr[0].from,
+ argstr[0].to-argstr[0].from,
+ GFP_KERNEL);
+ if (!fsopt->mds_namespace)
+ return -ENOMEM;
+ break;
/* misc */
case Opt_wsize:
fsopt->wsize = intval;
@@ -297,6 +305,8 @@ static void destroy_mount_options(struct ceph_mount_options *args)
{
dout("destroy_mount_options %p\n", args);
kfree(args->snapdir_name);
+ kfree(args->mds_namespace);
+ kfree(args->server_path);
kfree(args);
}
@@ -327,6 +337,13 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
if (ret)
return ret;
+ ret = strcmp_null(fsopt1->mds_namespace, fsopt2->mds_namespace);
+ if (ret)
+ return ret;
+
+ ret = strcmp_null(fsopt1->server_path, fsopt2->server_path);
+ if (ret)
+ return ret;
return ceph_compare_options(new_opt, fsc->client);
}
@@ -334,8 +351,7 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
static int parse_mount_options(struct ceph_mount_options **pfsopt,
struct ceph_options **popt,
int flags, char *options,
- const char *dev_name,
- const char **path)
+ const char *dev_name)
{
struct ceph_mount_options *fsopt;
const char *dev_name_end;
@@ -380,12 +396,13 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
*/
dev_name_end = strchr(dev_name, '/');
if (dev_name_end) {
- /* skip over leading '/' for path */
- *path = dev_name_end + 1;
+ fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL);
+ if (!fsopt->server_path) {
+ err = -ENOMEM;
+ goto out;
+ }
} else {
- /* path is empty */
dev_name_end = dev_name + strlen(dev_name);
- *path = dev_name_end;
}
err = -EINVAL;
dev_name_end--; /* back up to ':' separator */
@@ -395,7 +412,8 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
goto out;
}
dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
- dout("server path '%s'\n", *path);
+ if (fsopt->server_path)
+ dout("server path '%s'\n", fsopt->server_path);
*popt = ceph_parse_options(options, dev_name, dev_name_end,
parse_fsopt_token, (void *)fsopt);
@@ -457,6 +475,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",noacl");
#endif
+ if (fsopt->mds_namespace)
+ seq_printf(m, ",mds_namespace=%s", fsopt->mds_namespace);
if (fsopt->wsize)
seq_printf(m, ",wsize=%d", fsopt->wsize);
if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
@@ -495,9 +515,11 @@ static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
switch (type) {
case CEPH_MSG_MDS_MAP:
- ceph_mdsc_handle_map(fsc->mdsc, msg);
+ ceph_mdsc_handle_mdsmap(fsc->mdsc, msg);
+ return 0;
+ case CEPH_MSG_FS_MAP_USER:
+ ceph_mdsc_handle_fsmap(fsc->mdsc, msg);
return 0;
-
default:
return -1;
}
@@ -511,9 +533,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
{
struct ceph_fs_client *fsc;
const u64 supported_features =
- CEPH_FEATURE_FLOCK |
- CEPH_FEATURE_DIRLAYOUTHASH |
- CEPH_FEATURE_MDS_INLINE_DATA;
+ CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH |
+ CEPH_FEATURE_MDSENC | CEPH_FEATURE_MDS_INLINE_DATA;
const u64 required_features = 0;
int page_count;
size_t size;
@@ -530,7 +551,14 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
goto fail;
}
fsc->client->extra_mon_dispatch = extra_mon_dispatch;
- ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true);
+
+ if (fsopt->mds_namespace == NULL) {
+ ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP,
+ 0, true);
+ } else {
+ ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_FSMAP,
+ 0, false);
+ }
fsc->mount_options = fsopt;
@@ -560,7 +588,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
/* set up mempools */
err = -ENOMEM;
- page_count = fsc->mount_options->wsize >> PAGE_CACHE_SHIFT;
+ page_count = fsc->mount_options->wsize >> PAGE_SHIFT;
size = sizeof (struct page *) * (page_count ? page_count : 1);
fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size);
if (!fsc->wb_pagevec_pool)
@@ -658,8 +686,8 @@ static int __init init_caches(void)
if (ceph_dentry_cachep == NULL)
goto bad_dentry;
- ceph_file_cachep = KMEM_CACHE(ceph_file_info,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+ ceph_file_cachep = KMEM_CACHE(ceph_file_info, SLAB_MEM_SPREAD);
+
if (ceph_file_cachep == NULL)
goto bad_file;
@@ -717,6 +745,7 @@ static const struct super_operations ceph_super_ops = {
.destroy_inode = ceph_destroy_inode,
.write_inode = ceph_write_inode,
.drop_inode = ceph_drop_inode,
+ .evict_inode = ceph_evict_inode,
.sync_fs = ceph_sync_fs,
.put_super = ceph_put_super,
.show_options = ceph_show_options,
@@ -785,8 +814,7 @@ out:
/*
* mount: join the ceph cluster, and open root directory.
*/
-static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
- const char *path)
+static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc)
{
int err;
unsigned long started = jiffies; /* note the start time */
@@ -815,11 +843,12 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
goto fail;
}
- if (path[0] == 0) {
+ if (!fsc->mount_options->server_path) {
root = fsc->sb->s_root;
dget(root);
} else {
- dout("mount opening base mountpoint\n");
+ const char *path = fsc->mount_options->server_path + 1;
+ dout("mount opening path %s\n", path);
root = open_root_dentry(fsc, path, started);
if (IS_ERR(root)) {
err = PTR_ERR(root);
@@ -912,13 +941,13 @@ static int ceph_register_bdi(struct super_block *sb,
int err;
/* set ra_pages based on rasize mount option? */
- if (fsc->mount_options->rasize >= PAGE_CACHE_SIZE)
+ if (fsc->mount_options->rasize >= PAGE_SIZE)
fsc->backing_dev_info.ra_pages =
- (fsc->mount_options->rasize + PAGE_CACHE_SIZE - 1)
+ (fsc->mount_options->rasize + PAGE_SIZE - 1)
>> PAGE_SHIFT;
else
fsc->backing_dev_info.ra_pages =
- VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
+ VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
atomic_long_inc_return(&bdi_seq));
@@ -935,7 +964,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
struct dentry *res;
int err;
int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
- const char *path = NULL;
struct ceph_mount_options *fsopt = NULL;
struct ceph_options *opt = NULL;
@@ -944,7 +972,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
#ifdef CONFIG_CEPH_FS_POSIX_ACL
flags |= MS_POSIXACL;
#endif
- err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
+ err = parse_mount_options(&fsopt, &opt, flags, data, dev_name);
if (err < 0) {
res = ERR_PTR(err);
goto out_final;
@@ -987,7 +1015,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
}
}
- res = ceph_real_mount(fsc, path);
+ res = ceph_real_mount(fsc);
if (IS_ERR(res))
goto out_splat;
dout("root %p inode %p ino %llx.%llx\n", res,
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index e705c4d612d76..3e3fa91630596 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -69,6 +69,8 @@ struct ceph_mount_options {
*/
char *snapdir_name; /* default ".snap" */
+ char *mds_namespace; /* default NULL */
+ char *server_path; /* default "/" */
};
struct ceph_fs_client {
@@ -101,7 +103,6 @@ struct ceph_fs_client {
#ifdef CONFIG_CEPH_FSCACHE
struct fscache_cookie *fscache;
- struct workqueue_struct *revalidate_wq;
#endif
};
@@ -146,6 +147,14 @@ struct ceph_cap {
#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */
#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */
+struct ceph_cap_flush {
+ u64 tid;
+ int caps; /* 0 means capsnap */
+ bool wake; /* wake up flush waiters when finish ? */
+ struct list_head g_list; // global
+ struct list_head i_list; // per inode
+};
+
/*
* Snapped cap state that is pending flush to mds. When a snapshot occurs,
* we first complete any in-process sync writes and writeback any dirty
@@ -153,10 +162,11 @@ struct ceph_cap {
*/
struct ceph_cap_snap {
atomic_t nref;
- struct ceph_inode_info *ci;
- struct list_head ci_item, flushing_item;
+ struct list_head ci_item;
- u64 follows, flush_tid;
+ struct ceph_cap_flush cap_flush;
+
+ u64 follows;
int issued, dirty;
struct ceph_snap_context *context;
@@ -185,16 +195,6 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
}
}
-struct ceph_cap_flush {
- u64 tid;
- int caps;
- struct rb_node g_node; // global
- union {
- struct rb_node i_node; // inode
- struct list_head list;
- };
-};
-
/*
* The frag tree describes how a directory is fragmented, potentially across
* multiple metadata servers. It is also used to indicate points where
@@ -245,7 +245,7 @@ struct ceph_dentry_info {
unsigned long lease_renew_after, lease_renew_from;
struct list_head lru;
struct dentry *dentry;
- u64 time;
+ unsigned long time;
u64 offset;
};
@@ -286,7 +286,6 @@ struct ceph_inode_info {
struct ceph_dir_layout i_dir_layout;
struct ceph_file_layout i_layout;
- size_t i_pool_ns_len;
char *i_symlink;
/* for dirs */
@@ -295,6 +294,7 @@ struct ceph_inode_info {
u64 i_files, i_subdirs;
struct rb_root i_fragtree;
+ int i_fragtree_nsplits;
struct mutex i_fragtree_mutex;
struct ceph_inode_xattrs_info i_xattrs;
@@ -309,7 +309,7 @@ struct ceph_inode_info {
* overlapping, pipelined cap flushes to the mds. we can probably
* reduce the tid to 8 bits if we're concerned about inode size. */
struct ceph_cap_flush *i_prealloc_cap_flush;
- struct rb_root i_cap_flush_tree;
+ struct list_head i_cap_flush_list;
wait_queue_head_t i_cap_wq; /* threads waiting on a capability */
unsigned long i_hold_caps_min; /* jiffies */
unsigned long i_hold_caps_max; /* jiffies */
@@ -320,7 +320,7 @@ struct ceph_inode_info {
dirty|flushing caps */
unsigned i_snap_caps; /* cap bits for snapped files */
- int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
+ int i_nr_by_mode[CEPH_FILE_MODE_BITS]; /* open file counts */
struct mutex i_truncate_mutex;
u32 i_truncate_seq; /* last truncate to smaller size */
@@ -357,8 +357,7 @@ struct ceph_inode_info {
#ifdef CONFIG_CEPH_FSCACHE
struct fscache_cookie *fscache;
- u32 i_fscache_gen; /* sequence, for delayed fscache validate */
- struct work_struct i_revalidate_work;
+ u32 i_fscache_gen;
#endif
struct inode vfs_inode; /* at end */
};
@@ -469,6 +468,9 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
#define CEPH_I_POOL_RD (1 << 5) /* can read from pool */
#define CEPH_I_POOL_WR (1 << 6) /* can write to pool */
#define CEPH_I_SEC_INITED (1 << 7) /* security initialized */
+#define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */
+#define CEPH_I_KICK_FLUSH (1 << 9) /* kick flushing caps */
+#define CEPH_I_FLUSH_SNAPS (1 << 10) /* need flush snapss */
static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
long long release_count,
@@ -537,11 +539,6 @@ static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
return (struct ceph_dentry_info *)dentry->d_fsdata;
}
-static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
-{
- return ((loff_t)frag << 32) | (loff_t)off;
-}
-
/*
* caps helpers
*/
@@ -632,7 +629,6 @@ struct ceph_file_info {
struct ceph_mds_request *last_readdir;
/* readdir: position within a frag */
- unsigned offset; /* offset of last chunk, adjusted for . and .. */
unsigned next_offset; /* offset of next chunk (last_name's + 1) */
char *last_name; /* last entry in previous chunk */
long long dir_release_count;
@@ -754,6 +750,7 @@ extern const struct inode_operations ceph_file_iops;
extern struct inode *ceph_alloc_inode(struct super_block *sb);
extern void ceph_destroy_inode(struct inode *inode);
extern int ceph_drop_inode(struct inode *inode);
+extern void ceph_evict_inode(struct inode *inode);
extern struct inode *ceph_get_inode(struct super_block *sb,
struct ceph_vino vino);
@@ -785,19 +782,15 @@ static inline int ceph_do_getattr(struct inode *inode, int mask, bool force)
return __ceph_do_getattr(inode, NULL, mask, force);
}
extern int ceph_permission(struct inode *inode, int mask);
+extern int __ceph_setattr(struct inode *inode, struct iattr *attr);
extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
/* xattr.c */
-extern int ceph_setxattr(struct dentry *, const char *, const void *,
- size_t, int);
-int __ceph_setxattr(struct dentry *, const char *, const void *, size_t, int);
+int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int);
ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
-int __ceph_removexattr(struct dentry *, const char *);
-extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
-extern int ceph_removexattr(struct dentry *, const char *);
extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
extern void __init ceph_xattr_init(void);
@@ -898,9 +891,8 @@ extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
struct ceph_snap_context *snapc);
-extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
- struct ceph_mds_session **psession,
- int again);
+extern void ceph_flush_snaps(struct ceph_inode_info *ci,
+ struct ceph_mds_session **psession);
extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_session *session);
extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
@@ -915,10 +907,7 @@ extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
loff_t endoff, int *got, struct page **pinned_page);
/* for counting open files by mode */
-static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode)
-{
- ci->i_nr_by_mode[mode]++;
-}
+extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
/* addr.c */
@@ -931,6 +920,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
/* file.c */
extern const struct file_operations ceph_file_fops;
+extern int ceph_renew_caps(struct inode *inode);
extern int ceph_open(struct inode *inode, struct file *file);
extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
struct file *file, unsigned flags, umode_t mode,
@@ -938,6 +928,7 @@ extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
extern int ceph_release(struct inode *inode, struct file *filp);
extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
char *data, size_t len);
+extern void ceph_sync_write_wait(struct inode *inode);
/* dir.c */
extern const struct file_operations ceph_dir_fops;
extern const struct file_operations ceph_snapdir_fops;
@@ -946,6 +937,7 @@ extern const struct inode_operations ceph_snapdir_iops;
extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
ceph_snapdir_dentry_ops;
+extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order);
extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
extern int ceph_handle_snapdir(struct ceph_mds_request *req,
struct dentry *dentry, int err);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 9410abdef3cec..adc231892b0d5 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -16,6 +16,8 @@
static int __remove_xattr(struct ceph_inode_info *ci,
struct ceph_inode_xattr *xattr);
+const struct xattr_handler ceph_other_xattr_handler;
+
/*
* List of handlers for synthetic system.* attributes. Other
* attributes are handled directly.
@@ -25,6 +27,7 @@ const struct xattr_handler *ceph_xattr_handlers[] = {
&posix_acl_access_xattr_handler,
&posix_acl_default_xattr_handler,
#endif
+ &ceph_other_xattr_handler,
NULL,
};
@@ -33,7 +36,6 @@ static bool ceph_is_valid_xattr(const char *name)
return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) ||
!strncmp(name, XATTR_SECURITY_PREFIX,
XATTR_SECURITY_PREFIX_LEN) ||
- !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
}
@@ -55,81 +57,88 @@ struct ceph_vxattr {
static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
{
- size_t s;
- char *p = (char *)&ci->i_layout;
-
- for (s = 0; s < sizeof(ci->i_layout); s++, p++)
- if (*p)
- return true;
- return false;
+ struct ceph_file_layout *fl = &ci->i_layout;
+ return (fl->stripe_unit > 0 || fl->stripe_count > 0 ||
+ fl->object_size > 0 || fl->pool_id >= 0 ||
+ rcu_dereference_raw(fl->pool_ns) != NULL);
}
static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
size_t size)
{
- int ret;
struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
struct ceph_osd_client *osdc = &fsc->client->osdc;
- s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
+ struct ceph_string *pool_ns;
+ s64 pool = ci->i_layout.pool_id;
const char *pool_name;
+ const char *ns_field = " pool_namespace=";
char buf[128];
+ size_t len, total_len = 0;
+ int ret;
+
+ pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
- down_read(&osdc->map_sem);
+ down_read(&osdc->lock);
pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
if (pool_name) {
- size_t len = strlen(pool_name);
- ret = snprintf(buf, sizeof(buf),
- "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=",
- (unsigned long long)ceph_file_layout_su(ci->i_layout),
- (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
- (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
- if (!size) {
- ret += len;
- } else if (ret + len > size) {
- ret = -ERANGE;
- } else {
- memcpy(val, buf, ret);
+ len = snprintf(buf, sizeof(buf),
+ "stripe_unit=%u stripe_count=%u object_size=%u pool=",
+ ci->i_layout.stripe_unit, ci->i_layout.stripe_count,
+ ci->i_layout.object_size);
+ total_len = len + strlen(pool_name);
+ } else {
+ len = snprintf(buf, sizeof(buf),
+ "stripe_unit=%u stripe_count=%u object_size=%u pool=%lld",
+ ci->i_layout.stripe_unit, ci->i_layout.stripe_count,
+ ci->i_layout.object_size, (unsigned long long)pool);
+ total_len = len;
+ }
+
+ if (pool_ns)
+ total_len += strlen(ns_field) + pool_ns->len;
+
+ if (!size) {
+ ret = total_len;
+ } else if (total_len > size) {
+ ret = -ERANGE;
+ } else {
+ memcpy(val, buf, len);
+ ret = len;
+ if (pool_name) {
+ len = strlen(pool_name);
memcpy(val + ret, pool_name, len);
ret += len;
}
- } else {
- ret = snprintf(buf, sizeof(buf),
- "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld",
- (unsigned long long)ceph_file_layout_su(ci->i_layout),
- (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
- (unsigned long long)ceph_file_layout_object_size(ci->i_layout),
- (unsigned long long)pool);
- if (size) {
- if (ret <= size)
- memcpy(val, buf, ret);
- else
- ret = -ERANGE;
+ if (pool_ns) {
+ len = strlen(ns_field);
+ memcpy(val + ret, ns_field, len);
+ ret += len;
+ memcpy(val + ret, pool_ns->str, pool_ns->len);
+ ret += pool_ns->len;
}
}
- up_read(&osdc->map_sem);
+ up_read(&osdc->lock);
+ ceph_put_string(pool_ns);
return ret;
}
static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci,
char *val, size_t size)
{
- return snprintf(val, size, "%lld",
- (unsigned long long)ceph_file_layout_su(ci->i_layout));
+ return snprintf(val, size, "%u", ci->i_layout.stripe_unit);
}
static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci,
char *val, size_t size)
{
- return snprintf(val, size, "%lld",
- (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout));
+ return snprintf(val, size, "%u", ci->i_layout.stripe_count);
}
static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci,
char *val, size_t size)
{
- return snprintf(val, size, "%lld",
- (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
+ return snprintf(val, size, "%u", ci->i_layout.object_size);
}
static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
@@ -138,16 +147,28 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
int ret;
struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
struct ceph_osd_client *osdc = &fsc->client->osdc;
- s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
+ s64 pool = ci->i_layout.pool_id;
const char *pool_name;
- down_read(&osdc->map_sem);
+ down_read(&osdc->lock);
pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
if (pool_name)
ret = snprintf(val, size, "%s", pool_name);
else
ret = snprintf(val, size, "%lld", (unsigned long long)pool);
- up_read(&osdc->map_sem);
+ up_read(&osdc->lock);
+ return ret;
+}
+
+static size_t ceph_vxattrcb_layout_pool_namespace(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ int ret = 0;
+ struct ceph_string *ns = ceph_try_get_string(ci->i_layout.pool_ns);
+ if (ns) {
+ ret = snprintf(val, size, "%.*s", (int)ns->len, ns->str);
+ ceph_put_string(ns);
+ }
return ret;
}
@@ -239,6 +260,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
XATTR_LAYOUT_FIELD(dir, layout, object_size),
XATTR_LAYOUT_FIELD(dir, layout, pool),
+ XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
XATTR_NAME_CEPH(dir, entries),
XATTR_NAME_CEPH(dir, files),
XATTR_NAME_CEPH(dir, subdirs),
@@ -266,6 +288,7 @@ static struct ceph_vxattr ceph_file_vxattrs[] = {
XATTR_LAYOUT_FIELD(file, layout, stripe_count),
XATTR_LAYOUT_FIELD(file, layout, object_size),
XATTR_LAYOUT_FIELD(file, layout, pool),
+ XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
{ .name = NULL, 0 } /* Required table terminator */
};
static size_t ceph_file_vxattrs_name_size; /* total size of all names */
@@ -496,19 +519,6 @@ static int __remove_xattr(struct ceph_inode_info *ci,
return 0;
}
-static int __remove_xattr_by_name(struct ceph_inode_info *ci,
- const char *name)
-{
- struct rb_node **p;
- struct ceph_inode_xattr *xattr;
- int err;
-
- p = &ci->i_xattrs.index.rb_node;
- xattr = __get_xattr(ci, name);
- err = __remove_xattr(ci, xattr);
- return err;
-}
-
static char *__copy_xattr_names(struct ceph_inode_info *ci,
char *dest)
{
@@ -740,9 +750,6 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
int req_mask;
int err;
- if (!ceph_is_valid_xattr(name))
- return -ENODATA;
-
/* let's see if a virtual xattr was requested */
vxattr = ceph_match_vxattr(inode, name);
if (vxattr) {
@@ -804,15 +811,6 @@ out:
return err;
}
-ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
- size_t size)
-{
- if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- return generic_getxattr(dentry, name, value, size);
-
- return __ceph_getxattr(d_inode(dentry), name, value, size);
-}
-
ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
{
struct inode *inode = d_inode(dentry);
@@ -877,15 +875,15 @@ out:
return err;
}
-static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
+static int ceph_sync_setxattr(struct inode *inode, const char *name,
const char *value, size_t size, int flags)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
- struct inode *inode = d_inode(dentry);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_request *req;
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_pagelist *pagelist = NULL;
+ int op = CEPH_MDS_OP_SETXATTR;
int err;
if (size > 0) {
@@ -899,20 +897,21 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
if (err)
goto out;
} else if (!value) {
- flags |= CEPH_XATTR_REMOVE;
+ if (flags & CEPH_XATTR_REPLACE)
+ op = CEPH_MDS_OP_RMXATTR;
+ else
+ flags |= CEPH_XATTR_REMOVE;
}
dout("setxattr value=%.*s\n", (int)size, value);
/* do request */
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
- USE_AUTH_MDS);
+ req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
if (IS_ERR(req)) {
err = PTR_ERR(req);
goto out;
}
- req->r_args.setxattr.flags = cpu_to_le32(flags);
req->r_path2 = kstrdup(name, GFP_NOFS);
if (!req->r_path2) {
ceph_mdsc_put_request(req);
@@ -920,8 +919,11 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
goto out;
}
- req->r_pagelist = pagelist;
- pagelist = NULL;
+ if (op == CEPH_MDS_OP_SETXATTR) {
+ req->r_args.setxattr.flags = cpu_to_le32(flags);
+ req->r_pagelist = pagelist;
+ pagelist = NULL;
+ }
req->r_inode = inode;
ihold(inode);
@@ -939,13 +941,12 @@ out:
return err;
}
-int __ceph_setxattr(struct dentry *dentry, const char *name,
+int __ceph_setxattr(struct inode *inode, const char *name,
const void *value, size_t size, int flags)
{
- struct inode *inode = d_inode(dentry);
struct ceph_vxattr *vxattr;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_cap_flush *prealloc_cf = NULL;
int issued;
int err;
@@ -958,8 +959,8 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
int required_blob_size;
bool lock_snap_rwsem = false;
- if (!ceph_is_valid_xattr(name))
- return -EOPNOTSUPP;
+ if (ceph_snap(inode) != CEPH_NOSNAP)
+ return -EROFS;
vxattr = ceph_match_vxattr(inode, name);
if (vxattr && vxattr->readonly)
@@ -1056,7 +1057,7 @@ do_sync_unlocked:
"during filling trace\n", inode);
err = -EBUSY;
} else {
- err = ceph_sync_setxattr(dentry, name, value, size, flags);
+ err = ceph_sync_setxattr(inode, name, value, size, flags);
}
out:
ceph_free_cap_flush(prealloc_cf);
@@ -1066,146 +1067,30 @@ out:
return err;
}
-int ceph_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+static int ceph_get_xattr_handler(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, void *value, size_t size)
{
- if (ceph_snap(d_inode(dentry)) != CEPH_NOSNAP)
- return -EROFS;
-
- if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- return generic_setxattr(dentry, name, value, size, flags);
-
- if (size == 0)
- value = ""; /* empty EA, do not remove */
-
- return __ceph_setxattr(dentry, name, value, size, flags);
-}
-
-static int ceph_send_removexattr(struct dentry *dentry, const char *name)
-{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
- struct inode *inode = d_inode(dentry);
- struct ceph_mds_request *req;
- int err;
-
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RMXATTR,
- USE_AUTH_MDS);
- if (IS_ERR(req))
- return PTR_ERR(req);
- req->r_path2 = kstrdup(name, GFP_NOFS);
- if (!req->r_path2)
- return -ENOMEM;
-
- req->r_inode = inode;
- ihold(inode);
- req->r_num_caps = 1;
- req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
- err = ceph_mdsc_do_request(mdsc, NULL, req);
- ceph_mdsc_put_request(req);
- return err;
+ if (!ceph_is_valid_xattr(name))
+ return -EOPNOTSUPP;
+ return __ceph_getxattr(inode, name, value, size);
}
-int __ceph_removexattr(struct dentry *dentry, const char *name)
+static int ceph_set_xattr_handler(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- struct inode *inode = d_inode(dentry);
- struct ceph_vxattr *vxattr;
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
- struct ceph_cap_flush *prealloc_cf = NULL;
- int issued;
- int err;
- int required_blob_size;
- int dirty;
- bool lock_snap_rwsem = false;
-
if (!ceph_is_valid_xattr(name))
return -EOPNOTSUPP;
-
- vxattr = ceph_match_vxattr(inode, name);
- if (vxattr && vxattr->readonly)
- return -EOPNOTSUPP;
-
- /* pass any unhandled ceph.* xattrs through to the MDS */
- if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
- goto do_sync_unlocked;
-
- prealloc_cf = ceph_alloc_cap_flush();
- if (!prealloc_cf)
- return -ENOMEM;
-
- err = -ENOMEM;
- spin_lock(&ci->i_ceph_lock);
-retry:
- issued = __ceph_caps_issued(ci, NULL);
- if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))
- goto do_sync;
-
- if (!lock_snap_rwsem && !ci->i_head_snapc) {
- lock_snap_rwsem = true;
- if (!down_read_trylock(&mdsc->snap_rwsem)) {
- spin_unlock(&ci->i_ceph_lock);
- down_read(&mdsc->snap_rwsem);
- spin_lock(&ci->i_ceph_lock);
- goto retry;
- }
- }
-
- dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
-
- __build_xattrs(inode);
-
- required_blob_size = __get_required_blob_size(ci, 0, 0);
-
- if (!ci->i_xattrs.prealloc_blob ||
- required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
- struct ceph_buffer *blob;
-
- spin_unlock(&ci->i_ceph_lock);
- dout(" preaallocating new blob size=%d\n", required_blob_size);
- blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
- if (!blob)
- goto do_sync_unlocked;
- spin_lock(&ci->i_ceph_lock);
- if (ci->i_xattrs.prealloc_blob)
- ceph_buffer_put(ci->i_xattrs.prealloc_blob);
- ci->i_xattrs.prealloc_blob = blob;
- goto retry;
- }
-
- err = __remove_xattr_by_name(ceph_inode(inode), name);
-
- dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
- &prealloc_cf);
- ci->i_xattrs.dirty = true;
- inode->i_ctime = current_fs_time(inode->i_sb);
- spin_unlock(&ci->i_ceph_lock);
- if (lock_snap_rwsem)
- up_read(&mdsc->snap_rwsem);
- if (dirty)
- __mark_inode_dirty(inode, dirty);
- ceph_free_cap_flush(prealloc_cf);
- return err;
-do_sync:
- spin_unlock(&ci->i_ceph_lock);
-do_sync_unlocked:
- if (lock_snap_rwsem)
- up_read(&mdsc->snap_rwsem);
- ceph_free_cap_flush(prealloc_cf);
- err = ceph_send_removexattr(dentry, name);
- return err;
+ return __ceph_setxattr(inode, name, value, size, flags);
}
-int ceph_removexattr(struct dentry *dentry, const char *name)
-{
- if (ceph_snap(d_inode(dentry)) != CEPH_NOSNAP)
- return -EROFS;
-
- if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- return generic_removexattr(dentry, name);
-
- return __ceph_removexattr(dentry, name);
-}
+const struct xattr_handler ceph_other_xattr_handler = {
+ .prefix = "", /* match any name => handlers called with full name */
+ .get = ceph_get_xattr_handler,
+ .set = ceph_set_xattr_handler,
+};
#ifdef CONFIG_SECURITY
bool ceph_security_xattr_wanted(struct inode *in)
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 24b142569ca9b..6edd825231c5d 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -91,6 +91,10 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor,
break;
}
+ if (i < CHRDEV_MAJOR_DYN_END)
+ pr_warn("CHRDEV \"%s\" major number %d goes below the dynamic allocation range\n",
+ name, i);
+
if (i == 0) {
ret = -EBUSY;
goto out;
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 1964d212ab08c..eed7eb09f46f3 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -5,9 +5,10 @@ obj-$(CONFIG_CIFS) += cifs.o
cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
link.o misc.o netmisc.o smbencrypt.o transport.o asn1.o \
- cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
+ cifs_unicode.o nterr.o cifsencrypt.o \
readdir.o ioctl.o sess.o export.o smb1ops.o winucase.o
+cifs-$(CONFIG_CIFS_XATTR) += xattr.o
cifs-$(CONFIG_CIFS_ACL) += cifsacl.o
cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 788e19195991a..6c58e13fed2f1 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -244,7 +244,6 @@ static int cifs_debug_data_proc_open(struct inode *inode, struct file *file)
}
static const struct file_operations cifs_debug_data_proc_fops = {
- .owner = THIS_MODULE,
.open = cifs_debug_data_proc_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -361,7 +360,6 @@ static int cifs_stats_proc_open(struct inode *inode, struct file *file)
}
static const struct file_operations cifs_stats_proc_fops = {
- .owner = THIS_MODULE,
.open = cifs_stats_proc_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -447,7 +445,6 @@ static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer,
}
static const struct file_operations cifsFYI_proc_fops = {
- .owner = THIS_MODULE,
.open = cifsFYI_proc_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -479,7 +476,6 @@ static ssize_t cifs_linux_ext_proc_write(struct file *file,
}
static const struct file_operations cifs_linux_ext_proc_fops = {
- .owner = THIS_MODULE,
.open = cifs_linux_ext_proc_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -511,7 +507,6 @@ static ssize_t cifs_lookup_cache_proc_write(struct file *file,
}
static const struct file_operations cifs_lookup_cache_proc_fops = {
- .owner = THIS_MODULE,
.open = cifs_lookup_cache_proc_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -543,7 +538,6 @@ static ssize_t traceSMB_proc_write(struct file *file, const char __user *buffer,
}
static const struct file_operations traceSMB_proc_fops = {
- .owner = THIS_MODULE,
.open = traceSMB_proc_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -655,7 +649,6 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
}
static const struct file_operations cifs_security_flags_proc_fops = {
- .owner = THIS_MODULE,
.open = cifs_security_flags_proc_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index e956cba943381..ec9dbbcca3b90 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -151,8 +151,12 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
if (sb_mountdata == NULL)
return ERR_PTR(-EINVAL);
- if (strlen(fullpath) - ref->path_consumed)
+ if (strlen(fullpath) - ref->path_consumed) {
prepath = fullpath + ref->path_consumed;
+ /* skip initial delimiter */
+ if (*prepath == '/' || *prepath == '\\')
+ prepath++;
+ }
*devname = cifs_build_devname(ref->node_name, prepath);
if (IS_ERR(*devname)) {
@@ -302,7 +306,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
if (full_path == NULL)
goto cdda_exit;
- cifs_sb = CIFS_SB(d_inode(mntpt)->i_sb);
+ cifs_sb = CIFS_SB(mntpt->d_sb);
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink)) {
mnt = ERR_CAST(tlink);
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 3182273a34079..1418daa03d959 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -46,6 +46,9 @@
#define CIFS_MOUNT_CIFS_BACKUPUID 0x200000 /* backup intent bit for a user */
#define CIFS_MOUNT_CIFS_BACKUPGID 0x400000 /* backup intent bit for a group */
#define CIFS_MOUNT_MAP_SFM_CHR 0x800000 /* SFM/MAC mapping for illegal chars */
+#define CIFS_MOUNT_USE_PREFIX_PATH 0x1000000 /* make subpath with unaccessible
+ * root mountable
+ */
struct cifs_sb_info {
struct rb_root tlink_tree;
@@ -67,5 +70,6 @@ struct cifs_sb_info {
struct backing_dev_info bdi;
struct delayed_work prune_tlinks;
struct rcu_head rcu;
+ char *prepath;
};
#endif /* _CIFS_FS_SB_H */
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 6908080e9b6d8..b611fc2e8984e 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -24,10 +24,13 @@
#include <linux/string.h>
#include <keys/user-type.h>
#include <linux/key-type.h>
+#include <linux/keyctl.h>
#include <linux/inet.h>
#include "cifsglob.h"
#include "cifs_spnego.h"
#include "cifs_debug.h"
+#include "cifsproto.h"
+static const struct cred *spnego_cred;
/* create a new cifs key */
static int
@@ -102,6 +105,7 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo)
size_t desc_len;
struct key *spnego_key;
const char *hostname = server->hostname;
+ const struct cred *saved_cred;
/* length of fields (with semicolons): ver=0xyz ip4=ipaddress
host=hostname sec=mechanism uid=0xFF user=username */
@@ -163,7 +167,9 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo)
sprintf(dp, ";pid=0x%x", current->pid);
cifs_dbg(FYI, "key description = %s\n", description);
+ saved_cred = override_creds(spnego_cred);
spnego_key = request_key(&cifs_spnego_key_type, description, "");
+ revert_creds(saved_cred);
#ifdef CONFIG_CIFS_DEBUG2
if (cifsFYI && !IS_ERR(spnego_key)) {
@@ -177,3 +183,64 @@ out:
kfree(description);
return spnego_key;
}
+
+int
+init_cifs_spnego(void)
+{
+ struct cred *cred;
+ struct key *keyring;
+ int ret;
+
+ cifs_dbg(FYI, "Registering the %s key type\n",
+ cifs_spnego_key_type.name);
+
+ /*
+ * Create an override credential set with special thread keyring for
+ * spnego upcalls.
+ */
+
+ cred = prepare_kernel_cred(NULL);
+ if (!cred)
+ return -ENOMEM;
+
+ keyring = keyring_alloc(".cifs_spnego",
+ GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
+ (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+ KEY_USR_VIEW | KEY_USR_READ,
+ KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
+ if (IS_ERR(keyring)) {
+ ret = PTR_ERR(keyring);
+ goto failed_put_cred;
+ }
+
+ ret = register_key_type(&cifs_spnego_key_type);
+ if (ret < 0)
+ goto failed_put_key;
+
+ /*
+ * instruct request_key() to use this special keyring as a cache for
+ * the results it looks up
+ */
+ set_bit(KEY_FLAG_ROOT_CAN_CLEAR, &keyring->flags);
+ cred->thread_keyring = keyring;
+ cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+ spnego_cred = cred;
+
+ cifs_dbg(FYI, "cifs spnego keyring: %d\n", key_serial(keyring));
+ return 0;
+
+failed_put_key:
+ key_put(keyring);
+failed_put_cred:
+ put_cred(cred);
+ return ret;
+}
+
+void
+exit_cifs_spnego(void)
+{
+ key_revoke(spnego_cred->thread_keyring);
+ unregister_key_type(&cifs_spnego_key_type);
+ put_cred(spnego_cred);
+ cifs_dbg(FYI, "Unregistered %s key type\n", cifs_spnego_key_type.name);
+}
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 5a53ac6b1e025..02b071bf3732a 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -101,6 +101,12 @@ convert_sfm_char(const __u16 src_char, char *target)
case SFM_SLASH:
*target = '\\';
break;
+ case SFM_SPACE:
+ *target = ' ';
+ break;
+ case SFM_PERIOD:
+ *target = '.';
+ break;
default:
return false;
}
@@ -404,7 +410,7 @@ static __le16 convert_to_sfu_char(char src_char)
return dest_char;
}
-static __le16 convert_to_sfm_char(char src_char)
+static __le16 convert_to_sfm_char(char src_char, bool end_of_string)
{
__le16 dest_char;
@@ -427,6 +433,18 @@ static __le16 convert_to_sfm_char(char src_char)
case '|':
dest_char = cpu_to_le16(SFM_PIPE);
break;
+ case '.':
+ if (end_of_string)
+ dest_char = cpu_to_le16(SFM_PERIOD);
+ else
+ dest_char = 0;
+ break;
+ case ' ':
+ if (end_of_string)
+ dest_char = cpu_to_le16(SFM_SPACE);
+ else
+ dest_char = 0;
+ break;
default:
dest_char = 0;
}
@@ -469,9 +487,16 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
/* see if we must remap this char */
if (map_chars == SFU_MAP_UNI_RSVD)
dst_char = convert_to_sfu_char(src_char);
- else if (map_chars == SFM_MAP_UNI_RSVD)
- dst_char = convert_to_sfm_char(src_char);
- else
+ else if (map_chars == SFM_MAP_UNI_RSVD) {
+ bool end_of_string;
+
+ if (i == srclen - 1)
+ end_of_string = true;
+ else
+ end_of_string = false;
+
+ dst_char = convert_to_sfm_char(src_char, end_of_string);
+ } else
dst_char = 0;
/*
* FIXME: We can not handle remapping backslash (UNI_SLASH)
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index bdc52cb9a676d..479bc0a941f35 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -64,6 +64,8 @@
#define SFM_LESSTHAN ((__u16) 0xF023)
#define SFM_PIPE ((__u16) 0xF027)
#define SFM_SLASH ((__u16) 0xF026)
+#define SFM_PERIOD ((__u16) 0xF028)
+#define SFM_SPACE ((__u16) 0xF029)
/*
* Mapping mechanism to use when one of the seven reserved characters is
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 3f93125916bf0..71e8a56e94795 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -360,7 +360,7 @@ init_cifs_idmap(void)
GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
(KEY_POS_ALL & ~KEY_POS_SETATTR) |
KEY_USR_VIEW | KEY_USR_READ,
- KEY_ALLOC_NOT_IN_QUOTA, NULL);
+ KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
if (IS_ERR(keyring)) {
ret = PTR_ERR(keyring);
goto failed_put_cred;
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 4897dacf89449..8347c90cf483c 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -66,45 +66,15 @@ cifs_crypto_shash_md5_allocate(struct TCP_Server_Info *server)
return 0;
}
-/*
- * Calculate and return the CIFS signature based on the mac key and SMB PDU.
- * The 16 byte signature must be allocated by the caller. Note we only use the
- * 1st eight bytes and that the smb header signature field on input contains
- * the sequence number before this function is called. Also, this function
- * should be called with the server->srv_mutex held.
- */
-static int cifs_calc_signature(struct smb_rqst *rqst,
- struct TCP_Server_Info *server, char *signature)
+int __cifs_calc_signature(struct smb_rqst *rqst,
+ struct TCP_Server_Info *server, char *signature,
+ struct shash_desc *shash)
{
int i;
int rc;
struct kvec *iov = rqst->rq_iov;
int n_vec = rqst->rq_nvec;
- if (iov == NULL || signature == NULL || server == NULL)
- return -EINVAL;
-
- if (!server->secmech.sdescmd5) {
- rc = cifs_crypto_shash_md5_allocate(server);
- if (rc) {
- cifs_dbg(VFS, "%s: Can't alloc md5 crypto\n", __func__);
- return -1;
- }
- }
-
- rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
- if (rc) {
- cifs_dbg(VFS, "%s: Could not init md5\n", __func__);
- return rc;
- }
-
- rc = crypto_shash_update(&server->secmech.sdescmd5->shash,
- server->session_key.response, server->session_key.len);
- if (rc) {
- cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
- return rc;
- }
-
for (i = 0; i < n_vec; i++) {
if (iov[i].iov_len == 0)
continue;
@@ -117,12 +87,10 @@ static int cifs_calc_signature(struct smb_rqst *rqst,
if (i == 0) {
if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
break; /* nothing to sign or corrupt header */
- rc =
- crypto_shash_update(&server->secmech.sdescmd5->shash,
+ rc = crypto_shash_update(shash,
iov[i].iov_base + 4, iov[i].iov_len - 4);
} else {
- rc =
- crypto_shash_update(&server->secmech.sdescmd5->shash,
+ rc = crypto_shash_update(shash,
iov[i].iov_base, iov[i].iov_len);
}
if (rc) {
@@ -134,21 +102,64 @@ static int cifs_calc_signature(struct smb_rqst *rqst,
/* now hash over the rq_pages array */
for (i = 0; i < rqst->rq_npages; i++) {
- struct kvec p_iov;
+ void *kaddr = kmap(rqst->rq_pages[i]);
+ size_t len = rqst->rq_pagesz;
+
+ if (i == rqst->rq_npages - 1)
+ len = rqst->rq_tailsz;
+
+ crypto_shash_update(shash, kaddr, len);
- cifs_rqst_page_to_kvec(rqst, i, &p_iov);
- crypto_shash_update(&server->secmech.sdescmd5->shash,
- p_iov.iov_base, p_iov.iov_len);
kunmap(rqst->rq_pages[i]);
}
- rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
+ rc = crypto_shash_final(shash, signature);
if (rc)
- cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
+ cifs_dbg(VFS, "%s: Could not generate hash\n", __func__);
return rc;
}
+/*
+ * Calculate and return the CIFS signature based on the mac key and SMB PDU.
+ * The 16 byte signature must be allocated by the caller. Note we only use the
+ * 1st eight bytes and that the smb header signature field on input contains
+ * the sequence number before this function is called. Also, this function
+ * should be called with the server->srv_mutex held.
+ */
+static int cifs_calc_signature(struct smb_rqst *rqst,
+ struct TCP_Server_Info *server, char *signature)
+{
+ int rc;
+
+ if (!rqst->rq_iov || !signature || !server)
+ return -EINVAL;
+
+ if (!server->secmech.sdescmd5) {
+ rc = cifs_crypto_shash_md5_allocate(server);
+ if (rc) {
+ cifs_dbg(VFS, "%s: Can't alloc md5 crypto\n", __func__);
+ return -1;
+ }
+ }
+
+ rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
+ if (rc) {
+ cifs_dbg(VFS, "%s: Could not init md5\n", __func__);
+ return rc;
+ }
+
+ rc = crypto_shash_update(&server->secmech.sdescmd5->shash,
+ server->session_key.response, server->session_key.len);
+ if (rc) {
+ cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
+ return rc;
+ }
+
+ return __cifs_calc_signature(rqst, server, signature,
+ &server->secmech.sdescmd5->shash);
+}
+
/* must be called with server->srv_mutex held */
int cifs_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server,
__u32 *pexpected_response_sequence_number)
@@ -732,24 +743,26 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
memcpy(ses->auth_key.response + baselen, tiblob, tilen);
+ mutex_lock(&ses->server->srv_mutex);
+
rc = crypto_hmacmd5_alloc(ses->server);
if (rc) {
cifs_dbg(VFS, "could not crypto alloc hmacmd5 rc %d\n", rc);
- goto setup_ntlmv2_rsp_ret;
+ goto unlock;
}
/* calculate ntlmv2_hash */
rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp);
if (rc) {
cifs_dbg(VFS, "could not get v2 hash rc %d\n", rc);
- goto setup_ntlmv2_rsp_ret;
+ goto unlock;
}
/* calculate first part of the client response (CR1) */
rc = CalcNTLMv2_response(ses, ntlmv2_hash);
if (rc) {
cifs_dbg(VFS, "Could not calculate CR1 rc: %d\n", rc);
- goto setup_ntlmv2_rsp_ret;
+ goto unlock;
}
/* now calculate the session key for NTLMv2 */
@@ -758,13 +771,13 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
if (rc) {
cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n",
__func__);
- goto setup_ntlmv2_rsp_ret;
+ goto unlock;
}
rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
if (rc) {
cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__);
- goto setup_ntlmv2_rsp_ret;
+ goto unlock;
}
rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
@@ -772,7 +785,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
CIFS_HMAC_MD5_HASH_SIZE);
if (rc) {
cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
- goto setup_ntlmv2_rsp_ret;
+ goto unlock;
}
rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
@@ -780,6 +793,8 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
if (rc)
cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
+unlock:
+ mutex_unlock(&ses->server->srv_mutex);
setup_ntlmv2_rsp_ret:
kfree(tiblob);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 1d86fc620e5c2..6bbec5e784cd4 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -37,6 +37,7 @@
#include <linux/freezer.h>
#include <linux/namei.h>
#include <linux/random.h>
+#include <linux/xattr.h>
#include <net/ipv6.h>
#include "cifsfs.h"
#include "cifspdu.h"
@@ -86,6 +87,7 @@ extern mempool_t *cifs_req_poolp;
extern mempool_t *cifs_mid_poolp;
struct workqueue_struct *cifsiod_wq;
+__u32 cifs_lock_secret;
/*
* Bumps refcount for cifs super block.
@@ -135,6 +137,7 @@ cifs_read_super(struct super_block *sb)
sb->s_magic = CIFS_MAGIC_NUMBER;
sb->s_op = &cifs_super_ops;
+ sb->s_xattr = cifs_xattr_handlers;
sb->s_bdi = &cifs_sb->bdi;
sb->s_blocksize = CIFS_MAX_MSGSIZE;
sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */
@@ -686,6 +689,14 @@ cifs_do_mount(struct file_system_type *fs_type,
goto out_cifs_sb;
}
+ if (volume_info->prepath) {
+ cifs_sb->prepath = kstrdup(volume_info->prepath, GFP_KERNEL);
+ if (cifs_sb->prepath == NULL) {
+ root = ERR_PTR(-ENOMEM);
+ goto out_cifs_sb;
+ }
+ }
+
cifs_setup_cifs_sb(volume_info, cifs_sb);
rc = cifs_mount(cifs_sb, volume_info);
@@ -724,7 +735,11 @@ cifs_do_mount(struct file_system_type *fs_type,
sb->s_flags |= MS_ACTIVE;
}
- root = cifs_get_root(volume_info, sb);
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH)
+ root = dget(sb->s_root);
+ else
+ root = cifs_get_root(volume_info, sb);
+
if (IS_ERR(root))
goto out_super;
@@ -888,44 +903,33 @@ const struct inode_operations cifs_dir_inode_ops = {
.rmdir = cifs_rmdir,
.rename2 = cifs_rename2,
.permission = cifs_permission,
-/* revalidate:cifs_revalidate, */
.setattr = cifs_setattr,
.symlink = cifs_symlink,
.mknod = cifs_mknod,
-#ifdef CONFIG_CIFS_XATTR
- .setxattr = cifs_setxattr,
- .getxattr = cifs_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
.listxattr = cifs_listxattr,
- .removexattr = cifs_removexattr,
-#endif
+ .removexattr = generic_removexattr,
};
const struct inode_operations cifs_file_inode_ops = {
-/* revalidate:cifs_revalidate, */
.setattr = cifs_setattr,
- .getattr = cifs_getattr, /* do we need this anymore? */
+ .getattr = cifs_getattr,
.permission = cifs_permission,
-#ifdef CONFIG_CIFS_XATTR
- .setxattr = cifs_setxattr,
- .getxattr = cifs_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
.listxattr = cifs_listxattr,
- .removexattr = cifs_removexattr,
-#endif
+ .removexattr = generic_removexattr,
};
const struct inode_operations cifs_symlink_inode_ops = {
.readlink = generic_readlink,
.get_link = cifs_get_link,
.permission = cifs_permission,
- /* BB add the following two eventually */
- /* revalidate: cifs_revalidate,
- setattr: cifs_notify_change, *//* BB do we need notify change */
-#ifdef CONFIG_CIFS_XATTR
- .setxattr = cifs_setxattr,
- .getxattr = cifs_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
.listxattr = cifs_listxattr,
- .removexattr = cifs_removexattr,
-#endif
+ .removexattr = generic_removexattr,
};
static int cifs_clone_file_range(struct file *src_file, loff_t off,
@@ -962,7 +966,7 @@ static int cifs_clone_file_range(struct file *src_file, loff_t off,
cifs_dbg(FYI, "about to flush pages\n");
/* should we flush first and last page first */
truncate_inode_pages_range(&target_inode->i_data, destoff,
- PAGE_CACHE_ALIGN(destoff + len)-1);
+ PAGE_ALIGN(destoff + len)-1);
if (target_tcon->ses->server->ops->duplicate_extents)
rc = target_tcon->ses->server->ops->duplicate_extents(xid,
@@ -1083,7 +1087,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
};
const struct file_operations cifs_dir_ops = {
- .iterate = cifs_readdir,
+ .iterate_shared = cifs_readdir,
.release = cifs_closedir,
.read = generic_read_dir,
.unlocked_ioctl = cifs_ioctl,
@@ -1275,6 +1279,8 @@ init_cifs(void)
spin_lock_init(&cifs_file_list_lock);
spin_lock_init(&GlobalMid_Lock);
+ get_random_bytes(&cifs_lock_secret, sizeof(cifs_lock_secret));
+
if (cifs_max_pending < 2) {
cifs_max_pending = 2;
cifs_dbg(FYI, "cifs_max_pending set to min of 2\n");
@@ -1307,7 +1313,7 @@ init_cifs(void)
goto out_destroy_mids;
#ifdef CONFIG_CIFS_UPCALL
- rc = register_key_type(&cifs_spnego_key_type);
+ rc = init_cifs_spnego();
if (rc)
goto out_destroy_request_bufs;
#endif /* CONFIG_CIFS_UPCALL */
@@ -1330,7 +1336,7 @@ out_init_cifs_idmap:
out_register_key_type:
#endif
#ifdef CONFIG_CIFS_UPCALL
- unregister_key_type(&cifs_spnego_key_type);
+ exit_cifs_spnego();
out_destroy_request_bufs:
#endif
cifs_destroy_request_bufs();
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 83aac8ba50b0e..9dcf974acc47f 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -120,15 +120,19 @@ extern const char *cifs_get_link(struct dentry *, struct inode *,
struct delayed_call *);
extern int cifs_symlink(struct inode *inode, struct dentry *direntry,
const char *symname);
-extern int cifs_removexattr(struct dentry *, const char *);
-extern int cifs_setxattr(struct dentry *, const char *, const void *,
- size_t, int);
-extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t);
+
+#ifdef CONFIG_CIFS_XATTR
+extern const struct xattr_handler *cifs_xattr_handlers[];
extern ssize_t cifs_listxattr(struct dentry *, char *, size_t);
+#else
+# define cifs_xattr_handlers NULL
+# define cifs_listxattr NULL
+#endif
+
extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
#ifdef CONFIG_CIFS_NFSD_EXPORT
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.08"
+#define CIFS_VERSION "2.09"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index d21da9f05baec..8f1d8c1e72bec 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -615,8 +615,6 @@ struct TCP_Server_Info {
bool sec_mskerberos; /* supports legacy MS Kerberos */
bool large_buf; /* is current buffer large? */
struct delayed_work echo; /* echo ping workqueue job */
- struct kvec *iov; /* reusable kvec array for receives */
- unsigned int nr_iov; /* number of kvecs in array */
char *smallbuf; /* pointer to current "small" buffer */
char *bigbuf; /* pointer to current "big" buffer */
unsigned int total_read; /* total amount of data read in this pass */
@@ -714,7 +712,7 @@ compare_mid(__u16 mid, const struct smb_hdr *smb)
*
* Note that this might make for "interesting" allocation problems during
* writeback however as we have to allocate an array of pointers for the
- * pages. A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096.
+ * pages. A 16M write means ~32kb page array with PAGE_SIZE == 4096.
*
* For reads, there is a similar problem as we need to allocate an array
* of kvecs to handle the receive, though that should only need to be done
@@ -733,7 +731,7 @@ compare_mid(__u16 mid, const struct smb_hdr *smb)
/*
* The default wsize is 1M. find_get_pages seems to return a maximum of 256
- * pages in a single call. With PAGE_CACHE_SIZE == 4k, this means we can fill
+ * pages in a single call. With PAGE_SIZE == 4k, this means we can fill
* a single wsize request with a single call.
*/
#define CIFS_DEFAULT_IOSIZE (1024 * 1024)
@@ -1621,6 +1619,7 @@ void cifs_oplock_break(struct work_struct *work);
extern const struct slow_work_ops cifs_oplock_break_ops;
extern struct workqueue_struct *cifsiod_wq;
+extern __u32 cifs_lock_secret;
extern mempool_t *cifs_mid_poolp;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index eed7ff50faf01..1243bd326591a 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -37,8 +37,6 @@ extern void cifs_buf_release(void *);
extern struct smb_hdr *cifs_small_buf_get(void);
extern void cifs_small_buf_release(void *);
extern void free_rsp_buf(int, void *);
-extern void cifs_rqst_page_to_kvec(struct smb_rqst *rqst, unsigned int idx,
- struct kvec *iov);
extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *,
unsigned int /* length */);
extern unsigned int _get_xid(void);
@@ -60,6 +58,8 @@ do { \
} while (0)
extern int init_cifs_idmap(void);
extern void exit_cifs_idmap(void);
+extern int init_cifs_spnego(void);
+extern void exit_cifs_spnego(void);
extern char *build_path_from_dentry(struct dentry *);
extern char *cifs_build_path_to_root(struct smb_vol *vol,
struct cifs_sb_info *cifs_sb,
@@ -181,10 +181,9 @@ extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *,
extern void dequeue_mid(struct mid_q_entry *mid, bool malformed);
extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
- unsigned int to_read);
-extern int cifs_readv_from_socket(struct TCP_Server_Info *server,
- struct kvec *iov_orig, unsigned int nr_segs,
- unsigned int to_read);
+ unsigned int to_read);
+extern int cifs_read_page_from_socket(struct TCP_Server_Info *server,
+ struct page *page, unsigned int to_read);
extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
struct cifs_sb_info *cifs_sb);
extern int cifs_match_super(struct super_block *, void *);
@@ -512,4 +511,7 @@ int cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb,
const unsigned char *path, char *pbuf,
unsigned int *pbytes_written);
+int __cifs_calc_signature(struct smb_rqst *rqst,
+ struct TCP_Server_Info *server, char *signature,
+ struct shash_desc *shash);
#endif /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 76fcb50295a38..d47197ea4ab62 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1447,10 +1447,8 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
len = min_t(unsigned int, buflen, server->vals->read_rsp_size) -
HEADER_SIZE(server) + 1;
- rdata->iov.iov_base = buf + HEADER_SIZE(server) - 1;
- rdata->iov.iov_len = len;
-
- length = cifs_readv_from_socket(server, &rdata->iov, 1, len);
+ length = cifs_read_from_socket(server,
+ buf + HEADER_SIZE(server) - 1, len);
if (length < 0)
return length;
server->total_read += length;
@@ -1502,9 +1500,8 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
len = data_offset - server->total_read;
if (len > 0) {
/* read any junk before data into the rest of smallbuf */
- rdata->iov.iov_base = buf + server->total_read;
- rdata->iov.iov_len = len;
- length = cifs_readv_from_socket(server, &rdata->iov, 1, len);
+ length = cifs_read_from_socket(server,
+ buf + server->total_read, len);
if (length < 0)
return length;
server->total_read += length;
@@ -1929,17 +1926,17 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
wsize = server->ops->wp_retry_size(inode);
if (wsize < rest_len) {
- nr_pages = wsize / PAGE_CACHE_SIZE;
+ nr_pages = wsize / PAGE_SIZE;
if (!nr_pages) {
rc = -ENOTSUPP;
break;
}
- cur_len = nr_pages * PAGE_CACHE_SIZE;
- tailsz = PAGE_CACHE_SIZE;
+ cur_len = nr_pages * PAGE_SIZE;
+ tailsz = PAGE_SIZE;
} else {
- nr_pages = DIV_ROUND_UP(rest_len, PAGE_CACHE_SIZE);
+ nr_pages = DIV_ROUND_UP(rest_len, PAGE_SIZE);
cur_len = rest_len;
- tailsz = rest_len - (nr_pages - 1) * PAGE_CACHE_SIZE;
+ tailsz = rest_len - (nr_pages - 1) * PAGE_SIZE;
}
wdata2 = cifs_writedata_alloc(nr_pages, cifs_writev_complete);
@@ -1957,7 +1954,7 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
wdata2->sync_mode = wdata->sync_mode;
wdata2->nr_pages = nr_pages;
wdata2->offset = page_offset(wdata2->pages[0]);
- wdata2->pagesz = PAGE_CACHE_SIZE;
+ wdata2->pagesz = PAGE_SIZE;
wdata2->tailsz = tailsz;
wdata2->bytes = cur_len;
@@ -1975,7 +1972,7 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
if (rc != 0 && rc != -EAGAIN) {
SetPageError(wdata2->pages[j]);
end_page_writeback(wdata2->pages[j]);
- page_cache_release(wdata2->pages[j]);
+ put_page(wdata2->pages[j]);
}
}
@@ -2018,7 +2015,7 @@ cifs_writev_complete(struct work_struct *work)
else if (wdata->result < 0)
SetPageError(page);
end_page_writeback(page);
- page_cache_release(page);
+ put_page(page);
}
if (wdata->result != -EAGAIN)
mapping_set_error(inode->i_mapping, wdata->result);
@@ -3366,7 +3363,7 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen,
if (le16_to_cpu(cifs_acl->version) != CIFS_ACL_VERSION)
return -EOPNOTSUPP;
- if (acl_type & ACL_TYPE_ACCESS) {
+ if (acl_type == ACL_TYPE_ACCESS) {
count = le16_to_cpu(cifs_acl->access_entry_count);
pACE = &cifs_acl->ace_array[0];
size = sizeof(struct cifs_posix_acl);
@@ -3377,7 +3374,7 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen,
size_of_data_area, size);
return -EINVAL;
}
- } else if (acl_type & ACL_TYPE_DEFAULT) {
+ } else if (acl_type == ACL_TYPE_DEFAULT) {
count = le16_to_cpu(cifs_acl->access_entry_count);
size = sizeof(struct cifs_posix_acl);
size += sizeof(struct cifs_posix_ace) * count;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index a763cd3d9e7c8..7ae03283bd61c 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -428,7 +428,9 @@ cifs_echo_request(struct work_struct *work)
* server->ops->need_neg() == true. Also, no need to ping if
* we got a response recently.
*/
- if (!server->ops->need_neg || server->ops->need_neg(server) ||
+
+ if (server->tcpStatus == CifsNeedReconnect ||
+ server->tcpStatus == CifsExiting || server->tcpStatus == CifsNew ||
(server->ops->can_echo && !server->ops->can_echo(server)) ||
time_before(jiffies, server->lstrp + echo_interval - HZ))
goto requeue_echo;
@@ -501,99 +503,34 @@ server_unresponsive(struct TCP_Server_Info *server)
return false;
}
-/*
- * kvec_array_init - clone a kvec array, and advance into it
- * @new: pointer to memory for cloned array
- * @iov: pointer to original array
- * @nr_segs: number of members in original array
- * @bytes: number of bytes to advance into the cloned array
- *
- * This function will copy the array provided in iov to a section of memory
- * and advance the specified number of bytes into the new array. It returns
- * the number of segments in the new array. "new" must be at least as big as
- * the original iov array.
- */
-static unsigned int
-kvec_array_init(struct kvec *new, struct kvec *iov, unsigned int nr_segs,
- size_t bytes)
-{
- size_t base = 0;
-
- while (bytes || !iov->iov_len) {
- int copy = min(bytes, iov->iov_len);
-
- bytes -= copy;
- base += copy;
- if (iov->iov_len == base) {
- iov++;
- nr_segs--;
- base = 0;
- }
- }
- memcpy(new, iov, sizeof(*iov) * nr_segs);
- new->iov_base += base;
- new->iov_len -= base;
- return nr_segs;
-}
-
-static struct kvec *
-get_server_iovec(struct TCP_Server_Info *server, unsigned int nr_segs)
-{
- struct kvec *new_iov;
-
- if (server->iov && nr_segs <= server->nr_iov)
- return server->iov;
-
- /* not big enough -- allocate a new one and release the old */
- new_iov = kmalloc(sizeof(*new_iov) * nr_segs, GFP_NOFS);
- if (new_iov) {
- kfree(server->iov);
- server->iov = new_iov;
- server->nr_iov = nr_segs;
- }
- return new_iov;
-}
-
-int
-cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig,
- unsigned int nr_segs, unsigned int to_read)
+static int
+cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg)
{
int length = 0;
int total_read;
- unsigned int segs;
- struct msghdr smb_msg;
- struct kvec *iov;
- iov = get_server_iovec(server, nr_segs);
- if (!iov)
- return -ENOMEM;
+ smb_msg->msg_control = NULL;
+ smb_msg->msg_controllen = 0;
- smb_msg.msg_control = NULL;
- smb_msg.msg_controllen = 0;
-
- for (total_read = 0; to_read; total_read += length, to_read -= length) {
+ for (total_read = 0; msg_data_left(smb_msg); total_read += length) {
try_to_freeze();
- if (server_unresponsive(server)) {
- total_read = -ECONNABORTED;
- break;
- }
+ if (server_unresponsive(server))
+ return -ECONNABORTED;
- segs = kvec_array_init(iov, iov_orig, nr_segs, total_read);
+ length = sock_recvmsg(server->ssocket, smb_msg, 0);
- length = kernel_recvmsg(server->ssocket, &smb_msg,
- iov, segs, to_read, 0);
+ if (server->tcpStatus == CifsExiting)
+ return -ESHUTDOWN;
- if (server->tcpStatus == CifsExiting) {
- total_read = -ESHUTDOWN;
- break;
- } else if (server->tcpStatus == CifsNeedReconnect) {
+ if (server->tcpStatus == CifsNeedReconnect) {
cifs_reconnect(server);
- total_read = -ECONNABORTED;
- break;
- } else if (length == -ERESTARTSYS ||
- length == -EAGAIN ||
- length == -EINTR) {
+ return -ECONNABORTED;
+ }
+
+ if (length == -ERESTARTSYS ||
+ length == -EAGAIN ||
+ length == -EINTR) {
/*
* Minimum sleep to prevent looping, allowing socket
* to clear and app threads to set tcpStatus
@@ -602,12 +539,12 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig,
usleep_range(1000, 2000);
length = 0;
continue;
- } else if (length <= 0) {
- cifs_dbg(FYI, "Received no data or error: expecting %d\n"
- "got %d", to_read, length);
+ }
+
+ if (length <= 0) {
+ cifs_dbg(FYI, "Received no data or error: %d\n", length);
cifs_reconnect(server);
- total_read = -ECONNABORTED;
- break;
+ return -ECONNABORTED;
}
}
return total_read;
@@ -617,12 +554,21 @@ int
cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
unsigned int to_read)
{
- struct kvec iov;
+ struct msghdr smb_msg;
+ struct kvec iov = {.iov_base = buf, .iov_len = to_read};
+ iov_iter_kvec(&smb_msg.msg_iter, READ | ITER_KVEC, &iov, 1, to_read);
- iov.iov_base = buf;
- iov.iov_len = to_read;
+ return cifs_readv_from_socket(server, &smb_msg);
+}
- return cifs_readv_from_socket(server, &iov, 1, to_read);
+int
+cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page,
+ unsigned int to_read)
+{
+ struct msghdr smb_msg;
+ struct bio_vec bv = {.bv_page = page, .bv_len = to_read};
+ iov_iter_bvec(&smb_msg.msg_iter, READ | ITER_BVEC, &bv, 1, to_read);
+ return cifs_readv_from_socket(server, &smb_msg);
}
static bool
@@ -783,7 +729,6 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
}
kfree(server->hostname);
- kfree(server->iov);
kfree(server);
length = atomic_dec_return(&tcpSesAllocCount);
@@ -1196,8 +1141,12 @@ cifs_parse_devname(const char *devname, struct smb_vol *vol)
convert_delimiter(vol->UNC, '\\');
- /* If pos is NULL, or is a bogus trailing delimiter then no prepath */
- if (!*pos++ || !*pos)
+ /* skip any delimiter */
+ if (*pos == '/' || *pos == '\\')
+ pos++;
+
+ /* If pos is NULL then no prepath */
+ if (!*pos)
return 0;
vol->prepath = kstrdup(pos, GFP_KERNEL);
@@ -1279,6 +1228,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
vol->ops = &smb1_operations;
vol->vals = &smb1_values;
+ vol->echo_interval = SMB_ECHO_INTERVAL_DEFAULT;
+
if (!mountdata)
goto cifs_parse_mount_err;
@@ -2100,7 +2051,7 @@ static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol)
if (!match_security(server, vol))
return 0;
- if (server->echo_interval != vol->echo_interval)
+ if (server->echo_interval != vol->echo_interval * HZ)
return 0;
return 1;
@@ -2918,7 +2869,7 @@ static inline void
cifs_reclassify_socket4(struct socket *sock)
{
struct sock *sk = sock->sk;
- BUG_ON(sock_owned_by_user(sk));
+ BUG_ON(!sock_allow_reclassification(sk));
sock_lock_init_class_and_name(sk, "slock-AF_INET-CIFS",
&cifs_slock_key[0], "sk_lock-AF_INET-CIFS", &cifs_key[0]);
}
@@ -2927,7 +2878,7 @@ static inline void
cifs_reclassify_socket6(struct socket *sock)
{
struct sock *sk = sock->sk;
- BUG_ON(sock_owned_by_user(sk));
+ BUG_ON(!sock_allow_reclassification(sk));
sock_lock_init_class_and_name(sk, "slock-AF_INET6-CIFS",
&cifs_slock_key[1], "sk_lock-AF_INET6-CIFS", &cifs_key[1]);
}
@@ -3534,6 +3485,44 @@ cifs_get_volume_info(char *mount_data, const char *devname)
return volume_info;
}
+static int
+cifs_are_all_path_components_accessible(struct TCP_Server_Info *server,
+ unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb,
+ char *full_path)
+{
+ int rc;
+ char *s;
+ char sep, tmp;
+
+ sep = CIFS_DIR_SEP(cifs_sb);
+ s = full_path;
+
+ rc = server->ops->is_path_accessible(xid, tcon, cifs_sb, "");
+ while (rc == 0) {
+ /* skip separators */
+ while (*s == sep)
+ s++;
+ if (!*s)
+ break;
+ /* next separator */
+ while (*s && *s != sep)
+ s++;
+
+ /*
+ * temporarily null-terminate the path at the end of
+ * the current component
+ */
+ tmp = *s;
+ *s = 0;
+ rc = server->ops->is_path_accessible(xid, tcon, cifs_sb,
+ full_path);
+ *s = tmp;
+ }
+ return rc;
+}
+
int
cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
{
@@ -3630,7 +3619,7 @@ try_mount_again:
cifs_sb->rsize = server->ops->negotiate_rsize(tcon, volume_info);
/* tune readahead according to rsize */
- cifs_sb->bdi.ra_pages = cifs_sb->rsize / PAGE_CACHE_SIZE;
+ cifs_sb->bdi.ra_pages = cifs_sb->rsize / PAGE_SIZE;
remote_path_check:
#ifdef CONFIG_CIFS_DFS_UPCALL
@@ -3671,6 +3660,16 @@ remote_path_check:
kfree(full_path);
goto mount_fail_check;
}
+
+ rc = cifs_are_all_path_components_accessible(server,
+ xid, tcon, cifs_sb,
+ full_path);
+ if (rc != 0) {
+ cifs_dbg(VFS, "cannot query dirs between root and final path, "
+ "enabling CIFS_MOUNT_USE_PREFIX_PATH\n");
+ cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH;
+ rc = 0;
+ }
kfree(full_path);
}
@@ -3940,6 +3939,7 @@ cifs_umount(struct cifs_sb_info *cifs_sb)
bdi_destroy(&cifs_sb->bdi);
kfree(cifs_sb->mountdata);
+ kfree(cifs_sb->prepath);
call_rcu(&cifs_sb->rcu, delayed_free);
}
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index c3eb998a99bd1..4716c54dbfc64 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -84,6 +84,7 @@ build_path_from_dentry(struct dentry *direntry)
struct dentry *temp;
int namelen;
int dfsplen;
+ int pplen = 0;
char *full_path;
char dirsep;
struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
@@ -95,8 +96,12 @@ build_path_from_dentry(struct dentry *direntry)
dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
else
dfsplen = 0;
+
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH)
+ pplen = cifs_sb->prepath ? strlen(cifs_sb->prepath) + 1 : 0;
+
cifs_bp_rename_retry:
- namelen = dfsplen;
+ namelen = dfsplen + pplen;
seq = read_seqbegin(&rename_lock);
rcu_read_lock();
for (temp = direntry; !IS_ROOT(temp);) {
@@ -137,7 +142,7 @@ cifs_bp_rename_retry:
}
}
rcu_read_unlock();
- if (namelen != dfsplen || read_seqretry(&rename_lock, seq)) {
+ if (namelen != dfsplen + pplen || read_seqretry(&rename_lock, seq)) {
cifs_dbg(FYI, "did not end path lookup where expected. namelen=%ddfsplen=%d\n",
namelen, dfsplen);
/* presumably this is only possible if racing with a rename
@@ -153,6 +158,17 @@ cifs_bp_rename_retry:
those safely to '/' if any are found in the middle of the prepath */
/* BB test paths to Windows with '/' in the midst of prepath */
+ if (pplen) {
+ int i;
+
+ cifs_dbg(FYI, "using cifs_sb prepath <%s>\n", cifs_sb->prepath);
+ memcpy(full_path+dfsplen+1, cifs_sb->prepath, pplen-1);
+ full_path[dfsplen] = '\\';
+ for (i = 0; i < pplen-1; i++)
+ if (full_path[dfsplen+1+i] == '/')
+ full_path[dfsplen+1+i] = CIFS_DIR_SEP(cifs_sb);
+ }
+
if (dfsplen) {
strncpy(full_path, tcon->treeName, dfsplen);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
@@ -229,6 +245,13 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
goto cifs_create_get_file_info;
}
+ if (S_ISDIR(newinode->i_mode)) {
+ CIFSSMBClose(xid, tcon, fid->netfid);
+ iput(newinode);
+ rc = -EISDIR;
+ goto out;
+ }
+
if (!S_ISREG(newinode->i_mode)) {
/*
* The server may allow us to open things like
@@ -399,10 +422,14 @@ cifs_create_set_dentry:
if (rc != 0) {
cifs_dbg(FYI, "Create worked, get_inode_info failed rc = %d\n",
rc);
- if (server->ops->close)
- server->ops->close(xid, tcon, fid);
- goto out;
+ goto out_err;
+ }
+
+ if (S_ISDIR(newinode->i_mode)) {
+ rc = -EISDIR;
+ goto out_err;
}
+
d_drop(direntry);
d_add(direntry, newinode);
@@ -410,6 +437,13 @@ out:
kfree(buf);
kfree(full_path);
return rc;
+
+out_err:
+ if (server->ops->close)
+ server->ops->close(xid, tcon, fid);
+ if (newinode)
+ iput(newinode);
+ goto out;
}
int
@@ -445,7 +479,7 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
* Check for hashed negative dentry. We have already revalidated
* the dentry and it is fine. No need to perform another lookup.
*/
- if (!d_unhashed(direntry))
+ if (!d_in_lookup(direntry))
return -ENOENT;
res = cifs_lookup(inode, direntry, 0);
@@ -856,7 +890,7 @@ static int cifs_ci_hash(const struct dentry *dentry, struct qstr *q)
wchar_t c;
int i, charlen;
- hash = init_name_hash();
+ hash = init_name_hash(dentry);
for (i = 0; i < q->len; i += charlen) {
charlen = codepage->char2uni(&q->name[i], q->len - i, &c);
/* error out if we can't convert the character */
@@ -869,10 +903,10 @@ static int cifs_ci_hash(const struct dentry *dentry, struct qstr *q)
return 0;
}
-static int cifs_ci_compare(const struct dentry *parent, const struct dentry *dentry,
+static int cifs_ci_compare(const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
- struct nls_table *codepage = CIFS_SB(parent->d_sb)->local_nls;
+ struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls;
wchar_t c1, c2;
int i, l1, l2;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ff882aeaccc67..579e41b350a2d 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -271,7 +271,7 @@ struct cifsFileInfo *
cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
struct tcon_link *tlink, __u32 oplock)
{
- struct dentry *dentry = file->f_path.dentry;
+ struct dentry *dentry = file_dentry(file);
struct inode *inode = d_inode(dentry);
struct cifsInodeInfo *cinode = CIFS_I(inode);
struct cifsFileInfo *cfile;
@@ -461,7 +461,7 @@ int cifs_open(struct inode *inode, struct file *file)
tcon = tlink_tcon(tlink);
server = tcon->ses->server;
- full_path = build_path_from_dentry(file->f_path.dentry);
+ full_path = build_path_from_dentry(file_dentry(file));
if (full_path == NULL) {
rc = -ENOMEM;
goto out;
@@ -1112,6 +1112,12 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
return rc;
}
+static __u32
+hash_lockowner(fl_owner_t owner)
+{
+ return cifs_lock_secret ^ hash32_ptr((const void *)owner);
+}
+
struct lock_to_push {
struct list_head llist;
__u64 offset;
@@ -1178,7 +1184,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
else
type = CIFS_WRLCK;
lck = list_entry(el, struct lock_to_push, llist);
- lck->pid = flock->fl_pid;
+ lck->pid = hash_lockowner(flock->fl_owner);
lck->netfid = cfile->fid.netfid;
lck->length = length;
lck->type = type;
@@ -1305,7 +1311,8 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type,
posix_lock_type = CIFS_RDLCK;
else
posix_lock_type = CIFS_WRLCK;
- rc = CIFSSMBPosixLock(xid, tcon, netfid, current->tgid,
+ rc = CIFSSMBPosixLock(xid, tcon, netfid,
+ hash_lockowner(flock->fl_owner),
flock->fl_start, length, flock,
posix_lock_type, wait_flag);
return rc;
@@ -1505,7 +1512,8 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
posix_lock_type = CIFS_UNLCK;
rc = CIFSSMBPosixLock(xid, tcon, cfile->fid.netfid,
- current->tgid, flock->fl_start, length,
+ hash_lockowner(flock->fl_owner),
+ flock->fl_start, length,
NULL, posix_lock_type, wait_flag);
goto out;
}
@@ -1833,7 +1841,7 @@ refind_writable:
static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
{
struct address_space *mapping = page->mapping;
- loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
+ loff_t offset = (loff_t)page->index << PAGE_SHIFT;
char *write_data;
int rc = -EFAULT;
int bytes_written = 0;
@@ -1849,7 +1857,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
write_data = kmap(page);
write_data += from;
- if ((to > PAGE_CACHE_SIZE) || (from > to)) {
+ if ((to > PAGE_SIZE) || (from > to)) {
kunmap(page);
return -EIO;
}
@@ -1902,7 +1910,7 @@ wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping,
* find_get_pages_tag seems to return a max of 256 on each
* iteration, so we must call it several times in order to
* fill the array or the wsize is effectively limited to
- * 256 * PAGE_CACHE_SIZE.
+ * 256 * PAGE_SIZE.
*/
*found_pages = 0;
pages = wdata->pages;
@@ -1991,7 +1999,7 @@ wdata_prepare_pages(struct cifs_writedata *wdata, unsigned int found_pages,
/* put any pages we aren't going to use */
for (i = nr_pages; i < found_pages; i++) {
- page_cache_release(wdata->pages[i]);
+ put_page(wdata->pages[i]);
wdata->pages[i] = NULL;
}
@@ -2009,11 +2017,11 @@ wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages,
wdata->sync_mode = wbc->sync_mode;
wdata->nr_pages = nr_pages;
wdata->offset = page_offset(wdata->pages[0]);
- wdata->pagesz = PAGE_CACHE_SIZE;
+ wdata->pagesz = PAGE_SIZE;
wdata->tailsz = min(i_size_read(mapping->host) -
page_offset(wdata->pages[nr_pages - 1]),
- (loff_t)PAGE_CACHE_SIZE);
- wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) + wdata->tailsz;
+ (loff_t)PAGE_SIZE);
+ wdata->bytes = ((nr_pages - 1) * PAGE_SIZE) + wdata->tailsz;
if (wdata->cfile != NULL)
cifsFileInfo_put(wdata->cfile);
@@ -2047,15 +2055,15 @@ static int cifs_writepages(struct address_space *mapping,
* If wsize is smaller than the page cache size, default to writing
* one page at a time via cifs_writepage
*/
- if (cifs_sb->wsize < PAGE_CACHE_SIZE)
+ if (cifs_sb->wsize < PAGE_SIZE)
return generic_writepages(mapping, wbc);
if (wbc->range_cyclic) {
index = mapping->writeback_index; /* Start from prev offset */
end = -1;
} else {
- index = wbc->range_start >> PAGE_CACHE_SHIFT;
- end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ index = wbc->range_start >> PAGE_SHIFT;
+ end = wbc->range_end >> PAGE_SHIFT;
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = true;
scanned = true;
@@ -2071,7 +2079,7 @@ retry:
if (rc)
break;
- tofind = min((wsize / PAGE_CACHE_SIZE) - 1, end - index) + 1;
+ tofind = min((wsize / PAGE_SIZE) - 1, end - index) + 1;
wdata = wdata_alloc_and_fillpages(tofind, mapping, end, &index,
&found_pages);
@@ -2111,7 +2119,7 @@ retry:
else
SetPageError(wdata->pages[i]);
end_page_writeback(wdata->pages[i]);
- page_cache_release(wdata->pages[i]);
+ put_page(wdata->pages[i]);
}
if (rc != -EAGAIN)
mapping_set_error(mapping, rc);
@@ -2154,7 +2162,7 @@ cifs_writepage_locked(struct page *page, struct writeback_control *wbc)
xid = get_xid();
/* BB add check for wbc flags */
- page_cache_get(page);
+ get_page(page);
if (!PageUptodate(page))
cifs_dbg(FYI, "ppw - page not up to date\n");
@@ -2170,7 +2178,7 @@ cifs_writepage_locked(struct page *page, struct writeback_control *wbc)
*/
set_page_writeback(page);
retry_write:
- rc = cifs_partialpagewrite(page, 0, PAGE_CACHE_SIZE);
+ rc = cifs_partialpagewrite(page, 0, PAGE_SIZE);
if (rc == -EAGAIN && wbc->sync_mode == WB_SYNC_ALL)
goto retry_write;
else if (rc == -EAGAIN)
@@ -2180,7 +2188,7 @@ retry_write:
else
SetPageUptodate(page);
end_page_writeback(page);
- page_cache_release(page);
+ put_page(page);
free_xid(xid);
return rc;
}
@@ -2214,12 +2222,12 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
if (copied == len)
SetPageUptodate(page);
ClearPageChecked(page);
- } else if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE)
+ } else if (!PageUptodate(page) && copied == PAGE_SIZE)
SetPageUptodate(page);
if (!PageUptodate(page)) {
char *page_data;
- unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned offset = pos & (PAGE_SIZE - 1);
unsigned int xid;
xid = get_xid();
@@ -2248,7 +2256,7 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
}
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return rc;
}
@@ -2687,11 +2695,8 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from)
out:
inode_unlock(inode);
- if (rc > 0) {
- ssize_t err = generic_write_sync(file, iocb->ki_pos - rc, rc);
- if (err < 0)
- rc = err;
- }
+ if (rc > 0)
+ rc = generic_write_sync(iocb, rc);
up_read(&cinode->lock_sem);
return rc;
}
@@ -2855,39 +2860,31 @@ cifs_uncached_read_into_pages(struct TCP_Server_Info *server,
int result = 0;
unsigned int i;
unsigned int nr_pages = rdata->nr_pages;
- struct kvec iov;
rdata->got_bytes = 0;
rdata->tailsz = PAGE_SIZE;
for (i = 0; i < nr_pages; i++) {
struct page *page = rdata->pages[i];
+ size_t n;
- if (len >= PAGE_SIZE) {
- /* enough data to fill the page */
- iov.iov_base = kmap(page);
- iov.iov_len = PAGE_SIZE;
- cifs_dbg(FYI, "%u: iov_base=%p iov_len=%zu\n",
- i, iov.iov_base, iov.iov_len);
- len -= PAGE_SIZE;
- } else if (len > 0) {
- /* enough for partial page, fill and zero the rest */
- iov.iov_base = kmap(page);
- iov.iov_len = len;
- cifs_dbg(FYI, "%u: iov_base=%p iov_len=%zu\n",
- i, iov.iov_base, iov.iov_len);
- memset(iov.iov_base + len, '\0', PAGE_SIZE - len);
- rdata->tailsz = len;
- len = 0;
- } else {
+ if (len <= 0) {
/* no need to hold page hostage */
rdata->pages[i] = NULL;
rdata->nr_pages--;
put_page(page);
continue;
}
-
- result = cifs_readv_from_socket(server, &iov, 1, iov.iov_len);
- kunmap(page);
+ n = len;
+ if (len >= PAGE_SIZE) {
+ /* enough data to fill the page */
+ n = PAGE_SIZE;
+ len -= n;
+ } else {
+ zero_user(page, len, PAGE_SIZE - len);
+ rdata->tailsz = len;
+ len = 0;
+ }
+ result = cifs_read_page_from_socket(server, page, n);
if (result < 0)
break;
@@ -3286,9 +3283,9 @@ cifs_readv_complete(struct work_struct *work)
(rdata->result == -EAGAIN && got_bytes))
cifs_readpage_to_fscache(rdata->mapping->host, page);
- got_bytes -= min_t(unsigned int, PAGE_CACHE_SIZE, got_bytes);
+ got_bytes -= min_t(unsigned int, PAGE_SIZE, got_bytes);
- page_cache_release(page);
+ put_page(page);
rdata->pages[i] = NULL;
}
kref_put(&rdata->refcount, cifs_readdata_release);
@@ -3303,34 +3300,24 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
u64 eof;
pgoff_t eof_index;
unsigned int nr_pages = rdata->nr_pages;
- struct kvec iov;
/* determine the eof that the server (probably) has */
eof = CIFS_I(rdata->mapping->host)->server_eof;
- eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0;
+ eof_index = eof ? (eof - 1) >> PAGE_SHIFT : 0;
cifs_dbg(FYI, "eof=%llu eof_index=%lu\n", eof, eof_index);
rdata->got_bytes = 0;
- rdata->tailsz = PAGE_CACHE_SIZE;
+ rdata->tailsz = PAGE_SIZE;
for (i = 0; i < nr_pages; i++) {
struct page *page = rdata->pages[i];
+ size_t n = PAGE_SIZE;
- if (len >= PAGE_CACHE_SIZE) {
- /* enough data to fill the page */
- iov.iov_base = kmap(page);
- iov.iov_len = PAGE_CACHE_SIZE;
- cifs_dbg(FYI, "%u: idx=%lu iov_base=%p iov_len=%zu\n",
- i, page->index, iov.iov_base, iov.iov_len);
- len -= PAGE_CACHE_SIZE;
+ if (len >= PAGE_SIZE) {
+ len -= PAGE_SIZE;
} else if (len > 0) {
/* enough for partial page, fill and zero the rest */
- iov.iov_base = kmap(page);
- iov.iov_len = len;
- cifs_dbg(FYI, "%u: idx=%lu iov_base=%p iov_len=%zu\n",
- i, page->index, iov.iov_base, iov.iov_len);
- memset(iov.iov_base + len,
- '\0', PAGE_CACHE_SIZE - len);
- rdata->tailsz = len;
+ zero_user(page, len, PAGE_SIZE - len);
+ n = rdata->tailsz = len;
len = 0;
} else if (page->index > eof_index) {
/*
@@ -3341,12 +3328,12 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
* to prevent the VFS from repeatedly attempting to
* fill them until the writes are flushed.
*/
- zero_user(page, 0, PAGE_CACHE_SIZE);
+ zero_user(page, 0, PAGE_SIZE);
lru_cache_add_file(page);
flush_dcache_page(page);
SetPageUptodate(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
rdata->pages[i] = NULL;
rdata->nr_pages--;
continue;
@@ -3354,14 +3341,13 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
/* no need to hold page hostage */
lru_cache_add_file(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
rdata->pages[i] = NULL;
rdata->nr_pages--;
continue;
}
- result = cifs_readv_from_socket(server, &iov, 1, iov.iov_len);
- kunmap(page);
+ result = cifs_read_page_from_socket(server, page, n);
if (result < 0)
break;
@@ -3380,7 +3366,7 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
struct page *page, *tpage;
unsigned int expected_index;
int rc;
- gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
+ gfp_t gfp = readahead_gfp_mask(mapping);
INIT_LIST_HEAD(tmplist);
@@ -3402,8 +3388,8 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
}
/* move first page to the tmplist */
- *offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
- *bytes = PAGE_CACHE_SIZE;
+ *offset = (loff_t)page->index << PAGE_SHIFT;
+ *bytes = PAGE_SIZE;
*nr_pages = 1;
list_move_tail(&page->lru, tmplist);
@@ -3415,7 +3401,7 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
break;
/* would this page push the read over the rsize? */
- if (*bytes + PAGE_CACHE_SIZE > rsize)
+ if (*bytes + PAGE_SIZE > rsize)
break;
__SetPageLocked(page);
@@ -3424,7 +3410,7 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
break;
}
list_move_tail(&page->lru, tmplist);
- (*bytes) += PAGE_CACHE_SIZE;
+ (*bytes) += PAGE_SIZE;
expected_index++;
(*nr_pages)++;
}
@@ -3493,7 +3479,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
* reach this point however since we set ra_pages to 0 when the
* rsize is smaller than a cache page.
*/
- if (unlikely(rsize < PAGE_CACHE_SIZE)) {
+ if (unlikely(rsize < PAGE_SIZE)) {
add_credits_and_wake_if(server, credits, 0);
return 0;
}
@@ -3512,7 +3498,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
list_del(&page->lru);
lru_cache_add_file(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
rc = -ENOMEM;
add_credits_and_wake_if(server, credits, 0);
@@ -3524,7 +3510,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
rdata->offset = offset;
rdata->bytes = bytes;
rdata->pid = pid;
- rdata->pagesz = PAGE_CACHE_SIZE;
+ rdata->pagesz = PAGE_SIZE;
rdata->read_into_pages = cifs_readpages_read_into_pages;
rdata->credits = credits;
@@ -3542,7 +3528,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
page = rdata->pages[i];
lru_cache_add_file(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
/* Fallback to the readpage in error/reconnect cases */
kref_put(&rdata->refcount, cifs_readdata_release);
@@ -3577,7 +3563,7 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
read_data = kmap(page);
/* for reads over a certain size could initiate async read ahead */
- rc = cifs_read(file, read_data, PAGE_CACHE_SIZE, poffset);
+ rc = cifs_read(file, read_data, PAGE_SIZE, poffset);
if (rc < 0)
goto io_error;
@@ -3587,8 +3573,8 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
file_inode(file)->i_atime =
current_fs_time(file_inode(file)->i_sb);
- if (PAGE_CACHE_SIZE > rc)
- memset(read_data + rc, 0, PAGE_CACHE_SIZE - rc);
+ if (PAGE_SIZE > rc)
+ memset(read_data + rc, 0, PAGE_SIZE - rc);
flush_dcache_page(page);
SetPageUptodate(page);
@@ -3608,7 +3594,7 @@ read_complete:
static int cifs_readpage(struct file *file, struct page *page)
{
- loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
+ loff_t offset = (loff_t)page->index << PAGE_SHIFT;
int rc = -EACCES;
unsigned int xid;
@@ -3679,8 +3665,8 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
int oncethru = 0;
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- loff_t offset = pos & (PAGE_CACHE_SIZE - 1);
+ pgoff_t index = pos >> PAGE_SHIFT;
+ loff_t offset = pos & (PAGE_SIZE - 1);
loff_t page_start = pos & PAGE_MASK;
loff_t i_size;
struct page *page;
@@ -3703,7 +3689,7 @@ start:
* the server. If the write is short, we'll end up doing a sync write
* instead.
*/
- if (len == PAGE_CACHE_SIZE)
+ if (len == PAGE_SIZE)
goto out;
/*
@@ -3718,7 +3704,7 @@ start:
(offset == 0 && (pos + len) >= i_size)) {
zero_user_segments(page, 0, offset,
offset + len,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
/*
* PageChecked means that the parts of the page
* to which we're not writing are considered up
@@ -3737,7 +3723,7 @@ start:
* do a sync write instead since PG_uptodate isn't set.
*/
cifs_readpage_worker(file, page, &page_start);
- page_cache_release(page);
+ put_page(page);
oncethru = 1;
goto start;
} else {
@@ -3764,7 +3750,7 @@ static void cifs_invalidate_page(struct page *page, unsigned int offset,
{
struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host);
- if (offset == 0 && length == PAGE_CACHE_SIZE)
+ if (offset == 0 && length == PAGE_SIZE)
cifs_fscache_invalidate_page(page, &cifsi->vfs_inode);
}
@@ -3772,7 +3758,7 @@ static int cifs_launder_page(struct page *page)
{
int rc = 0;
loff_t range_start = page_offset(page);
- loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
+ loff_t range_end = range_start + (loff_t)(PAGE_SIZE - 1);
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.nr_to_write = 0,
@@ -3854,7 +3840,7 @@ void cifs_oplock_break(struct work_struct *work)
* Direct IO is not yet supported in the cached mode.
*/
static ssize_t
-cifs_direct_io(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
+cifs_direct_io(struct kiocb *iocb, struct iov_iter *iter)
{
/*
* FIXME
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index aeb26dbfa1bf2..b87efd0c92d60 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -59,7 +59,7 @@ static void cifs_set_ops(struct inode *inode)
/* check if server can support readpages */
if (cifs_sb_master_tcon(cifs_sb)->ses->server->maxBuf <
- PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
+ PAGE_SIZE + MAX_CIFS_HDR_SIZE)
inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
else
inode->i_data.a_ops = &cifs_addr_ops;
@@ -1002,10 +1002,26 @@ struct inode *cifs_root_iget(struct super_block *sb)
struct inode *inode = NULL;
long rc;
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+ char *path = NULL;
+ int len;
+
+ if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH)
+ && cifs_sb->prepath) {
+ len = strlen(cifs_sb->prepath);
+ path = kzalloc(len + 2 /* leading sep + null */, GFP_KERNEL);
+ if (path == NULL)
+ return ERR_PTR(-ENOMEM);
+ path[0] = '/';
+ memcpy(path+1, cifs_sb->prepath, len);
+ } else {
+ path = kstrdup("", GFP_KERNEL);
+ if (path == NULL)
+ return ERR_PTR(-ENOMEM);
+ }
xid = get_xid();
if (tcon->unix_ext) {
- rc = cifs_get_inode_info_unix(&inode, "", sb, xid);
+ rc = cifs_get_inode_info_unix(&inode, path, sb, xid);
/* some servers mistakenly claim POSIX support */
if (rc != -EOPNOTSUPP)
goto iget_no_retry;
@@ -1013,7 +1029,8 @@ struct inode *cifs_root_iget(struct super_block *sb)
tcon->unix_ext = false;
}
- rc = cifs_get_inode_info(&inode, "", NULL, sb, xid, NULL);
+ convert_delimiter(path, CIFS_DIR_SEP(cifs_sb));
+ rc = cifs_get_inode_info(&inode, path, NULL, sb, xid, NULL);
iget_no_retry:
if (!inode) {
@@ -1042,6 +1059,7 @@ iget_no_retry:
}
out:
+ kfree(path);
/* can not call macro free_xid here since in a void func
* TODO: This is no longer true
*/
@@ -2019,8 +2037,8 @@ int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
static int cifs_truncate_page(struct address_space *mapping, loff_t from)
{
- pgoff_t index = from >> PAGE_CACHE_SHIFT;
- unsigned offset = from & (PAGE_CACHE_SIZE - 1);
+ pgoff_t index = from >> PAGE_SHIFT;
+ unsigned offset = from & (PAGE_SIZE - 1);
struct page *page;
int rc = 0;
@@ -2028,9 +2046,9 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from)
if (!page)
return -ENOMEM;
- zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+ zero_user_segment(page, offset, PAGE_SIZE);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return rc;
}
@@ -2418,8 +2436,7 @@ cifs_setattr_exit:
int
cifs_setattr(struct dentry *direntry, struct iattr *attrs)
{
- struct inode *inode = d_inode(direntry);
- struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
struct cifs_tcon *pTcon = cifs_sb_master_tcon(cifs_sb);
if (pTcon->unix_ext)
diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h
index 848249fa120fc..3079b38f0afbd 100644
--- a/fs/cifs/ntlmssp.h
+++ b/fs/cifs/ntlmssp.h
@@ -133,6 +133,6 @@ typedef struct _AUTHENTICATE_MESSAGE {
int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, struct cifs_ses *ses);
void build_ntlmssp_negotiate_blob(unsigned char *pbuffer, struct cifs_ses *ses);
-int build_ntlmssp_auth_blob(unsigned char *pbuffer, u16 *buflen,
+int build_ntlmssp_auth_blob(unsigned char **pbuffer, u16 *buflen,
struct cifs_ses *ses,
const struct nls_table *nls_cp);
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index b30a4a6d98a0f..65cf85dcda09b 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -78,20 +78,34 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
{
struct dentry *dentry, *alias;
struct inode *inode;
- struct super_block *sb = d_inode(parent)->i_sb;
+ struct super_block *sb = parent->d_sb;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
dentry = d_hash_and_lookup(parent, name);
+ if (!dentry) {
+ /*
+ * If we know that the inode will need to be revalidated
+ * immediately, then don't create a new dentry for it.
+ * We'll end up doing an on the wire call either way and
+ * this spares us an invalidation.
+ */
+ if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL)
+ return;
+retry:
+ dentry = d_alloc_parallel(parent, name, &wq);
+ }
if (IS_ERR(dentry))
return;
-
- if (dentry) {
+ if (!d_in_lookup(dentry)) {
inode = d_inode(dentry);
if (inode) {
- if (d_mountpoint(dentry))
- goto out;
+ if (d_mountpoint(dentry)) {
+ dput(dentry);
+ return;
+ }
/*
* If we're generating inode numbers, then we don't
* want to clobber the existing one with the one that
@@ -106,33 +120,22 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
(inode->i_mode & S_IFMT) ==
(fattr->cf_mode & S_IFMT)) {
cifs_fattr_to_inode(inode, fattr);
- goto out;
+ dput(dentry);
+ return;
}
}
d_invalidate(dentry);
dput(dentry);
+ goto retry;
+ } else {
+ inode = cifs_iget(sb, fattr);
+ if (!inode)
+ inode = ERR_PTR(-ENOMEM);
+ alias = d_splice_alias(inode, dentry);
+ d_lookup_done(dentry);
+ if (alias && !IS_ERR(alias))
+ dput(alias);
}
-
- /*
- * If we know that the inode will need to be revalidated immediately,
- * then don't create a new dentry for it. We'll end up doing an on
- * the wire call either way and this spares us an invalidation.
- */
- if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL)
- return;
-
- dentry = d_alloc(parent, name);
- if (!dentry)
- return;
-
- inode = cifs_iget(sb, fattr);
- if (!inode)
- goto out;
-
- alias = d_splice_alias(inode, dentry);
- if (alias && !IS_ERR(alias))
- dput(alias);
-out:
dput(dentry);
}
@@ -300,7 +303,7 @@ initiate_cifs_search(const unsigned int xid, struct file *file)
cifsFile->invalidHandle = true;
cifsFile->srch_inf.endOfSearch = false;
- full_path = build_path_from_dentry(file->f_path.dentry);
+ full_path = build_path_from_dentry(file_dentry(file));
if (full_path == NULL) {
rc = -ENOMEM;
goto error_exit;
@@ -759,7 +762,7 @@ static int cifs_filldir(char *find_entry, struct file *file,
*/
fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
- cifs_prime_dcache(file->f_path.dentry, &name, &fattr);
+ cifs_prime_dcache(file_dentry(file), &name, &fattr);
ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
return !dir_emit(ctx, name.name, name.len, ino, fattr.cf_dtype);
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 59727e32ed0f6..538d9b55699a1 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -364,19 +364,43 @@ void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
sec_blob->DomainName.MaximumLength = 0;
}
-/* We do not malloc the blob, it is passed in pbuffer, because its
- maximum possible size is fixed and small, making this approach cleaner.
- This function returns the length of the data in the blob */
-int build_ntlmssp_auth_blob(unsigned char *pbuffer,
+static int size_of_ntlmssp_blob(struct cifs_ses *ses)
+{
+ int sz = sizeof(AUTHENTICATE_MESSAGE) + ses->auth_key.len
+ - CIFS_SESS_KEY_SIZE + CIFS_CPHTXT_SIZE + 2;
+
+ if (ses->domainName)
+ sz += 2 * strnlen(ses->domainName, CIFS_MAX_DOMAINNAME_LEN);
+ else
+ sz += 2;
+
+ if (ses->user_name)
+ sz += 2 * strnlen(ses->user_name, CIFS_MAX_USERNAME_LEN);
+ else
+ sz += 2;
+
+ return sz;
+}
+
+int build_ntlmssp_auth_blob(unsigned char **pbuffer,
u16 *buflen,
struct cifs_ses *ses,
const struct nls_table *nls_cp)
{
int rc;
- AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer;
+ AUTHENTICATE_MESSAGE *sec_blob;
__u32 flags;
unsigned char *tmp;
+ rc = setup_ntlmv2_rsp(ses, nls_cp);
+ if (rc) {
+ cifs_dbg(VFS, "Error %d during NTLMSSP authentication\n", rc);
+ *buflen = 0;
+ goto setup_ntlmv2_ret;
+ }
+ *pbuffer = kmalloc(size_of_ntlmssp_blob(ses), GFP_KERNEL);
+ sec_blob = (AUTHENTICATE_MESSAGE *)*pbuffer;
+
memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
sec_blob->MessageType = NtLmAuthenticate;
@@ -391,7 +415,7 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer,
flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
}
- tmp = pbuffer + sizeof(AUTHENTICATE_MESSAGE);
+ tmp = *pbuffer + sizeof(AUTHENTICATE_MESSAGE);
sec_blob->NegotiateFlags = cpu_to_le32(flags);
sec_blob->LmChallengeResponse.BufferOffset =
@@ -399,39 +423,43 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer,
sec_blob->LmChallengeResponse.Length = 0;
sec_blob->LmChallengeResponse.MaximumLength = 0;
- sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer);
- rc = setup_ntlmv2_rsp(ses, nls_cp);
- if (rc) {
- cifs_dbg(VFS, "Error %d during NTLMSSP authentication\n", rc);
- goto setup_ntlmv2_ret;
+ sec_blob->NtChallengeResponse.BufferOffset =
+ cpu_to_le32(tmp - *pbuffer);
+ if (ses->user_name != NULL) {
+ memcpy(tmp, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+ ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+ tmp += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
+
+ sec_blob->NtChallengeResponse.Length =
+ cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+ sec_blob->NtChallengeResponse.MaximumLength =
+ cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+ } else {
+ /*
+ * don't send an NT Response for anonymous access
+ */
+ sec_blob->NtChallengeResponse.Length = 0;
+ sec_blob->NtChallengeResponse.MaximumLength = 0;
}
- memcpy(tmp, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
- ses->auth_key.len - CIFS_SESS_KEY_SIZE);
- tmp += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
-
- sec_blob->NtChallengeResponse.Length =
- cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
- sec_blob->NtChallengeResponse.MaximumLength =
- cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
if (ses->domainName == NULL) {
- sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
+ sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - *pbuffer);
sec_blob->DomainName.Length = 0;
sec_blob->DomainName.MaximumLength = 0;
tmp += 2;
} else {
int len;
len = cifs_strtoUTF16((__le16 *)tmp, ses->domainName,
- CIFS_MAX_USERNAME_LEN, nls_cp);
+ CIFS_MAX_DOMAINNAME_LEN, nls_cp);
len *= 2; /* unicode is 2 bytes each */
- sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
+ sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - *pbuffer);
sec_blob->DomainName.Length = cpu_to_le16(len);
sec_blob->DomainName.MaximumLength = cpu_to_le16(len);
tmp += len;
}
if (ses->user_name == NULL) {
- sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
+ sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - *pbuffer);
sec_blob->UserName.Length = 0;
sec_blob->UserName.MaximumLength = 0;
tmp += 2;
@@ -440,13 +468,13 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer,
len = cifs_strtoUTF16((__le16 *)tmp, ses->user_name,
CIFS_MAX_USERNAME_LEN, nls_cp);
len *= 2; /* unicode is 2 bytes each */
- sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
+ sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - *pbuffer);
sec_blob->UserName.Length = cpu_to_le16(len);
sec_blob->UserName.MaximumLength = cpu_to_le16(len);
tmp += len;
}
- sec_blob->WorkstationName.BufferOffset = cpu_to_le32(tmp - pbuffer);
+ sec_blob->WorkstationName.BufferOffset = cpu_to_le32(tmp - *pbuffer);
sec_blob->WorkstationName.Length = 0;
sec_blob->WorkstationName.MaximumLength = 0;
tmp += 2;
@@ -455,19 +483,19 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer,
(ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_EXTENDED_SEC))
&& !calc_seckey(ses)) {
memcpy(tmp, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
- sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
+ sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - *pbuffer);
sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE);
sec_blob->SessionKey.MaximumLength =
cpu_to_le16(CIFS_CPHTXT_SIZE);
tmp += CIFS_CPHTXT_SIZE;
} else {
- sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
+ sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - *pbuffer);
sec_blob->SessionKey.Length = 0;
sec_blob->SessionKey.MaximumLength = 0;
}
+ *buflen = tmp - *pbuffer;
setup_ntlmv2_ret:
- *buflen = tmp - pbuffer;
return rc;
}
@@ -670,20 +698,26 @@ sess_auth_lanman(struct sess_data *sess_data)
pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
- /* no capabilities flags in old lanman negotiation */
- pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
-
- /* Calculate hash with password and copy into bcc_ptr.
- * Encryption Key (stored as in cryptkey) gets used if the
- * security mode bit in Negottiate Protocol response states
- * to use challenge/response method (i.e. Password bit is 1).
- */
- rc = calc_lanman_hash(ses->password, ses->server->cryptkey,
- ses->server->sec_mode & SECMODE_PW_ENCRYPT ?
- true : false, lnm_session_key);
-
- memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE);
- bcc_ptr += CIFS_AUTH_RESP_SIZE;
+ if (ses->user_name != NULL) {
+ /* no capabilities flags in old lanman negotiation */
+ pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
+
+ /* Calculate hash with password and copy into bcc_ptr.
+ * Encryption Key (stored as in cryptkey) gets used if the
+ * security mode bit in Negottiate Protocol response states
+ * to use challenge/response method (i.e. Password bit is 1).
+ */
+ rc = calc_lanman_hash(ses->password, ses->server->cryptkey,
+ ses->server->sec_mode & SECMODE_PW_ENCRYPT ?
+ true : false, lnm_session_key);
+ if (rc)
+ goto out;
+
+ memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE);
+ bcc_ptr += CIFS_AUTH_RESP_SIZE;
+ } else {
+ pSMB->old_req.PasswordLength = 0;
+ }
/*
* can not sign if LANMAN negotiated so no need
@@ -769,27 +803,32 @@ sess_auth_ntlm(struct sess_data *sess_data)
capabilities = cifs_ssetup_hdr(ses, pSMB);
pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
- pSMB->req_no_secext.CaseInsensitivePasswordLength =
- cpu_to_le16(CIFS_AUTH_RESP_SIZE);
- pSMB->req_no_secext.CaseSensitivePasswordLength =
- cpu_to_le16(CIFS_AUTH_RESP_SIZE);
+ if (ses->user_name != NULL) {
+ pSMB->req_no_secext.CaseInsensitivePasswordLength =
+ cpu_to_le16(CIFS_AUTH_RESP_SIZE);
+ pSMB->req_no_secext.CaseSensitivePasswordLength =
+ cpu_to_le16(CIFS_AUTH_RESP_SIZE);
+
+ /* calculate ntlm response and session key */
+ rc = setup_ntlm_response(ses, sess_data->nls_cp);
+ if (rc) {
+ cifs_dbg(VFS, "Error %d during NTLM authentication\n",
+ rc);
+ goto out;
+ }
- /* calculate ntlm response and session key */
- rc = setup_ntlm_response(ses, sess_data->nls_cp);
- if (rc) {
- cifs_dbg(VFS, "Error %d during NTLM authentication\n",
- rc);
- goto out;
+ /* copy ntlm response */
+ memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+ CIFS_AUTH_RESP_SIZE);
+ bcc_ptr += CIFS_AUTH_RESP_SIZE;
+ memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+ CIFS_AUTH_RESP_SIZE);
+ bcc_ptr += CIFS_AUTH_RESP_SIZE;
+ } else {
+ pSMB->req_no_secext.CaseInsensitivePasswordLength = 0;
+ pSMB->req_no_secext.CaseSensitivePasswordLength = 0;
}
- /* copy ntlm response */
- memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
- CIFS_AUTH_RESP_SIZE);
- bcc_ptr += CIFS_AUTH_RESP_SIZE;
- memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
- CIFS_AUTH_RESP_SIZE);
- bcc_ptr += CIFS_AUTH_RESP_SIZE;
-
if (ses->capabilities & CAP_UNICODE) {
/* unicode strings must be word aligned */
if (sess_data->iov[0].iov_len % 2) {
@@ -878,22 +917,26 @@ sess_auth_ntlmv2(struct sess_data *sess_data)
/* LM2 password would be here if we supported it */
pSMB->req_no_secext.CaseInsensitivePasswordLength = 0;
- /* calculate nlmv2 response and session key */
- rc = setup_ntlmv2_rsp(ses, sess_data->nls_cp);
- if (rc) {
- cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n", rc);
- goto out;
- }
+ if (ses->user_name != NULL) {
+ /* calculate nlmv2 response and session key */
+ rc = setup_ntlmv2_rsp(ses, sess_data->nls_cp);
+ if (rc) {
+ cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n", rc);
+ goto out;
+ }
- memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
- ses->auth_key.len - CIFS_SESS_KEY_SIZE);
- bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
+ memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+ ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+ bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
- /* set case sensitive password length after tilen may get
- * assigned, tilen is 0 otherwise.
- */
- pSMB->req_no_secext.CaseSensitivePasswordLength =
- cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+ /* set case sensitive password length after tilen may get
+ * assigned, tilen is 0 otherwise.
+ */
+ pSMB->req_no_secext.CaseSensitivePasswordLength =
+ cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+ } else {
+ pSMB->req_no_secext.CaseSensitivePasswordLength = 0;
+ }
if (ses->capabilities & CAP_UNICODE) {
if (sess_data->iov[0].iov_len % 2) {
@@ -1245,7 +1288,7 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data)
struct cifs_ses *ses = sess_data->ses;
__u16 bytes_remaining;
char *bcc_ptr;
- char *ntlmsspblob = NULL;
+ unsigned char *ntlmsspblob = NULL;
u16 blob_len;
cifs_dbg(FYI, "rawntlmssp session setup authenticate phase\n");
@@ -1258,19 +1301,7 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data)
/* Build security blob before we assemble the request */
pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
smb_buf = (struct smb_hdr *)pSMB;
- /*
- * 5 is an empirical value, large enough to hold
- * authenticate message plus max 10 of av paris,
- * domain, user, workstation names, flags, etc.
- */
- ntlmsspblob = kzalloc(5*sizeof(struct _AUTHENTICATE_MESSAGE),
- GFP_KERNEL);
- if (!ntlmsspblob) {
- rc = -ENOMEM;
- goto out;
- }
-
- rc = build_ntlmssp_auth_blob(ntlmsspblob,
+ rc = build_ntlmssp_auth_blob(&ntlmsspblob,
&blob_len, ses, sess_data->nls_cp);
if (rc)
goto out_free_ntlmsspblob;
diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h
index bc0bb9c34f72a..0ffa180943357 100644
--- a/fs/cifs/smb2glob.h
+++ b/fs/cifs/smb2glob.h
@@ -44,6 +44,7 @@
#define SMB2_OP_DELETE 7
#define SMB2_OP_HARDLINK 8
#define SMB2_OP_SET_EOF 9
+#define SMB2_OP_RMDIR 10
/* Used when constructing chained read requests. */
#define CHAINED_REQUEST 1
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 899bbc86f73e1..4f0231e685a92 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -80,6 +80,10 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
* SMB2_open() call.
*/
break;
+ case SMB2_OP_RMDIR:
+ tmprc = SMB2_rmdir(xid, tcon, fid.persistent_fid,
+ fid.volatile_fid);
+ break;
case SMB2_OP_RENAME:
tmprc = SMB2_rename(xid, tcon, fid.persistent_fid,
fid.volatile_fid, (__le16 *)data);
@@ -191,8 +195,8 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
struct cifs_sb_info *cifs_sb)
{
return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
- CREATE_NOT_FILE | CREATE_DELETE_ON_CLOSE,
- NULL, SMB2_OP_DELETE);
+ CREATE_NOT_FILE,
+ NULL, SMB2_OP_RMDIR);
}
int
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 3525ed756173d..d203c0329626c 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -1044,6 +1044,9 @@ smb2_new_lease_key(struct cifs_fid *fid)
get_random_bytes(fid->lease_key, SMB2_LEASE_KEY_SIZE);
}
+#define SMB2_SYMLINK_STRUCT_SIZE \
+ (sizeof(struct smb2_err_rsp) - 1 + sizeof(struct smb2_symlink_err_rsp))
+
static int
smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
const char *full_path, char **target_path,
@@ -1056,7 +1059,10 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_fid fid;
struct smb2_err_rsp *err_buf = NULL;
struct smb2_symlink_err_rsp *symlink;
- unsigned int sub_len, sub_offset;
+ unsigned int sub_len;
+ unsigned int sub_offset;
+ unsigned int print_len;
+ unsigned int print_offset;
cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
@@ -1077,11 +1083,33 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
kfree(utf16_path);
return -ENOENT;
}
+
+ if (le32_to_cpu(err_buf->ByteCount) < sizeof(struct smb2_symlink_err_rsp) ||
+ get_rfc1002_length(err_buf) + 4 < SMB2_SYMLINK_STRUCT_SIZE) {
+ kfree(utf16_path);
+ return -ENOENT;
+ }
+
/* open must fail on symlink - reset rc */
rc = 0;
symlink = (struct smb2_symlink_err_rsp *)err_buf->ErrorData;
sub_len = le16_to_cpu(symlink->SubstituteNameLength);
sub_offset = le16_to_cpu(symlink->SubstituteNameOffset);
+ print_len = le16_to_cpu(symlink->PrintNameLength);
+ print_offset = le16_to_cpu(symlink->PrintNameOffset);
+
+ if (get_rfc1002_length(err_buf) + 4 <
+ SMB2_SYMLINK_STRUCT_SIZE + sub_offset + sub_len) {
+ kfree(utf16_path);
+ return -ENOENT;
+ }
+
+ if (get_rfc1002_length(err_buf) + 4 <
+ SMB2_SYMLINK_STRUCT_SIZE + print_offset + print_len) {
+ kfree(utf16_path);
+ return -ENOENT;
+ }
+
*target_path = cifs_strndup_from_utf16(
(char *)symlink->PathBuffer + sub_offset,
sub_len, true, cifs_sb->local_nls);
@@ -1515,6 +1543,8 @@ struct smb_version_operations smb20_operations = {
.rename = smb2_rename_path,
.create_hardlink = smb2_create_hardlink,
.query_symlink = smb2_query_symlink,
+ .query_mf_symlink = smb3_query_mf_symlink,
+ .create_mf_symlink = smb3_create_mf_symlink,
.open = smb2_open_file,
.set_fid = smb2_set_fid,
.close = smb2_close_file,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 42e1f440eb1e9..29e06db5f187b 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -588,7 +588,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
u16 blob_length = 0;
struct key *spnego_key = NULL;
char *security_blob = NULL;
- char *ntlmssp_blob = NULL;
+ unsigned char *ntlmssp_blob = NULL;
bool use_spnego = false; /* else use raw ntlmssp */
cifs_dbg(FYI, "Session Setup\n");
@@ -713,13 +713,7 @@ ssetup_ntlmssp_authenticate:
iov[1].iov_len = blob_length;
} else if (phase == NtLmAuthenticate) {
req->hdr.SessionId = ses->Suid;
- ntlmssp_blob = kzalloc(sizeof(struct _NEGOTIATE_MESSAGE) + 500,
- GFP_KERNEL);
- if (ntlmssp_blob == NULL) {
- rc = -ENOMEM;
- goto ssetup_exit;
- }
- rc = build_ntlmssp_auth_blob(ntlmssp_blob, &blob_length, ses,
+ rc = build_ntlmssp_auth_blob(&ntlmssp_blob, &blob_length, ses,
nls_cp);
if (rc) {
cifs_dbg(FYI, "build_ntlmssp_auth_blob failed %d\n",
@@ -1818,6 +1812,33 @@ SMB2_echo(struct TCP_Server_Info *server)
cifs_dbg(FYI, "In echo request\n");
+ if (server->tcpStatus == CifsNeedNegotiate) {
+ struct list_head *tmp, *tmp2;
+ struct cifs_ses *ses;
+ struct cifs_tcon *tcon;
+
+ cifs_dbg(FYI, "Need negotiate, reconnecting tcons\n");
+ spin_lock(&cifs_tcp_ses_lock);
+ list_for_each(tmp, &server->smb_ses_list) {
+ ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
+ list_for_each(tmp2, &ses->tcon_list) {
+ tcon = list_entry(tmp2, struct cifs_tcon,
+ tcon_list);
+ /* add check for persistent handle reconnect */
+ if (tcon && tcon->need_reconnect) {
+ spin_unlock(&cifs_tcp_ses_lock);
+ rc = smb2_reconnect(SMB2_ECHO, tcon);
+ spin_lock(&cifs_tcp_ses_lock);
+ }
+ }
+ }
+ spin_unlock(&cifs_tcp_ses_lock);
+ }
+
+ /* if no session, renegotiate failed above */
+ if (server->tcpStatus == CifsNeedNegotiate)
+ return -EIO;
+
rc = small_smb2_init(SMB2_ECHO, NULL, (void **)&req);
if (rc)
return rc;
@@ -2575,6 +2596,22 @@ SMB2_rename(const unsigned int xid, struct cifs_tcon *tcon,
}
int
+SMB2_rmdir(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_fid, u64 volatile_fid)
+{
+ __u8 delete_pending = 1;
+ void *data;
+ unsigned int size;
+
+ data = &delete_pending;
+ size = 1; /* sizeof __u8 */
+
+ return send_set_info(xid, tcon, persistent_fid, volatile_fid,
+ current->tgid, FILE_DISPOSITION_INFORMATION, 1, &data,
+ &size);
+}
+
+int
SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, __le16 *target_file)
{
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 4f07dc93608db..eb2cde2f64ba7 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -141,6 +141,8 @@ extern int SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
extern int SMB2_rename(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid,
__le16 *target_file);
+extern int SMB2_rmdir(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_fid, u64 volatile_fid);
extern int SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid,
__le16 *target_file);
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 8732a43b10084..bc9a7b6346434 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -135,11 +135,10 @@ smb2_find_smb_ses(struct smb2_hdr *smb2hdr, struct TCP_Server_Info *server)
int
smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
{
- int i, rc;
+ int rc;
unsigned char smb2_signature[SMB2_HMACSHA256_SIZE];
unsigned char *sigptr = smb2_signature;
struct kvec *iov = rqst->rq_iov;
- int n_vec = rqst->rq_nvec;
struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base;
struct cifs_ses *ses;
@@ -171,53 +170,11 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
return rc;
}
- for (i = 0; i < n_vec; i++) {
- if (iov[i].iov_len == 0)
- continue;
- if (iov[i].iov_base == NULL) {
- cifs_dbg(VFS, "null iovec entry\n");
- return -EIO;
- }
- /*
- * The first entry includes a length field (which does not get
- * signed that occupies the first 4 bytes before the header).
- */
- if (i == 0) {
- if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
- break; /* nothing to sign or corrupt header */
- rc =
- crypto_shash_update(
- &server->secmech.sdeschmacsha256->shash,
- iov[i].iov_base + 4, iov[i].iov_len - 4);
- } else {
- rc =
- crypto_shash_update(
- &server->secmech.sdeschmacsha256->shash,
- iov[i].iov_base, iov[i].iov_len);
- }
- if (rc) {
- cifs_dbg(VFS, "%s: Could not update with payload\n",
- __func__);
- return rc;
- }
- }
-
- /* now hash over the rq_pages array */
- for (i = 0; i < rqst->rq_npages; i++) {
- struct kvec p_iov;
-
- cifs_rqst_page_to_kvec(rqst, i, &p_iov);
- crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
- p_iov.iov_base, p_iov.iov_len);
- kunmap(rqst->rq_pages[i]);
- }
-
- rc = crypto_shash_final(&server->secmech.sdeschmacsha256->shash,
- sigptr);
- if (rc)
- cifs_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__);
+ rc = __cifs_calc_signature(rqst, server, sigptr,
+ &server->secmech.sdeschmacsha256->shash);
- memcpy(smb2_pdu->Signature, sigptr, SMB2_SIGNATURE_SIZE);
+ if (!rc)
+ memcpy(smb2_pdu->Signature, sigptr, SMB2_SIGNATURE_SIZE);
return rc;
}
@@ -395,12 +352,10 @@ generate_smb311signingkey(struct cifs_ses *ses)
int
smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
{
- int i;
int rc = 0;
unsigned char smb3_signature[SMB2_CMACAES_SIZE];
unsigned char *sigptr = smb3_signature;
struct kvec *iov = rqst->rq_iov;
- int n_vec = rqst->rq_nvec;
struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base;
struct cifs_ses *ses;
@@ -431,54 +386,12 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
cifs_dbg(VFS, "%s: Could not init cmac aes\n", __func__);
return rc;
}
+
+ rc = __cifs_calc_signature(rqst, server, sigptr,
+ &server->secmech.sdesccmacaes->shash);
- for (i = 0; i < n_vec; i++) {
- if (iov[i].iov_len == 0)
- continue;
- if (iov[i].iov_base == NULL) {
- cifs_dbg(VFS, "null iovec entry");
- return -EIO;
- }
- /*
- * The first entry includes a length field (which does not get
- * signed that occupies the first 4 bytes before the header).
- */
- if (i == 0) {
- if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
- break; /* nothing to sign or corrupt header */
- rc =
- crypto_shash_update(
- &server->secmech.sdesccmacaes->shash,
- iov[i].iov_base + 4, iov[i].iov_len - 4);
- } else {
- rc =
- crypto_shash_update(
- &server->secmech.sdesccmacaes->shash,
- iov[i].iov_base, iov[i].iov_len);
- }
- if (rc) {
- cifs_dbg(VFS, "%s: Couldn't update cmac aes with payload\n",
- __func__);
- return rc;
- }
- }
-
- /* now hash over the rq_pages array */
- for (i = 0; i < rqst->rq_npages; i++) {
- struct kvec p_iov;
-
- cifs_rqst_page_to_kvec(rqst, i, &p_iov);
- crypto_shash_update(&server->secmech.sdesccmacaes->shash,
- p_iov.iov_base, p_iov.iov_len);
- kunmap(rqst->rq_pages[i]);
- }
-
- rc = crypto_shash_final(&server->secmech.sdesccmacaes->shash,
- sigptr);
- if (rc)
- cifs_dbg(VFS, "%s: Could not generate cmac aes\n", __func__);
-
- memcpy(smb2_pdu->Signature, sigptr, SMB2_SIGNATURE_SIZE);
+ if (!rc)
+ memcpy(smb2_pdu->Signature, sigptr, SMB2_SIGNATURE_SIZE);
return rc;
}
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 87abe8ed074c3..206a597b2293c 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -124,41 +124,32 @@ cifs_delete_mid(struct mid_q_entry *mid)
/*
* smb_send_kvec - send an array of kvecs to the server
* @server: Server to send the data to
- * @iov: Pointer to array of kvecs
- * @n_vec: length of kvec array
+ * @smb_msg: Message to send
* @sent: amount of data sent on socket is stored here
*
* Our basic "send data to server" function. Should be called with srv_mutex
* held. The caller is responsible for handling the results.
*/
static int
-smb_send_kvec(struct TCP_Server_Info *server, struct kvec *iov, size_t n_vec,
- size_t *sent)
+smb_send_kvec(struct TCP_Server_Info *server, struct msghdr *smb_msg,
+ size_t *sent)
{
int rc = 0;
- int i = 0;
- struct msghdr smb_msg;
- unsigned int remaining;
- size_t first_vec = 0;
+ int retries = 0;
struct socket *ssocket = server->ssocket;
*sent = 0;
- smb_msg.msg_name = (struct sockaddr *) &server->dstaddr;
- smb_msg.msg_namelen = sizeof(struct sockaddr);
- smb_msg.msg_control = NULL;
- smb_msg.msg_controllen = 0;
+ smb_msg->msg_name = (struct sockaddr *) &server->dstaddr;
+ smb_msg->msg_namelen = sizeof(struct sockaddr);
+ smb_msg->msg_control = NULL;
+ smb_msg->msg_controllen = 0;
if (server->noblocksnd)
- smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL;
+ smb_msg->msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL;
else
- smb_msg.msg_flags = MSG_NOSIGNAL;
-
- remaining = 0;
- for (i = 0; i < n_vec; i++)
- remaining += iov[i].iov_len;
+ smb_msg->msg_flags = MSG_NOSIGNAL;
- i = 0;
- while (remaining) {
+ while (msg_data_left(smb_msg)) {
/*
* If blocking send, we try 3 times, since each can block
* for 5 seconds. For nonblocking we have to try more
@@ -177,35 +168,21 @@ smb_send_kvec(struct TCP_Server_Info *server, struct kvec *iov, size_t n_vec,
* after the retries we will kill the socket and
* reconnect which may clear the network problem.
*/
- rc = kernel_sendmsg(ssocket, &smb_msg, &iov[first_vec],
- n_vec - first_vec, remaining);
+ rc = sock_sendmsg(ssocket, smb_msg);
if (rc == -EAGAIN) {
- i++;
- if (i >= 14 || (!server->noblocksnd && (i > 2))) {
+ retries++;
+ if (retries >= 14 ||
+ (!server->noblocksnd && (retries > 2))) {
cifs_dbg(VFS, "sends on sock %p stuck for 15 seconds\n",
ssocket);
- rc = -EAGAIN;
- break;
+ return -EAGAIN;
}
- msleep(1 << i);
+ msleep(1 << retries);
continue;
}
if (rc < 0)
- break;
-
- /* send was at least partially successful */
- *sent += rc;
-
- if (rc == remaining) {
- remaining = 0;
- break;
- }
-
- if (rc > remaining) {
- cifs_dbg(VFS, "sent %d requested %d\n", rc, remaining);
- break;
- }
+ return rc;
if (rc == 0) {
/* should never happen, letting socket clear before
@@ -215,59 +192,11 @@ smb_send_kvec(struct TCP_Server_Info *server, struct kvec *iov, size_t n_vec,
continue;
}
- remaining -= rc;
-
- /* the line below resets i */
- for (i = first_vec; i < n_vec; i++) {
- if (iov[i].iov_len) {
- if (rc > iov[i].iov_len) {
- rc -= iov[i].iov_len;
- iov[i].iov_len = 0;
- } else {
- iov[i].iov_base += rc;
- iov[i].iov_len -= rc;
- first_vec = i;
- break;
- }
- }
- }
-
- i = 0; /* in case we get ENOSPC on the next send */
- rc = 0;
+ /* send was at least partially successful */
+ *sent += rc;
+ retries = 0; /* in case we get ENOSPC on the next send */
}
- return rc;
-}
-
-/**
- * rqst_page_to_kvec - Turn a slot in the smb_rqst page array into a kvec
- * @rqst: pointer to smb_rqst
- * @idx: index into the array of the page
- * @iov: pointer to struct kvec that will hold the result
- *
- * Helper function to convert a slot in the rqst->rq_pages array into a kvec.
- * The page will be kmapped and the address placed into iov_base. The length
- * will then be adjusted according to the ptailoff.
- */
-void
-cifs_rqst_page_to_kvec(struct smb_rqst *rqst, unsigned int idx,
- struct kvec *iov)
-{
- /*
- * FIXME: We could avoid this kmap altogether if we used
- * kernel_sendpage instead of kernel_sendmsg. That will only
- * work if signing is disabled though as sendpage inlines the
- * page directly into the fraglist. If userspace modifies the
- * page after we calculate the signature, then the server will
- * reject it and may break the connection. kernel_sendmsg does
- * an extra copy of the data and avoids that issue.
- */
- iov->iov_base = kmap(rqst->rq_pages[idx]);
-
- /* if last page, don't send beyond this offset into page */
- if (idx == (rqst->rq_npages - 1))
- iov->iov_len = rqst->rq_tailsz;
- else
- iov->iov_len = rqst->rq_pagesz;
+ return 0;
}
static unsigned long
@@ -299,8 +228,9 @@ smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst)
unsigned int smb_buf_length = get_rfc1002_length(iov[0].iov_base);
unsigned long send_length;
unsigned int i;
- size_t total_len = 0, sent;
+ size_t total_len = 0, sent, size;
struct socket *ssocket = server->ssocket;
+ struct msghdr smb_msg;
int val = 1;
if (ssocket == NULL)
@@ -321,7 +251,13 @@ smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst)
kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK,
(char *)&val, sizeof(val));
- rc = smb_send_kvec(server, iov, n_vec, &sent);
+ size = 0;
+ for (i = 0; i < n_vec; i++)
+ size += iov[i].iov_len;
+
+ iov_iter_kvec(&smb_msg.msg_iter, WRITE | ITER_KVEC, iov, n_vec, size);
+
+ rc = smb_send_kvec(server, &smb_msg, &sent);
if (rc < 0)
goto uncork;
@@ -329,11 +265,16 @@ smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst)
/* now walk the page array and send each page in it */
for (i = 0; i < rqst->rq_npages; i++) {
- struct kvec p_iov;
-
- cifs_rqst_page_to_kvec(rqst, i, &p_iov);
- rc = smb_send_kvec(server, &p_iov, 1, &sent);
- kunmap(rqst->rq_pages[i]);
+ size_t len = i == rqst->rq_npages - 1
+ ? rqst->rq_tailsz
+ : rqst->rq_pagesz;
+ struct bio_vec bvec = {
+ .bv_page = rqst->rq_pages[i],
+ .bv_len = len
+ };
+ iov_iter_bvec(&smb_msg.msg_iter, WRITE | ITER_BVEC,
+ &bvec, 1, len);
+ rc = smb_send_kvec(server, &smb_msg, &sent);
if (rc < 0)
break;
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index f5dc2f0df4ad6..5e23f64c0804b 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -32,92 +32,25 @@
#include "cifs_unicode.h"
#define MAX_EA_VALUE_SIZE 65535
-#define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib"
#define CIFS_XATTR_CIFS_ACL "system.cifs_acl"
/* BB need to add server (Samba e.g) support for security and trusted prefix */
-int cifs_removexattr(struct dentry *direntry, const char *ea_name)
-{
- int rc = -EOPNOTSUPP;
-#ifdef CONFIG_CIFS_XATTR
- unsigned int xid;
- struct cifs_sb_info *cifs_sb;
- struct tcon_link *tlink;
- struct cifs_tcon *pTcon;
- struct super_block *sb;
- char *full_path = NULL;
-
- if (direntry == NULL)
- return -EIO;
- if (d_really_is_negative(direntry))
- return -EIO;
- sb = d_inode(direntry)->i_sb;
- if (sb == NULL)
- return -EIO;
-
- cifs_sb = CIFS_SB(sb);
- tlink = cifs_sb_tlink(cifs_sb);
- if (IS_ERR(tlink))
- return PTR_ERR(tlink);
- pTcon = tlink_tcon(tlink);
-
- xid = get_xid();
-
- full_path = build_path_from_dentry(direntry);
- if (full_path == NULL) {
- rc = -ENOMEM;
- goto remove_ea_exit;
- }
- if (ea_name == NULL) {
- cifs_dbg(FYI, "Null xattr names not supported\n");
- } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)
- && (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN))) {
- cifs_dbg(FYI,
- "illegal xattr request %s (only user namespace supported)\n",
- ea_name);
- /* BB what if no namespace prefix? */
- /* Should we just pass them to server, except for
- system and perhaps security prefixes? */
- } else {
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
- goto remove_ea_exit;
+enum { XATTR_USER, XATTR_CIFS_ACL, XATTR_ACL_ACCESS, XATTR_ACL_DEFAULT };
- ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
- if (pTcon->ses->server->ops->set_EA)
- rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
- full_path, ea_name, NULL, (__u16)0,
- cifs_sb->local_nls, cifs_remap(cifs_sb));
- }
-remove_ea_exit:
- kfree(full_path);
- free_xid(xid);
- cifs_put_tlink(tlink);
-#endif
- return rc;
-}
-
-int cifs_setxattr(struct dentry *direntry, const char *ea_name,
- const void *ea_value, size_t value_size, int flags)
+static int cifs_xattr_set(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
int rc = -EOPNOTSUPP;
-#ifdef CONFIG_CIFS_XATTR
unsigned int xid;
- struct cifs_sb_info *cifs_sb;
+ struct super_block *sb = dentry->d_sb;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct tcon_link *tlink;
struct cifs_tcon *pTcon;
- struct super_block *sb;
char *full_path;
- if (direntry == NULL)
- return -EIO;
- if (d_really_is_negative(direntry))
- return -EIO;
- sb = d_inode(direntry)->i_sb;
- if (sb == NULL)
- return -EIO;
-
- cifs_sb = CIFS_SB(sb);
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
return PTR_ERR(tlink);
@@ -125,10 +58,10 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
xid = get_xid();
- full_path = build_path_from_dentry(direntry);
+ full_path = build_path_from_dentry(dentry);
if (full_path == NULL) {
rc = -ENOMEM;
- goto set_ea_exit;
+ goto out;
}
/* return dos attributes as pseudo xattr */
/* return alt name if available as pseudo attr */
@@ -136,123 +69,93 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
/* if proc/fs/cifs/streamstoxattr is set then
search server for EAs or streams to
returns as xattrs */
- if (value_size > MAX_EA_VALUE_SIZE) {
+ if (size > MAX_EA_VALUE_SIZE) {
cifs_dbg(FYI, "size of EA value too large\n");
rc = -EOPNOTSUPP;
- goto set_ea_exit;
+ goto out;
}
- if (ea_name == NULL) {
- cifs_dbg(FYI, "Null xattr names not supported\n");
- } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)
- == 0) {
+ switch (handler->flags) {
+ case XATTR_USER:
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
- goto set_ea_exit;
- if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0)
- cifs_dbg(FYI, "attempt to set cifs inode metadata\n");
+ goto out;
- ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
if (pTcon->ses->server->ops->set_EA)
rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
- full_path, ea_name, ea_value, (__u16)value_size,
+ full_path, name, value, (__u16)size,
cifs_sb->local_nls, cifs_remap(cifs_sb));
- } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)
- == 0) {
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
- goto set_ea_exit;
+ break;
- ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */
- if (pTcon->ses->server->ops->set_EA)
- rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
- full_path, ea_name, ea_value, (__u16)value_size,
- cifs_sb->local_nls, cifs_remap(cifs_sb));
- } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL,
- strlen(CIFS_XATTR_CIFS_ACL)) == 0) {
+ case XATTR_CIFS_ACL: {
#ifdef CONFIG_CIFS_ACL
struct cifs_ntsd *pacl;
- pacl = kmalloc(value_size, GFP_KERNEL);
+
+ if (!value)
+ goto out;
+ pacl = kmalloc(size, GFP_KERNEL);
if (!pacl) {
rc = -ENOMEM;
} else {
- memcpy(pacl, ea_value, value_size);
- if (pTcon->ses->server->ops->set_acl)
+ memcpy(pacl, value, size);
+ if (value &&
+ pTcon->ses->server->ops->set_acl)
rc = pTcon->ses->server->ops->set_acl(pacl,
- value_size, d_inode(direntry),
+ size, inode,
full_path, CIFS_ACL_DACL);
else
rc = -EOPNOTSUPP;
if (rc == 0) /* force revalidate of the inode */
- CIFS_I(d_inode(direntry))->time = 0;
+ CIFS_I(inode)->time = 0;
kfree(pacl);
}
-#else
- cifs_dbg(FYI, "Set CIFS ACL not supported yet\n");
#endif /* CONFIG_CIFS_ACL */
- } else {
- int temp;
- temp = strncmp(ea_name, XATTR_NAME_POSIX_ACL_ACCESS,
- strlen(XATTR_NAME_POSIX_ACL_ACCESS));
- if (temp == 0) {
+ break;
+ }
+
+ case XATTR_ACL_ACCESS:
#ifdef CONFIG_CIFS_POSIX
- if (sb->s_flags & MS_POSIXACL)
- rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
- ea_value, (const int)value_size,
- ACL_TYPE_ACCESS, cifs_sb->local_nls,
- cifs_remap(cifs_sb));
- cifs_dbg(FYI, "set POSIX ACL rc %d\n", rc);
-#else
- cifs_dbg(FYI, "set POSIX ACL not supported\n");
-#endif
- } else if (strncmp(ea_name, XATTR_NAME_POSIX_ACL_DEFAULT,
- strlen(XATTR_NAME_POSIX_ACL_DEFAULT)) == 0) {
+ if (!value)
+ goto out;
+ if (sb->s_flags & MS_POSIXACL)
+ rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
+ value, (const int)size,
+ ACL_TYPE_ACCESS, cifs_sb->local_nls,
+ cifs_remap(cifs_sb));
+#endif /* CONFIG_CIFS_POSIX */
+ break;
+
+ case XATTR_ACL_DEFAULT:
#ifdef CONFIG_CIFS_POSIX
- if (sb->s_flags & MS_POSIXACL)
- rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
- ea_value, (const int)value_size,
- ACL_TYPE_DEFAULT, cifs_sb->local_nls,
- cifs_remap(cifs_sb));
- cifs_dbg(FYI, "set POSIX default ACL rc %d\n", rc);
-#else
- cifs_dbg(FYI, "set default POSIX ACL not supported\n");
-#endif
- } else {
- cifs_dbg(FYI, "illegal xattr request %s (only user namespace supported)\n",
- ea_name);
- /* BB what if no namespace prefix? */
- /* Should we just pass them to server, except for
- system and perhaps security prefixes? */
- }
+ if (!value)
+ goto out;
+ if (sb->s_flags & MS_POSIXACL)
+ rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
+ value, (const int)size,
+ ACL_TYPE_DEFAULT, cifs_sb->local_nls,
+ cifs_remap(cifs_sb));
+#endif /* CONFIG_CIFS_POSIX */
+ break;
}
-set_ea_exit:
+out:
kfree(full_path);
free_xid(xid);
cifs_put_tlink(tlink);
-#endif
return rc;
}
-ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
- void *ea_value, size_t buf_size)
+static int cifs_xattr_get(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, void *value, size_t size)
{
ssize_t rc = -EOPNOTSUPP;
-#ifdef CONFIG_CIFS_XATTR
unsigned int xid;
- struct cifs_sb_info *cifs_sb;
+ struct super_block *sb = dentry->d_sb;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct tcon_link *tlink;
struct cifs_tcon *pTcon;
- struct super_block *sb;
char *full_path;
- if (direntry == NULL)
- return -EIO;
- if (d_really_is_negative(direntry))
- return -EIO;
- sb = d_inode(direntry)->i_sb;
- if (sb == NULL)
- return -EIO;
-
- cifs_sb = CIFS_SB(sb);
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
return PTR_ERR(tlink);
@@ -260,98 +163,72 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
xid = get_xid();
- full_path = build_path_from_dentry(direntry);
+ full_path = build_path_from_dentry(dentry);
if (full_path == NULL) {
rc = -ENOMEM;
- goto get_ea_exit;
+ goto out;
}
/* return dos attributes as pseudo xattr */
/* return alt name if available as pseudo attr */
- if (ea_name == NULL) {
- cifs_dbg(FYI, "Null xattr names not supported\n");
- } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)
- == 0) {
+ switch (handler->flags) {
+ case XATTR_USER:
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
- goto get_ea_exit;
+ goto out;
- if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) {
- cifs_dbg(FYI, "attempt to query cifs inode metadata\n");
- /* revalidate/getattr then populate from inode */
- } /* BB add else when above is implemented */
- ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
if (pTcon->ses->server->ops->query_all_EAs)
rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
- full_path, ea_name, ea_value, buf_size,
+ full_path, name, value, size,
cifs_sb->local_nls, cifs_remap(cifs_sb));
- } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
- goto get_ea_exit;
+ break;
- ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */
- if (pTcon->ses->server->ops->query_all_EAs)
- rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
- full_path, ea_name, ea_value, buf_size,
- cifs_sb->local_nls, cifs_remap(cifs_sb));
- } else if (strncmp(ea_name, XATTR_NAME_POSIX_ACL_ACCESS,
- strlen(XATTR_NAME_POSIX_ACL_ACCESS)) == 0) {
+ case XATTR_CIFS_ACL: {
+#ifdef CONFIG_CIFS_ACL
+ u32 acllen;
+ struct cifs_ntsd *pacl;
+
+ if (pTcon->ses->server->ops->get_acl == NULL)
+ goto out; /* rc already EOPNOTSUPP */
+
+ pacl = pTcon->ses->server->ops->get_acl(cifs_sb,
+ inode, full_path, &acllen);
+ if (IS_ERR(pacl)) {
+ rc = PTR_ERR(pacl);
+ cifs_dbg(VFS, "%s: error %zd getting sec desc\n",
+ __func__, rc);
+ } else {
+ if (value) {
+ if (acllen > size)
+ acllen = -ERANGE;
+ else
+ memcpy(value, pacl, acllen);
+ }
+ rc = acllen;
+ kfree(pacl);
+ }
+#endif /* CONFIG_CIFS_ACL */
+ break;
+ }
+
+ case XATTR_ACL_ACCESS:
#ifdef CONFIG_CIFS_POSIX
if (sb->s_flags & MS_POSIXACL)
rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
- ea_value, buf_size, ACL_TYPE_ACCESS,
+ value, size, ACL_TYPE_ACCESS,
cifs_sb->local_nls,
cifs_remap(cifs_sb));
-#else
- cifs_dbg(FYI, "Query POSIX ACL not supported yet\n");
-#endif /* CONFIG_CIFS_POSIX */
- } else if (strncmp(ea_name, XATTR_NAME_POSIX_ACL_DEFAULT,
- strlen(XATTR_NAME_POSIX_ACL_DEFAULT)) == 0) {
+#endif /* CONFIG_CIFS_POSIX */
+ break;
+
+ case XATTR_ACL_DEFAULT:
#ifdef CONFIG_CIFS_POSIX
if (sb->s_flags & MS_POSIXACL)
rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
- ea_value, buf_size, ACL_TYPE_DEFAULT,
+ value, size, ACL_TYPE_DEFAULT,
cifs_sb->local_nls,
cifs_remap(cifs_sb));
-#else
- cifs_dbg(FYI, "Query POSIX default ACL not supported yet\n");
-#endif /* CONFIG_CIFS_POSIX */
- } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL,
- strlen(CIFS_XATTR_CIFS_ACL)) == 0) {
-#ifdef CONFIG_CIFS_ACL
- u32 acllen;
- struct cifs_ntsd *pacl;
-
- if (pTcon->ses->server->ops->get_acl == NULL)
- goto get_ea_exit; /* rc already EOPNOTSUPP */
-
- pacl = pTcon->ses->server->ops->get_acl(cifs_sb,
- d_inode(direntry), full_path, &acllen);
- if (IS_ERR(pacl)) {
- rc = PTR_ERR(pacl);
- cifs_dbg(VFS, "%s: error %zd getting sec desc\n",
- __func__, rc);
- } else {
- if (ea_value) {
- if (acllen > buf_size)
- acllen = -ERANGE;
- else
- memcpy(ea_value, pacl, acllen);
- }
- rc = acllen;
- kfree(pacl);
- }
-#else
- cifs_dbg(FYI, "Query CIFS ACL not supported yet\n");
-#endif /* CONFIG_CIFS_ACL */
- } else if (strncmp(ea_name,
- XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) {
- cifs_dbg(FYI, "Trusted xattr namespace not supported yet\n");
- } else if (strncmp(ea_name,
- XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) {
- cifs_dbg(FYI, "Security xattr namespace not supported yet\n");
- } else
- cifs_dbg(FYI,
- "illegal xattr request %s (only user namespace supported)\n",
- ea_name);
+#endif /* CONFIG_CIFS_POSIX */
+ break;
+ }
/* We could add an additional check for streams ie
if proc/fs/cifs/streamstoxattr is set then
@@ -361,34 +238,22 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
if (rc == -EINVAL)
rc = -EOPNOTSUPP;
-get_ea_exit:
+out:
kfree(full_path);
free_xid(xid);
cifs_put_tlink(tlink);
-#endif
return rc;
}
ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
{
ssize_t rc = -EOPNOTSUPP;
-#ifdef CONFIG_CIFS_XATTR
unsigned int xid;
- struct cifs_sb_info *cifs_sb;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
struct tcon_link *tlink;
struct cifs_tcon *pTcon;
- struct super_block *sb;
char *full_path;
- if (direntry == NULL)
- return -EIO;
- if (d_really_is_negative(direntry))
- return -EIO;
- sb = d_inode(direntry)->i_sb;
- if (sb == NULL)
- return -EIO;
-
- cifs_sb = CIFS_SB(sb);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
return -EOPNOTSUPP;
@@ -419,6 +284,50 @@ list_ea_exit:
kfree(full_path);
free_xid(xid);
cifs_put_tlink(tlink);
-#endif
return rc;
}
+
+static const struct xattr_handler cifs_user_xattr_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .flags = XATTR_USER,
+ .get = cifs_xattr_get,
+ .set = cifs_xattr_set,
+};
+
+/* os2.* attributes are treated like user.* attributes */
+static const struct xattr_handler cifs_os2_xattr_handler = {
+ .prefix = XATTR_OS2_PREFIX,
+ .flags = XATTR_USER,
+ .get = cifs_xattr_get,
+ .set = cifs_xattr_set,
+};
+
+static const struct xattr_handler cifs_cifs_acl_xattr_handler = {
+ .name = CIFS_XATTR_CIFS_ACL,
+ .flags = XATTR_CIFS_ACL,
+ .get = cifs_xattr_get,
+ .set = cifs_xattr_set,
+};
+
+static const struct xattr_handler cifs_posix_acl_access_xattr_handler = {
+ .name = XATTR_NAME_POSIX_ACL_ACCESS,
+ .flags = XATTR_ACL_ACCESS,
+ .get = cifs_xattr_get,
+ .set = cifs_xattr_set,
+};
+
+static const struct xattr_handler cifs_posix_acl_default_xattr_handler = {
+ .name = XATTR_NAME_POSIX_ACL_DEFAULT,
+ .flags = XATTR_ACL_DEFAULT,
+ .get = cifs_xattr_get,
+ .set = cifs_xattr_set,
+};
+
+const struct xattr_handler *cifs_xattr_handlers[] = {
+ &cifs_user_xattr_handler,
+ &cifs_os2_xattr_handler,
+ &cifs_cifs_acl_xattr_handler,
+ &cifs_posix_acl_access_xattr_handler,
+ &cifs_posix_acl_default_xattr_handler,
+ NULL
+};
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 42e731b8c80a6..6fb8672c0892c 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -424,16 +424,22 @@ static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
host_file = cfi->cfi_container;
- if (host_file->f_op->iterate) {
+ if (host_file->f_op->iterate || host_file->f_op->iterate_shared) {
struct inode *host_inode = file_inode(host_file);
-
- inode_lock(host_inode);
ret = -ENOENT;
if (!IS_DEADDIR(host_inode)) {
- ret = host_file->f_op->iterate(host_file, ctx);
- file_accessed(host_file);
+ if (host_file->f_op->iterate_shared) {
+ inode_lock_shared(host_inode);
+ ret = host_file->f_op->iterate_shared(host_file, ctx);
+ file_accessed(host_file);
+ inode_unlock_shared(host_inode);
+ } else {
+ inode_lock(host_inode);
+ ret = host_file->f_op->iterate(host_file, ctx);
+ file_accessed(host_file);
+ inode_unlock(host_inode);
+ }
}
- inode_unlock(host_inode);
return ret;
}
/* Venus: we must read Venus dirents from a file */
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index f36a4040afb80..b0b9cda419280 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -35,7 +35,6 @@ const struct inode_operations coda_ioctl_inode_operations = {
};
const struct file_operations coda_ioctl_operations = {
- .owner = THIS_MODULE,
.unlocked_ioctl = coda_pioctl,
.llseek = noop_llseek,
};
diff --git a/fs/compat.c b/fs/compat.c
index a71936a3f4cb3..be6e48b0a46c2 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -884,7 +884,7 @@ COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
struct compat_old_linux_dirent __user *, dirent, unsigned int, count)
{
int error;
- struct fd f = fdget(fd);
+ struct fd f = fdget_pos(fd);
struct compat_readdir_callback buf = {
.ctx.actor = compat_fillonedir,
.dirent = dirent
@@ -897,7 +897,7 @@ COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
if (buf.result)
error = buf.result;
- fdput(f);
+ fdput_pos(f);
return error;
}
@@ -936,6 +936,8 @@ static int compat_filldir(struct dir_context *ctx, const char *name, int namlen,
}
dirent = buf->previous;
if (dirent) {
+ if (signal_pending(current))
+ return -EINTR;
if (__put_user(offset, &dirent->d_off))
goto efault;
}
@@ -975,7 +977,7 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
if (!access_ok(VERIFY_WRITE, dirent, count))
return -EFAULT;
- f = fdget(fd);
+ f = fdget_pos(fd);
if (!f.file)
return -EBADF;
@@ -989,7 +991,7 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
else
error = count - buf.count;
}
- fdput(f);
+ fdput_pos(f);
return error;
}
@@ -1020,6 +1022,8 @@ static int compat_filldir64(struct dir_context *ctx, const char *name,
dirent = buf->previous;
if (dirent) {
+ if (signal_pending(current))
+ return -EINTR;
if (__put_user_unaligned(offset, &dirent->d_off))
goto efault;
}
@@ -1062,7 +1066,7 @@ COMPAT_SYSCALL_DEFINE3(getdents64, unsigned int, fd,
if (!access_ok(VERIFY_WRITE, dirent, count))
return -EFAULT;
- f = fdget(fd);
+ f = fdget_pos(fd);
if (!f.file)
return -EBADF;
@@ -1077,7 +1081,7 @@ COMPAT_SYSCALL_DEFINE3(getdents64, unsigned int, fd,
else
error = count - buf.count;
}
- fdput(f);
+ fdput_pos(f);
return error;
}
#endif /* __ARCH_WANT_COMPAT_SYS_GETDENTS64 */
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index bd01b92aad98e..c1e9f29c924cd 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -57,6 +57,7 @@
#include <linux/i2c-dev.h>
#include <linux/atalk.h>
#include <linux/gfp.h>
+#include <linux/cec.h>
#include "internal.h"
@@ -1377,6 +1378,17 @@ COMPATIBLE_IOCTL(VIDEO_GET_NAVI)
COMPATIBLE_IOCTL(VIDEO_SET_ATTRIBUTES)
COMPATIBLE_IOCTL(VIDEO_GET_SIZE)
COMPATIBLE_IOCTL(VIDEO_GET_FRAME_RATE)
+/* cec */
+COMPATIBLE_IOCTL(CEC_ADAP_G_CAPS)
+COMPATIBLE_IOCTL(CEC_ADAP_G_LOG_ADDRS)
+COMPATIBLE_IOCTL(CEC_ADAP_S_LOG_ADDRS)
+COMPATIBLE_IOCTL(CEC_ADAP_G_PHYS_ADDR)
+COMPATIBLE_IOCTL(CEC_ADAP_S_PHYS_ADDR)
+COMPATIBLE_IOCTL(CEC_G_MODE)
+COMPATIBLE_IOCTL(CEC_S_MODE)
+COMPATIBLE_IOCTL(CEC_TRANSMIT)
+COMPATIBLE_IOCTL(CEC_RECEIVE)
+COMPATIBLE_IOCTL(CEC_DQEVENT)
/* joystick */
COMPATIBLE_IOCTL(JSIOCGVERSION)
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index ea59c891fc530..56fb26127fef2 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -494,7 +494,7 @@ out:
* If there is an error, the caller will reset the flags via
* configfs_detach_rollback().
*/
-static int configfs_detach_prep(struct dentry *dentry, struct mutex **wait_mutex)
+static int configfs_detach_prep(struct dentry *dentry, struct dentry **wait)
{
struct configfs_dirent *parent_sd = dentry->d_fsdata;
struct configfs_dirent *sd;
@@ -515,8 +515,8 @@ static int configfs_detach_prep(struct dentry *dentry, struct mutex **wait_mutex
if (sd->s_type & CONFIGFS_USET_DEFAULT) {
/* Abort if racing with mkdir() */
if (sd->s_type & CONFIGFS_USET_IN_MKDIR) {
- if (wait_mutex)
- *wait_mutex = &d_inode(sd->s_dentry)->i_mutex;
+ if (wait)
+ *wait= dget(sd->s_dentry);
return -EAGAIN;
}
@@ -524,7 +524,7 @@ static int configfs_detach_prep(struct dentry *dentry, struct mutex **wait_mutex
* Yup, recursive. If there's a problem, blame
* deep nesting of default_groups
*/
- ret = configfs_detach_prep(sd->s_dentry, wait_mutex);
+ ret = configfs_detach_prep(sd->s_dentry, wait);
if (!ret)
continue;
} else
@@ -1458,7 +1458,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
* the new link is temporarily attached
*/
do {
- struct mutex *wait_mutex;
+ struct dentry *wait;
mutex_lock(&configfs_symlink_mutex);
spin_lock(&configfs_dirent_lock);
@@ -1469,7 +1469,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
*/
ret = sd->s_dependent_count ? -EBUSY : 0;
if (!ret) {
- ret = configfs_detach_prep(dentry, &wait_mutex);
+ ret = configfs_detach_prep(dentry, &wait);
if (ret)
configfs_detach_rollback(dentry);
}
@@ -1483,8 +1483,9 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
}
/* Wait until the racing operation terminates */
- mutex_lock(wait_mutex);
- mutex_unlock(wait_mutex);
+ inode_lock(d_inode(wait));
+ inode_unlock(d_inode(wait));
+ dput(wait);
}
} while (ret == -EAGAIN);
@@ -1632,11 +1633,9 @@ static int configfs_readdir(struct file *file, struct dir_context *ctx)
if (!dir_emit_dots(file, ctx))
return 0;
- if (ctx->pos == 2) {
- spin_lock(&configfs_dirent_lock);
+ spin_lock(&configfs_dirent_lock);
+ if (ctx->pos == 2)
list_move(q, &parent_sd->s_children);
- spin_unlock(&configfs_dirent_lock);
- }
for (p = q->next; p != &parent_sd->s_children; p = p->next) {
struct configfs_dirent *next;
const char *name;
@@ -1647,9 +1646,6 @@ static int configfs_readdir(struct file *file, struct dir_context *ctx)
if (!next->s_element)
continue;
- name = configfs_get_name(next);
- len = strlen(name);
-
/*
* We'll have a dentry and an inode for
* PINNED items and for open attribute
@@ -1663,7 +1659,6 @@ static int configfs_readdir(struct file *file, struct dir_context *ctx)
* they close it. Beyond that, we don't
* care.
*/
- spin_lock(&configfs_dirent_lock);
dentry = next->s_dentry;
if (dentry)
inode = d_inode(dentry);
@@ -1673,15 +1668,18 @@ static int configfs_readdir(struct file *file, struct dir_context *ctx)
if (!inode)
ino = iunique(sb, 2);
+ name = configfs_get_name(next);
+ len = strlen(name);
+
if (!dir_emit(ctx, name, len, ino, dt_type(next)))
return 0;
spin_lock(&configfs_dirent_lock);
list_move(q, p);
- spin_unlock(&configfs_dirent_lock);
p = q;
ctx->pos++;
}
+ spin_unlock(&configfs_dirent_lock);
return 0;
}
@@ -1689,7 +1687,6 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)
{
struct dentry * dentry = file->f_path.dentry;
- inode_lock(d_inode(dentry));
switch (whence) {
case 1:
offset += file->f_pos;
@@ -1697,7 +1694,6 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)
if (offset >= 0)
break;
default:
- inode_unlock(d_inode(dentry));
return -EINVAL;
}
if (offset != file->f_pos) {
@@ -1723,7 +1719,6 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)
spin_unlock(&configfs_dirent_lock);
}
}
- inode_unlock(d_inode(dentry));
return offset;
}
@@ -1732,7 +1727,7 @@ const struct file_operations configfs_dir_operations = {
.release = configfs_dir_close,
.llseek = configfs_dir_lseek,
.read = generic_read_dir,
- .iterate = configfs_readdir,
+ .iterate_shared = configfs_readdir,
};
/**
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 33b7ee34eda5f..c30cf49b69d2f 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -80,11 +80,11 @@ static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buf
count = attr->show(item, buffer->page);
- buffer->needs_read_fill = 0;
BUG_ON(count > (ssize_t)SIMPLE_ATTR_SIZE);
- if (count >= 0)
+ if (count >= 0) {
+ buffer->needs_read_fill = 0;
buffer->count = count;
- else
+ } else
ret = count;
return ret;
}
@@ -357,8 +357,6 @@ configfs_write_bin_file(struct file *file, const char __user *buf,
len = simple_write_to_buffer(buffer->bin_buffer,
buffer->bin_buffer_size, ppos, buf, count);
- if (len > 0)
- *ppos += len;
out:
mutex_unlock(&buffer->mutex);
return len;
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 03d124ae27d7a..0387968e6f475 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -156,7 +156,7 @@ static void configfs_set_inode_lock_class(struct configfs_dirent *sd,
if (depth > 0) {
if (depth <= ARRAY_SIZE(default_group_class)) {
- lockdep_set_class(&inode->i_mutex,
+ lockdep_set_class(&inode->i_rwsem,
&default_group_class[depth - 1]);
} else {
/*
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index a8f3b589a2dfe..cfd91320e869f 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -71,8 +71,8 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
struct inode *inode;
struct dentry *root;
- sb->s_blocksize = PAGE_CACHE_SIZE;
- sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ sb->s_blocksize = PAGE_SIZE;
+ sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = CONFIGFS_MAGIC;
sb->s_op = &configfs_ops;
sb->s_time_gran = 1;
diff --git a/fs/coredump.c b/fs/coredump.c
index 47c32c3bfa1d9..281b768000e66 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -413,7 +413,9 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
core_state->dumper.task = tsk;
core_state->dumper.next = NULL;
- down_write(&mm->mmap_sem);
+ if (down_write_killable(&mm->mmap_sem))
+ return -EINTR;
+
if (!mm->core_state)
core_waiters = zap_threads(tsk, mm, core_state, exit_code);
up_write(&mm->mmap_sem);
@@ -792,6 +794,7 @@ int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
return 0;
file->f_pos = pos;
cprm->written += n;
+ cprm->pos += n;
nr -= n;
}
return 1;
@@ -803,12 +806,10 @@ int dump_skip(struct coredump_params *cprm, size_t nr)
static char zeroes[PAGE_SIZE];
struct file *file = cprm->file;
if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
- if (cprm->written + nr > cprm->limit)
- return 0;
if (dump_interrupted() ||
file->f_op->llseek(file, nr, SEEK_CUR) < 0)
return 0;
- cprm->written += nr;
+ cprm->pos += nr;
return 1;
} else {
while (nr > PAGE_SIZE) {
@@ -823,7 +824,7 @@ EXPORT_SYMBOL(dump_skip);
int dump_align(struct coredump_params *cprm, int align)
{
- unsigned mod = cprm->written & (align - 1);
+ unsigned mod = cprm->pos & (align - 1);
if (align & (align - 1))
return 0;
return mod ? dump_skip(cprm, align - mod) : 1;
diff --git a/fs/cramfs/README b/fs/cramfs/README
index 445d1c2d76467..9d4e7ea311f45 100644
--- a/fs/cramfs/README
+++ b/fs/cramfs/README
@@ -86,26 +86,26 @@ Block Size
(Block size in cramfs refers to the size of input data that is
compressed at a time. It's intended to be somewhere around
-PAGE_CACHE_SIZE for cramfs_readpage's convenience.)
+PAGE_SIZE for cramfs_readpage's convenience.)
The superblock ought to indicate the block size that the fs was
written for, since comments in <linux/pagemap.h> indicate that
-PAGE_CACHE_SIZE may grow in future (if I interpret the comment
+PAGE_SIZE may grow in future (if I interpret the comment
correctly).
-Currently, mkcramfs #define's PAGE_CACHE_SIZE as 4096 and uses that
-for blksize, whereas Linux-2.3.39 uses its PAGE_CACHE_SIZE, which in
+Currently, mkcramfs #define's PAGE_SIZE as 4096 and uses that
+for blksize, whereas Linux-2.3.39 uses its PAGE_SIZE, which in
turn is defined as PAGE_SIZE (which can be as large as 32KB on arm).
This discrepancy is a bug, though it's not clear which should be
changed.
-One option is to change mkcramfs to take its PAGE_CACHE_SIZE from
+One option is to change mkcramfs to take its PAGE_SIZE from
<asm/page.h>. Personally I don't like this option, but it does
require the least amount of change: just change `#define
-PAGE_CACHE_SIZE (4096)' to `#include <asm/page.h>'. The disadvantage
+PAGE_SIZE (4096)' to `#include <asm/page.h>'. The disadvantage
is that the generated cramfs cannot always be shared between different
kernels, not even necessarily kernels of the same architecture if
-PAGE_CACHE_SIZE is subject to change between kernel versions
+PAGE_SIZE is subject to change between kernel versions
(currently possible with arm and ia64).
The remaining options try to make cramfs more sharable.
@@ -126,22 +126,22 @@ size. The options are:
1. Always 4096 bytes.
2. Writer chooses blocksize; kernel adapts but rejects blocksize >
- PAGE_CACHE_SIZE.
+ PAGE_SIZE.
3. Writer chooses blocksize; kernel adapts even to blocksize >
- PAGE_CACHE_SIZE.
+ PAGE_SIZE.
It's easy enough to change the kernel to use a smaller value than
-PAGE_CACHE_SIZE: just make cramfs_readpage read multiple blocks.
+PAGE_SIZE: just make cramfs_readpage read multiple blocks.
-The cost of option 1 is that kernels with a larger PAGE_CACHE_SIZE
+The cost of option 1 is that kernels with a larger PAGE_SIZE
value don't get as good compression as they can.
The cost of option 2 relative to option 1 is that the code uses
variables instead of #define'd constants. The gain is that people
-with kernels having larger PAGE_CACHE_SIZE can make use of that if
+with kernels having larger PAGE_SIZE can make use of that if
they don't mind their cramfs being inaccessible to kernels with
-smaller PAGE_CACHE_SIZE values.
+smaller PAGE_SIZE values.
Option 3 is easy to implement if we don't mind being CPU-inefficient:
e.g. get readpage to decompress to a buffer of size MAX_BLKSIZE (which
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index b862bc219cd7c..7919967488cbd 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -137,7 +137,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
* page cache and dentry tree anyway..
*
* This also acts as a way to guarantee contiguous areas of up to
- * BLKS_PER_BUF*PAGE_CACHE_SIZE, so that the caller doesn't need to
+ * BLKS_PER_BUF*PAGE_SIZE, so that the caller doesn't need to
* worry about end-of-buffer issues even when decompressing a full
* page cache.
*/
@@ -152,7 +152,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
*/
#define BLKS_PER_BUF_SHIFT (2)
#define BLKS_PER_BUF (1 << BLKS_PER_BUF_SHIFT)
-#define BUFFER_SIZE (BLKS_PER_BUF*PAGE_CACHE_SIZE)
+#define BUFFER_SIZE (BLKS_PER_BUF*PAGE_SIZE)
static unsigned char read_buffers[READ_BUFFERS][BUFFER_SIZE];
static unsigned buffer_blocknr[READ_BUFFERS];
@@ -173,8 +173,8 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
if (!len)
return NULL;
- blocknr = offset >> PAGE_CACHE_SHIFT;
- offset &= PAGE_CACHE_SIZE - 1;
+ blocknr = offset >> PAGE_SHIFT;
+ offset &= PAGE_SIZE - 1;
/* Check if an existing buffer already has the data.. */
for (i = 0; i < READ_BUFFERS; i++) {
@@ -184,14 +184,14 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
continue;
if (blocknr < buffer_blocknr[i])
continue;
- blk_offset = (blocknr - buffer_blocknr[i]) << PAGE_CACHE_SHIFT;
+ blk_offset = (blocknr - buffer_blocknr[i]) << PAGE_SHIFT;
blk_offset += offset;
if (blk_offset + len > BUFFER_SIZE)
continue;
return read_buffers[i] + blk_offset;
}
- devsize = mapping->host->i_size >> PAGE_CACHE_SHIFT;
+ devsize = mapping->host->i_size >> PAGE_SHIFT;
/* Ok, read in BLKS_PER_BUF pages completely first. */
for (i = 0; i < BLKS_PER_BUF; i++) {
@@ -213,7 +213,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
wait_on_page_locked(page);
if (!PageUptodate(page)) {
/* asynchronous error */
- page_cache_release(page);
+ put_page(page);
pages[i] = NULL;
}
}
@@ -229,12 +229,12 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
struct page *page = pages[i];
if (page) {
- memcpy(data, kmap(page), PAGE_CACHE_SIZE);
+ memcpy(data, kmap(page), PAGE_SIZE);
kunmap(page);
- page_cache_release(page);
+ put_page(page);
} else
- memset(data, 0, PAGE_CACHE_SIZE);
- data += PAGE_CACHE_SIZE;
+ memset(data, 0, PAGE_SIZE);
+ data += PAGE_SIZE;
}
return read_buffers[buffer] + offset;
}
@@ -353,7 +353,7 @@ static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
buf->f_type = CRAMFS_MAGIC;
- buf->f_bsize = PAGE_CACHE_SIZE;
+ buf->f_bsize = PAGE_SIZE;
buf->f_blocks = CRAMFS_SB(sb)->blocks;
buf->f_bfree = 0;
buf->f_bavail = 0;
@@ -496,7 +496,7 @@ static int cramfs_readpage(struct file *file, struct page *page)
int bytes_filled;
void *pgdata;
- maxblock = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ maxblock = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
bytes_filled = 0;
pgdata = kmap(page);
@@ -516,14 +516,14 @@ static int cramfs_readpage(struct file *file, struct page *page)
if (compr_len == 0)
; /* hole */
- else if (unlikely(compr_len > (PAGE_CACHE_SIZE << 1))) {
+ else if (unlikely(compr_len > (PAGE_SIZE << 1))) {
pr_err("bad compressed blocksize %u\n",
compr_len);
goto err;
} else {
mutex_lock(&read_mutex);
bytes_filled = cramfs_uncompress_block(pgdata,
- PAGE_CACHE_SIZE,
+ PAGE_SIZE,
cramfs_read(sb, start_offset, compr_len),
compr_len);
mutex_unlock(&read_mutex);
@@ -532,7 +532,7 @@ static int cramfs_readpage(struct file *file, struct page *page)
}
}
- memset(pgdata + bytes_filled, 0, PAGE_CACHE_SIZE - bytes_filled);
+ memset(pgdata + bytes_filled, 0, PAGE_SIZE - bytes_filled);
flush_dcache_page(page);
kunmap(page);
SetPageUptodate(page);
@@ -561,7 +561,7 @@ static const struct address_space_operations cramfs_aops = {
static const struct file_operations cramfs_directory_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .iterate = cramfs_readdir,
+ .iterate_shared = cramfs_readdir,
};
static const struct inode_operations cramfs_dir_inode_operations = {
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 06cd1a22240b4..c502c116924ca 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -26,6 +26,7 @@
#include <linux/ratelimit.h>
#include <linux/bio.h>
#include <linux/dcache.h>
+#include <linux/namei.h>
#include <linux/fscrypto.h>
#include <linux/ecryptfs.h>
@@ -81,13 +82,14 @@ EXPORT_SYMBOL(fscrypt_release_ctx);
/**
* fscrypt_get_ctx() - Gets an encryption context
* @inode: The inode for which we are doing the crypto
+ * @gfp_flags: The gfp flag for memory allocation
*
* Allocates and initializes an encryption context.
*
* Return: An allocated and initialized encryption context on success; error
* value or NULL otherwise.
*/
-struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode)
+struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags)
{
struct fscrypt_ctx *ctx = NULL;
struct fscrypt_info *ci = inode->i_crypt_info;
@@ -113,7 +115,7 @@ struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode)
list_del(&ctx->free_list);
spin_unlock_irqrestore(&fscrypt_ctx_lock, flags);
if (!ctx) {
- ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, GFP_NOFS);
+ ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, gfp_flags);
if (!ctx)
return ERR_PTR(-ENOMEM);
ctx->flags |= FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
@@ -147,7 +149,8 @@ typedef enum {
static int do_page_crypto(struct inode *inode,
fscrypt_direction_t rw, pgoff_t index,
- struct page *src_page, struct page *dest_page)
+ struct page *src_page, struct page *dest_page,
+ gfp_t gfp_flags)
{
u8 xts_tweak[FS_XTS_TWEAK_SIZE];
struct skcipher_request *req = NULL;
@@ -157,7 +160,7 @@ static int do_page_crypto(struct inode *inode,
struct crypto_skcipher *tfm = ci->ci_ctfm;
int res = 0;
- req = skcipher_request_alloc(tfm, GFP_NOFS);
+ req = skcipher_request_alloc(tfm, gfp_flags);
if (!req) {
printk_ratelimited(KERN_ERR
"%s: crypto_request_alloc() failed\n",
@@ -175,10 +178,10 @@ static int do_page_crypto(struct inode *inode,
FS_XTS_TWEAK_SIZE - sizeof(index));
sg_init_table(&dst, 1);
- sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0);
+ sg_set_page(&dst, dest_page, PAGE_SIZE, 0);
sg_init_table(&src, 1);
- sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0);
- skcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE,
+ sg_set_page(&src, src_page, PAGE_SIZE, 0);
+ skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE,
xts_tweak);
if (rw == FS_DECRYPT)
res = crypto_skcipher_decrypt(req);
@@ -199,10 +202,9 @@ static int do_page_crypto(struct inode *inode,
return 0;
}
-static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx)
+static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags)
{
- ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool,
- GFP_NOWAIT);
+ ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, gfp_flags);
if (ctx->w.bounce_page == NULL)
return ERR_PTR(-ENOMEM);
ctx->flags |= FS_WRITE_PATH_FL;
@@ -213,6 +215,7 @@ static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx)
* fscypt_encrypt_page() - Encrypts a page
* @inode: The inode for which the encryption should take place
* @plaintext_page: The page to encrypt. Must be locked.
+ * @gfp_flags: The gfp flag for memory allocation
*
* Allocates a ciphertext page and encrypts plaintext_page into it using the ctx
* encryption context.
@@ -225,7 +228,7 @@ static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx)
* error value or NULL.
*/
struct page *fscrypt_encrypt_page(struct inode *inode,
- struct page *plaintext_page)
+ struct page *plaintext_page, gfp_t gfp_flags)
{
struct fscrypt_ctx *ctx;
struct page *ciphertext_page = NULL;
@@ -233,18 +236,19 @@ struct page *fscrypt_encrypt_page(struct inode *inode,
BUG_ON(!PageLocked(plaintext_page));
- ctx = fscrypt_get_ctx(inode);
+ ctx = fscrypt_get_ctx(inode, gfp_flags);
if (IS_ERR(ctx))
return (struct page *)ctx;
/* The encryption operation will require a bounce page. */
- ciphertext_page = alloc_bounce_page(ctx);
+ ciphertext_page = alloc_bounce_page(ctx, gfp_flags);
if (IS_ERR(ciphertext_page))
goto errout;
ctx->w.control_page = plaintext_page;
err = do_page_crypto(inode, FS_ENCRYPT, plaintext_page->index,
- plaintext_page, ciphertext_page);
+ plaintext_page, ciphertext_page,
+ gfp_flags);
if (err) {
ciphertext_page = ERR_PTR(err);
goto errout;
@@ -275,7 +279,7 @@ int fscrypt_decrypt_page(struct page *page)
BUG_ON(!PageLocked(page));
return do_page_crypto(page->mapping->host,
- FS_DECRYPT, page->index, page, page);
+ FS_DECRYPT, page->index, page, page, GFP_NOFS);
}
EXPORT_SYMBOL(fscrypt_decrypt_page);
@@ -287,13 +291,13 @@ int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk,
struct bio *bio;
int ret, err = 0;
- BUG_ON(inode->i_sb->s_blocksize != PAGE_CACHE_SIZE);
+ BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE);
- ctx = fscrypt_get_ctx(inode);
+ ctx = fscrypt_get_ctx(inode, GFP_NOFS);
if (IS_ERR(ctx))
return PTR_ERR(ctx);
- ciphertext_page = alloc_bounce_page(ctx);
+ ciphertext_page = alloc_bounce_page(ctx, GFP_NOWAIT);
if (IS_ERR(ciphertext_page)) {
err = PTR_ERR(ciphertext_page);
goto errout;
@@ -301,11 +305,12 @@ int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk,
while (len--) {
err = do_page_crypto(inode, FS_ENCRYPT, lblk,
- ZERO_PAGE(0), ciphertext_page);
+ ZERO_PAGE(0), ciphertext_page,
+ GFP_NOFS);
if (err)
goto errout;
- bio = bio_alloc(GFP_KERNEL, 1);
+ bio = bio_alloc(GFP_NOWAIT, 1);
if (!bio) {
err = -ENOMEM;
goto errout;
@@ -313,6 +318,7 @@ int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk,
bio->bi_bdev = inode->i_sb->s_bdev;
bio->bi_iter.bi_sector =
pblk << (inode->i_sb->s_blocksize_bits - 9);
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
ret = bio_add_page(bio, ciphertext_page,
inode->i_sb->s_blocksize, 0);
if (ret != inode->i_sb->s_blocksize) {
@@ -322,7 +328,7 @@ int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk,
err = -EIO;
goto errout;
}
- err = submit_bio_wait(WRITE, bio);
+ err = submit_bio_wait(bio);
if ((err == 0) && bio->bi_error)
err = -EIO;
bio_put(bio);
@@ -345,13 +351,20 @@ EXPORT_SYMBOL(fscrypt_zeroout_range);
*/
static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
{
- struct inode *dir = d_inode(dentry->d_parent);
- struct fscrypt_info *ci = dir->i_crypt_info;
+ struct dentry *dir;
+ struct fscrypt_info *ci;
int dir_has_key, cached_with_key;
- if (!dir->i_sb->s_cop->is_encrypted(dir))
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ dir = dget_parent(dentry);
+ if (!d_inode(dir)->i_sb->s_cop->is_encrypted(d_inode(dir))) {
+ dput(dir);
return 0;
+ }
+ ci = d_inode(dir)->i_crypt_info;
if (ci && ci->ci_keyring_key &&
(ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
(1 << KEY_FLAG_REVOKED) |
@@ -363,6 +376,7 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
cached_with_key = dentry->d_flags & DCACHE_ENCRYPTED_WITH_KEY;
spin_unlock(&dentry->d_lock);
dir_has_key = (ci != NULL);
+ dput(dir);
/*
* If the dentry was cached without the key, and it is a
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 06f5aa478bf2c..1ac263eddc4eb 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -78,6 +78,67 @@ out:
return res;
}
+static int validate_user_key(struct fscrypt_info *crypt_info,
+ struct fscrypt_context *ctx, u8 *raw_key,
+ u8 *prefix, int prefix_size)
+{
+ u8 *full_key_descriptor;
+ struct key *keyring_key;
+ struct fscrypt_key *master_key;
+ const struct user_key_payload *ukp;
+ int full_key_len = prefix_size + (FS_KEY_DESCRIPTOR_SIZE * 2) + 1;
+ int res;
+
+ full_key_descriptor = kmalloc(full_key_len, GFP_NOFS);
+ if (!full_key_descriptor)
+ return -ENOMEM;
+
+ memcpy(full_key_descriptor, prefix, prefix_size);
+ sprintf(full_key_descriptor + prefix_size,
+ "%*phN", FS_KEY_DESCRIPTOR_SIZE,
+ ctx->master_key_descriptor);
+ full_key_descriptor[full_key_len - 1] = '\0';
+ keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL);
+ kfree(full_key_descriptor);
+ if (IS_ERR(keyring_key))
+ return PTR_ERR(keyring_key);
+
+ if (keyring_key->type != &key_type_logon) {
+ printk_once(KERN_WARNING
+ "%s: key type must be logon\n", __func__);
+ res = -ENOKEY;
+ goto out;
+ }
+ down_read(&keyring_key->sem);
+ ukp = user_key_payload(keyring_key);
+ if (ukp->datalen != sizeof(struct fscrypt_key)) {
+ res = -EINVAL;
+ up_read(&keyring_key->sem);
+ goto out;
+ }
+ master_key = (struct fscrypt_key *)ukp->data;
+ BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE);
+
+ if (master_key->size != FS_AES_256_XTS_KEY_SIZE) {
+ printk_once(KERN_WARNING
+ "%s: key size incorrect: %d\n",
+ __func__, master_key->size);
+ res = -ENOKEY;
+ up_read(&keyring_key->sem);
+ goto out;
+ }
+ res = derive_key_aes(ctx->nonce, master_key->raw, raw_key);
+ up_read(&keyring_key->sem);
+ if (res)
+ goto out;
+
+ crypt_info->ci_keyring_key = keyring_key;
+ return 0;
+out:
+ key_put(keyring_key);
+ return res;
+}
+
static void put_crypt_info(struct fscrypt_info *ci)
{
if (!ci)
@@ -91,12 +152,7 @@ static void put_crypt_info(struct fscrypt_info *ci)
int get_crypt_info(struct inode *inode)
{
struct fscrypt_info *crypt_info;
- u8 full_key_descriptor[FS_KEY_DESC_PREFIX_SIZE +
- (FS_KEY_DESCRIPTOR_SIZE * 2) + 1];
- struct key *keyring_key = NULL;
- struct fscrypt_key *master_key;
struct fscrypt_context ctx;
- const struct user_key_payload *ukp;
struct crypto_skcipher *ctfm;
const char *cipher_str;
u8 raw_key[FS_MAX_KEY_SIZE];
@@ -167,48 +223,24 @@ retry:
memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE);
goto got_key;
}
- memcpy(full_key_descriptor, FS_KEY_DESC_PREFIX,
- FS_KEY_DESC_PREFIX_SIZE);
- sprintf(full_key_descriptor + FS_KEY_DESC_PREFIX_SIZE,
- "%*phN", FS_KEY_DESCRIPTOR_SIZE,
- ctx.master_key_descriptor);
- full_key_descriptor[FS_KEY_DESC_PREFIX_SIZE +
- (2 * FS_KEY_DESCRIPTOR_SIZE)] = '\0';
- keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL);
- if (IS_ERR(keyring_key)) {
- res = PTR_ERR(keyring_key);
- keyring_key = NULL;
- goto out;
- }
- crypt_info->ci_keyring_key = keyring_key;
- if (keyring_key->type != &key_type_logon) {
- printk_once(KERN_WARNING
- "%s: key type must be logon\n", __func__);
- res = -ENOKEY;
- goto out;
- }
- down_read(&keyring_key->sem);
- ukp = user_key_payload(keyring_key);
- if (ukp->datalen != sizeof(struct fscrypt_key)) {
- res = -EINVAL;
- up_read(&keyring_key->sem);
- goto out;
- }
- master_key = (struct fscrypt_key *)ukp->data;
- BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE);
- if (master_key->size != FS_AES_256_XTS_KEY_SIZE) {
- printk_once(KERN_WARNING
- "%s: key size incorrect: %d\n",
- __func__, master_key->size);
- res = -ENOKEY;
- up_read(&keyring_key->sem);
+ res = validate_user_key(crypt_info, &ctx, raw_key,
+ FS_KEY_DESC_PREFIX, FS_KEY_DESC_PREFIX_SIZE);
+ if (res && inode->i_sb->s_cop->key_prefix) {
+ u8 *prefix = NULL;
+ int prefix_size, res2;
+
+ prefix_size = inode->i_sb->s_cop->key_prefix(inode, &prefix);
+ res2 = validate_user_key(crypt_info, &ctx, raw_key,
+ prefix, prefix_size);
+ if (res2) {
+ if (res2 == -ENOKEY)
+ res = -ENOKEY;
+ goto out;
+ }
+ } else if (res) {
goto out;
}
- res = derive_key_aes(ctx.nonce, master_key->raw, raw_key);
- up_read(&keyring_key->sem);
- if (res)
- goto out;
got_key:
ctfm = crypto_alloc_skcipher(cipher_str, 0, 0);
if (!ctfm || IS_ERR(ctfm)) {
diff --git a/fs/dax.c b/fs/dax.c
index 90322eb7498c1..993dc6fe0416e 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -32,18 +32,56 @@
#include <linux/pfn_t.h>
#include <linux/sizes.h>
+/*
+ * We use lowest available bit in exceptional entry for locking, other two
+ * bits to determine entry type. In total 3 special bits.
+ */
+#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)
+#define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
+#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
+#define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD)
+#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK)
+#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
+#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
+ RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \
+ RADIX_TREE_EXCEPTIONAL_ENTRY))
+
+/* We choose 4096 entries - same as per-zone page wait tables */
+#define DAX_WAIT_TABLE_BITS 12
+#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
+
+wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
+
+static int __init init_dax_wait_table(void)
+{
+ int i;
+
+ for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
+ init_waitqueue_head(wait_table + i);
+ return 0;
+}
+fs_initcall(init_dax_wait_table);
+
+static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
+ pgoff_t index)
+{
+ unsigned long hash = hash_long((unsigned long)mapping ^ index,
+ DAX_WAIT_TABLE_BITS);
+ return wait_table + hash;
+}
+
static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
{
struct request_queue *q = bdev->bd_queue;
long rc = -EIO;
- dax->addr = (void __pmem *) ERR_PTR(-EIO);
+ dax->addr = ERR_PTR(-EIO);
if (blk_queue_enter(q, true) != 0)
return rc;
rc = bdev_direct_access(bdev, dax);
if (rc < 0) {
- dax->addr = (void __pmem *) ERR_PTR(rc);
+ dax->addr = ERR_PTR(rc);
blk_queue_exit(q);
return rc;
}
@@ -78,50 +116,6 @@ struct page *read_dax_sector(struct block_device *bdev, sector_t n)
return page;
}
-/*
- * dax_clear_sectors() is called from within transaction context from XFS,
- * and hence this means the stack from this point must follow GFP_NOFS
- * semantics for all operations.
- */
-int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size)
-{
- struct blk_dax_ctl dax = {
- .sector = _sector,
- .size = _size,
- };
-
- might_sleep();
- do {
- long count, sz;
-
- count = dax_map_atomic(bdev, &dax);
- if (count < 0)
- return count;
- sz = min_t(long, count, SZ_128K);
- clear_pmem(dax.addr, sz);
- dax.size -= sz;
- dax.sector += sz / 512;
- dax_unmap_atomic(bdev, &dax);
- cond_resched();
- } while (dax.size);
-
- wmb_pmem();
- return 0;
-}
-EXPORT_SYMBOL_GPL(dax_clear_sectors);
-
-/* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */
-static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first,
- loff_t pos, loff_t end)
-{
- loff_t final = end - pos + first; /* The final byte of the buffer */
-
- if (first > 0)
- clear_pmem(addr, first);
- if (final < size)
- clear_pmem(addr + final, size - final);
-}
-
static bool buffer_written(struct buffer_head *bh)
{
return buffer_mapped(bh) && !buffer_unwritten(bh);
@@ -153,13 +147,16 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
struct buffer_head *bh)
{
loff_t pos = start, max = start, bh_max = start;
- bool hole = false, need_wmb = false;
+ bool hole = false;
struct block_device *bdev = NULL;
int rw = iov_iter_rw(iter), rc;
long map_len = 0;
struct blk_dax_ctl dax = {
- .addr = (void __pmem *) ERR_PTR(-EIO),
+ .addr = ERR_PTR(-EIO),
};
+ unsigned blkbits = inode->i_blkbits;
+ sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
+ >> blkbits;
if (rw == READ)
end = min(end, i_size_read(inode));
@@ -167,7 +164,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
while (pos < end) {
size_t len;
if (pos == max) {
- unsigned blkbits = inode->i_blkbits;
long page = pos >> PAGE_SHIFT;
sector_t block = page << (PAGE_SHIFT - blkbits);
unsigned first = pos - (block << blkbits);
@@ -183,6 +179,13 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
bh->b_size = 1 << blkbits;
bh_max = pos - first + bh->b_size;
bdev = bh->b_bdev;
+ /*
+ * We allow uninitialized buffers for writes
+ * beyond EOF as those cannot race with faults
+ */
+ WARN_ON_ONCE(
+ (buffer_new(bh) && block < file_blks) ||
+ (rw == WRITE && buffer_unwritten(bh)));
} else {
unsigned done = bh->b_size -
(bh_max - (pos - first));
@@ -202,20 +205,19 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
rc = map_len;
break;
}
- if (buffer_unwritten(bh) || buffer_new(bh)) {
- dax_new_buf(dax.addr, map_len, first,
- pos, end);
- need_wmb = true;
- }
dax.addr += first;
size = map_len - first;
}
- max = min(pos + size, end);
+ /*
+ * pos + size is one past the last offset for IO,
+ * so pos + size can overflow loff_t at extreme offsets.
+ * Cast to u64 to catch this and get the true minimum.
+ */
+ max = min_t(u64, pos + size, end);
}
if (iov_iter_rw(iter) == WRITE) {
len = copy_from_iter_pmem(dax.addr, max - pos, iter);
- need_wmb = true;
} else if (!hole)
len = copy_to_iter((void __force *) dax.addr, max - pos,
iter);
@@ -232,8 +234,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
dax.addr += len;
}
- if (need_wmb)
- wmb_pmem();
dax_unmap_atomic(bdev, &dax);
return (pos == start) ? rc : pos - start;
@@ -244,7 +244,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
* @iocb: The control block for this I/O
* @inode: The file which the I/O is directed at
* @iter: The addresses to do I/O from or to
- * @pos: The file offset where the I/O starts
* @get_block: The filesystem method used to translate file offsets to blocks
* @end_io: A filesystem callback for I/O completion
* @flags: See below
@@ -257,25 +256,19 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
* is in progress.
*/
ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
- struct iov_iter *iter, loff_t pos, get_block_t get_block,
+ struct iov_iter *iter, get_block_t get_block,
dio_iodone_t end_io, int flags)
{
struct buffer_head bh;
ssize_t retval = -EINVAL;
+ loff_t pos = iocb->ki_pos;
loff_t end = pos + iov_iter_count(iter);
memset(&bh, 0, sizeof(bh));
bh.b_bdev = inode->i_sb->s_bdev;
- if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) {
- struct address_space *mapping = inode->i_mapping;
+ if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
inode_lock(inode);
- retval = filemap_write_and_wait_range(mapping, pos, end - 1);
- if (retval) {
- inode_unlock(inode);
- goto out;
- }
- }
/* Protects against truncate */
if (!(flags & DIO_SKIP_DIO_COUNT))
@@ -296,12 +289,268 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
if (!(flags & DIO_SKIP_DIO_COUNT))
inode_dio_end(inode);
- out:
return retval;
}
EXPORT_SYMBOL_GPL(dax_do_io);
/*
+ * DAX radix tree locking
+ */
+struct exceptional_entry_key {
+ struct address_space *mapping;
+ unsigned long index;
+};
+
+struct wait_exceptional_entry_queue {
+ wait_queue_t wait;
+ struct exceptional_entry_key key;
+};
+
+static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
+ int sync, void *keyp)
+{
+ struct exceptional_entry_key *key = keyp;
+ struct wait_exceptional_entry_queue *ewait =
+ container_of(wait, struct wait_exceptional_entry_queue, wait);
+
+ if (key->mapping != ewait->key.mapping ||
+ key->index != ewait->key.index)
+ return 0;
+ return autoremove_wake_function(wait, mode, sync, NULL);
+}
+
+/*
+ * Check whether the given slot is locked. The function must be called with
+ * mapping->tree_lock held
+ */
+static inline int slot_locked(struct address_space *mapping, void **slot)
+{
+ unsigned long entry = (unsigned long)
+ radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+ return entry & RADIX_DAX_ENTRY_LOCK;
+}
+
+/*
+ * Mark the given slot is locked. The function must be called with
+ * mapping->tree_lock held
+ */
+static inline void *lock_slot(struct address_space *mapping, void **slot)
+{
+ unsigned long entry = (unsigned long)
+ radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+
+ entry |= RADIX_DAX_ENTRY_LOCK;
+ radix_tree_replace_slot(slot, (void *)entry);
+ return (void *)entry;
+}
+
+/*
+ * Mark the given slot is unlocked. The function must be called with
+ * mapping->tree_lock held
+ */
+static inline void *unlock_slot(struct address_space *mapping, void **slot)
+{
+ unsigned long entry = (unsigned long)
+ radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+
+ entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
+ radix_tree_replace_slot(slot, (void *)entry);
+ return (void *)entry;
+}
+
+/*
+ * Lookup entry in radix tree, wait for it to become unlocked if it is
+ * exceptional entry and return it. The caller must call
+ * put_unlocked_mapping_entry() when he decided not to lock the entry or
+ * put_locked_mapping_entry() when he locked the entry and now wants to
+ * unlock it.
+ *
+ * The function must be called with mapping->tree_lock held.
+ */
+static void *get_unlocked_mapping_entry(struct address_space *mapping,
+ pgoff_t index, void ***slotp)
+{
+ void *ret, **slot;
+ struct wait_exceptional_entry_queue ewait;
+ wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
+
+ init_wait(&ewait.wait);
+ ewait.wait.func = wake_exceptional_entry_func;
+ ewait.key.mapping = mapping;
+ ewait.key.index = index;
+
+ for (;;) {
+ ret = __radix_tree_lookup(&mapping->page_tree, index, NULL,
+ &slot);
+ if (!ret || !radix_tree_exceptional_entry(ret) ||
+ !slot_locked(mapping, slot)) {
+ if (slotp)
+ *slotp = slot;
+ return ret;
+ }
+ prepare_to_wait_exclusive(wq, &ewait.wait,
+ TASK_UNINTERRUPTIBLE);
+ spin_unlock_irq(&mapping->tree_lock);
+ schedule();
+ finish_wait(wq, &ewait.wait);
+ spin_lock_irq(&mapping->tree_lock);
+ }
+}
+
+/*
+ * Find radix tree entry at given index. If it points to a page, return with
+ * the page locked. If it points to the exceptional entry, return with the
+ * radix tree entry locked. If the radix tree doesn't contain given index,
+ * create empty exceptional entry for the index and return with it locked.
+ *
+ * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
+ * persistent memory the benefit is doubtful. We can add that later if we can
+ * show it helps.
+ */
+static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index)
+{
+ void *ret, **slot;
+
+restart:
+ spin_lock_irq(&mapping->tree_lock);
+ ret = get_unlocked_mapping_entry(mapping, index, &slot);
+ /* No entry for given index? Make sure radix tree is big enough. */
+ if (!ret) {
+ int err;
+
+ spin_unlock_irq(&mapping->tree_lock);
+ err = radix_tree_preload(
+ mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
+ if (err)
+ return ERR_PTR(err);
+ ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
+ RADIX_DAX_ENTRY_LOCK);
+ spin_lock_irq(&mapping->tree_lock);
+ err = radix_tree_insert(&mapping->page_tree, index, ret);
+ radix_tree_preload_end();
+ if (err) {
+ spin_unlock_irq(&mapping->tree_lock);
+ /* Someone already created the entry? */
+ if (err == -EEXIST)
+ goto restart;
+ return ERR_PTR(err);
+ }
+ /* Good, we have inserted empty locked entry into the tree. */
+ mapping->nrexceptional++;
+ spin_unlock_irq(&mapping->tree_lock);
+ return ret;
+ }
+ /* Normal page in radix tree? */
+ if (!radix_tree_exceptional_entry(ret)) {
+ struct page *page = ret;
+
+ get_page(page);
+ spin_unlock_irq(&mapping->tree_lock);
+ lock_page(page);
+ /* Page got truncated? Retry... */
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ put_page(page);
+ goto restart;
+ }
+ return page;
+ }
+ ret = lock_slot(mapping, slot);
+ spin_unlock_irq(&mapping->tree_lock);
+ return ret;
+}
+
+void dax_wake_mapping_entry_waiter(struct address_space *mapping,
+ pgoff_t index, bool wake_all)
+{
+ wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
+
+ /*
+ * Checking for locked entry and prepare_to_wait_exclusive() happens
+ * under mapping->tree_lock, ditto for entry handling in our callers.
+ * So at this point all tasks that could have seen our entry locked
+ * must be in the waitqueue and the following check will see them.
+ */
+ if (waitqueue_active(wq)) {
+ struct exceptional_entry_key key;
+
+ key.mapping = mapping;
+ key.index = index;
+ __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
+ }
+}
+
+void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
+{
+ void *ret, **slot;
+
+ spin_lock_irq(&mapping->tree_lock);
+ ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
+ if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) ||
+ !slot_locked(mapping, slot))) {
+ spin_unlock_irq(&mapping->tree_lock);
+ return;
+ }
+ unlock_slot(mapping, slot);
+ spin_unlock_irq(&mapping->tree_lock);
+ dax_wake_mapping_entry_waiter(mapping, index, false);
+}
+
+static void put_locked_mapping_entry(struct address_space *mapping,
+ pgoff_t index, void *entry)
+{
+ if (!radix_tree_exceptional_entry(entry)) {
+ unlock_page(entry);
+ put_page(entry);
+ } else {
+ dax_unlock_mapping_entry(mapping, index);
+ }
+}
+
+/*
+ * Called when we are done with radix tree entry we looked up via
+ * get_unlocked_mapping_entry() and which we didn't lock in the end.
+ */
+static void put_unlocked_mapping_entry(struct address_space *mapping,
+ pgoff_t index, void *entry)
+{
+ if (!radix_tree_exceptional_entry(entry))
+ return;
+
+ /* We have to wake up next waiter for the radix tree entry lock */
+ dax_wake_mapping_entry_waiter(mapping, index, false);
+}
+
+/*
+ * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
+ * entry to get unlocked before deleting it.
+ */
+int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
+{
+ void *entry;
+
+ spin_lock_irq(&mapping->tree_lock);
+ entry = get_unlocked_mapping_entry(mapping, index, NULL);
+ /*
+ * This gets called from truncate / punch_hole path. As such, the caller
+ * must hold locks protecting against concurrent modifications of the
+ * radix tree (usually fs-private i_mmap_sem for writing). Since the
+ * caller has seen exceptional entry for this index, we better find it
+ * at that index as well...
+ */
+ if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) {
+ spin_unlock_irq(&mapping->tree_lock);
+ return 0;
+ }
+ radix_tree_delete(&mapping->page_tree, index);
+ mapping->nrexceptional--;
+ spin_unlock_irq(&mapping->tree_lock);
+ dax_wake_mapping_entry_waiter(mapping, index, true);
+
+ return 1;
+}
+
+/*
* The user has performed a load from a hole in the file. Allocating
* a new page in the file would cause excessive storage usage for
* workloads with sparse files. We allocate a page cache page instead.
@@ -309,24 +558,24 @@ EXPORT_SYMBOL_GPL(dax_do_io);
* otherwise it will simply fall out of the page cache under memory
* pressure without ever having been dirtied.
*/
-static int dax_load_hole(struct address_space *mapping, struct page *page,
- struct vm_fault *vmf)
+static int dax_load_hole(struct address_space *mapping, void *entry,
+ struct vm_fault *vmf)
{
- unsigned long size;
- struct inode *inode = mapping->host;
- if (!page)
- page = find_or_create_page(mapping, vmf->pgoff,
- GFP_KERNEL | __GFP_ZERO);
- if (!page)
- return VM_FAULT_OOM;
- /* Recheck i_size under page lock to avoid truncate race */
- size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (vmf->pgoff >= size) {
- unlock_page(page);
- page_cache_release(page);
- return VM_FAULT_SIGBUS;
+ struct page *page;
+
+ /* Hole page already exists? Return it... */
+ if (!radix_tree_exceptional_entry(entry)) {
+ vmf->page = entry;
+ return VM_FAULT_LOCKED;
}
+ /* This will replace locked radix tree entry with a hole page */
+ page = find_or_create_page(mapping, vmf->pgoff,
+ vmf->gfp_mask | __GFP_ZERO);
+ if (!page) {
+ put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+ return VM_FAULT_OOM;
+ }
vmf->page = page;
return VM_FAULT_LOCKED;
}
@@ -350,77 +599,72 @@ static int copy_user_bh(struct page *to, struct inode *inode,
return 0;
}
-#define NO_SECTOR -1
-#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_CACHE_SHIFT))
+#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT))
-static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
- sector_t sector, bool pmd_entry, bool dirty)
+static void *dax_insert_mapping_entry(struct address_space *mapping,
+ struct vm_fault *vmf,
+ void *entry, sector_t sector)
{
struct radix_tree_root *page_tree = &mapping->page_tree;
- pgoff_t pmd_index = DAX_PMD_INDEX(index);
- int type, error = 0;
- void *entry;
+ int error = 0;
+ bool hole_fill = false;
+ void *new_entry;
+ pgoff_t index = vmf->pgoff;
- WARN_ON_ONCE(pmd_entry && !dirty);
- if (dirty)
+ if (vmf->flags & FAULT_FLAG_WRITE)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
- spin_lock_irq(&mapping->tree_lock);
-
- entry = radix_tree_lookup(page_tree, pmd_index);
- if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) {
- index = pmd_index;
- goto dirty;
+ /* Replacing hole page with block mapping? */
+ if (!radix_tree_exceptional_entry(entry)) {
+ hole_fill = true;
+ /*
+ * Unmap the page now before we remove it from page cache below.
+ * The page is locked so it cannot be faulted in again.
+ */
+ unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
+ PAGE_SIZE, 0);
+ error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
+ if (error)
+ return ERR_PTR(error);
}
- entry = radix_tree_lookup(page_tree, index);
- if (entry) {
- type = RADIX_DAX_TYPE(entry);
- if (WARN_ON_ONCE(type != RADIX_DAX_PTE &&
- type != RADIX_DAX_PMD)) {
- error = -EIO;
+ spin_lock_irq(&mapping->tree_lock);
+ new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) |
+ RADIX_DAX_ENTRY_LOCK);
+ if (hole_fill) {
+ __delete_from_page_cache(entry, NULL);
+ /* Drop pagecache reference */
+ put_page(entry);
+ error = radix_tree_insert(page_tree, index, new_entry);
+ if (error) {
+ new_entry = ERR_PTR(error);
goto unlock;
}
+ mapping->nrexceptional++;
+ } else {
+ void **slot;
+ void *ret;
- if (!pmd_entry || type == RADIX_DAX_PMD)
- goto dirty;
-
- /*
- * We only insert dirty PMD entries into the radix tree. This
- * means we don't need to worry about removing a dirty PTE
- * entry and inserting a clean PMD entry, thus reducing the
- * range we would flush with a follow-up fsync/msync call.
- */
- radix_tree_delete(&mapping->page_tree, index);
- mapping->nrexceptional--;
- }
-
- if (sector == NO_SECTOR) {
- /*
- * This can happen during correct operation if our pfn_mkwrite
- * fault raced against a hole punch operation. If this
- * happens the pte that was hole punched will have been
- * unmapped and the radix tree entry will have been removed by
- * the time we are called, but the call will still happen. We
- * will return all the way up to wp_pfn_shared(), where the
- * pte_same() check will fail, eventually causing page fault
- * to be retried by the CPU.
- */
- goto unlock;
+ ret = __radix_tree_lookup(page_tree, index, NULL, &slot);
+ WARN_ON_ONCE(ret != entry);
+ radix_tree_replace_slot(slot, new_entry);
}
-
- error = radix_tree_insert(page_tree, index,
- RADIX_DAX_ENTRY(sector, pmd_entry));
- if (error)
- goto unlock;
-
- mapping->nrexceptional++;
- dirty:
- if (dirty)
+ if (vmf->flags & FAULT_FLAG_WRITE)
radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
unlock:
spin_unlock_irq(&mapping->tree_lock);
- return error;
+ if (hole_fill) {
+ radix_tree_preload_end();
+ /*
+ * We don't need hole page anymore, it has been replaced with
+ * locked radix tree entry now.
+ */
+ if (mapping->a_ops->freepage)
+ mapping->a_ops->freepage(entry);
+ unlock_page(entry);
+ put_page(entry);
+ }
+ return new_entry;
}
static int dax_writeback_one(struct block_device *bdev,
@@ -506,8 +750,8 @@ int dax_writeback_mapping_range(struct address_space *mapping,
if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
return 0;
- start_index = wbc->range_start >> PAGE_CACHE_SHIFT;
- end_index = wbc->range_end >> PAGE_CACHE_SHIFT;
+ start_index = wbc->range_start >> PAGE_SHIFT;
+ end_index = wbc->range_end >> PAGE_SHIFT;
pmd_index = DAX_PMD_INDEX(start_index);
rcu_read_lock();
@@ -541,86 +785,52 @@ int dax_writeback_mapping_range(struct address_space *mapping,
return ret;
}
}
- wmb_pmem();
return 0;
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
-static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
+static int dax_insert_mapping(struct address_space *mapping,
+ struct buffer_head *bh, void **entryp,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
unsigned long vaddr = (unsigned long)vmf->virtual_address;
- struct address_space *mapping = inode->i_mapping;
struct block_device *bdev = bh->b_bdev;
struct blk_dax_ctl dax = {
- .sector = to_sector(bh, inode),
+ .sector = to_sector(bh, mapping->host),
.size = bh->b_size,
};
- pgoff_t size;
- int error;
+ void *ret;
+ void *entry = *entryp;
- i_mmap_lock_read(mapping);
-
- /*
- * Check truncate didn't happen while we were allocating a block.
- * If it did, this block may or may not be still allocated to the
- * file. We can't tell the filesystem to free it because we can't
- * take i_mutex here. In the worst case, the file still has blocks
- * allocated past the end of the file.
- */
- size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (unlikely(vmf->pgoff >= size)) {
- error = -EIO;
- goto out;
- }
-
- if (dax_map_atomic(bdev, &dax) < 0) {
- error = PTR_ERR(dax.addr);
- goto out;
- }
-
- if (buffer_unwritten(bh) || buffer_new(bh)) {
- clear_pmem(dax.addr, PAGE_SIZE);
- wmb_pmem();
- }
+ if (dax_map_atomic(bdev, &dax) < 0)
+ return PTR_ERR(dax.addr);
dax_unmap_atomic(bdev, &dax);
- error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
- vmf->flags & FAULT_FLAG_WRITE);
- if (error)
- goto out;
-
- error = vm_insert_mixed(vma, vaddr, dax.pfn);
+ ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector);
+ if (IS_ERR(ret))
+ return PTR_ERR(ret);
+ *entryp = ret;
- out:
- i_mmap_unlock_read(mapping);
-
- return error;
+ return vm_insert_mixed(vma, vaddr, dax.pfn);
}
/**
- * __dax_fault - handle a page fault on a DAX file
+ * dax_fault - handle a page fault on a DAX file
* @vma: The virtual memory area where the fault occurred
* @vmf: The description of the fault
* @get_block: The filesystem method used to translate file offsets to blocks
- * @complete_unwritten: The filesystem method used to convert unwritten blocks
- * to written so the data written to them is exposed. This is required for
- * required by write faults for filesystems that will return unwritten
- * extent mappings from @get_block, but it is optional for reads as
- * dax_insert_mapping() will always zero unwritten blocks. If the fs does
- * not support unwritten extents, the it should pass NULL.
*
* When a page fault occurs, filesystems may call this helper in their
- * fault handler for DAX files. __dax_fault() assumes the caller has done all
+ * fault handler for DAX files. dax_fault() assumes the caller has done all
* the necessary locking for the page fault to proceed successfully.
*/
-int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
- get_block_t get_block, dax_iodone_t complete_unwritten)
+int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+ get_block_t get_block)
{
struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- struct page *page;
+ void *entry;
struct buffer_head bh;
unsigned long vaddr = (unsigned long)vmf->virtual_address;
unsigned blkbits = inode->i_blkbits;
@@ -629,6 +839,11 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
int error;
int major = 0;
+ /*
+ * Check whether offset isn't beyond end of file now. Caller is supposed
+ * to hold locks serializing us with truncate / punch hole so this is
+ * a reliable test.
+ */
size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
if (vmf->pgoff >= size)
return VM_FAULT_SIGBUS;
@@ -638,49 +853,17 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
bh.b_bdev = inode->i_sb->s_bdev;
bh.b_size = PAGE_SIZE;
- repeat:
- page = find_get_page(mapping, vmf->pgoff);
- if (page) {
- if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
- page_cache_release(page);
- return VM_FAULT_RETRY;
- }
- if (unlikely(page->mapping != mapping)) {
- unlock_page(page);
- page_cache_release(page);
- goto repeat;
- }
- size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (unlikely(vmf->pgoff >= size)) {
- /*
- * We have a struct page covering a hole in the file
- * from a read fault and we've raced with a truncate
- */
- error = -EIO;
- goto unlock_page;
- }
+ entry = grab_mapping_entry(mapping, vmf->pgoff);
+ if (IS_ERR(entry)) {
+ error = PTR_ERR(entry);
+ goto out;
}
error = get_block(inode, block, &bh, 0);
if (!error && (bh.b_size < PAGE_SIZE))
error = -EIO; /* fs corruption? */
if (error)
- goto unlock_page;
-
- if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
- if (vmf->flags & FAULT_FLAG_WRITE) {
- error = get_block(inode, block, &bh, 1);
- count_vm_event(PGMAJFAULT);
- mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
- major = VM_FAULT_MAJOR;
- if (!error && (bh.b_size < PAGE_SIZE))
- error = -EIO;
- if (error)
- goto unlock_page;
- } else {
- return dax_load_hole(mapping, page, vmf);
- }
- }
+ goto unlock_entry;
if (vmf->cow_page) {
struct page *new_page = vmf->cow_page;
@@ -689,53 +872,35 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
else
clear_user_highpage(new_page, vaddr);
if (error)
- goto unlock_page;
- vmf->page = page;
- if (!page) {
- i_mmap_lock_read(mapping);
- /* Check we didn't race with truncate */
- size = (i_size_read(inode) + PAGE_SIZE - 1) >>
- PAGE_SHIFT;
- if (vmf->pgoff >= size) {
- i_mmap_unlock_read(mapping);
- error = -EIO;
- goto out;
- }
+ goto unlock_entry;
+ if (!radix_tree_exceptional_entry(entry)) {
+ vmf->page = entry;
+ return VM_FAULT_LOCKED;
}
- return VM_FAULT_LOCKED;
- }
-
- /* Check we didn't race with a read fault installing a new page */
- if (!page && major)
- page = find_lock_page(mapping, vmf->pgoff);
-
- if (page) {
- unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
- PAGE_CACHE_SIZE, 0);
- delete_from_page_cache(page);
- unlock_page(page);
- page_cache_release(page);
- page = NULL;
+ vmf->entry = entry;
+ return VM_FAULT_DAX_LOCKED;
}
- /*
- * If we successfully insert the new mapping over an unwritten extent,
- * we need to ensure we convert the unwritten extent. If there is an
- * error inserting the mapping, the filesystem needs to leave it as
- * unwritten to prevent exposure of the stale underlying data to
- * userspace, but we still need to call the completion function so
- * the private resources on the mapping buffer can be released. We
- * indicate what the callback should do via the uptodate variable, same
- * as for normal BH based IO completions.
- */
- error = dax_insert_mapping(inode, &bh, vma, vmf);
- if (buffer_unwritten(&bh)) {
- if (complete_unwritten)
- complete_unwritten(&bh, !error);
- else
- WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
+ if (!buffer_mapped(&bh)) {
+ if (vmf->flags & FAULT_FLAG_WRITE) {
+ error = get_block(inode, block, &bh, 1);
+ count_vm_event(PGMAJFAULT);
+ mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+ major = VM_FAULT_MAJOR;
+ if (!error && (bh.b_size < PAGE_SIZE))
+ error = -EIO;
+ if (error)
+ goto unlock_entry;
+ } else {
+ return dax_load_hole(mapping, entry, vmf);
+ }
}
+ /* Filesystem should not return unwritten buffers to us! */
+ WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
+ error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf);
+ unlock_entry:
+ put_locked_mapping_entry(mapping, vmf->pgoff, entry);
out:
if (error == -ENOMEM)
return VM_FAULT_OOM | major;
@@ -743,44 +908,10 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
if ((error < 0) && (error != -EBUSY))
return VM_FAULT_SIGBUS | major;
return VM_FAULT_NOPAGE | major;
-
- unlock_page:
- if (page) {
- unlock_page(page);
- page_cache_release(page);
- }
- goto out;
-}
-EXPORT_SYMBOL(__dax_fault);
-
-/**
- * dax_fault - handle a page fault on a DAX file
- * @vma: The virtual memory area where the fault occurred
- * @vmf: The description of the fault
- * @get_block: The filesystem method used to translate file offsets to blocks
- *
- * When a page fault occurs, filesystems may call this helper in their
- * fault handler for DAX files.
- */
-int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
- get_block_t get_block, dax_iodone_t complete_unwritten)
-{
- int result;
- struct super_block *sb = file_inode(vma->vm_file)->i_sb;
-
- if (vmf->flags & FAULT_FLAG_WRITE) {
- sb_start_pagefault(sb);
- file_update_time(vma->vm_file);
- }
- result = __dax_fault(vma, vmf, get_block, complete_unwritten);
- if (vmf->flags & FAULT_FLAG_WRITE)
- sb_end_pagefault(sb);
-
- return result;
}
EXPORT_SYMBOL_GPL(dax_fault);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
/*
* The 'colour' (ie low bits) within a PMD of a page offset. This comes up
* more often than one might expect in the below function.
@@ -805,9 +936,17 @@ static void __dax_dbg(struct buffer_head *bh, unsigned long address,
#define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd")
-int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmd, unsigned int flags, get_block_t get_block,
- dax_iodone_t complete_unwritten)
+/**
+ * dax_pmd_fault - handle a PMD fault on a DAX file
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * When a page fault occurs, filesystems may call this helper in their
+ * pmd_fault handler for DAX files.
+ */
+int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, unsigned int flags, get_block_t get_block)
{
struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
@@ -819,7 +958,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
struct block_device *bdev;
pgoff_t size, pgoff;
sector_t block;
- int error, result = 0;
+ int result = 0;
bool alloc = false;
/* dax pmd mappings require pfn_t_devmap() */
@@ -866,6 +1005,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
if (get_block(inode, block, &bh, 1) != 0)
return VM_FAULT_SIGBUS;
alloc = true;
+ WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
}
bdev = bh.b_bdev;
@@ -891,26 +1031,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
truncate_pagecache_range(inode, lstart, lend);
}
- i_mmap_lock_read(mapping);
-
- /*
- * If a truncate happened while we were allocating blocks, we may
- * leave blocks allocated to the file that are beyond EOF. We can't
- * take i_mutex here, so just leave them hanging; they'll be freed
- * when the file is deleted.
- */
- size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (pgoff >= size) {
- result = VM_FAULT_SIGBUS;
- goto out;
- }
- if ((pgoff | PG_PMD_COLOUR) >= size) {
- dax_pmd_dbg(&bh, address,
- "offset + huge page size > file size");
- goto fallback;
- }
-
- if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
+ if (!write && !buffer_mapped(&bh)) {
spinlock_t *ptl;
pmd_t entry;
struct page *zero_page = get_huge_zero_page();
@@ -945,8 +1066,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
long length = dax_map_atomic(bdev, &dax);
if (length < 0) {
- result = VM_FAULT_SIGBUS;
- goto out;
+ dax_pmd_dbg(&bh, address, "dax-error fallback");
+ goto fallback;
}
if (length < PMD_SIZE) {
dax_pmd_dbg(&bh, address, "dax-length too small");
@@ -964,14 +1085,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
dax_pmd_dbg(&bh, address, "pfn not in memmap");
goto fallback;
}
-
- if (buffer_unwritten(&bh) || buffer_new(&bh)) {
- clear_pmem(dax.addr, PMD_SIZE);
- wmb_pmem();
- count_vm_event(PGMAJFAULT);
- mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
- result |= VM_FAULT_MAJOR;
- }
dax_unmap_atomic(bdev, &dax);
/*
@@ -984,19 +1097,16 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
*
* The PMD path doesn't have an equivalent to
* dax_pfn_mkwrite(), though, so for a read followed by a
- * write we traverse all the way through __dax_pmd_fault()
+ * write we traverse all the way through dax_pmd_fault()
* twice. This means we can just skip inserting a radix tree
* entry completely on the initial read and just wait until
* the write to insert a dirty entry.
*/
if (write) {
- error = dax_radix_entry(mapping, pgoff, dax.sector,
- true, true);
- if (error) {
- dax_pmd_dbg(&bh, address,
- "PMD radix insertion failed");
- goto fallback;
- }
+ /*
+ * We should insert radix-tree entry and dirty it here.
+ * For now this is broken...
+ */
}
dev_dbg(part_to_dev(bdev->bd_part),
@@ -1009,11 +1119,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
}
out:
- i_mmap_unlock_read(mapping);
-
- if (buffer_unwritten(&bh))
- complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
-
return result;
fallback:
@@ -1021,35 +1126,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
result = VM_FAULT_FALLBACK;
goto out;
}
-EXPORT_SYMBOL_GPL(__dax_pmd_fault);
-
-/**
- * dax_pmd_fault - handle a PMD fault on a DAX file
- * @vma: The virtual memory area where the fault occurred
- * @vmf: The description of the fault
- * @get_block: The filesystem method used to translate file offsets to blocks
- *
- * When a page fault occurs, filesystems may call this helper in their
- * pmd_fault handler for DAX files.
- */
-int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmd, unsigned int flags, get_block_t get_block,
- dax_iodone_t complete_unwritten)
-{
- int result;
- struct super_block *sb = file_inode(vma->vm_file)->i_sb;
-
- if (flags & FAULT_FLAG_WRITE) {
- sb_start_pagefault(sb);
- file_update_time(vma->vm_file);
- }
- result = __dax_pmd_fault(vma, address, pmd, flags, get_block,
- complete_unwritten);
- if (flags & FAULT_FLAG_WRITE)
- sb_end_pagefault(sb);
-
- return result;
-}
EXPORT_SYMBOL_GPL(dax_pmd_fault);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -1061,27 +1137,58 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault);
int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct file *file = vma->vm_file;
- int error;
-
- /*
- * We pass NO_SECTOR to dax_radix_entry() because we expect that a
- * RADIX_DAX_PTE entry already exists in the radix tree from a
- * previous call to __dax_fault(). We just want to look up that PTE
- * entry using vmf->pgoff and make sure the dirty tag is set. This
- * saves us from having to make a call to get_block() here to look
- * up the sector.
- */
- error = dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false,
- true);
+ struct address_space *mapping = file->f_mapping;
+ void *entry;
+ pgoff_t index = vmf->pgoff;
- if (error == -ENOMEM)
- return VM_FAULT_OOM;
- if (error)
- return VM_FAULT_SIGBUS;
+ spin_lock_irq(&mapping->tree_lock);
+ entry = get_unlocked_mapping_entry(mapping, index, NULL);
+ if (!entry || !radix_tree_exceptional_entry(entry))
+ goto out;
+ radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
+ put_unlocked_mapping_entry(mapping, index, entry);
+out:
+ spin_unlock_irq(&mapping->tree_lock);
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
+static bool dax_range_is_aligned(struct block_device *bdev,
+ unsigned int offset, unsigned int length)
+{
+ unsigned short sector_size = bdev_logical_block_size(bdev);
+
+ if (!IS_ALIGNED(offset, sector_size))
+ return false;
+ if (!IS_ALIGNED(length, sector_size))
+ return false;
+
+ return true;
+}
+
+int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
+ unsigned int offset, unsigned int length)
+{
+ struct blk_dax_ctl dax = {
+ .sector = sector,
+ .size = PAGE_SIZE,
+ };
+
+ if (dax_range_is_aligned(bdev, offset, length)) {
+ sector_t start_sector = dax.sector + (offset >> 9);
+
+ return blkdev_issue_zeroout(bdev, start_sector,
+ length >> 9, GFP_NOFS, true);
+ } else {
+ if (dax_map_atomic(bdev, &dax) < 0)
+ return PTR_ERR(dax.addr);
+ clear_pmem(dax.addr + offset, length);
+ dax_unmap_atomic(bdev, &dax);
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__dax_zero_page_range);
+
/**
* dax_zero_page_range - zero a range within a page of a DAX file
* @inode: The file being truncated
@@ -1093,47 +1200,29 @@ EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
* page in a DAX file. This is intended for hole-punch operations. If
* you are truncating a file, the helper function dax_truncate_page() may be
* more convenient.
- *
- * We work in terms of PAGE_CACHE_SIZE here for commonality with
- * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
- * took care of disposing of the unnecessary blocks. Even if the filesystem
- * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
- * since the file might be mmapped.
*/
int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
get_block_t get_block)
{
struct buffer_head bh;
- pgoff_t index = from >> PAGE_CACHE_SHIFT;
- unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ pgoff_t index = from >> PAGE_SHIFT;
+ unsigned offset = from & (PAGE_SIZE-1);
int err;
/* Block boundary? Nothing to do */
if (!length)
return 0;
- BUG_ON((offset + length) > PAGE_CACHE_SIZE);
+ BUG_ON((offset + length) > PAGE_SIZE);
memset(&bh, 0, sizeof(bh));
bh.b_bdev = inode->i_sb->s_bdev;
- bh.b_size = PAGE_CACHE_SIZE;
+ bh.b_size = PAGE_SIZE;
err = get_block(inode, index, &bh, 0);
- if (err < 0)
+ if (err < 0 || !buffer_written(&bh))
return err;
- if (buffer_written(&bh)) {
- struct block_device *bdev = bh.b_bdev;
- struct blk_dax_ctl dax = {
- .sector = to_sector(&bh, inode),
- .size = PAGE_CACHE_SIZE,
- };
-
- if (dax_map_atomic(bdev, &dax) < 0)
- return PTR_ERR(dax.addr);
- clear_pmem(dax.addr + offset, length);
- wmb_pmem();
- dax_unmap_atomic(bdev, &dax);
- }
- return 0;
+ return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
+ offset, length);
}
EXPORT_SYMBOL_GPL(dax_zero_page_range);
@@ -1145,16 +1234,10 @@ EXPORT_SYMBOL_GPL(dax_zero_page_range);
*
* Similar to block_truncate_page(), this function can be called by a
* filesystem when it is truncating a DAX file to handle the partial page.
- *
- * We work in terms of PAGE_CACHE_SIZE here for commonality with
- * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
- * took care of disposing of the unnecessary blocks. Even if the filesystem
- * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
- * since the file might be mmapped.
*/
int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
{
- unsigned length = PAGE_CACHE_ALIGN(from) - from;
+ unsigned length = PAGE_ALIGN(from) - from;
return dax_zero_page_range(inode, from, length, get_block);
}
EXPORT_SYMBOL_GPL(dax_truncate_page);
diff --git a/fs/dcache.c b/fs/dcache.c
index 32ceae3e61129..5c7cc953ac819 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -104,13 +104,22 @@ static unsigned int d_hash_shift __read_mostly;
static struct hlist_bl_head *dentry_hashtable __read_mostly;
-static inline struct hlist_bl_head *d_hash(const struct dentry *parent,
+static inline struct hlist_bl_head *d_hash(unsigned int hash)
+{
+ return dentry_hashtable + (hash >> (32 - d_hash_shift));
+}
+
+#define IN_LOOKUP_SHIFT 10
+static struct hlist_bl_head in_lookup_hashtable[1 << IN_LOOKUP_SHIFT];
+
+static inline struct hlist_bl_head *in_lookup_hash(const struct dentry *parent,
unsigned int hash)
{
hash += (unsigned long) parent / L1_CACHE_BYTES;
- return dentry_hashtable + hash_32(hash, d_hash_shift);
+ return in_lookup_hashtable + hash_32(hash, IN_LOOKUP_SHIFT);
}
+
/* Statistics gathering. */
struct dentry_stat_t dentry_stat = {
.age_limit = 45,
@@ -215,10 +224,9 @@ static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char
static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount)
{
- const unsigned char *cs;
/*
* Be careful about RCU walk racing with rename:
- * use ACCESS_ONCE to fetch the name pointer.
+ * use 'lockless_dereference' to fetch the name pointer.
*
* NOTE! Even if a rename will mean that the length
* was not loaded atomically, we don't care. The
@@ -232,8 +240,8 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c
* early because the data cannot match (there can
* be no NUL in the ct/tcount data)
*/
- cs = ACCESS_ONCE(dentry->d_name.name);
- smp_read_barrier_depends();
+ const unsigned char *cs = lockless_dereference(dentry->d_name.name);
+
return dentry_string_cmp(cs, ct, tcount);
}
@@ -308,60 +316,23 @@ static void dentry_free(struct dentry *dentry)
call_rcu(&dentry->d_u.d_rcu, __d_free);
}
-/**
- * dentry_rcuwalk_invalidate - invalidate in-progress rcu-walk lookups
- * @dentry: the target dentry
- * After this call, in-progress rcu-walk path lookup will fail. This
- * should be called after unhashing, and after changing d_inode (if
- * the dentry has not already been unhashed).
- */
-static inline void dentry_rcuwalk_invalidate(struct dentry *dentry)
-{
- lockdep_assert_held(&dentry->d_lock);
- /* Go through am invalidation barrier */
- write_seqcount_invalidate(&dentry->d_seq);
-}
-
/*
* Release the dentry's inode, using the filesystem
- * d_iput() operation if defined. Dentry has no refcount
- * and is unhashed.
- */
-static void dentry_iput(struct dentry * dentry)
- __releases(dentry->d_lock)
- __releases(dentry->d_inode->i_lock)
-{
- struct inode *inode = dentry->d_inode;
- if (inode) {
- __d_clear_type_and_inode(dentry);
- hlist_del_init(&dentry->d_u.d_alias);
- spin_unlock(&dentry->d_lock);
- spin_unlock(&inode->i_lock);
- if (!inode->i_nlink)
- fsnotify_inoderemove(inode);
- if (dentry->d_op && dentry->d_op->d_iput)
- dentry->d_op->d_iput(dentry, inode);
- else
- iput(inode);
- } else {
- spin_unlock(&dentry->d_lock);
- }
-}
-
-/*
- * Release the dentry's inode, using the filesystem
- * d_iput() operation if defined. dentry remains in-use.
+ * d_iput() operation if defined.
*/
static void dentry_unlink_inode(struct dentry * dentry)
__releases(dentry->d_lock)
__releases(dentry->d_inode->i_lock)
{
struct inode *inode = dentry->d_inode;
+ bool hashed = !d_unhashed(dentry);
- raw_write_seqcount_begin(&dentry->d_seq);
+ if (hashed)
+ raw_write_seqcount_begin(&dentry->d_seq);
__d_clear_type_and_inode(dentry);
hlist_del_init(&dentry->d_u.d_alias);
- raw_write_seqcount_end(&dentry->d_seq);
+ if (hashed)
+ raw_write_seqcount_end(&dentry->d_seq);
spin_unlock(&dentry->d_lock);
spin_unlock(&inode->i_lock);
if (!inode->i_nlink)
@@ -477,13 +448,14 @@ void __d_drop(struct dentry *dentry)
if (unlikely(IS_ROOT(dentry)))
b = &dentry->d_sb->s_anon;
else
- b = d_hash(dentry->d_parent, dentry->d_name.hash);
+ b = d_hash(dentry->d_name.hash);
hlist_bl_lock(b);
__hlist_bl_del(&dentry->d_hash);
dentry->d_hash.pprev = NULL;
hlist_bl_unlock(b);
- dentry_rcuwalk_invalidate(dentry);
+ /* After this call, in-progress rcu-walk path lookup will fail. */
+ write_seqcount_invalidate(&dentry->d_seq);
}
}
EXPORT_SYMBOL(__d_drop);
@@ -496,6 +468,44 @@ void d_drop(struct dentry *dentry)
}
EXPORT_SYMBOL(d_drop);
+static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent)
+{
+ struct dentry *next;
+ /*
+ * Inform d_walk() and shrink_dentry_list() that we are no longer
+ * attached to the dentry tree
+ */
+ dentry->d_flags |= DCACHE_DENTRY_KILLED;
+ if (unlikely(list_empty(&dentry->d_child)))
+ return;
+ __list_del_entry(&dentry->d_child);
+ /*
+ * Cursors can move around the list of children. While we'd been
+ * a normal list member, it didn't matter - ->d_child.next would've
+ * been updated. However, from now on it won't be and for the
+ * things like d_walk() it might end up with a nasty surprise.
+ * Normally d_walk() doesn't care about cursors moving around -
+ * ->d_lock on parent prevents that and since a cursor has no children
+ * of its own, we get through it without ever unlocking the parent.
+ * There is one exception, though - if we ascend from a child that
+ * gets killed as soon as we unlock it, the next sibling is found
+ * using the value left in its ->d_child.next. And if _that_
+ * pointed to a cursor, and cursor got moved (e.g. by lseek())
+ * before d_walk() regains parent->d_lock, we'll end up skipping
+ * everything the cursor had been moved past.
+ *
+ * Solution: make sure that the pointer left behind in ->d_child.next
+ * points to something that won't be moving around. I.e. skip the
+ * cursors.
+ */
+ while (dentry->d_child.next != &parent->d_subdirs) {
+ next = list_entry(dentry->d_child.next, struct dentry, d_child);
+ if (likely(!(next->d_flags & DCACHE_DENTRY_CURSOR)))
+ break;
+ dentry->d_child.next = next->d_child.next;
+ }
+}
+
static void __dentry_kill(struct dentry *dentry)
{
struct dentry *parent = NULL;
@@ -521,20 +531,13 @@ static void __dentry_kill(struct dentry *dentry)
}
/* if it was on the hash then remove it */
__d_drop(dentry);
- __list_del_entry(&dentry->d_child);
- /*
- * Inform d_walk() that we are no longer attached to the
- * dentry tree
- */
- dentry->d_flags |= DCACHE_DENTRY_KILLED;
+ dentry_unlist(dentry, parent);
if (parent)
spin_unlock(&parent->d_lock);
- dentry_iput(dentry);
- /*
- * dentry_iput drops the locks, at which point nobody (except
- * transient RCU lookups) can reach this dentry.
- */
- BUG_ON(dentry->d_lockref.count > 0);
+ if (dentry->d_inode)
+ dentry_unlink_inode(dentry);
+ else
+ spin_unlock(&dentry->d_lock);
this_cpu_dec(nr_dentry);
if (dentry->d_op && dentry->d_op->d_release)
dentry->d_op->d_release(dentry);
@@ -578,7 +581,6 @@ static struct dentry *dentry_kill(struct dentry *dentry)
failed:
spin_unlock(&dentry->d_lock);
- cpu_relax();
return dentry; /* try again with same dentry */
}
@@ -752,6 +754,8 @@ void dput(struct dentry *dentry)
return;
repeat:
+ might_sleep();
+
rcu_read_lock();
if (likely(fast_dput(dentry))) {
rcu_read_unlock();
@@ -761,6 +765,8 @@ repeat:
/* Slow case: now with the dentry lock held */
rcu_read_unlock();
+ WARN_ON(d_in_lookup(dentry));
+
/* Unreachable? Get rid of it */
if (unlikely(d_unhashed(dentry)))
goto kill_it;
@@ -783,8 +789,10 @@ repeat:
kill_it:
dentry = dentry_kill(dentry);
- if (dentry)
+ if (dentry) {
+ cond_resched();
goto repeat;
+ }
}
EXPORT_SYMBOL(dput);
@@ -1190,6 +1198,9 @@ resume:
struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
next = tmp->next;
+ if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR))
+ continue;
+
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
ret = enter(data, dentry);
@@ -1546,6 +1557,7 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
{
struct dentry *dentry;
char *dname;
+ int err;
dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
if (!dentry)
@@ -1558,7 +1570,11 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
* be overwriting an internal NUL character
*/
dentry->d_iname[DNAME_INLINE_LEN-1] = 0;
- if (name->len > DNAME_INLINE_LEN-1) {
+ if (unlikely(!name)) {
+ static const struct qstr anon = QSTR_INIT("/", 1);
+ name = &anon;
+ dname = dentry->d_iname;
+ } else if (name->len > DNAME_INLINE_LEN-1) {
size_t size = offsetof(struct external_name, name[1]);
struct external_name *p = kmalloc(size + name->len,
GFP_KERNEL_ACCOUNT);
@@ -1600,6 +1616,16 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
INIT_LIST_HEAD(&dentry->d_child);
d_set_d_op(dentry, dentry->d_sb->s_d_op);
+ if (dentry->d_op && dentry->d_op->d_init) {
+ err = dentry->d_op->d_init(dentry);
+ if (err) {
+ if (dname_external(dentry))
+ kfree(external_name(dentry));
+ kmem_cache_free(dentry_cache, dentry);
+ return NULL;
+ }
+ }
+
this_cpu_inc(nr_dentry);
return dentry;
@@ -1619,7 +1645,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
struct dentry *dentry = __d_alloc(parent->d_sb, name);
if (!dentry)
return NULL;
-
+ dentry->d_flags |= DCACHE_RCUACCESS;
spin_lock(&parent->d_lock);
/*
* don't need child lock because it is not subject
@@ -1634,6 +1660,16 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
}
EXPORT_SYMBOL(d_alloc);
+struct dentry *d_alloc_cursor(struct dentry * parent)
+{
+ struct dentry *dentry = __d_alloc(parent->d_sb, NULL);
+ if (dentry) {
+ dentry->d_flags |= DCACHE_RCUACCESS | DCACHE_DENTRY_CURSOR;
+ dentry->d_parent = dget(parent);
+ }
+ return dentry;
+}
+
/**
* d_alloc_pseudo - allocate a dentry (for lookup-less filesystems)
* @sb: the superblock
@@ -1653,8 +1689,7 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name)
struct qstr q;
q.name = name;
- q.len = strlen(name);
- q.hash = full_name_hash(q.name, q.len);
+ q.hash_len = hashlen_string(parent, name);
return d_alloc(parent, &q);
}
EXPORT_SYMBOL(d_alloc_name);
@@ -1667,7 +1702,7 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
DCACHE_OP_REVALIDATE |
DCACHE_OP_WEAK_REVALIDATE |
DCACHE_OP_DELETE |
- DCACHE_OP_SELECT_INODE));
+ DCACHE_OP_REAL));
dentry->d_op = op;
if (!op)
return;
@@ -1683,8 +1718,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
dentry->d_flags |= DCACHE_OP_DELETE;
if (op->d_prune)
dentry->d_flags |= DCACHE_OP_PRUNE;
- if (op->d_select_inode)
- dentry->d_flags |= DCACHE_OP_SELECT_INODE;
+ if (op->d_real)
+ dentry->d_flags |= DCACHE_OP_REAL;
}
EXPORT_SYMBOL(d_set_d_op);
@@ -1743,13 +1778,14 @@ type_determined:
static void __d_instantiate(struct dentry *dentry, struct inode *inode)
{
unsigned add_flags = d_flags_for_inode(inode);
+ WARN_ON(d_in_lookup(dentry));
spin_lock(&dentry->d_lock);
hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
raw_write_seqcount_begin(&dentry->d_seq);
__d_set_inode_and_type(dentry, inode, add_flags);
raw_write_seqcount_end(&dentry->d_seq);
- __fsnotify_d_instantiate(dentry);
+ fsnotify_update_flags(dentry);
spin_unlock(&dentry->d_lock);
}
@@ -1772,11 +1808,11 @@ void d_instantiate(struct dentry *entry, struct inode * inode)
{
BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
if (inode) {
+ security_d_instantiate(entry, inode);
spin_lock(&inode->i_lock);
__d_instantiate(entry, inode);
spin_unlock(&inode->i_lock);
}
- security_d_instantiate(entry, inode);
}
EXPORT_SYMBOL(d_instantiate);
@@ -1793,6 +1829,7 @@ int d_instantiate_no_diralias(struct dentry *entry, struct inode *inode)
{
BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
+ security_d_instantiate(entry, inode);
spin_lock(&inode->i_lock);
if (S_ISDIR(inode->i_mode) && !hlist_empty(&inode->i_dentry)) {
spin_unlock(&inode->i_lock);
@@ -1801,7 +1838,6 @@ int d_instantiate_no_diralias(struct dentry *entry, struct inode *inode)
}
__d_instantiate(entry, inode);
spin_unlock(&inode->i_lock);
- security_d_instantiate(entry, inode);
return 0;
}
@@ -1812,9 +1848,7 @@ struct dentry *d_make_root(struct inode *root_inode)
struct dentry *res = NULL;
if (root_inode) {
- static const struct qstr name = QSTR_INIT("/", 1);
-
- res = __d_alloc(root_inode->i_sb, &name);
+ res = __d_alloc(root_inode->i_sb, NULL);
if (res)
d_instantiate(res, root_inode);
else
@@ -1855,7 +1889,6 @@ EXPORT_SYMBOL(d_find_any_alias);
static struct dentry *__d_obtain_alias(struct inode *inode, int disconnected)
{
- static const struct qstr anonstring = QSTR_INIT("/", 1);
struct dentry *tmp;
struct dentry *res;
unsigned add_flags;
@@ -1869,12 +1902,13 @@ static struct dentry *__d_obtain_alias(struct inode *inode, int disconnected)
if (res)
goto out_iput;
- tmp = __d_alloc(inode->i_sb, &anonstring);
+ tmp = __d_alloc(inode->i_sb, NULL);
if (!tmp) {
res = ERR_PTR(-ENOMEM);
goto out_iput;
}
+ security_d_instantiate(tmp, inode);
spin_lock(&inode->i_lock);
res = __d_find_any_alias(inode);
if (res) {
@@ -1897,13 +1931,10 @@ static struct dentry *__d_obtain_alias(struct inode *inode, int disconnected)
hlist_bl_unlock(&tmp->d_sb->s_anon);
spin_unlock(&tmp->d_lock);
spin_unlock(&inode->i_lock);
- security_d_instantiate(tmp, inode);
return tmp;
out_iput:
- if (res && !IS_ERR(res))
- security_d_instantiate(res, inode);
iput(inode);
return res;
}
@@ -1972,68 +2003,53 @@ EXPORT_SYMBOL(d_obtain_root);
struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
struct qstr *name)
{
- struct dentry *found;
- struct dentry *new;
+ struct dentry *found, *res;
/*
* First check if a dentry matching the name already exists,
* if not go ahead and create it now.
*/
found = d_hash_and_lookup(dentry->d_parent, name);
- if (!found) {
- new = d_alloc(dentry->d_parent, name);
- if (!new) {
- found = ERR_PTR(-ENOMEM);
- } else {
- found = d_splice_alias(inode, new);
- if (found) {
- dput(new);
- return found;
- }
- return new;
+ if (found) {
+ iput(inode);
+ return found;
+ }
+ if (d_in_lookup(dentry)) {
+ found = d_alloc_parallel(dentry->d_parent, name,
+ dentry->d_wait);
+ if (IS_ERR(found) || !d_in_lookup(found)) {
+ iput(inode);
+ return found;
}
+ } else {
+ found = d_alloc(dentry->d_parent, name);
+ if (!found) {
+ iput(inode);
+ return ERR_PTR(-ENOMEM);
+ }
+ }
+ res = d_splice_alias(inode, found);
+ if (res) {
+ dput(found);
+ return res;
}
- iput(inode);
return found;
}
EXPORT_SYMBOL(d_add_ci);
-/*
- * Do the slow-case of the dentry name compare.
- *
- * Unlike the dentry_cmp() function, we need to atomically
- * load the name and length information, so that the
- * filesystem can rely on them, and can use the 'name' and
- * 'len' information without worrying about walking off the
- * end of memory etc.
- *
- * Thus the read_seqcount_retry() and the "duplicate" info
- * in arguments (the low-level filesystem should not look
- * at the dentry inode or name contents directly, since
- * rename can change them while we're in RCU mode).
- */
-enum slow_d_compare {
- D_COMP_OK,
- D_COMP_NOMATCH,
- D_COMP_SEQRETRY,
-};
-static noinline enum slow_d_compare slow_dentry_cmp(
- const struct dentry *parent,
- struct dentry *dentry,
- unsigned int seq,
- const struct qstr *name)
+static inline bool d_same_name(const struct dentry *dentry,
+ const struct dentry *parent,
+ const struct qstr *name)
{
- int tlen = dentry->d_name.len;
- const char *tname = dentry->d_name.name;
-
- if (read_seqcount_retry(&dentry->d_seq, seq)) {
- cpu_relax();
- return D_COMP_SEQRETRY;
+ if (likely(!(parent->d_flags & DCACHE_OP_COMPARE))) {
+ if (dentry->d_name.len != name->len)
+ return false;
+ return dentry_cmp(dentry, name->name, name->len) == 0;
}
- if (parent->d_op->d_compare(parent, dentry, tlen, tname, name))
- return D_COMP_NOMATCH;
- return D_COMP_OK;
+ return parent->d_op->d_compare(dentry,
+ dentry->d_name.len, dentry->d_name.name,
+ name) == 0;
}
/**
@@ -2071,7 +2087,7 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent,
{
u64 hashlen = name->hash_len;
const unsigned char *str = name->name;
- struct hlist_bl_head *b = d_hash(parent, hashlen_hash(hashlen));
+ struct hlist_bl_head *b = d_hash(hashlen_hash(hashlen));
struct hlist_bl_node *node;
struct dentry *dentry;
@@ -2112,6 +2128,9 @@ seqretry:
* dentry compare, we will do seqretries until it is stable,
* and if we end up with a successful lookup, we actually
* want to exit RCU lookup anyway.
+ *
+ * Note that raw_seqcount_begin still *does* smp_rmb(), so
+ * we are still guaranteed NUL-termination of ->d_name.name.
*/
seq = raw_seqcount_begin(&dentry->d_seq);
if (dentry->d_parent != parent)
@@ -2120,24 +2139,28 @@ seqretry:
continue;
if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
+ int tlen;
+ const char *tname;
if (dentry->d_name.hash != hashlen_hash(hashlen))
continue;
- *seqp = seq;
- switch (slow_dentry_cmp(parent, dentry, seq, name)) {
- case D_COMP_OK:
- return dentry;
- case D_COMP_NOMATCH:
- continue;
- default:
+ tlen = dentry->d_name.len;
+ tname = dentry->d_name.name;
+ /* we want a consistent (name,len) pair */
+ if (read_seqcount_retry(&dentry->d_seq, seq)) {
+ cpu_relax();
goto seqretry;
}
+ if (parent->d_op->d_compare(dentry,
+ tlen, tname, name) != 0)
+ continue;
+ } else {
+ if (dentry->d_name.hash_len != hashlen)
+ continue;
+ if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0)
+ continue;
}
-
- if (dentry->d_name.hash_len != hashlen)
- continue;
*seqp = seq;
- if (!dentry_cmp(dentry, str, hashlen_len(hashlen)))
- return dentry;
+ return dentry;
}
return NULL;
}
@@ -2185,10 +2208,8 @@ EXPORT_SYMBOL(d_lookup);
*/
struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name)
{
- unsigned int len = name->len;
unsigned int hash = name->hash;
- const unsigned char *str = name->name;
- struct hlist_bl_head *b = d_hash(parent, hash);
+ struct hlist_bl_head *b = d_hash(hash);
struct hlist_bl_node *node;
struct dentry *found = NULL;
struct dentry *dentry;
@@ -2226,21 +2247,8 @@ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name)
if (d_unhashed(dentry))
goto next;
- /*
- * It is safe to compare names since d_move() cannot
- * change the qstr (protected by d_lock).
- */
- if (parent->d_flags & DCACHE_OP_COMPARE) {
- int tlen = dentry->d_name.len;
- const char *tname = dentry->d_name.name;
- if (parent->d_op->d_compare(parent, dentry, tlen, tname, name))
- goto next;
- } else {
- if (dentry->d_name.len != len)
- goto next;
- if (dentry_cmp(dentry, str, len))
- goto next;
- }
+ if (!d_same_name(dentry, parent, name))
+ goto next;
dentry->d_lockref.count++;
found = dentry;
@@ -2268,7 +2276,7 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
* calculate the standard hash first, as the d_op->d_hash()
* routine may choose to leave the hash value unchanged.
*/
- name->hash = full_name_hash(name->name, name->len);
+ name->hash = full_name_hash(dir, name->name, name->len);
if (dir->d_flags & DCACHE_OP_HASH) {
int err = dir->d_op->d_hash(dir, name);
if (unlikely(err < 0))
@@ -2331,20 +2339,15 @@ again:
}
EXPORT_SYMBOL(d_delete);
-static void __d_rehash(struct dentry * entry, struct hlist_bl_head *b)
+static void __d_rehash(struct dentry *entry)
{
+ struct hlist_bl_head *b = d_hash(entry->d_name.hash);
BUG_ON(!d_unhashed(entry));
hlist_bl_lock(b);
- entry->d_flags |= DCACHE_RCUACCESS;
hlist_bl_add_head_rcu(&entry->d_hash, b);
hlist_bl_unlock(b);
}
-static void _d_rehash(struct dentry * entry)
-{
- __d_rehash(entry, d_hash(entry->d_parent, entry->d_name.hash));
-}
-
/**
* d_rehash - add an entry back to the hash
* @entry: dentry to add to the hash
@@ -2355,22 +2358,186 @@ static void _d_rehash(struct dentry * entry)
void d_rehash(struct dentry * entry)
{
spin_lock(&entry->d_lock);
- _d_rehash(entry);
+ __d_rehash(entry);
spin_unlock(&entry->d_lock);
}
EXPORT_SYMBOL(d_rehash);
+static inline unsigned start_dir_add(struct inode *dir)
+{
+
+ for (;;) {
+ unsigned n = dir->i_dir_seq;
+ if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n)
+ return n;
+ cpu_relax();
+ }
+}
+
+static inline void end_dir_add(struct inode *dir, unsigned n)
+{
+ smp_store_release(&dir->i_dir_seq, n + 2);
+}
+
+static void d_wait_lookup(struct dentry *dentry)
+{
+ if (d_in_lookup(dentry)) {
+ DECLARE_WAITQUEUE(wait, current);
+ add_wait_queue(dentry->d_wait, &wait);
+ do {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ spin_unlock(&dentry->d_lock);
+ schedule();
+ spin_lock(&dentry->d_lock);
+ } while (d_in_lookup(dentry));
+ }
+}
+
+struct dentry *d_alloc_parallel(struct dentry *parent,
+ const struct qstr *name,
+ wait_queue_head_t *wq)
+{
+ unsigned int hash = name->hash;
+ struct hlist_bl_head *b = in_lookup_hash(parent, hash);
+ struct hlist_bl_node *node;
+ struct dentry *new = d_alloc(parent, name);
+ struct dentry *dentry;
+ unsigned seq, r_seq, d_seq;
+
+ if (unlikely(!new))
+ return ERR_PTR(-ENOMEM);
+
+retry:
+ rcu_read_lock();
+ seq = smp_load_acquire(&parent->d_inode->i_dir_seq) & ~1;
+ r_seq = read_seqbegin(&rename_lock);
+ dentry = __d_lookup_rcu(parent, name, &d_seq);
+ if (unlikely(dentry)) {
+ if (!lockref_get_not_dead(&dentry->d_lockref)) {
+ rcu_read_unlock();
+ goto retry;
+ }
+ if (read_seqcount_retry(&dentry->d_seq, d_seq)) {
+ rcu_read_unlock();
+ dput(dentry);
+ goto retry;
+ }
+ rcu_read_unlock();
+ dput(new);
+ return dentry;
+ }
+ if (unlikely(read_seqretry(&rename_lock, r_seq))) {
+ rcu_read_unlock();
+ goto retry;
+ }
+ hlist_bl_lock(b);
+ if (unlikely(parent->d_inode->i_dir_seq != seq)) {
+ hlist_bl_unlock(b);
+ rcu_read_unlock();
+ goto retry;
+ }
+ /*
+ * No changes for the parent since the beginning of d_lookup().
+ * Since all removals from the chain happen with hlist_bl_lock(),
+ * any potential in-lookup matches are going to stay here until
+ * we unlock the chain. All fields are stable in everything
+ * we encounter.
+ */
+ hlist_bl_for_each_entry(dentry, node, b, d_u.d_in_lookup_hash) {
+ if (dentry->d_name.hash != hash)
+ continue;
+ if (dentry->d_parent != parent)
+ continue;
+ if (!d_same_name(dentry, parent, name))
+ continue;
+ hlist_bl_unlock(b);
+ /* now we can try to grab a reference */
+ if (!lockref_get_not_dead(&dentry->d_lockref)) {
+ rcu_read_unlock();
+ goto retry;
+ }
+
+ rcu_read_unlock();
+ /*
+ * somebody is likely to be still doing lookup for it;
+ * wait for them to finish
+ */
+ spin_lock(&dentry->d_lock);
+ d_wait_lookup(dentry);
+ /*
+ * it's not in-lookup anymore; in principle we should repeat
+ * everything from dcache lookup, but it's likely to be what
+ * d_lookup() would've found anyway. If it is, just return it;
+ * otherwise we really have to repeat the whole thing.
+ */
+ if (unlikely(dentry->d_name.hash != hash))
+ goto mismatch;
+ if (unlikely(dentry->d_parent != parent))
+ goto mismatch;
+ if (unlikely(d_unhashed(dentry)))
+ goto mismatch;
+ if (unlikely(!d_same_name(dentry, parent, name)))
+ goto mismatch;
+ /* OK, it *is* a hashed match; return it */
+ spin_unlock(&dentry->d_lock);
+ dput(new);
+ return dentry;
+ }
+ rcu_read_unlock();
+ /* we can't take ->d_lock here; it's OK, though. */
+ new->d_flags |= DCACHE_PAR_LOOKUP;
+ new->d_wait = wq;
+ hlist_bl_add_head_rcu(&new->d_u.d_in_lookup_hash, b);
+ hlist_bl_unlock(b);
+ return new;
+mismatch:
+ spin_unlock(&dentry->d_lock);
+ dput(dentry);
+ goto retry;
+}
+EXPORT_SYMBOL(d_alloc_parallel);
+
+void __d_lookup_done(struct dentry *dentry)
+{
+ struct hlist_bl_head *b = in_lookup_hash(dentry->d_parent,
+ dentry->d_name.hash);
+ hlist_bl_lock(b);
+ dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
+ __hlist_bl_del(&dentry->d_u.d_in_lookup_hash);
+ wake_up_all(dentry->d_wait);
+ dentry->d_wait = NULL;
+ hlist_bl_unlock(b);
+ INIT_HLIST_NODE(&dentry->d_u.d_alias);
+ INIT_LIST_HEAD(&dentry->d_lru);
+}
+EXPORT_SYMBOL(__d_lookup_done);
/* inode->i_lock held if inode is non-NULL */
static inline void __d_add(struct dentry *dentry, struct inode *inode)
{
+ struct inode *dir = NULL;
+ unsigned n;
+ spin_lock(&dentry->d_lock);
+ if (unlikely(d_in_lookup(dentry))) {
+ dir = dentry->d_parent->d_inode;
+ n = start_dir_add(dir);
+ __d_lookup_done(dentry);
+ }
if (inode) {
- __d_instantiate(dentry, inode);
+ unsigned add_flags = d_flags_for_inode(inode);
+ hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
+ raw_write_seqcount_begin(&dentry->d_seq);
+ __d_set_inode_and_type(dentry, inode, add_flags);
+ raw_write_seqcount_end(&dentry->d_seq);
+ fsnotify_update_flags(dentry);
+ }
+ __d_rehash(dentry);
+ if (dir)
+ end_dir_add(dir, n);
+ spin_unlock(&dentry->d_lock);
+ if (inode)
spin_unlock(&inode->i_lock);
- }
- security_d_instantiate(dentry, inode);
- d_rehash(dentry);
}
/**
@@ -2384,8 +2551,10 @@ static inline void __d_add(struct dentry *dentry, struct inode *inode)
void d_add(struct dentry *entry, struct inode *inode)
{
- if (inode)
+ if (inode) {
+ security_d_instantiate(entry, inode);
spin_lock(&inode->i_lock);
+ }
__d_add(entry, inode);
}
EXPORT_SYMBOL(d_add);
@@ -2404,8 +2573,6 @@ EXPORT_SYMBOL(d_add);
struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode)
{
struct dentry *alias;
- int len = entry->d_name.len;
- const char *name = entry->d_name.name;
unsigned int hash = entry->d_name.hash;
spin_lock(&inode->i_lock);
@@ -2419,9 +2586,7 @@ struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode)
continue;
if (alias->d_parent != entry->d_parent)
continue;
- if (alias->d_name.len != len)
- continue;
- if (dentry_cmp(alias, name, len))
+ if (!d_same_name(alias, entry->d_parent, &entry->d_name))
continue;
spin_lock(&alias->d_lock);
if (!d_unhashed(alias)) {
@@ -2429,7 +2594,7 @@ struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode)
alias = NULL;
} else {
__dget_dlock(alias);
- _d_rehash(alias);
+ __d_rehash(alias);
spin_unlock(&alias->d_lock);
}
spin_unlock(&inode->i_lock);
@@ -2454,7 +2619,7 @@ EXPORT_SYMBOL(d_exact_alias);
* Parent inode i_mutex must be held over d_lookup and into this call (to
* keep renames and concurrent inserts, and readdir(2) away).
*/
-void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
+void dentry_update_name_case(struct dentry *dentry, const struct qstr *name)
{
BUG_ON(!inode_is_locked(dentry->d_parent->d_inode));
BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */
@@ -2595,6 +2760,8 @@ static void dentry_unlock_for_move(struct dentry *dentry, struct dentry *target)
static void __d_move(struct dentry *dentry, struct dentry *target,
bool exchange)
{
+ struct inode *dir = NULL;
+ unsigned n;
if (!dentry->d_inode)
printk(KERN_WARNING "VFS: moving negative dcache entry\n");
@@ -2602,28 +2769,19 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
BUG_ON(d_ancestor(target, dentry));
dentry_lock_for_move(dentry, target);
+ if (unlikely(d_in_lookup(target))) {
+ dir = target->d_parent->d_inode;
+ n = start_dir_add(dir);
+ __d_lookup_done(target);
+ }
write_seqcount_begin(&dentry->d_seq);
write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED);
+ /* unhash both */
/* __d_drop does write_seqcount_barrier, but they're OK to nest. */
-
- /*
- * Move the dentry to the target hash queue. Don't bother checking
- * for the same hash queue because of how unlikely it is.
- */
__d_drop(dentry);
- __d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash));
-
- /*
- * Unhash the target (d_delete() is not usable here). If exchanging
- * the two dentries, then rehash onto the other's hash queue.
- */
__d_drop(target);
- if (exchange) {
- __d_rehash(target,
- d_hash(dentry->d_parent, dentry->d_name.hash));
- }
/* Switch the names.. */
if (exchange)
@@ -2631,9 +2789,15 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
else
copy_name(dentry, target);
+ /* rehash in new place(s) */
+ __d_rehash(dentry);
+ if (exchange)
+ __d_rehash(target);
+
/* ... and switch them in the tree */
if (IS_ROOT(dentry)) {
/* splicing a tree */
+ dentry->d_flags |= DCACHE_RCUACCESS;
dentry->d_parent = target->d_parent;
target->d_parent = target;
list_del_init(&target->d_child);
@@ -2644,13 +2808,15 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
list_move(&target->d_child, &target->d_parent->d_subdirs);
list_move(&dentry->d_child, &dentry->d_parent->d_subdirs);
if (exchange)
- fsnotify_d_move(target);
- fsnotify_d_move(dentry);
+ fsnotify_update_flags(target);
+ fsnotify_update_flags(dentry);
}
write_seqcount_end(&target->d_seq);
write_seqcount_end(&dentry->d_seq);
+ if (dir)
+ end_dir_add(dir, n);
dentry_unlock_for_move(dentry, target);
}
@@ -2721,7 +2887,8 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
static int __d_unalias(struct inode *inode,
struct dentry *dentry, struct dentry *alias)
{
- struct mutex *m1 = NULL, *m2 = NULL;
+ struct mutex *m1 = NULL;
+ struct rw_semaphore *m2 = NULL;
int ret = -ESTALE;
/* If alias and dentry share a parent, then no extra locks required */
@@ -2732,15 +2899,15 @@ static int __d_unalias(struct inode *inode,
if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex))
goto out_err;
m1 = &dentry->d_sb->s_vfs_rename_mutex;
- if (!inode_trylock(alias->d_parent->d_inode))
+ if (!inode_trylock_shared(alias->d_parent->d_inode))
goto out_err;
- m2 = &alias->d_parent->d_inode->i_mutex;
+ m2 = &alias->d_parent->d_inode->i_rwsem;
out_unalias:
__d_move(alias, dentry, false);
ret = 0;
out_err:
if (m2)
- mutex_unlock(m2);
+ up_read(m2);
if (m1)
mutex_unlock(m1);
return ret;
@@ -2779,6 +2946,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
if (!inode)
goto out;
+ security_d_instantiate(dentry, inode);
spin_lock(&inode->i_lock);
if (S_ISDIR(inode->i_mode)) {
struct dentry *new = __d_find_any_alias(inode);
@@ -2806,7 +2974,6 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
} else {
__d_move(new, dentry, false);
write_sequnlock(&rename_lock);
- security_d_instantiate(new, inode);
}
iput(inode);
return new;
@@ -2846,7 +3013,7 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen)
* Data dependency barrier is needed to make sure that we see that terminating
* NUL. Alpha strikes again, film at 11...
*/
-static int prepend_name(char **buffer, int *buflen, struct qstr *name)
+static int prepend_name(char **buffer, int *buflen, const struct qstr *name)
{
const char *dname = ACCESS_ONCE(name->name);
u32 dlen = ACCESS_ONCE(name->len);
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index d2ba12e23ed94..592059f88e04f 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -22,6 +22,12 @@
#include <linux/slab.h>
#include <linux/atomic.h>
#include <linux/device.h>
+#include <linux/srcu.h>
+#include <asm/poll.h>
+
+#include "internal.h"
+
+struct poll_table_struct;
static ssize_t default_read_file(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
@@ -35,27 +41,294 @@ static ssize_t default_write_file(struct file *file, const char __user *buf,
return count;
}
-const struct file_operations debugfs_file_operations = {
+const struct file_operations debugfs_noop_file_operations = {
.read = default_read_file,
.write = default_write_file,
.open = simple_open,
.llseek = noop_llseek,
};
-static struct dentry *debugfs_create_mode(const char *name, umode_t mode,
- struct dentry *parent, void *value,
- const struct file_operations *fops,
- const struct file_operations *fops_ro,
- const struct file_operations *fops_wo)
+/**
+ * debugfs_use_file_start - mark the beginning of file data access
+ * @dentry: the dentry object whose data is being accessed.
+ * @srcu_idx: a pointer to some memory to store a SRCU index in.
+ *
+ * Up to a matching call to debugfs_use_file_finish(), any
+ * successive call into the file removing functions debugfs_remove()
+ * and debugfs_remove_recursive() will block. Since associated private
+ * file data may only get freed after a successful return of any of
+ * the removal functions, you may safely access it after a successful
+ * call to debugfs_use_file_start() without worrying about
+ * lifetime issues.
+ *
+ * If -%EIO is returned, the file has already been removed and thus,
+ * it is not safe to access any of its data. If, on the other hand,
+ * it is allowed to access the file data, zero is returned.
+ *
+ * Regardless of the return code, any call to
+ * debugfs_use_file_start() must be followed by a matching call
+ * to debugfs_use_file_finish().
+ */
+int debugfs_use_file_start(const struct dentry *dentry, int *srcu_idx)
+ __acquires(&debugfs_srcu)
+{
+ *srcu_idx = srcu_read_lock(&debugfs_srcu);
+ barrier();
+ if (d_unlinked(dentry))
+ return -EIO;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(debugfs_use_file_start);
+
+/**
+ * debugfs_use_file_finish - mark the end of file data access
+ * @srcu_idx: the SRCU index "created" by a former call to
+ * debugfs_use_file_start().
+ *
+ * Allow any ongoing concurrent call into debugfs_remove() or
+ * debugfs_remove_recursive() blocked by a former call to
+ * debugfs_use_file_start() to proceed and return to its caller.
+ */
+void debugfs_use_file_finish(int srcu_idx) __releases(&debugfs_srcu)
+{
+ srcu_read_unlock(&debugfs_srcu, srcu_idx);
+}
+EXPORT_SYMBOL_GPL(debugfs_use_file_finish);
+
+#define F_DENTRY(filp) ((filp)->f_path.dentry)
+
+#define REAL_FOPS_DEREF(dentry) \
+ ((const struct file_operations *)(dentry)->d_fsdata)
+
+static int open_proxy_open(struct inode *inode, struct file *filp)
+{
+ const struct dentry *dentry = F_DENTRY(filp);
+ const struct file_operations *real_fops = NULL;
+ int srcu_idx, r;
+
+ r = debugfs_use_file_start(dentry, &srcu_idx);
+ if (r) {
+ r = -ENOENT;
+ goto out;
+ }
+
+ real_fops = REAL_FOPS_DEREF(dentry);
+ real_fops = fops_get(real_fops);
+ if (!real_fops) {
+ /* Huh? Module did not clean up after itself at exit? */
+ WARN(1, "debugfs file owner did not clean up at exit: %pd",
+ dentry);
+ r = -ENXIO;
+ goto out;
+ }
+ replace_fops(filp, real_fops);
+
+ if (real_fops->open)
+ r = real_fops->open(inode, filp);
+
+out:
+ debugfs_use_file_finish(srcu_idx);
+ return r;
+}
+
+const struct file_operations debugfs_open_proxy_file_operations = {
+ .open = open_proxy_open,
+};
+
+#define PROTO(args...) args
+#define ARGS(args...) args
+
+#define FULL_PROXY_FUNC(name, ret_type, filp, proto, args) \
+static ret_type full_proxy_ ## name(proto) \
+{ \
+ const struct dentry *dentry = F_DENTRY(filp); \
+ const struct file_operations *real_fops = \
+ REAL_FOPS_DEREF(dentry); \
+ int srcu_idx; \
+ ret_type r; \
+ \
+ r = debugfs_use_file_start(dentry, &srcu_idx); \
+ if (likely(!r)) \
+ r = real_fops->name(args); \
+ debugfs_use_file_finish(srcu_idx); \
+ return r; \
+}
+
+FULL_PROXY_FUNC(llseek, loff_t, filp,
+ PROTO(struct file *filp, loff_t offset, int whence),
+ ARGS(filp, offset, whence));
+
+FULL_PROXY_FUNC(read, ssize_t, filp,
+ PROTO(struct file *filp, char __user *buf, size_t size,
+ loff_t *ppos),
+ ARGS(filp, buf, size, ppos));
+
+FULL_PROXY_FUNC(write, ssize_t, filp,
+ PROTO(struct file *filp, const char __user *buf, size_t size,
+ loff_t *ppos),
+ ARGS(filp, buf, size, ppos));
+
+FULL_PROXY_FUNC(unlocked_ioctl, long, filp,
+ PROTO(struct file *filp, unsigned int cmd, unsigned long arg),
+ ARGS(filp, cmd, arg));
+
+static unsigned int full_proxy_poll(struct file *filp,
+ struct poll_table_struct *wait)
+{
+ const struct dentry *dentry = F_DENTRY(filp);
+ const struct file_operations *real_fops = REAL_FOPS_DEREF(dentry);
+ int srcu_idx;
+ unsigned int r = 0;
+
+ if (debugfs_use_file_start(dentry, &srcu_idx)) {
+ debugfs_use_file_finish(srcu_idx);
+ return POLLHUP;
+ }
+
+ r = real_fops->poll(filp, wait);
+ debugfs_use_file_finish(srcu_idx);
+ return r;
+}
+
+static int full_proxy_release(struct inode *inode, struct file *filp)
+{
+ const struct dentry *dentry = F_DENTRY(filp);
+ const struct file_operations *real_fops = REAL_FOPS_DEREF(dentry);
+ const struct file_operations *proxy_fops = filp->f_op;
+ int r = 0;
+
+ /*
+ * We must not protect this against removal races here: the
+ * original releaser should be called unconditionally in order
+ * not to leak any resources. Releasers must not assume that
+ * ->i_private is still being meaningful here.
+ */
+ if (real_fops->release)
+ r = real_fops->release(inode, filp);
+
+ replace_fops(filp, d_inode(dentry)->i_fop);
+ kfree((void *)proxy_fops);
+ fops_put(real_fops);
+ return 0;
+}
+
+static void __full_proxy_fops_init(struct file_operations *proxy_fops,
+ const struct file_operations *real_fops)
+{
+ proxy_fops->release = full_proxy_release;
+ if (real_fops->llseek)
+ proxy_fops->llseek = full_proxy_llseek;
+ if (real_fops->read)
+ proxy_fops->read = full_proxy_read;
+ if (real_fops->write)
+ proxy_fops->write = full_proxy_write;
+ if (real_fops->poll)
+ proxy_fops->poll = full_proxy_poll;
+ if (real_fops->unlocked_ioctl)
+ proxy_fops->unlocked_ioctl = full_proxy_unlocked_ioctl;
+}
+
+static int full_proxy_open(struct inode *inode, struct file *filp)
+{
+ const struct dentry *dentry = F_DENTRY(filp);
+ const struct file_operations *real_fops = NULL;
+ struct file_operations *proxy_fops = NULL;
+ int srcu_idx, r;
+
+ r = debugfs_use_file_start(dentry, &srcu_idx);
+ if (r) {
+ r = -ENOENT;
+ goto out;
+ }
+
+ real_fops = REAL_FOPS_DEREF(dentry);
+ real_fops = fops_get(real_fops);
+ if (!real_fops) {
+ /* Huh? Module did not cleanup after itself at exit? */
+ WARN(1, "debugfs file owner did not clean up at exit: %pd",
+ dentry);
+ r = -ENXIO;
+ goto out;
+ }
+
+ proxy_fops = kzalloc(sizeof(*proxy_fops), GFP_KERNEL);
+ if (!proxy_fops) {
+ r = -ENOMEM;
+ goto free_proxy;
+ }
+ __full_proxy_fops_init(proxy_fops, real_fops);
+ replace_fops(filp, proxy_fops);
+
+ if (real_fops->open) {
+ r = real_fops->open(inode, filp);
+ if (r) {
+ replace_fops(filp, d_inode(dentry)->i_fop);
+ goto free_proxy;
+ } else if (filp->f_op != proxy_fops) {
+ /* No protection against file removal anymore. */
+ WARN(1, "debugfs file owner replaced proxy fops: %pd",
+ dentry);
+ goto free_proxy;
+ }
+ }
+
+ goto out;
+free_proxy:
+ kfree(proxy_fops);
+ fops_put(real_fops);
+out:
+ debugfs_use_file_finish(srcu_idx);
+ return r;
+}
+
+const struct file_operations debugfs_full_proxy_file_operations = {
+ .open = full_proxy_open,
+};
+
+ssize_t debugfs_attr_read(struct file *file, char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ ssize_t ret;
+ int srcu_idx;
+
+ ret = debugfs_use_file_start(F_DENTRY(file), &srcu_idx);
+ if (likely(!ret))
+ ret = simple_attr_read(file, buf, len, ppos);
+ debugfs_use_file_finish(srcu_idx);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(debugfs_attr_read);
+
+ssize_t debugfs_attr_write(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ ssize_t ret;
+ int srcu_idx;
+
+ ret = debugfs_use_file_start(F_DENTRY(file), &srcu_idx);
+ if (likely(!ret))
+ ret = simple_attr_write(file, buf, len, ppos);
+ debugfs_use_file_finish(srcu_idx);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(debugfs_attr_write);
+
+static struct dentry *debugfs_create_mode_unsafe(const char *name, umode_t mode,
+ struct dentry *parent, void *value,
+ const struct file_operations *fops,
+ const struct file_operations *fops_ro,
+ const struct file_operations *fops_wo)
{
/* if there are no write bits set, make read only */
if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, fops_ro);
+ return debugfs_create_file_unsafe(name, mode, parent, value,
+ fops_ro);
/* if there are no read bits set, make write only */
if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, fops_wo);
+ return debugfs_create_file_unsafe(name, mode, parent, value,
+ fops_wo);
- return debugfs_create_file(name, mode, parent, value, fops);
+ return debugfs_create_file_unsafe(name, mode, parent, value, fops);
}
static int debugfs_u8_set(void *data, u64 val)
@@ -68,9 +341,9 @@ static int debugfs_u8_get(void *data, u64 *val)
*val = *(u8 *)data;
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_u8_ro, debugfs_u8_get, NULL, "%llu\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_u8_ro, debugfs_u8_get, NULL, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n");
/**
* debugfs_create_u8 - create a debugfs file that is used to read and write an unsigned 8-bit value
@@ -99,7 +372,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n");
struct dentry *debugfs_create_u8(const char *name, umode_t mode,
struct dentry *parent, u8 *value)
{
- return debugfs_create_mode(name, mode, parent, value, &fops_u8,
+ return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u8,
&fops_u8_ro, &fops_u8_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u8);
@@ -114,9 +387,9 @@ static int debugfs_u16_get(void *data, u64 *val)
*val = *(u16 *)data;
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_u16_ro, debugfs_u16_get, NULL, "%llu\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_u16_ro, debugfs_u16_get, NULL, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n");
/**
* debugfs_create_u16 - create a debugfs file that is used to read and write an unsigned 16-bit value
@@ -145,7 +418,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n");
struct dentry *debugfs_create_u16(const char *name, umode_t mode,
struct dentry *parent, u16 *value)
{
- return debugfs_create_mode(name, mode, parent, value, &fops_u16,
+ return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u16,
&fops_u16_ro, &fops_u16_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u16);
@@ -160,9 +433,9 @@ static int debugfs_u32_get(void *data, u64 *val)
*val = *(u32 *)data;
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_u32_ro, debugfs_u32_get, NULL, "%llu\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_u32_ro, debugfs_u32_get, NULL, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n");
/**
* debugfs_create_u32 - create a debugfs file that is used to read and write an unsigned 32-bit value
@@ -191,7 +464,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n");
struct dentry *debugfs_create_u32(const char *name, umode_t mode,
struct dentry *parent, u32 *value)
{
- return debugfs_create_mode(name, mode, parent, value, &fops_u32,
+ return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u32,
&fops_u32_ro, &fops_u32_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u32);
@@ -207,9 +480,9 @@ static int debugfs_u64_get(void *data, u64 *val)
*val = *(u64 *)data;
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_u64_ro, debugfs_u64_get, NULL, "%llu\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_u64_ro, debugfs_u64_get, NULL, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n");
/**
* debugfs_create_u64 - create a debugfs file that is used to read and write an unsigned 64-bit value
@@ -238,7 +511,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n");
struct dentry *debugfs_create_u64(const char *name, umode_t mode,
struct dentry *parent, u64 *value)
{
- return debugfs_create_mode(name, mode, parent, value, &fops_u64,
+ return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u64,
&fops_u64_ro, &fops_u64_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u64);
@@ -254,9 +527,10 @@ static int debugfs_ulong_get(void *data, u64 *val)
*val = *(unsigned long *)data;
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(fops_ulong, debugfs_ulong_get, debugfs_ulong_set, "%llu\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_ulong_ro, debugfs_ulong_get, NULL, "%llu\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong, debugfs_ulong_get, debugfs_ulong_set,
+ "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong_ro, debugfs_ulong_get, NULL, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n");
/**
* debugfs_create_ulong - create a debugfs file that is used to read and write
@@ -286,26 +560,30 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n");
struct dentry *debugfs_create_ulong(const char *name, umode_t mode,
struct dentry *parent, unsigned long *value)
{
- return debugfs_create_mode(name, mode, parent, value, &fops_ulong,
- &fops_ulong_ro, &fops_ulong_wo);
+ return debugfs_create_mode_unsafe(name, mode, parent, value,
+ &fops_ulong, &fops_ulong_ro,
+ &fops_ulong_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_ulong);
-DEFINE_SIMPLE_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_x8_ro, debugfs_u8_get, NULL, "0x%02llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_x8_wo, NULL, debugfs_u8_set, "0x%02llx\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_x8_ro, debugfs_u8_get, NULL, "0x%02llx\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_x8_wo, NULL, debugfs_u8_set, "0x%02llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_x16, debugfs_u16_get, debugfs_u16_set, "0x%04llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_x16_ro, debugfs_u16_get, NULL, "0x%04llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_x16_wo, NULL, debugfs_u16_set, "0x%04llx\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_x16, debugfs_u16_get, debugfs_u16_set,
+ "0x%04llx\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_x16_ro, debugfs_u16_get, NULL, "0x%04llx\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_x16_wo, NULL, debugfs_u16_set, "0x%04llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set,
+ "0x%08llx\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_x64_ro, debugfs_u64_get, NULL, "0x%016llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_x64_wo, NULL, debugfs_u64_set, "0x%016llx\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set,
+ "0x%016llx\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_x64_ro, debugfs_u64_get, NULL, "0x%016llx\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_x64_wo, NULL, debugfs_u64_set, "0x%016llx\n");
/*
* debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value
@@ -328,7 +606,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x64_wo, NULL, debugfs_u64_set, "0x%016llx\n");
struct dentry *debugfs_create_x8(const char *name, umode_t mode,
struct dentry *parent, u8 *value)
{
- return debugfs_create_mode(name, mode, parent, value, &fops_x8,
+ return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x8,
&fops_x8_ro, &fops_x8_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x8);
@@ -346,7 +624,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_x8);
struct dentry *debugfs_create_x16(const char *name, umode_t mode,
struct dentry *parent, u16 *value)
{
- return debugfs_create_mode(name, mode, parent, value, &fops_x16,
+ return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x16,
&fops_x16_ro, &fops_x16_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x16);
@@ -364,7 +642,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_x16);
struct dentry *debugfs_create_x32(const char *name, umode_t mode,
struct dentry *parent, u32 *value)
{
- return debugfs_create_mode(name, mode, parent, value, &fops_x32,
+ return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x32,
&fops_x32_ro, &fops_x32_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x32);
@@ -382,7 +660,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_x32);
struct dentry *debugfs_create_x64(const char *name, umode_t mode,
struct dentry *parent, u64 *value)
{
- return debugfs_create_mode(name, mode, parent, value, &fops_x64,
+ return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x64,
&fops_x64_ro, &fops_x64_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x64);
@@ -398,10 +676,10 @@ static int debugfs_size_t_get(void *data, u64 *val)
*val = *(size_t *)data;
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set,
- "%llu\n"); /* %llu and %zu are more or less the same */
-DEFINE_SIMPLE_ATTRIBUTE(fops_size_t_ro, debugfs_size_t_get, NULL, "%llu\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_size_t_wo, NULL, debugfs_size_t_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set,
+ "%llu\n"); /* %llu and %zu are more or less the same */
+DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t_ro, debugfs_size_t_get, NULL, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t_wo, NULL, debugfs_size_t_set, "%llu\n");
/**
* debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value
@@ -416,8 +694,9 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_size_t_wo, NULL, debugfs_size_t_set, "%llu\n");
struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
struct dentry *parent, size_t *value)
{
- return debugfs_create_mode(name, mode, parent, value, &fops_size_t,
- &fops_size_t_ro, &fops_size_t_wo);
+ return debugfs_create_mode_unsafe(name, mode, parent, value,
+ &fops_size_t, &fops_size_t_ro,
+ &fops_size_t_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_size_t);
@@ -431,10 +710,12 @@ static int debugfs_atomic_t_get(void *data, u64 *val)
*val = atomic_read((atomic_t *)data);
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get,
+DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get,
debugfs_atomic_t_set, "%lld\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, "%lld\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL,
+ "%lld\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set,
+ "%lld\n");
/**
* debugfs_create_atomic_t - create a debugfs file that is used to read and
@@ -450,8 +731,9 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n");
struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode,
struct dentry *parent, atomic_t *value)
{
- return debugfs_create_mode(name, mode, parent, value, &fops_atomic_t,
- &fops_atomic_t_ro, &fops_atomic_t_wo);
+ return debugfs_create_mode_unsafe(name, mode, parent, value,
+ &fops_atomic_t, &fops_atomic_t_ro,
+ &fops_atomic_t_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_atomic_t);
@@ -459,9 +741,17 @@ ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
char buf[3];
- bool *val = file->private_data;
+ bool val;
+ int r, srcu_idx;
- if (*val)
+ r = debugfs_use_file_start(F_DENTRY(file), &srcu_idx);
+ if (likely(!r))
+ val = *(bool *)file->private_data;
+ debugfs_use_file_finish(srcu_idx);
+ if (r)
+ return r;
+
+ if (val)
buf[0] = 'Y';
else
buf[0] = 'N';
@@ -477,6 +767,7 @@ ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf,
char buf[32];
size_t buf_size;
bool bv;
+ int r, srcu_idx;
bool *val = file->private_data;
buf_size = min(count, (sizeof(buf)-1));
@@ -484,8 +775,14 @@ ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf,
return -EFAULT;
buf[buf_size] = '\0';
- if (strtobool(buf, &bv) == 0)
- *val = bv;
+ if (strtobool(buf, &bv) == 0) {
+ r = debugfs_use_file_start(F_DENTRY(file), &srcu_idx);
+ if (likely(!r))
+ *val = bv;
+ debugfs_use_file_finish(srcu_idx);
+ if (r)
+ return r;
+ }
return count;
}
@@ -537,7 +834,7 @@ static const struct file_operations fops_bool_wo = {
struct dentry *debugfs_create_bool(const char *name, umode_t mode,
struct dentry *parent, bool *value)
{
- return debugfs_create_mode(name, mode, parent, value, &fops_bool,
+ return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_bool,
&fops_bool_ro, &fops_bool_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_bool);
@@ -546,8 +843,15 @@ static ssize_t read_file_blob(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
struct debugfs_blob_wrapper *blob = file->private_data;
- return simple_read_from_buffer(user_buf, count, ppos, blob->data,
- blob->size);
+ ssize_t r;
+ int srcu_idx;
+
+ r = debugfs_use_file_start(F_DENTRY(file), &srcu_idx);
+ if (likely(!r))
+ r = simple_read_from_buffer(user_buf, count, ppos, blob->data,
+ blob->size);
+ debugfs_use_file_finish(srcu_idx);
+ return r;
}
static const struct file_operations fops_blob = {
@@ -584,7 +888,7 @@ struct dentry *debugfs_create_blob(const char *name, umode_t mode,
struct dentry *parent,
struct debugfs_blob_wrapper *blob)
{
- return debugfs_create_file(name, mode, parent, blob, &fops_blob);
+ return debugfs_create_file_unsafe(name, mode, parent, blob, &fops_blob);
}
EXPORT_SYMBOL_GPL(debugfs_create_blob);
@@ -689,7 +993,8 @@ struct dentry *debugfs_create_u32_array(const char *name, umode_t mode,
data->array = array;
data->elements = elements;
- return debugfs_create_file(name, mode, parent, data, &u32_array_fops);
+ return debugfs_create_file_unsafe(name, mode, parent, data,
+ &u32_array_fops);
}
EXPORT_SYMBOL_GPL(debugfs_create_u32_array);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index bece948b363df..72361baf9da71 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -27,9 +27,14 @@
#include <linux/parser.h>
#include <linux/magic.h>
#include <linux/slab.h>
+#include <linux/srcu.h>
+
+#include "internal.h"
#define DEBUGFS_DEFAULT_MODE 0700
+DEFINE_SRCU(debugfs_srcu);
+
static struct vfsmount *debugfs_mount;
static int debugfs_mount_count;
static bool debugfs_registered;
@@ -39,7 +44,8 @@ static struct inode *debugfs_get_inode(struct super_block *sb)
struct inode *inode = new_inode(sb);
if (inode) {
inode->i_ino = get_next_ino();
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime =
+ inode->i_ctime = current_fs_time(sb);
}
return inode;
}
@@ -294,6 +300,37 @@ static struct dentry *end_creating(struct dentry *dentry)
return dentry;
}
+static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
+ struct dentry *parent, void *data,
+ const struct file_operations *proxy_fops,
+ const struct file_operations *real_fops)
+{
+ struct dentry *dentry;
+ struct inode *inode;
+
+ if (!(mode & S_IFMT))
+ mode |= S_IFREG;
+ BUG_ON(!S_ISREG(mode));
+ dentry = start_creating(name, parent);
+
+ if (IS_ERR(dentry))
+ return NULL;
+
+ inode = debugfs_get_inode(dentry->d_sb);
+ if (unlikely(!inode))
+ return failed_creating(dentry);
+
+ inode->i_mode = mode;
+ inode->i_private = data;
+
+ inode->i_fop = proxy_fops;
+ dentry->d_fsdata = (void *)real_fops;
+
+ d_instantiate(dentry, inode);
+ fsnotify_create(d_inode(dentry->d_parent), dentry);
+ return end_creating(dentry);
+}
+
/**
* debugfs_create_file - create a file in the debugfs filesystem
* @name: a pointer to a string containing the name of the file to create.
@@ -324,29 +361,52 @@ struct dentry *debugfs_create_file(const char *name, umode_t mode,
struct dentry *parent, void *data,
const struct file_operations *fops)
{
- struct dentry *dentry;
- struct inode *inode;
- if (!(mode & S_IFMT))
- mode |= S_IFREG;
- BUG_ON(!S_ISREG(mode));
- dentry = start_creating(name, parent);
-
- if (IS_ERR(dentry))
- return NULL;
+ return __debugfs_create_file(name, mode, parent, data,
+ fops ? &debugfs_full_proxy_file_operations :
+ &debugfs_noop_file_operations,
+ fops);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_file);
- inode = debugfs_get_inode(dentry->d_sb);
- if (unlikely(!inode))
- return failed_creating(dentry);
+/**
+ * debugfs_create_file_unsafe - create a file in the debugfs filesystem
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have.
+ * @parent: a pointer to the parent dentry for this file. This should be a
+ * directory dentry if set. If this parameter is NULL, then the
+ * file will be created in the root of the debugfs filesystem.
+ * @data: a pointer to something that the caller will want to get to later
+ * on. The inode.i_private pointer will point to this value on
+ * the open() call.
+ * @fops: a pointer to a struct file_operations that should be used for
+ * this file.
+ *
+ * debugfs_create_file_unsafe() is completely analogous to
+ * debugfs_create_file(), the only difference being that the fops
+ * handed it will not get protected against file removals by the
+ * debugfs core.
+ *
+ * It is your responsibility to protect your struct file_operation
+ * methods against file removals by means of debugfs_use_file_start()
+ * and debugfs_use_file_finish(). ->open() is still protected by
+ * debugfs though.
+ *
+ * Any struct file_operations defined by means of
+ * DEFINE_DEBUGFS_ATTRIBUTE() is protected against file removals and
+ * thus, may be used here.
+ */
+struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode,
+ struct dentry *parent, void *data,
+ const struct file_operations *fops)
+{
- inode->i_mode = mode;
- inode->i_fop = fops ? fops : &debugfs_file_operations;
- inode->i_private = data;
- d_instantiate(dentry, inode);
- fsnotify_create(d_inode(dentry->d_parent), dentry);
- return end_creating(dentry);
+ return __debugfs_create_file(name, mode, parent, data,
+ fops ? &debugfs_open_proxy_file_operations :
+ &debugfs_noop_file_operations,
+ fops);
}
-EXPORT_SYMBOL_GPL(debugfs_create_file);
+EXPORT_SYMBOL_GPL(debugfs_create_file_unsafe);
/**
* debugfs_create_file_size - create a file in the debugfs filesystem
@@ -457,11 +517,15 @@ struct dentry *debugfs_create_automount(const char *name,
if (unlikely(!inode))
return failed_creating(dentry);
- inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+ make_empty_dir_inode(inode);
inode->i_flags |= S_AUTOMOUNT;
inode->i_private = data;
dentry->d_fsdata = (void *)f;
+ /* directory inodes start off with i_nlink == 2 (for "." entry) */
+ inc_nlink(inode);
d_instantiate(dentry, inode);
+ inc_nlink(d_inode(dentry->d_parent));
+ fsnotify_mkdir(d_inode(dentry->d_parent), dentry);
return end_creating(dentry);
}
EXPORT_SYMBOL(debugfs_create_automount);
@@ -557,14 +621,13 @@ void debugfs_remove(struct dentry *dentry)
return;
parent = dentry->d_parent;
- if (!parent || d_really_is_negative(parent))
- return;
-
inode_lock(d_inode(parent));
ret = __debugfs_remove(dentry, parent);
inode_unlock(d_inode(parent));
if (!ret)
simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+
+ synchronize_srcu(&debugfs_srcu);
}
EXPORT_SYMBOL_GPL(debugfs_remove);
@@ -588,10 +651,6 @@ void debugfs_remove_recursive(struct dentry *dentry)
if (IS_ERR_OR_NULL(dentry))
return;
- parent = dentry->d_parent;
- if (!parent || d_really_is_negative(parent))
- return;
-
parent = dentry;
down:
inode_lock(d_inode(parent));
@@ -642,6 +701,8 @@ void debugfs_remove_recursive(struct dentry *dentry)
if (!__debugfs_remove(child, parent))
simple_release_fs(&debugfs_mount, &debugfs_mount_count);
inode_unlock(d_inode(parent));
+
+ synchronize_srcu(&debugfs_srcu);
}
EXPORT_SYMBOL_GPL(debugfs_remove_recursive);
diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h
new file mode 100644
index 0000000000000..bba52634b995b
--- /dev/null
+++ b/fs/debugfs/internal.h
@@ -0,0 +1,26 @@
+/*
+ * internal.h - declarations internal to debugfs
+ *
+ * Copyright (C) 2016 Nicolai Stange <nicstange@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ */
+
+#ifndef _DEBUGFS_INTERNAL_H_
+#define _DEBUGFS_INTERNAL_H_
+
+struct file_operations;
+
+/* declared over in file.c */
+extern const struct file_operations debugfs_noop_file_operations;
+extern const struct file_operations debugfs_open_proxy_file_operations;
+extern const struct file_operations debugfs_full_proxy_file_operations;
+
+struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode,
+ struct dentry *parent, void *data,
+ const struct file_operations *fops);
+
+#endif /* _DEBUGFS_INTERNAL_H_ */
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 655f21f991606..d116453b02766 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -95,8 +95,6 @@ static struct ctl_table pty_root_table[] = {
static DEFINE_MUTEX(allocated_ptys_lock);
-static struct vfsmount *devpts_mnt;
-
struct pts_mount_opts {
int setuid;
int setgid;
@@ -104,7 +102,7 @@ struct pts_mount_opts {
kgid_t gid;
umode_t mode;
umode_t ptmxmode;
- int newinstance;
+ int reserve;
int max;
};
@@ -117,17 +115,16 @@ static const match_table_t tokens = {
{Opt_uid, "uid=%u"},
{Opt_gid, "gid=%u"},
{Opt_mode, "mode=%o"},
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
{Opt_ptmxmode, "ptmxmode=%o"},
{Opt_newinstance, "newinstance"},
{Opt_max, "max=%d"},
-#endif
{Opt_err, NULL}
};
struct pts_fs_info {
struct ida allocated_ptys;
struct pts_mount_opts mount_opts;
+ struct super_block *sb;
struct dentry *ptmx_dentry;
};
@@ -136,15 +133,48 @@ static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb)
return sb->s_fs_info;
}
-static inline struct super_block *pts_sb_from_inode(struct inode *inode)
+struct pts_fs_info *devpts_acquire(struct file *filp)
{
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
- if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
- return inode->i_sb;
-#endif
- if (!devpts_mnt)
- return NULL;
- return devpts_mnt->mnt_sb;
+ struct pts_fs_info *result;
+ struct path path;
+ struct super_block *sb;
+ int err;
+
+ path = filp->f_path;
+ path_get(&path);
+
+ /* Has the devpts filesystem already been found? */
+ sb = path.mnt->mnt_sb;
+ if (sb->s_magic != DEVPTS_SUPER_MAGIC) {
+ /* Is a devpts filesystem at "pts" in the same directory? */
+ err = path_pts(&path);
+ if (err) {
+ result = ERR_PTR(err);
+ goto out;
+ }
+
+ /* Is the path the root of a devpts filesystem? */
+ result = ERR_PTR(-ENODEV);
+ sb = path.mnt->mnt_sb;
+ if ((sb->s_magic != DEVPTS_SUPER_MAGIC) ||
+ (path.mnt->mnt_root != sb->s_root))
+ goto out;
+ }
+
+ /*
+ * pty code needs to hold extra references in case of last /dev/tty close
+ */
+ atomic_inc(&sb->s_active);
+ result = DEVPTS_SB(sb);
+
+out:
+ path_put(&path);
+ return result;
+}
+
+void devpts_release(struct pts_fs_info *fsi)
+{
+ deactivate_super(fsi->sb);
}
#define PARSE_MOUNT 0
@@ -153,9 +183,7 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode)
/*
* parse_mount_options():
* Set @opts to mount options specified in @data. If an option is not
- * specified in @data, set it to its default value. The exception is
- * 'newinstance' option which can only be set/cleared on a mount (i.e.
- * cannot be changed during remount).
+ * specified in @data, set it to its default value.
*
* Note: @data may be NULL (in which case all options are set to default).
*/
@@ -173,9 +201,12 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
opts->max = NR_UNIX98_PTY_MAX;
- /* newinstance makes sense only on initial mount */
+ /* Only allow instances mounted from the initial mount
+ * namespace to tap the reserve pool of ptys.
+ */
if (op == PARSE_MOUNT)
- opts->newinstance = 0;
+ opts->reserve =
+ (current->nsproxy->mnt_ns == init_task.nsproxy->mnt_ns);
while ((p = strsep(&data, ",")) != NULL) {
substring_t args[MAX_OPT_ARGS];
@@ -210,16 +241,12 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
return -EINVAL;
opts->mode = option & S_IALLUGO;
break;
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
case Opt_ptmxmode:
if (match_octal(&args[0], &option))
return -EINVAL;
opts->ptmxmode = option & S_IALLUGO;
break;
case Opt_newinstance:
- /* newinstance makes sense only on initial mount */
- if (op == PARSE_MOUNT)
- opts->newinstance = 1;
break;
case Opt_max:
if (match_int(&args[0], &option) ||
@@ -227,7 +254,6 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
return -EINVAL;
opts->max = option;
break;
-#endif
default:
pr_err("called with bogus options\n");
return -EINVAL;
@@ -237,7 +263,6 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
return 0;
}
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
static int mknod_ptmx(struct super_block *sb)
{
int mode;
@@ -304,12 +329,6 @@ static void update_ptmx_mode(struct pts_fs_info *fsi)
inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode;
}
}
-#else
-static inline void update_ptmx_mode(struct pts_fs_info *fsi)
-{
- return;
-}
-#endif
static int devpts_remount(struct super_block *sb, int *flags, char *data)
{
@@ -343,11 +362,9 @@ static int devpts_show_options(struct seq_file *seq, struct dentry *root)
seq_printf(seq, ",gid=%u",
from_kgid_munged(&init_user_ns, opts->gid));
seq_printf(seq, ",mode=%03o", opts->mode);
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode);
if (opts->max < NR_UNIX98_PTY_MAX)
seq_printf(seq, ",max=%d", opts->max);
-#endif
return 0;
}
@@ -358,7 +375,7 @@ static const struct super_operations devpts_sops = {
.show_options = devpts_show_options,
};
-static void *new_pts_fs_info(void)
+static void *new_pts_fs_info(struct super_block *sb)
{
struct pts_fs_info *fsi;
@@ -369,6 +386,7 @@ static void *new_pts_fs_info(void)
ida_init(&fsi->allocated_ptys);
fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE;
fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
+ fsi->sb = sb;
return fsi;
}
@@ -378,13 +396,14 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
{
struct inode *inode;
+ s->s_iflags &= ~SB_I_NODEV;
s->s_blocksize = 1024;
s->s_blocksize_bits = 10;
s->s_magic = DEVPTS_SUPER_MAGIC;
s->s_op = &devpts_sops;
s->s_time_gran = 1;
- s->s_fs_info = new_pts_fs_info();
+ s->s_fs_info = new_pts_fs_info(s);
if (!s->s_fs_info)
goto fail;
@@ -408,40 +427,11 @@ fail:
return -ENOMEM;
}
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
-static int compare_init_pts_sb(struct super_block *s, void *p)
-{
- if (devpts_mnt)
- return devpts_mnt->mnt_sb == s;
- return 0;
-}
-
/*
* devpts_mount()
*
- * If the '-o newinstance' mount option was specified, mount a new
- * (private) instance of devpts. PTYs created in this instance are
- * independent of the PTYs in other devpts instances.
- *
- * If the '-o newinstance' option was not specified, mount/remount the
- * initial kernel mount of devpts. This type of mount gives the
- * legacy, single-instance semantics.
- *
- * The 'newinstance' option is needed to support multiple namespace
- * semantics in devpts while preserving backward compatibility of the
- * current 'single-namespace' semantics. i.e all mounts of devpts
- * without the 'newinstance' mount option should bind to the initial
- * kernel mount, like mount_single().
- *
- * Mounts with 'newinstance' option create a new, private namespace.
- *
- * NOTE:
- *
- * For single-mount semantics, devpts cannot use mount_single(),
- * because mount_single()/sget() find and use the super-block from
- * the most recent mount of devpts. But that recent mount may be a
- * 'newinstance' mount and mount_single() would pick the newinstance
- * super-block instead of the initial super-block.
+ * Mount a new (private) instance of devpts. PTYs created in this
+ * instance are independent of the PTYs in other devpts instances.
*/
static struct dentry *devpts_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
@@ -454,18 +444,7 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type,
if (error)
return ERR_PTR(error);
- /* Require newinstance for all user namespace mounts to ensure
- * the mount options are not changed.
- */
- if ((current_user_ns() != &init_user_ns) && !opts.newinstance)
- return ERR_PTR(-EINVAL);
-
- if (opts.newinstance)
- s = sget(fs_type, NULL, set_anon_super, flags, NULL);
- else
- s = sget(fs_type, compare_init_pts_sb, set_anon_super, flags,
- NULL);
-
+ s = sget(fs_type, NULL, set_anon_super, flags, NULL);
if (IS_ERR(s))
return ERR_CAST(s);
@@ -489,18 +468,6 @@ out_undo_sget:
return ERR_PTR(error);
}
-#else
-/*
- * This supports only the legacy single-instance semantics (no
- * multiple-instance semantics)
- */
-static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags,
- const char *dev_name, void *data)
-{
- return mount_single(fs_type, flags, data, devpts_fill_super);
-}
-#endif
-
static void devpts_kill_sb(struct super_block *sb)
{
struct pts_fs_info *fsi = DEVPTS_SB(sb);
@@ -514,9 +481,7 @@ static struct file_system_type devpts_fs_type = {
.name = "devpts",
.mount = devpts_mount,
.kill_sb = devpts_kill_sb,
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
- .fs_flags = FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT,
-#endif
+ .fs_flags = FS_USERNS_MOUNT,
};
/*
@@ -524,24 +489,18 @@ static struct file_system_type devpts_fs_type = {
* to the System V naming convention
*/
-int devpts_new_index(struct inode *ptmx_inode)
+int devpts_new_index(struct pts_fs_info *fsi)
{
- struct super_block *sb = pts_sb_from_inode(ptmx_inode);
- struct pts_fs_info *fsi;
int index;
int ida_ret;
- if (!sb)
- return -ENODEV;
-
- fsi = DEVPTS_SB(sb);
retry:
if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL))
return -ENOMEM;
mutex_lock(&allocated_ptys_lock);
- if (pty_count >= pty_limit -
- (fsi->mount_opts.newinstance ? pty_reserve : 0)) {
+ if (pty_count >= (pty_limit -
+ (fsi->mount_opts.reserve ? 0 : pty_reserve))) {
mutex_unlock(&allocated_ptys_lock);
return -ENOSPC;
}
@@ -564,37 +523,14 @@ retry:
return index;
}
-void devpts_kill_index(struct inode *ptmx_inode, int idx)
+void devpts_kill_index(struct pts_fs_info *fsi, int idx)
{
- struct super_block *sb = pts_sb_from_inode(ptmx_inode);
- struct pts_fs_info *fsi = DEVPTS_SB(sb);
-
mutex_lock(&allocated_ptys_lock);
ida_remove(&fsi->allocated_ptys, idx);
pty_count--;
mutex_unlock(&allocated_ptys_lock);
}
-/*
- * pty code needs to hold extra references in case of last /dev/tty close
- */
-
-void devpts_add_ref(struct inode *ptmx_inode)
-{
- struct super_block *sb = pts_sb_from_inode(ptmx_inode);
-
- atomic_inc(&sb->s_active);
- ihold(ptmx_inode);
-}
-
-void devpts_del_ref(struct inode *ptmx_inode)
-{
- struct super_block *sb = pts_sb_from_inode(ptmx_inode);
-
- iput(ptmx_inode);
- deactivate_super(sb);
-}
-
/**
* devpts_pty_new -- create a new inode in /dev/pts/
* @ptmx_inode: inode of the master
@@ -604,22 +540,16 @@ void devpts_del_ref(struct inode *ptmx_inode)
*
* The created inode is returned. Remove it from /dev/pts/ by devpts_pty_kill.
*/
-struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index,
- void *priv)
+struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv)
{
struct dentry *dentry;
- struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+ struct super_block *sb = fsi->sb;
struct inode *inode;
struct dentry *root;
- struct pts_fs_info *fsi;
struct pts_mount_opts *opts;
char s[12];
- if (!sb)
- return ERR_PTR(-ENODEV);
-
root = sb->s_root;
- fsi = DEVPTS_SB(sb);
opts = &fsi->mount_opts;
inode = new_inode(sb);
@@ -630,25 +560,21 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index,
inode->i_uid = opts->setuid ? opts->uid : current_fsuid();
inode->i_gid = opts->setgid ? opts->gid : current_fsgid();
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
- init_special_inode(inode, S_IFCHR|opts->mode, device);
- inode->i_private = priv;
+ init_special_inode(inode, S_IFCHR|opts->mode, MKDEV(UNIX98_PTY_SLAVE_MAJOR, index));
sprintf(s, "%d", index);
- inode_lock(d_inode(root));
-
dentry = d_alloc_name(root, s);
if (dentry) {
+ dentry->d_fsdata = priv;
d_add(dentry, inode);
fsnotify_create(d_inode(root), dentry);
} else {
iput(inode);
- inode = ERR_PTR(-ENOMEM);
+ dentry = ERR_PTR(-ENOMEM);
}
- inode_unlock(d_inode(root));
-
- return inode;
+ return dentry;
}
/**
@@ -657,24 +583,10 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index,
*
* Returns whatever was passed as priv in devpts_pty_new for a given inode.
*/
-void *devpts_get_priv(struct inode *pts_inode)
+void *devpts_get_priv(struct dentry *dentry)
{
- struct dentry *dentry;
- void *priv = NULL;
-
- BUG_ON(pts_inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
-
- /* Ensure dentry has not been deleted by devpts_pty_kill() */
- dentry = d_find_alias(pts_inode);
- if (!dentry)
- return NULL;
-
- if (pts_inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
- priv = pts_inode->i_private;
-
- dput(dentry);
-
- return priv;
+ WARN_ON_ONCE(dentry->d_sb->s_magic != DEVPTS_SUPER_MAGIC);
+ return dentry->d_fsdata;
}
/**
@@ -683,43 +595,21 @@ void *devpts_get_priv(struct inode *pts_inode)
*
* This is an inverse operation of devpts_pty_new.
*/
-void devpts_pty_kill(struct inode *inode)
+void devpts_pty_kill(struct dentry *dentry)
{
- struct super_block *sb = pts_sb_from_inode(inode);
- struct dentry *root = sb->s_root;
- struct dentry *dentry;
-
- BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
-
- inode_lock(d_inode(root));
-
- dentry = d_find_alias(inode);
+ WARN_ON_ONCE(dentry->d_sb->s_magic != DEVPTS_SUPER_MAGIC);
- drop_nlink(inode);
+ dentry->d_fsdata = NULL;
+ drop_nlink(dentry->d_inode);
d_delete(dentry);
dput(dentry); /* d_alloc_name() in devpts_pty_new() */
- dput(dentry); /* d_find_alias above */
-
- inode_unlock(d_inode(root));
}
static int __init init_devpts_fs(void)
{
int err = register_filesystem(&devpts_fs_type);
- struct ctl_table_header *table;
-
if (!err) {
- struct vfsmount *mnt;
-
- table = register_sysctl_table(pty_root_table);
- mnt = kern_mount(&devpts_fs_type);
- if (IS_ERR(mnt)) {
- err = PTR_ERR(mnt);
- unregister_filesystem(&devpts_fs_type);
- unregister_sysctl_table(table);
- } else {
- devpts_mnt = mnt;
- }
+ register_sysctl_table(pty_root_table);
}
return err;
}
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 476f1ecbd1f0e..7c3ce73cb6170 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -108,7 +108,8 @@ struct dio_submit {
/* dio_state communicated between submission path and end_io */
struct dio {
int flags; /* doesn't change */
- int rw;
+ int op;
+ int op_flags;
blk_qc_t bio_cookie;
struct block_device *bio_bdev;
struct inode *inode;
@@ -163,7 +164,7 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
ret = iov_iter_get_pages(sdio->iter, dio->pages, LONG_MAX, DIO_PAGES,
&sdio->from);
- if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) {
+ if (ret < 0 && sdio->blocks_available && (dio->op == REQ_OP_WRITE)) {
struct page *page = ZERO_PAGE(0);
/*
* A memory fault, but the filesystem has some outstanding
@@ -172,7 +173,7 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
*/
if (dio->page_errors == 0)
dio->page_errors = ret;
- page_cache_get(page);
+ get_page(page);
dio->pages[0] = page;
sdio->head = 0;
sdio->tail = 1;
@@ -224,9 +225,9 @@ static inline struct page *dio_get_page(struct dio *dio,
* filesystems can use it to hold additional state between get_block calls and
* dio_complete.
*/
-static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
- bool is_async)
+static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
{
+ loff_t offset = dio->iocb->ki_pos;
ssize_t transferred = 0;
/*
@@ -242,7 +243,8 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
transferred = dio->result;
/* Check for short read case */
- if ((dio->rw == READ) && ((offset + transferred) > dio->i_size))
+ if ((dio->op == REQ_OP_READ) &&
+ ((offset + transferred) > dio->i_size))
transferred = dio->i_size - offset;
}
@@ -256,6 +258,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
if (dio->end_io) {
int err;
+ // XXX: ki_pos??
err = dio->end_io(dio->iocb, offset, ret, dio->private);
if (err)
ret = err;
@@ -265,15 +268,15 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
inode_dio_end(dio->inode);
if (is_async) {
- if (dio->rw & WRITE) {
- int err;
-
- err = generic_write_sync(dio->iocb->ki_filp, offset,
- transferred);
- if (err < 0 && ret > 0)
- ret = err;
- }
+ /*
+ * generic_write_sync expects ki_pos to have been updated
+ * already, but the submission path only does this for
+ * synchronous I/O.
+ */
+ dio->iocb->ki_pos += transferred;
+ if (dio->op == REQ_OP_WRITE)
+ ret = generic_write_sync(dio->iocb, transferred);
dio->iocb->ki_complete(dio->iocb, ret, 0);
}
@@ -285,7 +288,7 @@ static void dio_aio_complete_work(struct work_struct *work)
{
struct dio *dio = container_of(work, struct dio, complete_work);
- dio_complete(dio, dio->iocb->ki_pos, 0, true);
+ dio_complete(dio, 0, true);
}
static int dio_bio_complete(struct dio *dio, struct bio *bio);
@@ -314,7 +317,7 @@ static void dio_bio_end_aio(struct bio *bio)
queue_work(dio->inode->i_sb->s_dio_done_wq,
&dio->complete_work);
} else {
- dio_complete(dio, dio->iocb->ki_pos, 0, true);
+ dio_complete(dio, 0, true);
}
}
}
@@ -374,6 +377,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
bio->bi_bdev = bdev;
bio->bi_iter.bi_sector = first_sector;
+ bio_set_op_attrs(bio, dio->op, dio->op_flags);
if (dio->is_async)
bio->bi_end_io = dio_bio_end_aio;
else
@@ -401,17 +405,16 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
dio->refcount++;
spin_unlock_irqrestore(&dio->bio_lock, flags);
- if (dio->is_async && dio->rw == READ && dio->should_dirty)
+ if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty)
bio_set_pages_dirty(bio);
dio->bio_bdev = bio->bi_bdev;
if (sdio->submit_io) {
- sdio->submit_io(dio->rw, bio, dio->inode,
- sdio->logical_offset_in_bio);
+ sdio->submit_io(bio, dio->inode, sdio->logical_offset_in_bio);
dio->bio_cookie = BLK_QC_T_NONE;
} else
- dio->bio_cookie = submit_bio(dio->rw, bio);
+ dio->bio_cookie = submit_bio(bio);
sdio->bio = NULL;
sdio->boundary = 0;
@@ -424,7 +427,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
{
while (sdio->head < sdio->tail)
- page_cache_release(dio->pages[sdio->head++]);
+ put_page(dio->pages[sdio->head++]);
}
/*
@@ -477,17 +480,17 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
if (bio->bi_error)
dio->io_error = -EIO;
- if (dio->is_async && dio->rw == READ && dio->should_dirty) {
+ if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) {
err = bio->bi_error;
bio_check_pages_dirty(bio); /* transfers ownership */
} else {
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
- if (dio->rw == READ && !PageCompound(page) &&
+ if (dio->op == REQ_OP_READ && !PageCompound(page) &&
dio->should_dirty)
set_page_dirty_lock(page);
- page_cache_release(page);
+ put_page(page);
}
err = bio->bi_error;
bio_put(bio);
@@ -627,20 +630,20 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
map_bh->b_size = fs_count << i_blkbits;
/*
- * For writes inside i_size on a DIO_SKIP_HOLES filesystem we
- * forbid block creations: only overwrites are permitted.
- * We will return early to the caller once we see an
- * unmapped buffer head returned, and the caller will fall
- * back to buffered I/O.
+ * For writes that could fill holes inside i_size on a
+ * DIO_SKIP_HOLES filesystem we forbid block creations: only
+ * overwrites are permitted. We will return early to the caller
+ * once we see an unmapped buffer head returned, and the caller
+ * will fall back to buffered I/O.
*
* Otherwise the decision is left to the get_blocks method,
* which may decide to handle it or also return an unmapped
* buffer head.
*/
- create = dio->rw & WRITE;
+ create = dio->op == REQ_OP_WRITE;
if (dio->flags & DIO_SKIP_HOLES) {
- if (sdio->block_in_file < (i_size_read(dio->inode) >>
- sdio->blkbits))
+ if (fs_startblk <= ((i_size_read(dio->inode) - 1) >>
+ i_blkbits))
create = 0;
}
@@ -696,7 +699,7 @@ static inline int dio_bio_add_page(struct dio_submit *sdio)
*/
if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE)
sdio->pages_in_io--;
- page_cache_get(sdio->cur_page);
+ get_page(sdio->cur_page);
sdio->final_block_in_bio = sdio->cur_page_block +
(sdio->cur_page_len >> sdio->blkbits);
ret = 0;
@@ -787,7 +790,7 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
{
int ret = 0;
- if (dio->rw & WRITE) {
+ if (dio->op == REQ_OP_WRITE) {
/*
* Read accounting is performed in submit_bio()
*/
@@ -810,13 +813,13 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
*/
if (sdio->cur_page) {
ret = dio_send_cur_page(dio, sdio, map_bh);
- page_cache_release(sdio->cur_page);
+ put_page(sdio->cur_page);
sdio->cur_page = NULL;
if (ret)
return ret;
}
- page_cache_get(page); /* It is in dio */
+ get_page(page); /* It is in dio */
sdio->cur_page = page;
sdio->cur_page_offset = offset;
sdio->cur_page_len = len;
@@ -830,7 +833,7 @@ out:
if (sdio->boundary) {
ret = dio_send_cur_page(dio, sdio, map_bh);
dio_bio_submit(dio, sdio);
- page_cache_release(sdio->cur_page);
+ put_page(sdio->cur_page);
sdio->cur_page = NULL;
}
return ret;
@@ -947,7 +950,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
ret = get_more_blocks(dio, sdio, map_bh);
if (ret) {
- page_cache_release(page);
+ put_page(page);
goto out;
}
if (!buffer_mapped(map_bh))
@@ -987,8 +990,8 @@ do_holes:
loff_t i_size_aligned;
/* AKPM: eargh, -ENOTBLK is a hack */
- if (dio->rw & WRITE) {
- page_cache_release(page);
+ if (dio->op == REQ_OP_WRITE) {
+ put_page(page);
return -ENOTBLK;
}
@@ -1001,7 +1004,7 @@ do_holes:
if (sdio->block_in_file >=
i_size_aligned >> blkbits) {
/* We hit eof */
- page_cache_release(page);
+ put_page(page);
goto out;
}
zero_user(page, from, 1 << blkbits);
@@ -1041,7 +1044,7 @@ do_holes:
sdio->next_block_for_io,
map_bh);
if (ret) {
- page_cache_release(page);
+ put_page(page);
goto out;
}
sdio->next_block_for_io += this_chunk_blocks;
@@ -1057,7 +1060,7 @@ next_block:
}
/* Drop the ref which was taken in get_user_pages() */
- page_cache_release(page);
+ put_page(page);
}
out:
return ret;
@@ -1113,7 +1116,7 @@ static inline int drop_refcount(struct dio *dio)
static inline ssize_t
do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, struct iov_iter *iter,
- loff_t offset, get_block_t get_block, dio_iodone_t end_io,
+ get_block_t get_block, dio_iodone_t end_io,
dio_submit_t submit_io, int flags)
{
unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits);
@@ -1121,6 +1124,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
unsigned blocksize_mask = (1 << blkbits) - 1;
ssize_t retval = -EINVAL;
size_t count = iov_iter_count(iter);
+ loff_t offset = iocb->ki_pos;
loff_t end = offset + count;
struct dio *dio;
struct dio_submit sdio = { 0, };
@@ -1200,7 +1204,12 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
dio->is_async = true;
dio->inode = inode;
- dio->rw = iov_iter_rw(iter) == WRITE ? WRITE_ODIRECT : READ;
+ if (iov_iter_rw(iter) == WRITE) {
+ dio->op = REQ_OP_WRITE;
+ dio->op_flags = WRITE_ODIRECT;
+ } else {
+ dio->op = REQ_OP_READ;
+ }
/*
* For AIO O_(D)SYNC writes we need to defer completions to a workqueue
@@ -1281,7 +1290,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
ret2 = dio_send_cur_page(dio, &sdio, &map_bh);
if (retval == 0)
retval = ret2;
- page_cache_release(sdio.cur_page);
+ put_page(sdio.cur_page);
sdio.cur_page = NULL;
}
if (sdio.bio)
@@ -1318,7 +1327,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
dio_await_completion(dio);
if (drop_refcount(dio) == 0) {
- retval = dio_complete(dio, offset, retval, false);
+ retval = dio_complete(dio, retval, false);
} else
BUG_ON(retval != -EIOCBQUEUED);
@@ -1328,7 +1337,7 @@ out:
ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, struct iov_iter *iter,
- loff_t offset, get_block_t get_block,
+ get_block_t get_block,
dio_iodone_t end_io, dio_submit_t submit_io,
int flags)
{
@@ -1344,7 +1353,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
prefetch(bdev->bd_queue);
prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);
- return do_blockdev_direct_IO(iocb, inode, bdev, iter, offset, get_block,
+ return do_blockdev_direct_IO(iocb, inode, bdev, iter, get_block,
end_io, submit_io, flags);
}
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 519112168a9e2..df955d2209ce9 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -73,6 +73,7 @@ struct dlm_cluster {
unsigned int cl_toss_secs;
unsigned int cl_scan_secs;
unsigned int cl_log_debug;
+ unsigned int cl_log_info;
unsigned int cl_protocol;
unsigned int cl_timewarn_cs;
unsigned int cl_waitwarn_us;
@@ -95,6 +96,7 @@ enum {
CLUSTER_ATTR_TOSS_SECS,
CLUSTER_ATTR_SCAN_SECS,
CLUSTER_ATTR_LOG_DEBUG,
+ CLUSTER_ATTR_LOG_INFO,
CLUSTER_ATTR_PROTOCOL,
CLUSTER_ATTR_TIMEWARN_CS,
CLUSTER_ATTR_WAITWARN_US,
@@ -166,6 +168,7 @@ CLUSTER_ATTR(recover_timer, 1);
CLUSTER_ATTR(toss_secs, 1);
CLUSTER_ATTR(scan_secs, 1);
CLUSTER_ATTR(log_debug, 0);
+CLUSTER_ATTR(log_info, 0);
CLUSTER_ATTR(protocol, 0);
CLUSTER_ATTR(timewarn_cs, 1);
CLUSTER_ATTR(waitwarn_us, 0);
@@ -180,6 +183,7 @@ static struct configfs_attribute *cluster_attrs[] = {
[CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs,
[CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs,
[CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug,
+ [CLUSTER_ATTR_LOG_INFO] = &cluster_attr_log_info,
[CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol,
[CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs,
[CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us,
@@ -343,13 +347,12 @@ static struct config_group *make_cluster(struct config_group *g,
struct dlm_cluster *cl = NULL;
struct dlm_spaces *sps = NULL;
struct dlm_comms *cms = NULL;
- void *gps = NULL;
cl = kzalloc(sizeof(struct dlm_cluster), GFP_NOFS);
sps = kzalloc(sizeof(struct dlm_spaces), GFP_NOFS);
cms = kzalloc(sizeof(struct dlm_comms), GFP_NOFS);
- if (!cl || !gps || !sps || !cms)
+ if (!cl || !sps || !cms)
goto fail;
config_group_init_type_name(&cl->group, name, &cluster_type);
@@ -366,6 +369,7 @@ static struct config_group *make_cluster(struct config_group *g,
cl->cl_toss_secs = dlm_config.ci_toss_secs;
cl->cl_scan_secs = dlm_config.ci_scan_secs;
cl->cl_log_debug = dlm_config.ci_log_debug;
+ cl->cl_log_info = dlm_config.ci_log_info;
cl->cl_protocol = dlm_config.ci_protocol;
cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs;
cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us;
@@ -851,6 +855,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
#define DEFAULT_TOSS_SECS 10
#define DEFAULT_SCAN_SECS 5
#define DEFAULT_LOG_DEBUG 0
+#define DEFAULT_LOG_INFO 1
#define DEFAULT_PROTOCOL 0
#define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */
#define DEFAULT_WAITWARN_US 0
@@ -866,6 +871,7 @@ struct dlm_config_info dlm_config = {
.ci_toss_secs = DEFAULT_TOSS_SECS,
.ci_scan_secs = DEFAULT_SCAN_SECS,
.ci_log_debug = DEFAULT_LOG_DEBUG,
+ .ci_log_info = DEFAULT_LOG_INFO,
.ci_protocol = DEFAULT_PROTOCOL,
.ci_timewarn_cs = DEFAULT_TIMEWARN_CS,
.ci_waitwarn_us = DEFAULT_WAITWARN_US,
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index f30697bc2780d..6041eec886abf 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -31,6 +31,7 @@ struct dlm_config_info {
int ci_toss_secs;
int ci_scan_secs;
int ci_log_debug;
+ int ci_log_info;
int ci_protocol;
int ci_timewarn_cs;
int ci_waitwarn_us;
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 5eff6ea3e27f1..216b61604ef90 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -65,8 +65,16 @@ struct dlm_mhandle;
printk(KERN_ERR "dlm: "fmt"\n" , ##args)
#define log_error(ls, fmt, args...) \
printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args)
+
#define log_rinfo(ls, fmt, args...) \
- printk(KERN_INFO "dlm: %s: " fmt "\n", (ls)->ls_name , ##args);
+do { \
+ if (dlm_config.ci_log_info) \
+ printk(KERN_INFO "dlm: %s: " fmt "\n", \
+ (ls)->ls_name, ##args); \
+ else if (dlm_config.ci_log_debug) \
+ printk(KERN_DEBUG "dlm: %s: " fmt "\n", \
+ (ls)->ls_name , ##args); \
+} while (0)
#define log_debug(ls, fmt, args...) \
do { \
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 00640e70ed7ae..963016c8f3d12 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -640,7 +640,7 @@ static int receive_from_sock(struct connection *con)
con->rx_page = alloc_page(GFP_ATOMIC);
if (con->rx_page == NULL)
goto out_resched;
- cbuf_init(&con->cb, PAGE_CACHE_SIZE);
+ cbuf_init(&con->cb, PAGE_SIZE);
}
/*
@@ -657,7 +657,7 @@ static int receive_from_sock(struct connection *con)
* buffer and the start of the currently used section (cb.base)
*/
if (cbuf_data(&con->cb) >= con->cb.base) {
- iov[0].iov_len = PAGE_CACHE_SIZE - cbuf_data(&con->cb);
+ iov[0].iov_len = PAGE_SIZE - cbuf_data(&con->cb);
iov[1].iov_len = con->cb.base;
iov[1].iov_base = page_address(con->rx_page);
nvec = 2;
@@ -675,7 +675,7 @@ static int receive_from_sock(struct connection *con)
ret = dlm_process_incoming_buffer(con->nodeid,
page_address(con->rx_page),
con->cb.base, con->cb.len,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
if (ret == -EBADMSG) {
log_print("lowcomms: addr=%p, base=%u, len=%u, read=%d",
page_address(con->rx_page), con->cb.base,
@@ -1279,10 +1279,9 @@ static void init_local(void)
if (dlm_our_addr(&sas, i))
break;
- addr = kmalloc(sizeof(*addr), GFP_NOFS);
+ addr = kmemdup(&sas, sizeof(*addr), GFP_NOFS);
if (!addr)
break;
- memcpy(addr, &sas, sizeof(*addr));
dlm_local_addr[dlm_local_count++] = addr;
}
}
@@ -1416,7 +1415,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
spin_lock(&con->writequeue_lock);
e = list_entry(con->writequeue.prev, struct writequeue_entry, list);
if ((&e->list == &con->writequeue) ||
- (PAGE_CACHE_SIZE - e->end < len)) {
+ (PAGE_SIZE - e->end < len)) {
e = NULL;
} else {
offset = e->end;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 64026e53722a2..e5e29f8c920b1 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -45,7 +45,7 @@
* ecryptfs_to_hex
* @dst: Buffer to take hex character representation of contents of
* src; must be at least of size (src_size * 2)
- * @src: Buffer to be converted to a hex string respresentation
+ * @src: Buffer to be converted to a hex string representation
* @src_size: number of bytes to convert
*/
void ecryptfs_to_hex(char *dst, char *src, size_t src_size)
@@ -60,7 +60,7 @@ void ecryptfs_to_hex(char *dst, char *src, size_t src_size)
* ecryptfs_from_hex
* @dst: Buffer to take the bytes from src hex; must be at least of
* size (src_size / 2)
- * @src: Buffer to be converted from a hex string respresentation to raw value
+ * @src: Buffer to be converted from a hex string representation to raw value
* @dst_size: size of dst buffer, or number of hex characters pairs to convert
*/
void ecryptfs_from_hex(char *dst, char *src, int dst_size)
@@ -105,19 +105,7 @@ static int ecryptfs_calculate_md5(char *dst,
struct crypto_shash *tfm;
int rc = 0;
- mutex_lock(&crypt_stat->cs_hash_tfm_mutex);
tfm = crypt_stat->hash_tfm;
- if (!tfm) {
- tfm = crypto_alloc_shash(ECRYPTFS_DEFAULT_HASH, 0, 0);
- if (IS_ERR(tfm)) {
- rc = PTR_ERR(tfm);
- ecryptfs_printk(KERN_ERR, "Error attempting to "
- "allocate crypto context; rc = [%d]\n",
- rc);
- goto out;
- }
- crypt_stat->hash_tfm = tfm;
- }
rc = ecryptfs_hash_digest(tfm, src, len, dst);
if (rc) {
printk(KERN_ERR
@@ -126,7 +114,6 @@ static int ecryptfs_calculate_md5(char *dst,
goto out;
}
out:
- mutex_unlock(&crypt_stat->cs_hash_tfm_mutex);
return rc;
}
@@ -207,16 +194,29 @@ out:
*
* Initialize the crypt_stat structure.
*/
-void
-ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
+int ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
{
+ struct crypto_shash *tfm;
+ int rc;
+
+ tfm = crypto_alloc_shash(ECRYPTFS_DEFAULT_HASH, 0, 0);
+ if (IS_ERR(tfm)) {
+ rc = PTR_ERR(tfm);
+ ecryptfs_printk(KERN_ERR, "Error attempting to "
+ "allocate crypto context; rc = [%d]\n",
+ rc);
+ return rc;
+ }
+
memset((void *)crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat));
INIT_LIST_HEAD(&crypt_stat->keysig_list);
mutex_init(&crypt_stat->keysig_list_mutex);
mutex_init(&crypt_stat->cs_mutex);
mutex_init(&crypt_stat->cs_tfm_mutex);
- mutex_init(&crypt_stat->cs_hash_tfm_mutex);
+ crypt_stat->hash_tfm = tfm;
crypt_stat->flags |= ECRYPTFS_STRUCT_INITIALIZED;
+
+ return 0;
}
/**
@@ -286,7 +286,7 @@ int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
pg = virt_to_page(addr);
offset = offset_in_page(addr);
sg_set_page(&sg[i], pg, 0, offset);
- remainder_of_page = PAGE_CACHE_SIZE - offset;
+ remainder_of_page = PAGE_SIZE - offset;
if (size >= remainder_of_page) {
sg[i].length = remainder_of_page;
addr += remainder_of_page;
@@ -400,7 +400,7 @@ static loff_t lower_offset_for_page(struct ecryptfs_crypt_stat *crypt_stat,
struct page *page)
{
return ecryptfs_lower_header_size(crypt_stat) +
- ((loff_t)page->index << PAGE_CACHE_SHIFT);
+ ((loff_t)page->index << PAGE_SHIFT);
}
/**
@@ -428,7 +428,7 @@ static int crypt_extent(struct ecryptfs_crypt_stat *crypt_stat,
size_t extent_size = crypt_stat->extent_size;
int rc;
- extent_base = (((loff_t)page_index) * (PAGE_CACHE_SIZE / extent_size));
+ extent_base = (((loff_t)page_index) * (PAGE_SIZE / extent_size));
rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
(extent_base + extent_offset));
if (rc) {
@@ -498,7 +498,7 @@ int ecryptfs_encrypt_page(struct page *page)
}
for (extent_offset = 0;
- extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size);
+ extent_offset < (PAGE_SIZE / crypt_stat->extent_size);
extent_offset++) {
rc = crypt_extent(crypt_stat, enc_extent_page, page,
extent_offset, ENCRYPT);
@@ -512,7 +512,7 @@ int ecryptfs_encrypt_page(struct page *page)
lower_offset = lower_offset_for_page(crypt_stat, page);
enc_extent_virt = kmap(enc_extent_page);
rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, lower_offset,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
kunmap(enc_extent_page);
if (rc < 0) {
ecryptfs_printk(KERN_ERR,
@@ -560,7 +560,7 @@ int ecryptfs_decrypt_page(struct page *page)
lower_offset = lower_offset_for_page(crypt_stat, page);
page_virt = kmap(page);
- rc = ecryptfs_read_lower(page_virt, lower_offset, PAGE_CACHE_SIZE,
+ rc = ecryptfs_read_lower(page_virt, lower_offset, PAGE_SIZE,
ecryptfs_inode);
kunmap(page);
if (rc < 0) {
@@ -571,7 +571,7 @@ int ecryptfs_decrypt_page(struct page *page)
}
for (extent_offset = 0;
- extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size);
+ extent_offset < (PAGE_SIZE / crypt_stat->extent_size);
extent_offset++) {
rc = crypt_extent(crypt_stat, page, page,
extent_offset, DECRYPT);
@@ -659,11 +659,11 @@ void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat)
if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
else {
- if (PAGE_CACHE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)
+ if (PAGE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)
crypt_stat->metadata_size =
ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
else
- crypt_stat->metadata_size = PAGE_CACHE_SIZE;
+ crypt_stat->metadata_size = PAGE_SIZE;
}
}
@@ -953,7 +953,7 @@ struct ecryptfs_cipher_code_str_map_elem {
};
/* Add support for additional ciphers by adding elements here. The
- * cipher_code is whatever OpenPGP applicatoins use to identify the
+ * cipher_code is whatever OpenPGP applications use to identify the
* ciphers. List in order of probability. */
static struct ecryptfs_cipher_code_str_map_elem
ecryptfs_cipher_code_str_map[] = {
@@ -1141,12 +1141,13 @@ ecryptfs_write_metadata_to_contents(struct inode *ecryptfs_inode,
static int
ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry,
+ struct inode *ecryptfs_inode,
char *page_virt, size_t size)
{
int rc;
- rc = ecryptfs_setxattr(ecryptfs_dentry, ECRYPTFS_XATTR_NAME, page_virt,
- size, 0);
+ rc = ecryptfs_setxattr(ecryptfs_dentry, ecryptfs_inode,
+ ECRYPTFS_XATTR_NAME, page_virt, size, 0);
return rc;
}
@@ -1215,8 +1216,8 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry,
goto out_free;
}
if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
- rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt,
- size);
+ rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, ecryptfs_inode,
+ virt, size);
else
rc = ecryptfs_write_metadata_to_contents(ecryptfs_inode, virt,
virt_len);
@@ -1369,7 +1370,9 @@ int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode)
ssize_t size;
int rc = 0;
- size = ecryptfs_getxattr_lower(lower_dentry, ECRYPTFS_XATTR_NAME,
+ size = ecryptfs_getxattr_lower(lower_dentry,
+ ecryptfs_inode_to_lower(ecryptfs_inode),
+ ECRYPTFS_XATTR_NAME,
page_virt, ECRYPTFS_DEFAULT_EXTENT_SIZE);
if (size < 0) {
if (unlikely(ecryptfs_verbosity > 0))
@@ -1391,6 +1394,7 @@ int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry,
int rc;
rc = ecryptfs_getxattr_lower(ecryptfs_dentry_to_lower(dentry),
+ ecryptfs_inode_to_lower(inode),
ECRYPTFS_XATTR_NAME, file_size,
ECRYPTFS_SIZE_AND_MARKER_BYTES);
if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES)
@@ -1406,7 +1410,7 @@ int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry,
*
* Common entry point for reading file metadata. From here, we could
* retrieve the header information from the header region of the file,
- * the xattr region of the file, or some other repostory that is
+ * the xattr region of the file, or some other repository that is
* stored separately from the file itself. The current implementation
* supports retrieving the metadata information from the file contents
* and from the xattr region.
@@ -1442,7 +1446,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
ECRYPTFS_VALIDATE_HEADER_SIZE);
if (rc) {
/* metadata is not in the file header, so try xattrs */
- memset(page_virt, 0, PAGE_CACHE_SIZE);
+ memset(page_virt, 0, PAGE_SIZE);
rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode);
if (rc) {
printk(KERN_DEBUG "Valid eCryptfs headers not found in "
@@ -1475,7 +1479,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
}
out:
if (page_virt) {
- memset(page_virt, 0, PAGE_CACHE_SIZE);
+ memset(page_virt, 0, PAGE_SIZE);
kmem_cache_free(ecryptfs_header_cache, page_virt);
}
return rc;
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index d123fbaa28e00..4ba1547bb9adc 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -242,7 +242,6 @@ struct ecryptfs_crypt_stat {
struct list_head keysig_list;
struct mutex keysig_list_mutex;
struct mutex cs_tfm_mutex;
- struct mutex cs_hash_tfm_mutex;
struct mutex cs_mutex;
};
@@ -577,7 +576,7 @@ int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
int sg_size);
int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat);
void ecryptfs_rotate_iv(unsigned char *iv);
-void ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat);
+int ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat);
void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat);
void ecryptfs_destroy_mount_crypt_stat(
struct ecryptfs_mount_crypt_stat *mount_crypt_stat);
@@ -607,11 +606,11 @@ ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
unsigned char *src, struct dentry *ecryptfs_dentry);
int ecryptfs_truncate(struct dentry *dentry, loff_t new_length);
ssize_t
-ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name,
- void *value, size_t size);
+ecryptfs_getxattr_lower(struct dentry *lower_dentry, struct inode *lower_inode,
+ const char *name, void *value, size_t size);
int
-ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
- size_t size, int flags);
+ecryptfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
+ const void *value, size_t size, int flags);
int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode);
#ifdef CONFIG_ECRYPT_FS_MESSAGING
int ecryptfs_process_response(struct ecryptfs_daemon *daemon,
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index feef8a9c4de7c..ca4e83750214a 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -112,7 +112,6 @@ static int ecryptfs_readdir(struct file *file, struct dir_context *ctx)
.sb = inode->i_sb,
};
lower_file = ecryptfs_file_to_lower(file);
- lower_file->f_pos = ctx->pos;
rc = iterate_dir(lower_file, &buf.ctx);
ctx->pos = buf.ctx.pos;
if (rc < 0)
@@ -170,9 +169,22 @@ out:
return rc;
}
+static int ecryptfs_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct file *lower_file = ecryptfs_file_to_lower(file);
+ /*
+ * Don't allow mmap on top of file systems that don't support it
+ * natively. If FILESYSTEM_MAX_STACK_DEPTH > 2 or ecryptfs
+ * allows recursive mounting, this will need to be extended.
+ */
+ if (!lower_file->f_op->mmap)
+ return -ENODEV;
+ return generic_file_mmap(file, vma);
+}
+
/**
* ecryptfs_open
- * @inode: inode speciying file to open
+ * @inode: inode specifying file to open
* @file: Structure to return filled in
*
* Opens the file specified by inode.
@@ -223,14 +235,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
}
ecryptfs_set_file_lower(
file, ecryptfs_inode_to_private(inode)->lower_file);
- if (d_is_dir(ecryptfs_dentry)) {
- ecryptfs_printk(KERN_DEBUG, "This is a directory\n");
- mutex_lock(&crypt_stat->cs_mutex);
- crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
- mutex_unlock(&crypt_stat->cs_mutex);
- rc = 0;
- goto out;
- }
rc = read_or_initialize_metadata(ecryptfs_dentry);
if (rc)
goto out_put;
@@ -247,6 +251,45 @@ out:
return rc;
}
+/**
+ * ecryptfs_dir_open
+ * @inode: inode specifying file to open
+ * @file: Structure to return filled in
+ *
+ * Opens the file specified by inode.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_dir_open(struct inode *inode, struct file *file)
+{
+ struct dentry *ecryptfs_dentry = file->f_path.dentry;
+ /* Private value of ecryptfs_dentry allocated in
+ * ecryptfs_lookup() */
+ struct ecryptfs_file_info *file_info;
+ struct file *lower_file;
+
+ /* Released in ecryptfs_release or end of function if failure */
+ file_info = kmem_cache_zalloc(ecryptfs_file_info_cache, GFP_KERNEL);
+ ecryptfs_set_file_private(file, file_info);
+ if (unlikely(!file_info)) {
+ ecryptfs_printk(KERN_ERR,
+ "Error attempting to allocate memory\n");
+ return -ENOMEM;
+ }
+ lower_file = dentry_open(ecryptfs_dentry_to_lower_path(ecryptfs_dentry),
+ file->f_flags, current_cred());
+ if (IS_ERR(lower_file)) {
+ printk(KERN_ERR "%s: Error attempting to initialize "
+ "the lower file for the dentry with name "
+ "[%pd]; rc = [%ld]\n", __func__,
+ ecryptfs_dentry, PTR_ERR(lower_file));
+ kmem_cache_free(ecryptfs_file_info_cache, file_info);
+ return PTR_ERR(lower_file);
+ }
+ ecryptfs_set_file_lower(file, lower_file);
+ return 0;
+}
+
static int ecryptfs_flush(struct file *file, fl_owner_t td)
{
struct file *lower_file = ecryptfs_file_to_lower(file);
@@ -267,6 +310,19 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
return 0;
}
+static int ecryptfs_dir_release(struct inode *inode, struct file *file)
+{
+ fput(ecryptfs_file_to_lower(file));
+ kmem_cache_free(ecryptfs_file_info_cache,
+ ecryptfs_file_to_private(file));
+ return 0;
+}
+
+static loff_t ecryptfs_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+ return vfs_llseek(ecryptfs_file_to_lower(file), offset, whence);
+}
+
static int
ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
@@ -340,31 +396,27 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
#endif
const struct file_operations ecryptfs_dir_fops = {
- .iterate = ecryptfs_readdir,
+ .iterate_shared = ecryptfs_readdir,
.read = generic_read_dir,
.unlocked_ioctl = ecryptfs_unlocked_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ecryptfs_compat_ioctl,
#endif
- .open = ecryptfs_open,
- .flush = ecryptfs_flush,
- .release = ecryptfs_release,
+ .open = ecryptfs_dir_open,
+ .release = ecryptfs_dir_release,
.fsync = ecryptfs_fsync,
- .fasync = ecryptfs_fasync,
- .splice_read = generic_file_splice_read,
- .llseek = default_llseek,
+ .llseek = ecryptfs_dir_llseek,
};
const struct file_operations ecryptfs_main_fops = {
.llseek = generic_file_llseek,
.read_iter = ecryptfs_read_update_atime,
.write_iter = generic_file_write_iter,
- .iterate = ecryptfs_readdir,
.unlocked_ioctl = ecryptfs_unlocked_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ecryptfs_compat_ioctl,
#endif
- .mmap = generic_file_mmap,
+ .mmap = ecryptfs_mmap,
.open = ecryptfs_open,
.flush = ecryptfs_flush,
.release = ecryptfs_release,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 121114e9a4643..9d153b6a1d723 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -324,9 +324,8 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode)
/**
* ecryptfs_lookup_interpose - Dentry interposition for a lookup
*/
-static int ecryptfs_lookup_interpose(struct dentry *dentry,
- struct dentry *lower_dentry,
- struct inode *dir_inode)
+static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry,
+ struct dentry *lower_dentry)
{
struct inode *inode, *lower_inode = d_inode(lower_dentry);
struct ecryptfs_dentry_info *dentry_info;
@@ -339,11 +338,12 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry,
"to allocate ecryptfs_dentry_info struct\n",
__func__);
dput(lower_dentry);
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
}
lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
- fsstack_copy_attr_atime(dir_inode, d_inode(lower_dentry->d_parent));
+ fsstack_copy_attr_atime(d_inode(dentry->d_parent),
+ d_inode(lower_dentry->d_parent));
BUG_ON(!d_count(lower_dentry));
ecryptfs_set_dentry_private(dentry, dentry_info);
@@ -353,27 +353,25 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry,
if (d_really_is_negative(lower_dentry)) {
/* We want to add because we couldn't find in lower */
d_add(dentry, NULL);
- return 0;
+ return NULL;
}
- inode = __ecryptfs_get_inode(lower_inode, dir_inode->i_sb);
+ inode = __ecryptfs_get_inode(lower_inode, dentry->d_sb);
if (IS_ERR(inode)) {
printk(KERN_ERR "%s: Error interposing; rc = [%ld]\n",
__func__, PTR_ERR(inode));
- return PTR_ERR(inode);
+ return ERR_CAST(inode);
}
if (S_ISREG(inode->i_mode)) {
rc = ecryptfs_i_size_read(dentry, inode);
if (rc) {
make_bad_inode(inode);
- return rc;
+ return ERR_PTR(rc);
}
}
if (inode->i_state & I_NEW)
unlock_new_inode(inode);
- d_add(dentry, inode);
-
- return rc;
+ return d_splice_alias(inode, dentry);
}
/**
@@ -390,55 +388,42 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
unsigned int flags)
{
char *encrypted_and_encoded_name = NULL;
- size_t encrypted_and_encoded_name_size;
- struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
+ struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
struct dentry *lower_dir_dentry, *lower_dentry;
+ const char *name = ecryptfs_dentry->d_name.name;
+ size_t len = ecryptfs_dentry->d_name.len;
+ struct dentry *res;
int rc = 0;
lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
- lower_dentry = lookup_one_len_unlocked(ecryptfs_dentry->d_name.name,
- lower_dir_dentry,
- ecryptfs_dentry->d_name.len);
- if (IS_ERR(lower_dentry)) {
- rc = PTR_ERR(lower_dentry);
- ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
- "[%d] on lower_dentry = [%pd]\n", __func__, rc,
- ecryptfs_dentry);
- goto out;
- }
- if (d_really_is_positive(lower_dentry))
- goto interpose;
+
mount_crypt_stat = &ecryptfs_superblock_to_private(
ecryptfs_dentry->d_sb)->mount_crypt_stat;
- if (!(mount_crypt_stat
- && (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)))
- goto interpose;
- dput(lower_dentry);
- rc = ecryptfs_encrypt_and_encode_filename(
- &encrypted_and_encoded_name, &encrypted_and_encoded_name_size,
- mount_crypt_stat, ecryptfs_dentry->d_name.name,
- ecryptfs_dentry->d_name.len);
- if (rc) {
- printk(KERN_ERR "%s: Error attempting to encrypt and encode "
- "filename; rc = [%d]\n", __func__, rc);
- goto out;
+ if (mount_crypt_stat
+ && (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)) {
+ rc = ecryptfs_encrypt_and_encode_filename(
+ &encrypted_and_encoded_name, &len,
+ mount_crypt_stat, name, len);
+ if (rc) {
+ printk(KERN_ERR "%s: Error attempting to encrypt and encode "
+ "filename; rc = [%d]\n", __func__, rc);
+ return ERR_PTR(rc);
+ }
+ name = encrypted_and_encoded_name;
}
- lower_dentry = lookup_one_len_unlocked(encrypted_and_encoded_name,
- lower_dir_dentry,
- encrypted_and_encoded_name_size);
+
+ lower_dentry = lookup_one_len_unlocked(name, lower_dir_dentry, len);
if (IS_ERR(lower_dentry)) {
- rc = PTR_ERR(lower_dentry);
ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
- "[%d] on lower_dentry = [%s]\n", __func__, rc,
- encrypted_and_encoded_name);
- goto out;
+ "[%ld] on lower_dentry = [%s]\n", __func__,
+ PTR_ERR(lower_dentry),
+ name);
+ res = ERR_CAST(lower_dentry);
+ } else {
+ res = ecryptfs_lookup_interpose(ecryptfs_dentry, lower_dentry);
}
-interpose:
- rc = ecryptfs_lookup_interpose(ecryptfs_dentry, lower_dentry,
- ecryptfs_dir_inode);
-out:
kfree(encrypted_and_encoded_name);
- return ERR_PTR(rc);
+ return res;
}
static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
@@ -763,10 +748,10 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
} else { /* ia->ia_size < i_size_read(inode) */
/* We're chopping off all the pages down to the page
* in which ia->ia_size is located. Fill in the end of
- * that page from (ia->ia_size & ~PAGE_CACHE_MASK) to
- * PAGE_CACHE_SIZE with zeros. */
- size_t num_zeros = (PAGE_CACHE_SIZE
- - (ia->ia_size & ~PAGE_CACHE_MASK));
+ * that page from (ia->ia_size & ~PAGE_MASK) to
+ * PAGE_SIZE with zeros. */
+ size_t num_zeros = (PAGE_SIZE
+ - (ia->ia_size & ~PAGE_MASK));
if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
truncate_setsize(inode, ia->ia_size);
@@ -898,8 +883,11 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
struct ecryptfs_crypt_stat *crypt_stat;
crypt_stat = &ecryptfs_inode_to_private(d_inode(dentry))->crypt_stat;
- if (!(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED))
- ecryptfs_init_crypt_stat(crypt_stat);
+ if (!(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)) {
+ rc = ecryptfs_init_crypt_stat(crypt_stat);
+ if (rc)
+ return rc;
+ }
inode = d_inode(dentry);
lower_inode = ecryptfs_inode_to_lower(inode);
lower_dentry = ecryptfs_dentry_to_lower(dentry);
@@ -1013,7 +1001,8 @@ static int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
}
int
-ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+ecryptfs_setxattr(struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
size_t size, int flags)
{
int rc = 0;
@@ -1026,36 +1015,37 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
}
rc = vfs_setxattr(lower_dentry, name, value, size, flags);
- if (!rc && d_really_is_positive(dentry))
- fsstack_copy_attr_all(d_inode(dentry), d_inode(lower_dentry));
+ if (!rc && inode)
+ fsstack_copy_attr_all(inode, d_inode(lower_dentry));
out:
return rc;
}
ssize_t
-ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name,
- void *value, size_t size)
+ecryptfs_getxattr_lower(struct dentry *lower_dentry, struct inode *lower_inode,
+ const char *name, void *value, size_t size)
{
int rc = 0;
- if (!d_inode(lower_dentry)->i_op->getxattr) {
+ if (!lower_inode->i_op->getxattr) {
rc = -EOPNOTSUPP;
goto out;
}
- inode_lock(d_inode(lower_dentry));
- rc = d_inode(lower_dentry)->i_op->getxattr(lower_dentry, name, value,
- size);
- inode_unlock(d_inode(lower_dentry));
+ inode_lock(lower_inode);
+ rc = lower_inode->i_op->getxattr(lower_dentry, lower_inode,
+ name, value, size);
+ inode_unlock(lower_inode);
out:
return rc;
}
static ssize_t
-ecryptfs_getxattr(struct dentry *dentry, const char *name, void *value,
- size_t size)
+ecryptfs_getxattr(struct dentry *dentry, struct inode *inode,
+ const char *name, void *value, size_t size)
{
- return ecryptfs_getxattr_lower(ecryptfs_dentry_to_lower(dentry), name,
- value, size);
+ return ecryptfs_getxattr_lower(ecryptfs_dentry_to_lower(dentry),
+ ecryptfs_inode_to_lower(inode),
+ name, value, size);
}
static ssize_t
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 9893d15381222..3cf1546dca825 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1798,7 +1798,7 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
* added the our &auth_tok_list */
next_packet_is_auth_tok_packet = 1;
while (next_packet_is_auth_tok_packet) {
- size_t max_packet_size = ((PAGE_CACHE_SIZE - 8) - i);
+ size_t max_packet_size = ((PAGE_SIZE - 8) - i);
switch (src[i]) {
case ECRYPTFS_TAG_3_PACKET_TYPE:
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 8b0b4a73116d0..6120044951415 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -695,12 +695,12 @@ static struct ecryptfs_cache_info {
{
.cache = &ecryptfs_header_cache,
.name = "ecryptfs_headers",
- .size = PAGE_CACHE_SIZE,
+ .size = PAGE_SIZE,
},
{
.cache = &ecryptfs_xattr_cache,
.name = "ecryptfs_xattr_cache",
- .size = PAGE_CACHE_SIZE,
+ .size = PAGE_SIZE,
},
{
.cache = &ecryptfs_key_record_cache,
@@ -738,8 +738,7 @@ static void ecryptfs_free_kmem_caches(void)
struct ecryptfs_cache_info *info;
info = &ecryptfs_cache_infos[i];
- if (*(info->cache))
- kmem_cache_destroy(*(info->cache));
+ kmem_cache_destroy(*(info->cache));
}
}
@@ -818,7 +817,7 @@ static int __init ecryptfs_init(void)
{
int rc;
- if (ECRYPTFS_DEFAULT_EXTENT_SIZE > PAGE_CACHE_SIZE) {
+ if (ECRYPTFS_DEFAULT_EXTENT_SIZE > PAGE_SIZE) {
rc = -EINVAL;
ecryptfs_printk(KERN_ERR, "The eCryptfs extent size is "
"larger than the host's page size, and so "
@@ -826,7 +825,7 @@ static int __init ecryptfs_init(void)
"default eCryptfs extent size is [%u] bytes; "
"the page size is [%lu] bytes.\n",
ECRYPTFS_DEFAULT_EXTENT_SIZE,
- (unsigned long)PAGE_CACHE_SIZE);
+ (unsigned long)PAGE_SIZE);
goto out;
}
rc = ecryptfs_init_kmem_caches();
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 1f5865263b3ef..9c3437c8a5b12 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -122,7 +122,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
struct ecryptfs_crypt_stat *crypt_stat)
{
loff_t extent_num_in_page = 0;
- loff_t num_extents_per_page = (PAGE_CACHE_SIZE
+ loff_t num_extents_per_page = (PAGE_SIZE
/ crypt_stat->extent_size);
int rc = 0;
@@ -138,7 +138,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
char *page_virt;
page_virt = kmap_atomic(page);
- memset(page_virt, 0, PAGE_CACHE_SIZE);
+ memset(page_virt, 0, PAGE_SIZE);
/* TODO: Support more than one header extent */
if (view_extent_num == 0) {
size_t written;
@@ -164,8 +164,8 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
- crypt_stat->metadata_size);
rc = ecryptfs_read_lower_page_segment(
- page, (lower_offset >> PAGE_CACHE_SHIFT),
- (lower_offset & ~PAGE_CACHE_MASK),
+ page, (lower_offset >> PAGE_SHIFT),
+ (lower_offset & ~PAGE_MASK),
crypt_stat->extent_size, page->mapping->host);
if (rc) {
printk(KERN_ERR "%s: Error attempting to read "
@@ -198,7 +198,7 @@ static int ecryptfs_readpage(struct file *file, struct page *page)
if (!crypt_stat || !(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
rc = ecryptfs_read_lower_page_segment(page, page->index, 0,
- PAGE_CACHE_SIZE,
+ PAGE_SIZE,
page->mapping->host);
} else if (crypt_stat->flags & ECRYPTFS_VIEW_AS_ENCRYPTED) {
if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) {
@@ -215,7 +215,7 @@ static int ecryptfs_readpage(struct file *file, struct page *page)
} else {
rc = ecryptfs_read_lower_page_segment(
- page, page->index, 0, PAGE_CACHE_SIZE,
+ page, page->index, 0, PAGE_SIZE,
page->mapping->host);
if (rc) {
printk(KERN_ERR "Error reading page; rc = "
@@ -250,12 +250,12 @@ static int fill_zeros_to_end_of_page(struct page *page, unsigned int to)
struct inode *inode = page->mapping->host;
int end_byte_in_page;
- if ((i_size_read(inode) / PAGE_CACHE_SIZE) != page->index)
+ if ((i_size_read(inode) / PAGE_SIZE) != page->index)
goto out;
- end_byte_in_page = i_size_read(inode) % PAGE_CACHE_SIZE;
+ end_byte_in_page = i_size_read(inode) % PAGE_SIZE;
if (to > end_byte_in_page)
end_byte_in_page = to;
- zero_user_segment(page, end_byte_in_page, PAGE_CACHE_SIZE);
+ zero_user_segment(page, end_byte_in_page, PAGE_SIZE);
out:
return 0;
}
@@ -279,7 +279,7 @@ static int ecryptfs_write_begin(struct file *file,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t index = pos >> PAGE_SHIFT;
struct page *page;
loff_t prev_page_end_size;
int rc = 0;
@@ -289,14 +289,14 @@ static int ecryptfs_write_begin(struct file *file,
return -ENOMEM;
*pagep = page;
- prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT);
+ prev_page_end_size = ((loff_t)index << PAGE_SHIFT);
if (!PageUptodate(page)) {
struct ecryptfs_crypt_stat *crypt_stat =
&ecryptfs_inode_to_private(mapping->host)->crypt_stat;
if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
rc = ecryptfs_read_lower_page_segment(
- page, index, 0, PAGE_CACHE_SIZE, mapping->host);
+ page, index, 0, PAGE_SIZE, mapping->host);
if (rc) {
printk(KERN_ERR "%s: Error attempting to read "
"lower page segment; rc = [%d]\n",
@@ -322,7 +322,7 @@ static int ecryptfs_write_begin(struct file *file,
SetPageUptodate(page);
} else {
rc = ecryptfs_read_lower_page_segment(
- page, index, 0, PAGE_CACHE_SIZE,
+ page, index, 0, PAGE_SIZE,
mapping->host);
if (rc) {
printk(KERN_ERR "%s: Error reading "
@@ -336,9 +336,9 @@ static int ecryptfs_write_begin(struct file *file,
} else {
if (prev_page_end_size
>= i_size_read(page->mapping->host)) {
- zero_user(page, 0, PAGE_CACHE_SIZE);
+ zero_user(page, 0, PAGE_SIZE);
SetPageUptodate(page);
- } else if (len < PAGE_CACHE_SIZE) {
+ } else if (len < PAGE_SIZE) {
rc = ecryptfs_decrypt_page(page);
if (rc) {
printk(KERN_ERR "%s: Error decrypting "
@@ -371,11 +371,11 @@ static int ecryptfs_write_begin(struct file *file,
* of page? Zero it out. */
if ((i_size_read(mapping->host) == prev_page_end_size)
&& (pos != 0))
- zero_user(page, 0, PAGE_CACHE_SIZE);
+ zero_user(page, 0, PAGE_SIZE);
out:
if (unlikely(rc)) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
*pagep = NULL;
}
return rc;
@@ -436,12 +436,14 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
goto out;
}
inode_lock(lower_inode);
- size = lower_inode->i_op->getxattr(lower_dentry, ECRYPTFS_XATTR_NAME,
- xattr_virt, PAGE_CACHE_SIZE);
+ size = lower_inode->i_op->getxattr(lower_dentry, lower_inode,
+ ECRYPTFS_XATTR_NAME,
+ xattr_virt, PAGE_SIZE);
if (size < 0)
size = 8;
put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt);
- rc = lower_inode->i_op->setxattr(lower_dentry, ECRYPTFS_XATTR_NAME,
+ rc = lower_inode->i_op->setxattr(lower_dentry, lower_inode,
+ ECRYPTFS_XATTR_NAME,
xattr_virt, size, 0);
inode_unlock(lower_inode);
if (rc)
@@ -479,8 +481,8 @@ static int ecryptfs_write_end(struct file *file,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ pgoff_t index = pos >> PAGE_SHIFT;
+ unsigned from = pos & (PAGE_SIZE - 1);
unsigned to = from + copied;
struct inode *ecryptfs_inode = mapping->host;
struct ecryptfs_crypt_stat *crypt_stat =
@@ -500,7 +502,7 @@ static int ecryptfs_write_end(struct file *file,
goto out;
}
if (!PageUptodate(page)) {
- if (copied < PAGE_CACHE_SIZE) {
+ if (copied < PAGE_SIZE) {
rc = 0;
goto out;
}
@@ -533,7 +535,7 @@ static int ecryptfs_write_end(struct file *file,
rc = copied;
out:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return rc;
}
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 09fe622274e44..158a3a39f82de 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -74,7 +74,7 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
loff_t offset;
int rc;
- offset = ((((loff_t)page_for_lower->index) << PAGE_CACHE_SHIFT)
+ offset = ((((loff_t)page_for_lower->index) << PAGE_SHIFT)
+ offset_in_page);
virt = kmap(page_for_lower);
rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size);
@@ -123,9 +123,9 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
else
pos = offset;
while (pos < (offset + size)) {
- pgoff_t ecryptfs_page_idx = (pos >> PAGE_CACHE_SHIFT);
- size_t start_offset_in_page = (pos & ~PAGE_CACHE_MASK);
- size_t num_bytes = (PAGE_CACHE_SIZE - start_offset_in_page);
+ pgoff_t ecryptfs_page_idx = (pos >> PAGE_SHIFT);
+ size_t start_offset_in_page = (pos & ~PAGE_MASK);
+ size_t num_bytes = (PAGE_SIZE - start_offset_in_page);
loff_t total_remaining_bytes = ((offset + size) - pos);
if (fatal_signal_pending(current)) {
@@ -165,7 +165,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
* Fill in zero values to the end of the page */
memset(((char *)ecryptfs_page_virt
+ start_offset_in_page), 0,
- PAGE_CACHE_SIZE - start_offset_in_page);
+ PAGE_SIZE - start_offset_in_page);
}
/* pos >= offset, we are now writing the data request */
@@ -186,7 +186,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
ecryptfs_page,
start_offset_in_page,
data_offset);
- page_cache_release(ecryptfs_page);
+ put_page(ecryptfs_page);
if (rc) {
printk(KERN_ERR "%s: Error encrypting "
"page; rc = [%d]\n", __func__, rc);
@@ -262,7 +262,7 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
loff_t offset;
int rc;
- offset = ((((loff_t)page_index) << PAGE_CACHE_SHIFT) + offset_in_page);
+ offset = ((((loff_t)page_index) << PAGE_SHIFT) + offset_in_page);
virt = kmap(page_for_ecryptfs);
rc = ecryptfs_read_lower(virt, offset, size, ecryptfs_inode);
if (rc > 0)
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 77a486d3a51b6..85411ceb0508b 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -55,7 +55,10 @@ static struct inode *ecryptfs_alloc_inode(struct super_block *sb)
inode_info = kmem_cache_alloc(ecryptfs_inode_info_cache, GFP_KERNEL);
if (unlikely(!inode_info))
goto out;
- ecryptfs_init_crypt_stat(&inode_info->crypt_stat);
+ if (ecryptfs_init_crypt_stat(&inode_info->crypt_stat)) {
+ kmem_cache_free(ecryptfs_inode_info_cache, inode_info);
+ goto out;
+ }
mutex_init(&inode_info->lower_file_mutex);
atomic_set(&inode_info->lower_file_count, 0);
inode_info->lower_file = NULL;
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index d48e0d261d78d..5f22e74bbadea 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -157,7 +157,7 @@ efivarfs_ioc_setxflags(struct file *file, void __user *arg)
return 0;
}
-long
+static long
efivarfs_file_ioctl(struct file *file, unsigned int cmd, unsigned long p)
{
void __user *arg = (void __user *)p;
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index e2ab6d0497f2b..1d73fc6dba13b 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -11,6 +11,7 @@
#include <linux/fs.h>
#include <linux/ctype.h>
#include <linux/slab.h>
+#include <linux/uuid.h>
#include "internal.h"
@@ -46,11 +47,7 @@ struct inode *efivarfs_get_inode(struct super_block *sb,
*/
bool efivarfs_valid_name(const char *str, int len)
{
- static const char dashes[EFI_VARIABLE_GUID_LEN] = {
- [8] = 1, [13] = 1, [18] = 1, [23] = 1
- };
const char *s = str + len - EFI_VARIABLE_GUID_LEN;
- int i;
/*
* We need a GUID, plus at least one letter for the variable name,
@@ -68,37 +65,7 @@ bool efivarfs_valid_name(const char *str, int len)
*
* 12345678-1234-1234-1234-123456789abc
*/
- for (i = 0; i < EFI_VARIABLE_GUID_LEN; i++) {
- if (dashes[i]) {
- if (*s++ != '-')
- return false;
- } else {
- if (!isxdigit(*s++))
- return false;
- }
- }
-
- return true;
-}
-
-static void efivarfs_hex_to_guid(const char *str, efi_guid_t *guid)
-{
- guid->b[0] = hex_to_bin(str[6]) << 4 | hex_to_bin(str[7]);
- guid->b[1] = hex_to_bin(str[4]) << 4 | hex_to_bin(str[5]);
- guid->b[2] = hex_to_bin(str[2]) << 4 | hex_to_bin(str[3]);
- guid->b[3] = hex_to_bin(str[0]) << 4 | hex_to_bin(str[1]);
- guid->b[4] = hex_to_bin(str[11]) << 4 | hex_to_bin(str[12]);
- guid->b[5] = hex_to_bin(str[9]) << 4 | hex_to_bin(str[10]);
- guid->b[6] = hex_to_bin(str[16]) << 4 | hex_to_bin(str[17]);
- guid->b[7] = hex_to_bin(str[14]) << 4 | hex_to_bin(str[15]);
- guid->b[8] = hex_to_bin(str[19]) << 4 | hex_to_bin(str[20]);
- guid->b[9] = hex_to_bin(str[21]) << 4 | hex_to_bin(str[22]);
- guid->b[10] = hex_to_bin(str[24]) << 4 | hex_to_bin(str[25]);
- guid->b[11] = hex_to_bin(str[26]) << 4 | hex_to_bin(str[27]);
- guid->b[12] = hex_to_bin(str[28]) << 4 | hex_to_bin(str[29]);
- guid->b[13] = hex_to_bin(str[30]) << 4 | hex_to_bin(str[31]);
- guid->b[14] = hex_to_bin(str[32]) << 4 | hex_to_bin(str[33]);
- guid->b[15] = hex_to_bin(str[34]) << 4 | hex_to_bin(str[35]);
+ return uuid_is_valid(s);
}
static int efivarfs_create(struct inode *dir, struct dentry *dentry,
@@ -119,8 +86,7 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry,
/* length of the variable name itself: remove GUID and separator */
namelen = dentry->d_name.len - EFI_VARIABLE_GUID_LEN - 1;
- efivarfs_hex_to_guid(dentry->d_name.name + namelen + 1,
- &var->var.VendorGuid);
+ uuid_le_to_bin(dentry->d_name.name + namelen + 1, &var->var.VendorGuid);
if (efivar_variable_is_removable(var->var.VendorGuid,
dentry->d_name.name, namelen))
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index dd029d13ea614..688ccc16b702d 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -45,8 +45,7 @@ static struct super_block *efivarfs_sb;
* So we need to perform a case-sensitive match on part 1 and a
* case-insensitive match on part 2.
*/
-static int efivarfs_d_compare(const struct dentry *parent,
- const struct dentry *dentry,
+static int efivarfs_d_compare(const struct dentry *dentry,
unsigned int len, const char *str,
const struct qstr *name)
{
@@ -65,7 +64,7 @@ static int efivarfs_d_compare(const struct dentry *parent,
static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr)
{
- unsigned long hash = init_name_hash();
+ unsigned long hash = init_name_hash(dentry);
const unsigned char *s = qstr->name;
unsigned int len = qstr->len;
@@ -98,7 +97,7 @@ static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name)
q.name = name;
q.len = strlen(name);
- err = efivarfs_d_hash(NULL, &q);
+ err = efivarfs_d_hash(parent, &q);
if (err)
return ERR_PTR(err);
@@ -197,8 +196,8 @@ static int efivarfs_fill_super(struct super_block *sb, void *data, int silent)
efivarfs_sb = sb;
sb->s_maxbytes = MAX_LFS_FILESIZE;
- sb->s_blocksize = PAGE_CACHE_SIZE;
- sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ sb->s_blocksize = PAGE_SIZE;
+ sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = EFIVARFS_MAGIC;
sb->s_op = &efivarfs_ops;
sb->s_d_op = &efivarfs_d_ops;
@@ -216,8 +215,7 @@ static int efivarfs_fill_super(struct super_block *sb, void *data, int silent)
INIT_LIST_HEAD(&efivarfs_list);
- err = efivar_init(efivarfs_callback, (void *)sb, false,
- true, &efivarfs_list);
+ err = efivar_init(efivarfs_callback, (void *)sb, true, &efivarfs_list);
if (err)
__efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL);
diff --git a/fs/efs/dir.c b/fs/efs/dir.c
index ce63b24f7c3e2..a7be96e5f1cb8 100644
--- a/fs/efs/dir.c
+++ b/fs/efs/dir.c
@@ -12,7 +12,7 @@ static int efs_readdir(struct file *, struct dir_context *);
const struct file_operations efs_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .iterate = efs_readdir,
+ .iterate_shared = efs_readdir,
};
const struct inode_operations efs_dir_inode_operations = {
@@ -100,4 +100,3 @@ static int efs_readdir(struct file *file, struct dir_context *ctx)
ctx->pos = (block << EFS_DIRBSIZE_BITS) | slot;
return 0;
}
-
diff --git a/fs/efs/namei.c b/fs/efs/namei.c
index 40ba9cc41bf74..d34a40edcdb27 100644
--- a/fs/efs/namei.c
+++ b/fs/efs/namei.c
@@ -113,7 +113,7 @@ struct dentry *efs_get_parent(struct dentry *child)
ino = efs_find_entry(d_inode(child), "..", 2);
if (ino)
- parent = d_obtain_alias(efs_iget(d_inode(child)->i_sb, ino));
+ parent = d_obtain_alias(efs_iget(child->d_sb, ino));
return parent;
}
diff --git a/fs/efs/super.c b/fs/efs/super.c
index cb68dac4f9d32..368f7dd21c610 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -275,7 +275,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
if (!bh) {
pr_err("cannot read volume header\n");
- return -EINVAL;
+ return -EIO;
}
/*
@@ -293,7 +293,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
bh = sb_bread(s, sb->fs_start + EFS_SUPER);
if (!bh) {
pr_err("cannot read superblock\n");
- return -EINVAL;
+ return -EIO;
}
if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) {
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 8a74a2a52e0fa..10db912189338 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1583,15 +1583,15 @@ static int ep_send_events(struct eventpoll *ep,
return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
}
-static inline struct timespec ep_set_mstimeout(long ms)
+static inline struct timespec64 ep_set_mstimeout(long ms)
{
- struct timespec now, ts = {
+ struct timespec64 now, ts = {
.tv_sec = ms / MSEC_PER_SEC,
.tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
};
- ktime_get_ts(&now);
- return timespec_add_safe(now, ts);
+ ktime_get_ts64(&now);
+ return timespec64_add_safe(now, ts);
}
/**
@@ -1621,11 +1621,11 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
ktime_t expires, *to = NULL;
if (timeout > 0) {
- struct timespec end_time = ep_set_mstimeout(timeout);
+ struct timespec64 end_time = ep_set_mstimeout(timeout);
slack = select_estimate_accuracy(&end_time);
to = &expires;
- *to = timespec_to_ktime(end_time);
+ *to = timespec64_to_ktime(end_time);
} else if (timeout == 0) {
/*
* Avoid the unnecessary trip to the wait queue loop, if the
diff --git a/fs/exec.c b/fs/exec.c
index c4010b8207a14..6fcfb3f7b1379 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -243,10 +243,6 @@ static void put_arg_page(struct page *page)
put_page(page);
}
-static void free_arg_page(struct linux_binprm *bprm, int i)
-{
-}
-
static void free_arg_pages(struct linux_binprm *bprm)
{
}
@@ -267,7 +263,10 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
if (!vma)
return -ENOMEM;
- down_write(&mm->mmap_sem);
+ if (down_write_killable(&mm->mmap_sem)) {
+ err = -EINTR;
+ goto err_free;
+ }
vma->vm_mm = mm;
/*
@@ -294,6 +293,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
return 0;
err:
up_write(&mm->mmap_sem);
+err_free:
bprm->vma = NULL;
kmem_cache_free(vm_area_cachep, vma);
return err;
@@ -700,7 +700,9 @@ int setup_arg_pages(struct linux_binprm *bprm,
bprm->loader -= stack_shift;
bprm->exec -= stack_shift;
- down_write(&mm->mmap_sem);
+ if (down_write_killable(&mm->mmap_sem))
+ return -EINTR;
+
vm_flags = VM_STACK_FLAGS;
/*
@@ -760,6 +762,39 @@ out_unlock:
}
EXPORT_SYMBOL(setup_arg_pages);
+#else
+
+/*
+ * Transfer the program arguments and environment from the holding pages
+ * onto the stack. The provided stack pointer is adjusted accordingly.
+ */
+int transfer_args_to_stack(struct linux_binprm *bprm,
+ unsigned long *sp_location)
+{
+ unsigned long index, stop, sp;
+ int ret = 0;
+
+ stop = bprm->p >> PAGE_SHIFT;
+ sp = *sp_location;
+
+ for (index = MAX_ARG_PAGES - 1; index >= stop; index--) {
+ unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : 0;
+ char *src = kmap(bprm->page[index]) + offset;
+ sp -= PAGE_SIZE - offset;
+ if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0)
+ ret = -EFAULT;
+ kunmap(bprm->page[index]);
+ if (ret)
+ goto out;
+ }
+
+ *sp_location = sp;
+
+out:
+ return ret;
+}
+EXPORT_SYMBOL(transfer_args_to_stack);
+
#endif /* CONFIG_MMU */
static struct file *do_open_execat(int fd, struct filename *name, int flags)
@@ -850,15 +885,26 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size,
if (ret)
return ret;
+ ret = deny_write_access(file);
+ if (ret)
+ return ret;
+
i_size = i_size_read(file_inode(file));
- if (max_size > 0 && i_size > max_size)
- return -EFBIG;
- if (i_size <= 0)
- return -EINVAL;
+ if (max_size > 0 && i_size > max_size) {
+ ret = -EFBIG;
+ goto out;
+ }
+ if (i_size <= 0) {
+ ret = -EINVAL;
+ goto out;
+ }
- *buf = vmalloc(i_size);
- if (!*buf)
- return -ENOMEM;
+ if (id != READING_FIRMWARE_PREALLOC_BUFFER)
+ *buf = vmalloc(i_size);
+ if (!*buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
pos = 0;
while (pos < i_size) {
@@ -876,18 +922,23 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size,
if (pos != i_size) {
ret = -EIO;
- goto out;
+ goto out_free;
}
ret = security_kernel_post_read_file(file, *buf, i_size, id);
if (!ret)
*size = pos;
-out:
+out_free:
if (ret < 0) {
- vfree(*buf);
- *buf = NULL;
+ if (id != READING_FIRMWARE_PREALLOC_BUFFER) {
+ vfree(*buf);
+ *buf = NULL;
+ }
}
+
+out:
+ allow_write_access(file);
return ret;
}
EXPORT_SYMBOL_GPL(kernel_read_file);
@@ -1387,11 +1438,16 @@ static void bprm_fill_uid(struct linux_binprm *bprm)
kuid_t uid;
kgid_t gid;
- /* clear any previous set[ug]id data from a previous binary */
+ /*
+ * Since this can be called multiple times (via prepare_binprm),
+ * we must clear any previous work done when setting set[ug]id
+ * bits from any earlier bprm->file uses (for example when run
+ * first for a setuid script then again for its interpreter).
+ */
bprm->cred->euid = current_euid();
bprm->cred->egid = current_egid();
- if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)
+ if (!mnt_may_suid(bprm->file->f_path.mnt))
return;
if (task_no_new_privs(current))
@@ -1481,9 +1537,6 @@ int remove_arg_zero(struct linux_binprm *bprm)
kunmap_atomic(kaddr);
put_arg_page(page);
-
- if (offset == PAGE_SIZE)
- free_arg_page(bprm, (bprm->p >> PAGE_SHIFT) - 1);
} while (offset == PAGE_SIZE);
bprm->p++;
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index e5bb2abf77f9a..f69a1b5826a5c 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -41,16 +41,16 @@ static inline unsigned exofs_chunk_size(struct inode *inode)
static inline void exofs_put_page(struct page *page)
{
kunmap(page);
- page_cache_release(page);
+ put_page(page);
}
static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr)
{
loff_t last_byte = inode->i_size;
- last_byte -= page_nr << PAGE_CACHE_SHIFT;
- if (last_byte > PAGE_CACHE_SIZE)
- last_byte = PAGE_CACHE_SIZE;
+ last_byte -= page_nr << PAGE_SHIFT;
+ if (last_byte > PAGE_SIZE)
+ last_byte = PAGE_SIZE;
return last_byte;
}
@@ -79,19 +79,19 @@ static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len)
return err;
}
-static void exofs_check_page(struct page *page)
+static bool exofs_check_page(struct page *page)
{
struct inode *dir = page->mapping->host;
unsigned chunk_size = exofs_chunk_size(dir);
char *kaddr = page_address(page);
unsigned offs, rec_len;
- unsigned limit = PAGE_CACHE_SIZE;
+ unsigned limit = PAGE_SIZE;
struct exofs_dir_entry *p;
char *error;
/* if the page is the last one in the directory */
- if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
- limit = dir->i_size & ~PAGE_CACHE_MASK;
+ if ((dir->i_size >> PAGE_SHIFT) == page->index) {
+ limit = dir->i_size & ~PAGE_MASK;
if (limit & (chunk_size - 1))
goto Ebadsize;
if (!limit)
@@ -114,7 +114,7 @@ static void exofs_check_page(struct page *page)
goto Eend;
out:
SetPageChecked(page);
- return;
+ return true;
Ebadsize:
EXOFS_ERR("ERROR [exofs_check_page]: "
@@ -138,7 +138,7 @@ bad_entry:
EXOFS_ERR(
"ERROR [exofs_check_page]: bad entry in directory(0x%lx): %s - "
"offset=%lu, inode=0x%llu, rec_len=%d, name_len=%d\n",
- dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+ dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs,
_LLU(le64_to_cpu(p->inode_no)),
rec_len, p->name_len);
goto fail;
@@ -147,11 +147,11 @@ Eend:
EXOFS_ERR("ERROR [exofs_check_page]: "
"entry in directory(0x%lx) spans the page boundary"
"offset=%lu, inode=0x%llx\n",
- dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
+ dir->i_ino, (page->index<<PAGE_SHIFT)+offs,
_LLU(le64_to_cpu(p->inode_no)));
fail:
- SetPageChecked(page);
SetPageError(page);
+ return false;
}
static struct page *exofs_get_page(struct inode *dir, unsigned long n)
@@ -161,10 +161,10 @@ static struct page *exofs_get_page(struct inode *dir, unsigned long n)
if (!IS_ERR(page)) {
kmap(page);
- if (!PageChecked(page))
- exofs_check_page(page);
- if (PageError(page))
- goto fail;
+ if (unlikely(!PageChecked(page))) {
+ if (PageError(page) || !exofs_check_page(page))
+ goto fail;
+ }
}
return page;
@@ -237,8 +237,8 @@ exofs_readdir(struct file *file, struct dir_context *ctx)
{
loff_t pos = ctx->pos;
struct inode *inode = file_inode(file);
- unsigned int offset = pos & ~PAGE_CACHE_MASK;
- unsigned long n = pos >> PAGE_CACHE_SHIFT;
+ unsigned int offset = pos & ~PAGE_MASK;
+ unsigned long n = pos >> PAGE_SHIFT;
unsigned long npages = dir_pages(inode);
unsigned chunk_mask = ~(exofs_chunk_size(inode)-1);
int need_revalidate = (file->f_version != inode->i_version);
@@ -254,7 +254,7 @@ exofs_readdir(struct file *file, struct dir_context *ctx)
if (IS_ERR(page)) {
EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n",
inode->i_ino);
- ctx->pos += PAGE_CACHE_SIZE - offset;
+ ctx->pos += PAGE_SIZE - offset;
return PTR_ERR(page);
}
kaddr = page_address(page);
@@ -262,7 +262,7 @@ exofs_readdir(struct file *file, struct dir_context *ctx)
if (offset) {
offset = exofs_validate_entry(kaddr, offset,
chunk_mask);
- ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
+ ctx->pos = (n<<PAGE_SHIFT) + offset;
}
file->f_version = inode->i_version;
need_revalidate = 0;
@@ -449,7 +449,7 @@ int exofs_add_link(struct dentry *dentry, struct inode *inode)
kaddr = page_address(page);
dir_end = kaddr + exofs_last_byte(dir, n);
de = (struct exofs_dir_entry *)kaddr;
- kaddr += PAGE_CACHE_SIZE - reclen;
+ kaddr += PAGE_SIZE - reclen;
while ((char *)de <= kaddr) {
if ((char *)de == dir_end) {
name_len = 0;
@@ -602,7 +602,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent)
kunmap_atomic(kaddr);
err = exofs_commit_chunk(page, 0, chunk_size);
fail:
- page_cache_release(page);
+ put_page(page);
return err;
}
@@ -657,5 +657,5 @@ not_empty:
const struct file_operations exofs_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .iterate = exofs_readdir,
+ .iterate_shared = exofs_readdir,
};
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 9eaf595aeaf88..9dc4c6dbf3c99 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -317,7 +317,7 @@ static int read_exec(struct page_collect *pcol)
if (!pcol->ios) {
int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, true,
- pcol->pg_first << PAGE_CACHE_SHIFT,
+ pcol->pg_first << PAGE_SHIFT,
pcol->length, &pcol->ios);
if (ret)
@@ -383,7 +383,7 @@ static int readpage_strip(void *data, struct page *page)
struct inode *inode = pcol->inode;
struct exofs_i_info *oi = exofs_i(inode);
loff_t i_size = i_size_read(inode);
- pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ pgoff_t end_index = i_size >> PAGE_SHIFT;
size_t len;
int ret;
@@ -397,9 +397,9 @@ static int readpage_strip(void *data, struct page *page)
pcol->that_locked_page = page;
if (page->index < end_index)
- len = PAGE_CACHE_SIZE;
+ len = PAGE_SIZE;
else if (page->index == end_index)
- len = i_size & ~PAGE_CACHE_MASK;
+ len = i_size & ~PAGE_MASK;
else
len = 0;
@@ -442,8 +442,8 @@ try_again:
goto fail;
}
- if (len != PAGE_CACHE_SIZE)
- zero_user(page, len, PAGE_CACHE_SIZE - len);
+ if (len != PAGE_SIZE)
+ zero_user(page, len, PAGE_SIZE - len);
EXOFS_DBGMSG2(" readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
inode->i_ino, page->index, len);
@@ -609,7 +609,7 @@ static void __r4w_put_page(void *priv, struct page *page)
if ((pcol->that_locked_page != page) && (ZERO_PAGE(0) != page)) {
EXOFS_DBGMSG2("index=0x%lx\n", page->index);
- page_cache_release(page);
+ put_page(page);
return;
}
EXOFS_DBGMSG2("that_locked_page index=0x%lx\n",
@@ -633,7 +633,7 @@ static int write_exec(struct page_collect *pcol)
BUG_ON(pcol->ios);
ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false,
- pcol->pg_first << PAGE_CACHE_SHIFT,
+ pcol->pg_first << PAGE_SHIFT,
pcol->length, &pcol->ios);
if (unlikely(ret))
goto err;
@@ -696,7 +696,7 @@ static int writepage_strip(struct page *page,
struct inode *inode = pcol->inode;
struct exofs_i_info *oi = exofs_i(inode);
loff_t i_size = i_size_read(inode);
- pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ pgoff_t end_index = i_size >> PAGE_SHIFT;
size_t len;
int ret;
@@ -708,9 +708,9 @@ static int writepage_strip(struct page *page,
if (page->index < end_index)
/* in this case, the page is within the limits of the file */
- len = PAGE_CACHE_SIZE;
+ len = PAGE_SIZE;
else {
- len = i_size & ~PAGE_CACHE_MASK;
+ len = i_size & ~PAGE_MASK;
if (page->index > end_index || !len) {
/* in this case, the page is outside the limits
@@ -790,10 +790,10 @@ static int exofs_writepages(struct address_space *mapping,
long start, end, expected_pages;
int ret;
- start = wbc->range_start >> PAGE_CACHE_SHIFT;
+ start = wbc->range_start >> PAGE_SHIFT;
end = (wbc->range_end == LLONG_MAX) ?
start + mapping->nrpages :
- wbc->range_end >> PAGE_CACHE_SHIFT;
+ wbc->range_end >> PAGE_SHIFT;
if (start || end)
expected_pages = end - start + 1;
@@ -881,15 +881,15 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
}
/* read modify write */
- if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
+ if (!PageUptodate(page) && (len != PAGE_SIZE)) {
loff_t i_size = i_size_read(mapping->host);
- pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ pgoff_t end_index = i_size >> PAGE_SHIFT;
size_t rlen;
if (page->index < end_index)
- rlen = PAGE_CACHE_SIZE;
+ rlen = PAGE_SIZE;
else if (page->index == end_index)
- rlen = i_size & ~PAGE_CACHE_MASK;
+ rlen = i_size & ~PAGE_MASK;
else
rlen = 0;
@@ -960,8 +960,7 @@ static void exofs_invalidatepage(struct page *page, unsigned int offset,
/* TODO: Should be easy enough to do proprly */
-static ssize_t exofs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- loff_t offset)
+static ssize_t exofs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
return 0;
}
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index c20d77df2679a..622a686bb08b5 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -292,11 +292,11 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
out_dir:
if (dir_de) {
kunmap(dir_page);
- page_cache_release(dir_page);
+ put_page(dir_page);
}
out_old:
kunmap(old_page);
- page_cache_release(old_page);
+ put_page(old_page);
out:
return err;
}
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 7bd8ac8dfb280..8bb72807e70d4 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -878,7 +878,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
} else {
bio = master_dev->bio;
/* FIXME: bio_set_dir() */
- bio->bi_rw |= REQ_WRITE;
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
}
osd_req_write(or, _ios_obj(ios, cur_comp),
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 6658a50530a06..1076a4233b396 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -122,7 +122,7 @@ static int parse_options(char *options, struct exofs_mountopt *opts)
if (match_int(&args[0], &option))
return -EINVAL;
if (option <= 0) {
- EXOFS_ERR("Timout must be > 0");
+ EXOFS_ERR("Timeout must be > 0");
return -EINVAL;
}
opts->timeout = option * HZ;
@@ -958,7 +958,7 @@ static struct dentry *exofs_get_parent(struct dentry *child)
if (!ino)
return ERR_PTR(-ESTALE);
- return d_obtain_alias(exofs_iget(d_inode(child)->i_sb, ino));
+ return d_obtain_alias(exofs_iget(child->d_sb, ino));
}
static struct inode *exofs_nfs_get_inode(struct super_block *sb,
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index c46f1a190b8d9..207ba8d627ca7 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -143,14 +143,18 @@ static struct dentry *reconnect_one(struct vfsmount *mnt,
if (err)
goto out_err;
dprintk("%s: found name: %s\n", __func__, nbuf);
- inode_lock(parent->d_inode);
- tmp = lookup_one_len(nbuf, parent, strlen(nbuf));
- inode_unlock(parent->d_inode);
+ tmp = lookup_one_len_unlocked(nbuf, parent, strlen(nbuf));
if (IS_ERR(tmp)) {
dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp));
goto out_err;
}
if (tmp != dentry) {
+ /*
+ * Somebody has renamed it since exportfs_get_name();
+ * great, since it could've only been renamed if it
+ * got looked up and thus connected, and it would
+ * remain connected afterwards. We are done.
+ */
dput(tmp);
goto out_reconnected;
}
@@ -308,7 +312,7 @@ static int get_name(const struct path *path, char *name, struct dentry *child)
goto out;
error = -EINVAL;
- if (!file->f_op->iterate)
+ if (!file->f_op->iterate && !file->f_op->iterate_shared)
goto out_close;
buffer.sequence = 0;
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 27695e6f4e466..42f1d1814083c 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -172,9 +172,6 @@ ext2_get_acl(struct inode *inode, int type)
acl = ERR_PTR(retval);
kfree(value);
- if (!IS_ERR(acl))
- set_cached_acl(inode, type, acl);
-
return acl;
}
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 9f9992b37924a..4c40c0786e168 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -1194,6 +1194,27 @@ static int ext2_has_free_blocks(struct ext2_sb_info *sbi)
}
/*
+ * Returns 1 if the passed-in block region is valid; 0 if some part overlaps
+ * with filesystem metadata blocksi.
+ */
+int ext2_data_block_valid(struct ext2_sb_info *sbi, ext2_fsblk_t start_blk,
+ unsigned int count)
+{
+ if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
+ (start_blk + count < start_blk) ||
+ (start_blk > le32_to_cpu(sbi->s_es->s_blocks_count)))
+ return 0;
+
+ /* Ensure we do not step over superblock */
+ if ((start_blk <= sbi->s_sb_block) &&
+ (start_blk + count >= sbi->s_sb_block))
+ return 0;
+
+
+ return 1;
+}
+
+/*
* ext2_new_blocks() -- core block(s) allocation function
* @inode: file inode
* @goal: given target block(filesystem wide)
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 0c6638b40f217..61ad490ed67b9 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -37,7 +37,7 @@ static inline unsigned ext2_rec_len_from_disk(__le16 dlen)
{
unsigned len = le16_to_cpu(dlen);
-#if (PAGE_CACHE_SIZE >= 65536)
+#if (PAGE_SIZE >= 65536)
if (len == EXT2_MAX_REC_LEN)
return 1 << 16;
#endif
@@ -46,7 +46,7 @@ static inline unsigned ext2_rec_len_from_disk(__le16 dlen)
static inline __le16 ext2_rec_len_to_disk(unsigned len)
{
-#if (PAGE_CACHE_SIZE >= 65536)
+#if (PAGE_SIZE >= 65536)
if (len == (1 << 16))
return cpu_to_le16(EXT2_MAX_REC_LEN);
else
@@ -67,7 +67,7 @@ static inline unsigned ext2_chunk_size(struct inode *inode)
static inline void ext2_put_page(struct page *page)
{
kunmap(page);
- page_cache_release(page);
+ put_page(page);
}
/*
@@ -79,9 +79,9 @@ ext2_last_byte(struct inode *inode, unsigned long page_nr)
{
unsigned last_byte = inode->i_size;
- last_byte -= page_nr << PAGE_CACHE_SHIFT;
- if (last_byte > PAGE_CACHE_SIZE)
- last_byte = PAGE_CACHE_SIZE;
+ last_byte -= page_nr << PAGE_SHIFT;
+ if (last_byte > PAGE_SIZE)
+ last_byte = PAGE_SIZE;
return last_byte;
}
@@ -110,7 +110,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
return err;
}
-static void ext2_check_page(struct page *page, int quiet)
+static bool ext2_check_page(struct page *page, int quiet)
{
struct inode *dir = page->mapping->host;
struct super_block *sb = dir->i_sb;
@@ -118,12 +118,12 @@ static void ext2_check_page(struct page *page, int quiet)
char *kaddr = page_address(page);
u32 max_inumber = le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count);
unsigned offs, rec_len;
- unsigned limit = PAGE_CACHE_SIZE;
+ unsigned limit = PAGE_SIZE;
ext2_dirent *p;
char *error;
- if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
- limit = dir->i_size & ~PAGE_CACHE_MASK;
+ if ((dir->i_size >> PAGE_SHIFT) == page->index) {
+ limit = dir->i_size & ~PAGE_MASK;
if (limit & (chunk_size - 1))
goto Ebadsize;
if (!limit)
@@ -148,7 +148,7 @@ static void ext2_check_page(struct page *page, int quiet)
goto Eend;
out:
SetPageChecked(page);
- return;
+ return true;
/* Too bad, we had an error */
@@ -176,7 +176,7 @@ bad_entry:
if (!quiet)
ext2_error(sb, __func__, "bad entry in directory #%lu: : %s - "
"offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
- dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+ dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs,
(unsigned long) le32_to_cpu(p->inode),
rec_len, p->name_len);
goto fail;
@@ -186,12 +186,12 @@ Eend:
ext2_error(sb, "ext2_check_page",
"entry in directory #%lu spans the page boundary"
"offset=%lu, inode=%lu",
- dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
+ dir->i_ino, (page->index<<PAGE_SHIFT)+offs,
(unsigned long) le32_to_cpu(p->inode));
}
fail:
- SetPageChecked(page);
SetPageError(page);
+ return false;
}
static struct page * ext2_get_page(struct inode *dir, unsigned long n,
@@ -201,10 +201,10 @@ static struct page * ext2_get_page(struct inode *dir, unsigned long n,
struct page *page = read_mapping_page(mapping, n, NULL);
if (!IS_ERR(page)) {
kmap(page);
- if (!PageChecked(page))
- ext2_check_page(page, quiet);
- if (PageError(page))
- goto fail;
+ if (unlikely(!PageChecked(page))) {
+ if (PageError(page) || !ext2_check_page(page, quiet))
+ goto fail;
+ }
}
return page;
@@ -287,8 +287,8 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
loff_t pos = ctx->pos;
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
- unsigned int offset = pos & ~PAGE_CACHE_MASK;
- unsigned long n = pos >> PAGE_CACHE_SHIFT;
+ unsigned int offset = pos & ~PAGE_MASK;
+ unsigned long n = pos >> PAGE_SHIFT;
unsigned long npages = dir_pages(inode);
unsigned chunk_mask = ~(ext2_chunk_size(inode)-1);
unsigned char *types = NULL;
@@ -309,14 +309,14 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
ext2_error(sb, __func__,
"bad page in #%lu",
inode->i_ino);
- ctx->pos += PAGE_CACHE_SIZE - offset;
+ ctx->pos += PAGE_SIZE - offset;
return PTR_ERR(page);
}
kaddr = page_address(page);
if (unlikely(need_revalidate)) {
if (offset) {
offset = ext2_validate_entry(kaddr, offset, chunk_mask);
- ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
+ ctx->pos = (n<<PAGE_SHIFT) + offset;
}
file->f_version = inode->i_version;
need_revalidate = 0;
@@ -358,8 +358,8 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
* and the entry itself. Page is returned mapped and unlocked.
* Entry is guaranteed to be valid.
*/
-struct ext2_dir_entry_2 *ext2_find_entry (struct inode * dir,
- struct qstr *child, struct page ** res_page)
+struct ext2_dir_entry_2 *ext2_find_entry (struct inode *dir,
+ const struct qstr *child, struct page **res_page)
{
const char *name = child->name;
int namelen = child->len;
@@ -406,7 +406,7 @@ struct ext2_dir_entry_2 *ext2_find_entry (struct inode * dir,
if (++n >= npages)
n = 0;
/* next page is past the blocks we've got */
- if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) {
+ if (unlikely(n > (dir->i_blocks >> (PAGE_SHIFT - 9)))) {
ext2_error(dir->i_sb, __func__,
"dir %lu size %lld exceeds block count %llu",
dir->i_ino, dir->i_size,
@@ -435,7 +435,7 @@ struct ext2_dir_entry_2 * ext2_dotdot (struct inode *dir, struct page **p)
return de;
}
-ino_t ext2_inode_by_name(struct inode *dir, struct qstr *child)
+ino_t ext2_inode_by_name(struct inode *dir, const struct qstr *child)
{
ino_t res = 0;
struct ext2_dir_entry_2 *de;
@@ -511,7 +511,7 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode)
kaddr = page_address(page);
dir_end = kaddr + ext2_last_byte(dir, n);
de = (ext2_dirent *)kaddr;
- kaddr += PAGE_CACHE_SIZE - reclen;
+ kaddr += PAGE_SIZE - reclen;
while ((char *)de <= kaddr) {
if ((char *)de == dir_end) {
/* We hit i_size */
@@ -655,7 +655,7 @@ int ext2_make_empty(struct inode *inode, struct inode *parent)
kunmap_atomic(kaddr);
err = ext2_commit_chunk(page, 0, chunk_size);
fail:
- page_cache_release(page);
+ put_page(page);
return err;
}
@@ -716,7 +716,7 @@ not_empty:
const struct file_operations ext2_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .iterate = ext2_readdir,
+ .iterate_shared = ext2_readdir,
.unlocked_ioctl = ext2_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext2_compat_ioctl,
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 170939f379d74..06af2f92226c9 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -367,6 +367,7 @@ struct ext2_inode {
*/
#define EXT2_VALID_FS 0x0001 /* Unmounted cleanly */
#define EXT2_ERROR_FS 0x0002 /* Errors detected */
+#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
/*
* Mount flags
@@ -739,6 +740,8 @@ extern unsigned long ext2_bg_num_gdb(struct super_block *sb, int group);
extern ext2_fsblk_t ext2_new_block(struct inode *, unsigned long, int *);
extern ext2_fsblk_t ext2_new_blocks(struct inode *, unsigned long,
unsigned long *, int *);
+extern int ext2_data_block_valid(struct ext2_sb_info *sbi, ext2_fsblk_t start_blk,
+ unsigned int count);
extern void ext2_free_blocks (struct inode *, unsigned long,
unsigned long);
extern unsigned long ext2_count_free_blocks (struct super_block *);
@@ -754,9 +757,9 @@ extern void ext2_rsv_window_add(struct super_block *sb, struct ext2_reserve_wind
/* dir.c */
extern int ext2_add_link (struct dentry *, struct inode *);
-extern ino_t ext2_inode_by_name(struct inode *, struct qstr *);
+extern ino_t ext2_inode_by_name(struct inode *, const struct qstr *);
extern int ext2_make_empty(struct inode *, struct inode *);
-extern struct ext2_dir_entry_2 * ext2_find_entry (struct inode *,struct qstr *, struct page **);
+extern struct ext2_dir_entry_2 * ext2_find_entry (struct inode *,const struct qstr *, struct page **);
extern int ext2_delete_entry (struct ext2_dir_entry_2 *, struct page *);
extern int ext2_empty_dir (struct inode *);
extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index c1400b109805b..5efeefe17abb4 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -51,7 +51,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
down_read(&ei->dax_sem);
- ret = __dax_fault(vma, vmf, ext2_get_block, NULL);
+ ret = dax_fault(vma, vmf, ext2_get_block);
up_read(&ei->dax_sem);
if (vmf->flags & FAULT_FLAG_WRITE)
@@ -72,7 +72,7 @@ static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
}
down_read(&ei->dax_sem);
- ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL);
+ ret = dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block);
up_read(&ei->dax_sem);
if (flags & FAULT_FLAG_WRITE)
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 6bd58e6ff0386..d5c7d09919f31 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -26,6 +26,7 @@
#include <linux/highuid.h>
#include <linux/pagemap.h>
#include <linux/dax.h>
+#include <linux/blkdev.h>
#include <linux/quotaops.h>
#include <linux/writeback.h>
#include <linux/buffer_head.h>
@@ -737,19 +738,18 @@ static int ext2_get_blocks(struct inode *inode,
* so that it's not found by another thread before it's
* initialised
*/
- err = dax_clear_sectors(inode->i_sb->s_bdev,
- le32_to_cpu(chain[depth-1].key) <<
- (inode->i_blkbits - 9),
- 1 << inode->i_blkbits);
+ err = sb_issue_zeroout(inode->i_sb,
+ le32_to_cpu(chain[depth-1].key), count,
+ GFP_NOFS);
if (err) {
mutex_unlock(&ei->truncate_mutex);
goto cleanup;
}
- }
+ } else
+ set_buffer_new(bh_result);
ext2_splice_branch(inode, iblock, partial, indirect_blks, count);
mutex_unlock(&ei->truncate_mutex);
- set_buffer_new(bh_result);
got_it:
map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
if (count > blocks_to_boundary)
@@ -854,20 +854,20 @@ static sector_t ext2_bmap(struct address_space *mapping, sector_t block)
}
static ssize_t
-ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
+ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
size_t count = iov_iter_count(iter);
+ loff_t offset = iocb->ki_pos;
ssize_t ret;
if (IS_DAX(inode))
- ret = dax_do_io(iocb, inode, iter, offset, ext2_get_block, NULL,
+ ret = dax_do_io(iocb, inode, iter, ext2_get_block, NULL,
DIO_LOCKING);
else
- ret = blockdev_direct_IO(iocb, inode, iter, offset,
- ext2_get_block);
+ ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block);
if (ret < 0 && iov_iter_rw(iter) == WRITE)
ext2_write_failed(mapping, offset + count);
return ret;
@@ -1389,6 +1389,16 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
ei->i_frag_size = raw_inode->i_fsize;
ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
ei->i_dir_acl = 0;
+
+ if (ei->i_file_acl &&
+ !ext2_data_block_valid(EXT2_SB(sb), ei->i_file_acl, 1)) {
+ ext2_error(sb, "ext2_iget", "bad extended attribute block %u",
+ ei->i_file_acl);
+ brelse(bh);
+ ret = -EFSCORRUPTED;
+ goto bad_inode;
+ }
+
if (S_ISREG(inode->i_mode))
inode->i_size |= ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
else
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 7a2be8f7f3c37..d446203127fcf 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -82,7 +82,7 @@ struct dentry *ext2_get_parent(struct dentry *child)
unsigned long ino = ext2_inode_by_name(d_inode(child), &dotdot);
if (!ino)
return ERR_PTR(-ENOENT);
- return d_obtain_alias(ext2_iget(d_inode(child)->i_sb, ino));
+ return d_obtain_alias(ext2_iget(child->d_sb, ino));
}
/*
@@ -398,7 +398,7 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0);
else {
kunmap(dir_page);
- page_cache_release(dir_page);
+ put_page(dir_page);
}
inode_dec_link_count(old_dir);
}
@@ -408,11 +408,11 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
out_dir:
if (dir_de) {
kunmap(dir_page);
- page_cache_release(dir_page);
+ put_page(dir_page);
}
out_old:
kunmap(old_page);
- page_cache_release(old_page);
+ put_page(old_page);
out:
return err;
}
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index b78caf25f7462..1d9379568aa83 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -922,16 +922,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
if (sbi->s_mount_opt & EXT2_MOUNT_DAX) {
- if (blocksize != PAGE_SIZE) {
- ext2_msg(sb, KERN_ERR,
- "error: unsupported blocksize for dax");
+ err = bdev_dax_supported(sb, blocksize);
+ if (err)
goto failed_mount;
- }
- if (!sb->s_bdev->bd_disk->fops->direct_access) {
- ext2_msg(sb, KERN_ERR,
- "error: device does not support dax");
- goto failed_mount;
- }
}
/* If the blocksize doesn't match, re-read the thing.. */
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 1a5e3bff0b63c..b7f896f3f7a7f 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -759,10 +759,19 @@ void
ext2_xattr_delete_inode(struct inode *inode)
{
struct buffer_head *bh = NULL;
+ struct ext2_sb_info *sbi = EXT2_SB(inode->i_sb);
down_write(&EXT2_I(inode)->xattr_sem);
if (!EXT2_I(inode)->i_file_acl)
goto cleanup;
+
+ if (!ext2_data_block_valid(sbi, EXT2_I(inode)->i_file_acl, 0)) {
+ ext2_error(inode->i_sb, "ext2_xattr_delete_inode",
+ "inode %ld: xattr block %d is out of data blocks range",
+ inode->i_ino, EXT2_I(inode)->i_file_acl);
+ goto cleanup;
+ }
+
bh = sb_bread(inode->i_sb, EXT2_I(inode)->i_file_acl);
if (!bh) {
ext2_error(inode->i_sb, "ext2_xattr_delete_inode",
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index ba97f243b0504..7b9e9c1842d52 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -9,19 +9,20 @@
static int
ext2_xattr_security_get(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- void *buffer, size_t size)
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *buffer, size_t size)
{
- return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name,
+ return ext2_xattr_get(inode, EXT2_XATTR_INDEX_SECURITY, name,
buffer, size);
}
static int
ext2_xattr_security_set(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name,
+ return ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY, name,
value, size, flags);
}
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 2c94d19306262..65049b71af137 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -16,19 +16,20 @@ ext2_xattr_trusted_list(struct dentry *dentry)
static int
ext2_xattr_trusted_get(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- void *buffer, size_t size)
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *buffer, size_t size)
{
- return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name,
+ return ext2_xattr_get(inode, EXT2_XATTR_INDEX_TRUSTED, name,
buffer, size);
}
static int
ext2_xattr_trusted_set(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name,
+ return ext2_xattr_set(inode, EXT2_XATTR_INDEX_TRUSTED, name,
value, size, flags);
}
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index 72a2a96d677f9..fb2f992ae763a 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -18,24 +18,25 @@ ext2_xattr_user_list(struct dentry *dentry)
static int
ext2_xattr_user_get(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- void *buffer, size_t size)
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *buffer, size_t size)
{
- if (!test_opt(dentry->d_sb, XATTR_USER))
+ if (!test_opt(inode->i_sb, XATTR_USER))
return -EOPNOTSUPP;
- return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_USER,
+ return ext2_xattr_get(inode, EXT2_XATTR_INDEX_USER,
name, buffer, size);
}
static int
ext2_xattr_user_set(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- if (!test_opt(dentry->d_sb, XATTR_USER))
+ if (!test_opt(inode->i_sb, XATTR_USER))
return -EOPNOTSUPP;
- return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_USER,
+ return ext2_xattr_set(inode, EXT2_XATTR_INDEX_USER,
name, value, size, flags);
}
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index b46e9fc641960..e38039fd96ff5 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -99,17 +99,9 @@ config EXT4_FS_SECURITY
extended attributes for file security labels, say N.
config EXT4_ENCRYPTION
- tristate "Ext4 Encryption"
+ bool "Ext4 Encryption"
depends on EXT4_FS
- select CRYPTO_AES
- select CRYPTO_CBC
- select CRYPTO_ECB
- select CRYPTO_XTS
- select CRYPTO_CTS
- select CRYPTO_CTR
- select CRYPTO_SHA256
- select KEYS
- select ENCRYPTED_KEYS
+ select FS_ENCRYPTION
help
Enable encryption of ext4 files and directories. This
feature is similar to ecryptfs, but it is more memory
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index f52cf54f0cbc4..354103f3490c3 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -12,5 +12,3 @@ ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
-ext4-$(CONFIG_EXT4_FS_ENCRYPTION) += crypto_policy.o crypto.o \
- crypto_key.o crypto_fname.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 69b1e73026a51..c6601a476c021 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -172,9 +172,6 @@ ext4_get_acl(struct inode *inode, int type)
acl = ERR_PTR(retval);
kfree(value);
- if (!IS_ERR(acl))
- set_cached_acl(inode, type, acl);
-
return acl;
}
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index fe1f50fe764ff..e04ec868e37e7 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -208,6 +208,9 @@ static int ext4_init_block_bitmap(struct super_block *sb,
memset(bh->b_data, 0, sb->s_blocksize);
bit_max = ext4_num_base_meta_clusters(sb, block_group);
+ if ((bit_max >> 3) >= bh->b_size)
+ return -EFSCORRUPTED;
+
for (bit = 0; bit < bit_max; bit++)
ext4_set_bit(bit, bh->b_data);
@@ -470,7 +473,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
trace_ext4_read_block_bitmap_load(sb, block_group);
bh->b_end_io = ext4_end_bitmap_read;
get_bh(bh);
- submit_bh(READ | REQ_META | REQ_PRIO, bh);
+ submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
return bh;
verify:
err = ext4_validate_block_bitmap(sb, desc, block_group, bh);
@@ -610,7 +613,10 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
- return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
+ smp_mb();
+ if (EXT4_SB(sb)->s_mb_free_pending)
+ jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
+ return 1;
}
/*
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
deleted file mode 100644
index edc053a819144..0000000000000
--- a/fs/ext4/crypto.c
+++ /dev/null
@@ -1,523 +0,0 @@
-/*
- * linux/fs/ext4/crypto.c
- *
- * Copyright (C) 2015, Google, Inc.
- *
- * This contains encryption functions for ext4
- *
- * Written by Michael Halcrow, 2014.
- *
- * Filename encryption additions
- * Uday Savagaonkar, 2014
- * Encryption policy handling additions
- * Ildar Muslukhov, 2014
- *
- * This has not yet undergone a rigorous security audit.
- *
- * The usage of AES-XTS should conform to recommendations in NIST
- * Special Publication 800-38E and IEEE P1619/D16.
- */
-
-#include <crypto/skcipher.h>
-#include <keys/user-type.h>
-#include <keys/encrypted-type.h>
-#include <linux/ecryptfs.h>
-#include <linux/gfp.h>
-#include <linux/kernel.h>
-#include <linux/key.h>
-#include <linux/list.h>
-#include <linux/mempool.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/random.h>
-#include <linux/scatterlist.h>
-#include <linux/spinlock_types.h>
-
-#include "ext4_extents.h"
-#include "xattr.h"
-
-/* Encryption added and removed here! (L: */
-
-static unsigned int num_prealloc_crypto_pages = 32;
-static unsigned int num_prealloc_crypto_ctxs = 128;
-
-module_param(num_prealloc_crypto_pages, uint, 0444);
-MODULE_PARM_DESC(num_prealloc_crypto_pages,
- "Number of crypto pages to preallocate");
-module_param(num_prealloc_crypto_ctxs, uint, 0444);
-MODULE_PARM_DESC(num_prealloc_crypto_ctxs,
- "Number of crypto contexts to preallocate");
-
-static mempool_t *ext4_bounce_page_pool;
-
-static LIST_HEAD(ext4_free_crypto_ctxs);
-static DEFINE_SPINLOCK(ext4_crypto_ctx_lock);
-
-static struct kmem_cache *ext4_crypto_ctx_cachep;
-struct kmem_cache *ext4_crypt_info_cachep;
-
-/**
- * ext4_release_crypto_ctx() - Releases an encryption context
- * @ctx: The encryption context to release.
- *
- * If the encryption context was allocated from the pre-allocated pool, returns
- * it to that pool. Else, frees it.
- *
- * If there's a bounce page in the context, this frees that.
- */
-void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx)
-{
- unsigned long flags;
-
- if (ctx->flags & EXT4_WRITE_PATH_FL && ctx->w.bounce_page)
- mempool_free(ctx->w.bounce_page, ext4_bounce_page_pool);
- ctx->w.bounce_page = NULL;
- ctx->w.control_page = NULL;
- if (ctx->flags & EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL) {
- kmem_cache_free(ext4_crypto_ctx_cachep, ctx);
- } else {
- spin_lock_irqsave(&ext4_crypto_ctx_lock, flags);
- list_add(&ctx->free_list, &ext4_free_crypto_ctxs);
- spin_unlock_irqrestore(&ext4_crypto_ctx_lock, flags);
- }
-}
-
-/**
- * ext4_get_crypto_ctx() - Gets an encryption context
- * @inode: The inode for which we are doing the crypto
- *
- * Allocates and initializes an encryption context.
- *
- * Return: An allocated and initialized encryption context on success; error
- * value or NULL otherwise.
- */
-struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode)
-{
- struct ext4_crypto_ctx *ctx = NULL;
- int res = 0;
- unsigned long flags;
- struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
-
- if (ci == NULL)
- return ERR_PTR(-ENOKEY);
-
- /*
- * We first try getting the ctx from a free list because in
- * the common case the ctx will have an allocated and
- * initialized crypto tfm, so it's probably a worthwhile
- * optimization. For the bounce page, we first try getting it
- * from the kernel allocator because that's just about as fast
- * as getting it from a list and because a cache of free pages
- * should generally be a "last resort" option for a filesystem
- * to be able to do its job.
- */
- spin_lock_irqsave(&ext4_crypto_ctx_lock, flags);
- ctx = list_first_entry_or_null(&ext4_free_crypto_ctxs,
- struct ext4_crypto_ctx, free_list);
- if (ctx)
- list_del(&ctx->free_list);
- spin_unlock_irqrestore(&ext4_crypto_ctx_lock, flags);
- if (!ctx) {
- ctx = kmem_cache_zalloc(ext4_crypto_ctx_cachep, GFP_NOFS);
- if (!ctx) {
- res = -ENOMEM;
- goto out;
- }
- ctx->flags |= EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL;
- } else {
- ctx->flags &= ~EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL;
- }
- ctx->flags &= ~EXT4_WRITE_PATH_FL;
-
-out:
- if (res) {
- if (!IS_ERR_OR_NULL(ctx))
- ext4_release_crypto_ctx(ctx);
- ctx = ERR_PTR(res);
- }
- return ctx;
-}
-
-struct workqueue_struct *ext4_read_workqueue;
-static DEFINE_MUTEX(crypto_init);
-
-/**
- * ext4_exit_crypto() - Shutdown the ext4 encryption system
- */
-void ext4_exit_crypto(void)
-{
- struct ext4_crypto_ctx *pos, *n;
-
- list_for_each_entry_safe(pos, n, &ext4_free_crypto_ctxs, free_list)
- kmem_cache_free(ext4_crypto_ctx_cachep, pos);
- INIT_LIST_HEAD(&ext4_free_crypto_ctxs);
- if (ext4_bounce_page_pool)
- mempool_destroy(ext4_bounce_page_pool);
- ext4_bounce_page_pool = NULL;
- if (ext4_read_workqueue)
- destroy_workqueue(ext4_read_workqueue);
- ext4_read_workqueue = NULL;
- if (ext4_crypto_ctx_cachep)
- kmem_cache_destroy(ext4_crypto_ctx_cachep);
- ext4_crypto_ctx_cachep = NULL;
- if (ext4_crypt_info_cachep)
- kmem_cache_destroy(ext4_crypt_info_cachep);
- ext4_crypt_info_cachep = NULL;
-}
-
-/**
- * ext4_init_crypto() - Set up for ext4 encryption.
- *
- * We only call this when we start accessing encrypted files, since it
- * results in memory getting allocated that wouldn't otherwise be used.
- *
- * Return: Zero on success, non-zero otherwise.
- */
-int ext4_init_crypto(void)
-{
- int i, res = -ENOMEM;
-
- mutex_lock(&crypto_init);
- if (ext4_read_workqueue)
- goto already_initialized;
- ext4_read_workqueue = alloc_workqueue("ext4_crypto", WQ_HIGHPRI, 0);
- if (!ext4_read_workqueue)
- goto fail;
-
- ext4_crypto_ctx_cachep = KMEM_CACHE(ext4_crypto_ctx,
- SLAB_RECLAIM_ACCOUNT);
- if (!ext4_crypto_ctx_cachep)
- goto fail;
-
- ext4_crypt_info_cachep = KMEM_CACHE(ext4_crypt_info,
- SLAB_RECLAIM_ACCOUNT);
- if (!ext4_crypt_info_cachep)
- goto fail;
-
- for (i = 0; i < num_prealloc_crypto_ctxs; i++) {
- struct ext4_crypto_ctx *ctx;
-
- ctx = kmem_cache_zalloc(ext4_crypto_ctx_cachep, GFP_NOFS);
- if (!ctx) {
- res = -ENOMEM;
- goto fail;
- }
- list_add(&ctx->free_list, &ext4_free_crypto_ctxs);
- }
-
- ext4_bounce_page_pool =
- mempool_create_page_pool(num_prealloc_crypto_pages, 0);
- if (!ext4_bounce_page_pool) {
- res = -ENOMEM;
- goto fail;
- }
-already_initialized:
- mutex_unlock(&crypto_init);
- return 0;
-fail:
- ext4_exit_crypto();
- mutex_unlock(&crypto_init);
- return res;
-}
-
-void ext4_restore_control_page(struct page *data_page)
-{
- struct ext4_crypto_ctx *ctx =
- (struct ext4_crypto_ctx *)page_private(data_page);
-
- set_page_private(data_page, (unsigned long)NULL);
- ClearPagePrivate(data_page);
- unlock_page(data_page);
- ext4_release_crypto_ctx(ctx);
-}
-
-/**
- * ext4_crypt_complete() - The completion callback for page encryption
- * @req: The asynchronous encryption request context
- * @res: The result of the encryption operation
- */
-static void ext4_crypt_complete(struct crypto_async_request *req, int res)
-{
- struct ext4_completion_result *ecr = req->data;
-
- if (res == -EINPROGRESS)
- return;
- ecr->res = res;
- complete(&ecr->completion);
-}
-
-typedef enum {
- EXT4_DECRYPT = 0,
- EXT4_ENCRYPT,
-} ext4_direction_t;
-
-static int ext4_page_crypto(struct inode *inode,
- ext4_direction_t rw,
- pgoff_t index,
- struct page *src_page,
- struct page *dest_page)
-
-{
- u8 xts_tweak[EXT4_XTS_TWEAK_SIZE];
- struct skcipher_request *req = NULL;
- DECLARE_EXT4_COMPLETION_RESULT(ecr);
- struct scatterlist dst, src;
- struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
- struct crypto_skcipher *tfm = ci->ci_ctfm;
- int res = 0;
-
- req = skcipher_request_alloc(tfm, GFP_NOFS);
- if (!req) {
- printk_ratelimited(KERN_ERR
- "%s: crypto_request_alloc() failed\n",
- __func__);
- return -ENOMEM;
- }
- skcipher_request_set_callback(
- req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- ext4_crypt_complete, &ecr);
-
- BUILD_BUG_ON(EXT4_XTS_TWEAK_SIZE < sizeof(index));
- memcpy(xts_tweak, &index, sizeof(index));
- memset(&xts_tweak[sizeof(index)], 0,
- EXT4_XTS_TWEAK_SIZE - sizeof(index));
-
- sg_init_table(&dst, 1);
- sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0);
- sg_init_table(&src, 1);
- sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0);
- skcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE,
- xts_tweak);
- if (rw == EXT4_DECRYPT)
- res = crypto_skcipher_decrypt(req);
- else
- res = crypto_skcipher_encrypt(req);
- if (res == -EINPROGRESS || res == -EBUSY) {
- wait_for_completion(&ecr.completion);
- res = ecr.res;
- }
- skcipher_request_free(req);
- if (res) {
- printk_ratelimited(
- KERN_ERR
- "%s: crypto_skcipher_encrypt() returned %d\n",
- __func__, res);
- return res;
- }
- return 0;
-}
-
-static struct page *alloc_bounce_page(struct ext4_crypto_ctx *ctx)
-{
- ctx->w.bounce_page = mempool_alloc(ext4_bounce_page_pool, GFP_NOWAIT);
- if (ctx->w.bounce_page == NULL)
- return ERR_PTR(-ENOMEM);
- ctx->flags |= EXT4_WRITE_PATH_FL;
- return ctx->w.bounce_page;
-}
-
-/**
- * ext4_encrypt() - Encrypts a page
- * @inode: The inode for which the encryption should take place
- * @plaintext_page: The page to encrypt. Must be locked.
- *
- * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx
- * encryption context.
- *
- * Called on the page write path. The caller must call
- * ext4_restore_control_page() on the returned ciphertext page to
- * release the bounce buffer and the encryption context.
- *
- * Return: An allocated page with the encrypted content on success. Else, an
- * error value or NULL.
- */
-struct page *ext4_encrypt(struct inode *inode,
- struct page *plaintext_page)
-{
- struct ext4_crypto_ctx *ctx;
- struct page *ciphertext_page = NULL;
- int err;
-
- BUG_ON(!PageLocked(plaintext_page));
-
- ctx = ext4_get_crypto_ctx(inode);
- if (IS_ERR(ctx))
- return (struct page *) ctx;
-
- /* The encryption operation will require a bounce page. */
- ciphertext_page = alloc_bounce_page(ctx);
- if (IS_ERR(ciphertext_page))
- goto errout;
- ctx->w.control_page = plaintext_page;
- err = ext4_page_crypto(inode, EXT4_ENCRYPT, plaintext_page->index,
- plaintext_page, ciphertext_page);
- if (err) {
- ciphertext_page = ERR_PTR(err);
- errout:
- ext4_release_crypto_ctx(ctx);
- return ciphertext_page;
- }
- SetPagePrivate(ciphertext_page);
- set_page_private(ciphertext_page, (unsigned long)ctx);
- lock_page(ciphertext_page);
- return ciphertext_page;
-}
-
-/**
- * ext4_decrypt() - Decrypts a page in-place
- * @ctx: The encryption context.
- * @page: The page to decrypt. Must be locked.
- *
- * Decrypts page in-place using the ctx encryption context.
- *
- * Called from the read completion callback.
- *
- * Return: Zero on success, non-zero otherwise.
- */
-int ext4_decrypt(struct page *page)
-{
- BUG_ON(!PageLocked(page));
-
- return ext4_page_crypto(page->mapping->host,
- EXT4_DECRYPT, page->index, page, page);
-}
-
-int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk,
- ext4_fsblk_t pblk, ext4_lblk_t len)
-{
- struct ext4_crypto_ctx *ctx;
- struct page *ciphertext_page = NULL;
- struct bio *bio;
- int ret, err = 0;
-
-#if 0
- ext4_msg(inode->i_sb, KERN_CRIT,
- "ext4_encrypted_zeroout ino %lu lblk %u len %u",
- (unsigned long) inode->i_ino, lblk, len);
-#endif
-
- BUG_ON(inode->i_sb->s_blocksize != PAGE_CACHE_SIZE);
-
- ctx = ext4_get_crypto_ctx(inode);
- if (IS_ERR(ctx))
- return PTR_ERR(ctx);
-
- ciphertext_page = alloc_bounce_page(ctx);
- if (IS_ERR(ciphertext_page)) {
- err = PTR_ERR(ciphertext_page);
- goto errout;
- }
-
- while (len--) {
- err = ext4_page_crypto(inode, EXT4_ENCRYPT, lblk,
- ZERO_PAGE(0), ciphertext_page);
- if (err)
- goto errout;
-
- bio = bio_alloc(GFP_KERNEL, 1);
- if (!bio) {
- err = -ENOMEM;
- goto errout;
- }
- bio->bi_bdev = inode->i_sb->s_bdev;
- bio->bi_iter.bi_sector =
- pblk << (inode->i_sb->s_blocksize_bits - 9);
- ret = bio_add_page(bio, ciphertext_page,
- inode->i_sb->s_blocksize, 0);
- if (ret != inode->i_sb->s_blocksize) {
- /* should never happen! */
- ext4_msg(inode->i_sb, KERN_ERR,
- "bio_add_page failed: %d", ret);
- WARN_ON(1);
- bio_put(bio);
- err = -EIO;
- goto errout;
- }
- err = submit_bio_wait(WRITE, bio);
- if ((err == 0) && bio->bi_error)
- err = -EIO;
- bio_put(bio);
- if (err)
- goto errout;
- lblk++; pblk++;
- }
- err = 0;
-errout:
- ext4_release_crypto_ctx(ctx);
- return err;
-}
-
-bool ext4_valid_contents_enc_mode(uint32_t mode)
-{
- return (mode == EXT4_ENCRYPTION_MODE_AES_256_XTS);
-}
-
-/**
- * ext4_validate_encryption_key_size() - Validate the encryption key size
- * @mode: The key mode.
- * @size: The key size to validate.
- *
- * Return: The validated key size for @mode. Zero if invalid.
- */
-uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size)
-{
- if (size == ext4_encryption_key_size(mode))
- return size;
- return 0;
-}
-
-/*
- * Validate dentries for encrypted directories to make sure we aren't
- * potentially caching stale data after a key has been added or
- * removed.
- */
-static int ext4_d_revalidate(struct dentry *dentry, unsigned int flags)
-{
- struct inode *dir = d_inode(dentry->d_parent);
- struct ext4_crypt_info *ci = EXT4_I(dir)->i_crypt_info;
- int dir_has_key, cached_with_key;
-
- if (!ext4_encrypted_inode(dir))
- return 0;
-
- if (ci && ci->ci_keyring_key &&
- (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
- (1 << KEY_FLAG_REVOKED) |
- (1 << KEY_FLAG_DEAD))))
- ci = NULL;
-
- /* this should eventually be an flag in d_flags */
- cached_with_key = dentry->d_fsdata != NULL;
- dir_has_key = (ci != NULL);
-
- /*
- * If the dentry was cached without the key, and it is a
- * negative dentry, it might be a valid name. We can't check
- * if the key has since been made available due to locking
- * reasons, so we fail the validation so ext4_lookup() can do
- * this check.
- *
- * We also fail the validation if the dentry was created with
- * the key present, but we no longer have the key, or vice versa.
- */
- if ((!cached_with_key && d_is_negative(dentry)) ||
- (!cached_with_key && dir_has_key) ||
- (cached_with_key && !dir_has_key)) {
-#if 0 /* Revalidation debug */
- char buf[80];
- char *cp = simple_dname(dentry, buf, sizeof(buf));
-
- if (IS_ERR(cp))
- cp = (char *) "???";
- pr_err("revalidate: %s %p %d %d %d\n", cp, dentry->d_fsdata,
- cached_with_key, d_is_negative(dentry),
- dir_has_key);
-#endif
- return 0;
- }
- return 1;
-}
-
-const struct dentry_operations ext4_encrypted_d_ops = {
- .d_revalidate = ext4_d_revalidate,
-};
diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c
deleted file mode 100644
index 1a2f360405dbd..0000000000000
--- a/fs/ext4/crypto_fname.c
+++ /dev/null
@@ -1,468 +0,0 @@
-/*
- * linux/fs/ext4/crypto_fname.c
- *
- * Copyright (C) 2015, Google, Inc.
- *
- * This contains functions for filename crypto management in ext4
- *
- * Written by Uday Savagaonkar, 2014.
- *
- * This has not yet undergone a rigorous security audit.
- *
- */
-
-#include <crypto/skcipher.h>
-#include <keys/encrypted-type.h>
-#include <keys/user-type.h>
-#include <linux/gfp.h>
-#include <linux/kernel.h>
-#include <linux/key.h>
-#include <linux/list.h>
-#include <linux/mempool.h>
-#include <linux/random.h>
-#include <linux/scatterlist.h>
-#include <linux/spinlock_types.h>
-
-#include "ext4.h"
-#include "ext4_crypto.h"
-#include "xattr.h"
-
-/**
- * ext4_dir_crypt_complete() -
- */
-static void ext4_dir_crypt_complete(struct crypto_async_request *req, int res)
-{
- struct ext4_completion_result *ecr = req->data;
-
- if (res == -EINPROGRESS)
- return;
- ecr->res = res;
- complete(&ecr->completion);
-}
-
-bool ext4_valid_filenames_enc_mode(uint32_t mode)
-{
- return (mode == EXT4_ENCRYPTION_MODE_AES_256_CTS);
-}
-
-static unsigned max_name_len(struct inode *inode)
-{
- return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize :
- EXT4_NAME_LEN;
-}
-
-/**
- * ext4_fname_encrypt() -
- *
- * This function encrypts the input filename, and returns the length of the
- * ciphertext. Errors are returned as negative numbers. We trust the caller to
- * allocate sufficient memory to oname string.
- */
-static int ext4_fname_encrypt(struct inode *inode,
- const struct qstr *iname,
- struct ext4_str *oname)
-{
- u32 ciphertext_len;
- struct skcipher_request *req = NULL;
- DECLARE_EXT4_COMPLETION_RESULT(ecr);
- struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
- struct crypto_skcipher *tfm = ci->ci_ctfm;
- int res = 0;
- char iv[EXT4_CRYPTO_BLOCK_SIZE];
- struct scatterlist src_sg, dst_sg;
- int padding = 4 << (ci->ci_flags & EXT4_POLICY_FLAGS_PAD_MASK);
- char *workbuf, buf[32], *alloc_buf = NULL;
- unsigned lim = max_name_len(inode);
-
- if (iname->len <= 0 || iname->len > lim)
- return -EIO;
-
- ciphertext_len = (iname->len < EXT4_CRYPTO_BLOCK_SIZE) ?
- EXT4_CRYPTO_BLOCK_SIZE : iname->len;
- ciphertext_len = ext4_fname_crypto_round_up(ciphertext_len, padding);
- ciphertext_len = (ciphertext_len > lim)
- ? lim : ciphertext_len;
-
- if (ciphertext_len <= sizeof(buf)) {
- workbuf = buf;
- } else {
- alloc_buf = kmalloc(ciphertext_len, GFP_NOFS);
- if (!alloc_buf)
- return -ENOMEM;
- workbuf = alloc_buf;
- }
-
- /* Allocate request */
- req = skcipher_request_alloc(tfm, GFP_NOFS);
- if (!req) {
- printk_ratelimited(
- KERN_ERR "%s: crypto_request_alloc() failed\n", __func__);
- kfree(alloc_buf);
- return -ENOMEM;
- }
- skcipher_request_set_callback(req,
- CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- ext4_dir_crypt_complete, &ecr);
-
- /* Copy the input */
- memcpy(workbuf, iname->name, iname->len);
- if (iname->len < ciphertext_len)
- memset(workbuf + iname->len, 0, ciphertext_len - iname->len);
-
- /* Initialize IV */
- memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE);
-
- /* Create encryption request */
- sg_init_one(&src_sg, workbuf, ciphertext_len);
- sg_init_one(&dst_sg, oname->name, ciphertext_len);
- skcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv);
- res = crypto_skcipher_encrypt(req);
- if (res == -EINPROGRESS || res == -EBUSY) {
- wait_for_completion(&ecr.completion);
- res = ecr.res;
- }
- kfree(alloc_buf);
- skcipher_request_free(req);
- if (res < 0) {
- printk_ratelimited(
- KERN_ERR "%s: Error (error code %d)\n", __func__, res);
- }
- oname->len = ciphertext_len;
- return res;
-}
-
-/*
- * ext4_fname_decrypt()
- * This function decrypts the input filename, and returns
- * the length of the plaintext.
- * Errors are returned as negative numbers.
- * We trust the caller to allocate sufficient memory to oname string.
- */
-static int ext4_fname_decrypt(struct inode *inode,
- const struct ext4_str *iname,
- struct ext4_str *oname)
-{
- struct ext4_str tmp_in[2], tmp_out[1];
- struct skcipher_request *req = NULL;
- DECLARE_EXT4_COMPLETION_RESULT(ecr);
- struct scatterlist src_sg, dst_sg;
- struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
- struct crypto_skcipher *tfm = ci->ci_ctfm;
- int res = 0;
- char iv[EXT4_CRYPTO_BLOCK_SIZE];
- unsigned lim = max_name_len(inode);
-
- if (iname->len <= 0 || iname->len > lim)
- return -EIO;
-
- tmp_in[0].name = iname->name;
- tmp_in[0].len = iname->len;
- tmp_out[0].name = oname->name;
-
- /* Allocate request */
- req = skcipher_request_alloc(tfm, GFP_NOFS);
- if (!req) {
- printk_ratelimited(
- KERN_ERR "%s: crypto_request_alloc() failed\n", __func__);
- return -ENOMEM;
- }
- skcipher_request_set_callback(req,
- CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- ext4_dir_crypt_complete, &ecr);
-
- /* Initialize IV */
- memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE);
-
- /* Create encryption request */
- sg_init_one(&src_sg, iname->name, iname->len);
- sg_init_one(&dst_sg, oname->name, oname->len);
- skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
- res = crypto_skcipher_decrypt(req);
- if (res == -EINPROGRESS || res == -EBUSY) {
- wait_for_completion(&ecr.completion);
- res = ecr.res;
- }
- skcipher_request_free(req);
- if (res < 0) {
- printk_ratelimited(
- KERN_ERR "%s: Error in ext4_fname_encrypt (error code %d)\n",
- __func__, res);
- return res;
- }
-
- oname->len = strnlen(oname->name, iname->len);
- return oname->len;
-}
-
-static const char *lookup_table =
- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
-
-/**
- * ext4_fname_encode_digest() -
- *
- * Encodes the input digest using characters from the set [a-zA-Z0-9_+].
- * The encoded string is roughly 4/3 times the size of the input string.
- */
-static int digest_encode(const char *src, int len, char *dst)
-{
- int i = 0, bits = 0, ac = 0;
- char *cp = dst;
-
- while (i < len) {
- ac += (((unsigned char) src[i]) << bits);
- bits += 8;
- do {
- *cp++ = lookup_table[ac & 0x3f];
- ac >>= 6;
- bits -= 6;
- } while (bits >= 6);
- i++;
- }
- if (bits)
- *cp++ = lookup_table[ac & 0x3f];
- return cp - dst;
-}
-
-static int digest_decode(const char *src, int len, char *dst)
-{
- int i = 0, bits = 0, ac = 0;
- const char *p;
- char *cp = dst;
-
- while (i < len) {
- p = strchr(lookup_table, src[i]);
- if (p == NULL || src[i] == 0)
- return -2;
- ac += (p - lookup_table) << bits;
- bits += 6;
- if (bits >= 8) {
- *cp++ = ac & 0xff;
- ac >>= 8;
- bits -= 8;
- }
- i++;
- }
- if (ac)
- return -1;
- return cp - dst;
-}
-
-/**
- * ext4_fname_crypto_round_up() -
- *
- * Return: The next multiple of block size
- */
-u32 ext4_fname_crypto_round_up(u32 size, u32 blksize)
-{
- return ((size+blksize-1)/blksize)*blksize;
-}
-
-unsigned ext4_fname_encrypted_size(struct inode *inode, u32 ilen)
-{
- struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
- int padding = 32;
-
- if (ci)
- padding = 4 << (ci->ci_flags & EXT4_POLICY_FLAGS_PAD_MASK);
- if (ilen < EXT4_CRYPTO_BLOCK_SIZE)
- ilen = EXT4_CRYPTO_BLOCK_SIZE;
- return ext4_fname_crypto_round_up(ilen, padding);
-}
-
-/*
- * ext4_fname_crypto_alloc_buffer() -
- *
- * Allocates an output buffer that is sufficient for the crypto operation
- * specified by the context and the direction.
- */
-int ext4_fname_crypto_alloc_buffer(struct inode *inode,
- u32 ilen, struct ext4_str *crypto_str)
-{
- unsigned int olen = ext4_fname_encrypted_size(inode, ilen);
-
- crypto_str->len = olen;
- if (olen < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2)
- olen = EXT4_FNAME_CRYPTO_DIGEST_SIZE*2;
- /* Allocated buffer can hold one more character to null-terminate the
- * string */
- crypto_str->name = kmalloc(olen+1, GFP_NOFS);
- if (!(crypto_str->name))
- return -ENOMEM;
- return 0;
-}
-
-/**
- * ext4_fname_crypto_free_buffer() -
- *
- * Frees the buffer allocated for crypto operation.
- */
-void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str)
-{
- if (!crypto_str)
- return;
- kfree(crypto_str->name);
- crypto_str->name = NULL;
-}
-
-/**
- * ext4_fname_disk_to_usr() - converts a filename from disk space to user space
- */
-int _ext4_fname_disk_to_usr(struct inode *inode,
- struct dx_hash_info *hinfo,
- const struct ext4_str *iname,
- struct ext4_str *oname)
-{
- char buf[24];
- int ret;
-
- if (iname->len < 3) {
- /*Check for . and .. */
- if (iname->name[0] == '.' && iname->name[iname->len-1] == '.') {
- oname->name[0] = '.';
- oname->name[iname->len-1] = '.';
- oname->len = iname->len;
- return oname->len;
- }
- }
- if (iname->len < EXT4_CRYPTO_BLOCK_SIZE) {
- EXT4_ERROR_INODE(inode, "encrypted inode too small");
- return -EUCLEAN;
- }
- if (EXT4_I(inode)->i_crypt_info)
- return ext4_fname_decrypt(inode, iname, oname);
-
- if (iname->len <= EXT4_FNAME_CRYPTO_DIGEST_SIZE) {
- ret = digest_encode(iname->name, iname->len, oname->name);
- oname->len = ret;
- return ret;
- }
- if (hinfo) {
- memcpy(buf, &hinfo->hash, 4);
- memcpy(buf+4, &hinfo->minor_hash, 4);
- } else
- memset(buf, 0, 8);
- memcpy(buf + 8, iname->name + iname->len - 16, 16);
- oname->name[0] = '_';
- ret = digest_encode(buf, 24, oname->name+1);
- oname->len = ret + 1;
- return ret + 1;
-}
-
-int ext4_fname_disk_to_usr(struct inode *inode,
- struct dx_hash_info *hinfo,
- const struct ext4_dir_entry_2 *de,
- struct ext4_str *oname)
-{
- struct ext4_str iname = {.name = (unsigned char *) de->name,
- .len = de->name_len };
-
- return _ext4_fname_disk_to_usr(inode, hinfo, &iname, oname);
-}
-
-
-/**
- * ext4_fname_usr_to_disk() - converts a filename from user space to disk space
- */
-int ext4_fname_usr_to_disk(struct inode *inode,
- const struct qstr *iname,
- struct ext4_str *oname)
-{
- int res;
- struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
-
- if (iname->len < 3) {
- /*Check for . and .. */
- if (iname->name[0] == '.' &&
- iname->name[iname->len-1] == '.') {
- oname->name[0] = '.';
- oname->name[iname->len-1] = '.';
- oname->len = iname->len;
- return oname->len;
- }
- }
- if (ci) {
- res = ext4_fname_encrypt(inode, iname, oname);
- return res;
- }
- /* Without a proper key, a user is not allowed to modify the filenames
- * in a directory. Consequently, a user space name cannot be mapped to
- * a disk-space name */
- return -EACCES;
-}
-
-int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname,
- int lookup, struct ext4_filename *fname)
-{
- struct ext4_crypt_info *ci;
- int ret = 0, bigname = 0;
-
- memset(fname, 0, sizeof(struct ext4_filename));
- fname->usr_fname = iname;
-
- if (!ext4_encrypted_inode(dir) ||
- ((iname->name[0] == '.') &&
- ((iname->len == 1) ||
- ((iname->name[1] == '.') && (iname->len == 2))))) {
- fname->disk_name.name = (unsigned char *) iname->name;
- fname->disk_name.len = iname->len;
- return 0;
- }
- ret = ext4_get_encryption_info(dir);
- if (ret)
- return ret;
- ci = EXT4_I(dir)->i_crypt_info;
- if (ci) {
- ret = ext4_fname_crypto_alloc_buffer(dir, iname->len,
- &fname->crypto_buf);
- if (ret < 0)
- return ret;
- ret = ext4_fname_encrypt(dir, iname, &fname->crypto_buf);
- if (ret < 0)
- goto errout;
- fname->disk_name.name = fname->crypto_buf.name;
- fname->disk_name.len = fname->crypto_buf.len;
- return 0;
- }
- if (!lookup)
- return -EACCES;
-
- /* We don't have the key and we are doing a lookup; decode the
- * user-supplied name
- */
- if (iname->name[0] == '_')
- bigname = 1;
- if ((bigname && (iname->len != 33)) ||
- (!bigname && (iname->len > 43)))
- return -ENOENT;
-
- fname->crypto_buf.name = kmalloc(32, GFP_KERNEL);
- if (fname->crypto_buf.name == NULL)
- return -ENOMEM;
- ret = digest_decode(iname->name + bigname, iname->len - bigname,
- fname->crypto_buf.name);
- if (ret < 0) {
- ret = -ENOENT;
- goto errout;
- }
- fname->crypto_buf.len = ret;
- if (bigname) {
- memcpy(&fname->hinfo.hash, fname->crypto_buf.name, 4);
- memcpy(&fname->hinfo.minor_hash, fname->crypto_buf.name + 4, 4);
- } else {
- fname->disk_name.name = fname->crypto_buf.name;
- fname->disk_name.len = fname->crypto_buf.len;
- }
- return 0;
-errout:
- kfree(fname->crypto_buf.name);
- fname->crypto_buf.name = NULL;
- return ret;
-}
-
-void ext4_fname_free_filename(struct ext4_filename *fname)
-{
- kfree(fname->crypto_buf.name);
- fname->crypto_buf.name = NULL;
- fname->usr_fname = NULL;
- fname->disk_name.name = NULL;
-}
diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c
deleted file mode 100644
index 0129d688d1f71..0000000000000
--- a/fs/ext4/crypto_key.c
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * linux/fs/ext4/crypto_key.c
- *
- * Copyright (C) 2015, Google, Inc.
- *
- * This contains encryption key functions for ext4
- *
- * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
- */
-
-#include <crypto/skcipher.h>
-#include <keys/encrypted-type.h>
-#include <keys/user-type.h>
-#include <linux/random.h>
-#include <linux/scatterlist.h>
-#include <uapi/linux/keyctl.h>
-
-#include "ext4.h"
-#include "xattr.h"
-
-static void derive_crypt_complete(struct crypto_async_request *req, int rc)
-{
- struct ext4_completion_result *ecr = req->data;
-
- if (rc == -EINPROGRESS)
- return;
-
- ecr->res = rc;
- complete(&ecr->completion);
-}
-
-/**
- * ext4_derive_key_aes() - Derive a key using AES-128-ECB
- * @deriving_key: Encryption key used for derivation.
- * @source_key: Source key to which to apply derivation.
- * @derived_key: Derived key.
- *
- * Return: Zero on success; non-zero otherwise.
- */
-static int ext4_derive_key_aes(char deriving_key[EXT4_AES_128_ECB_KEY_SIZE],
- char source_key[EXT4_AES_256_XTS_KEY_SIZE],
- char derived_key[EXT4_AES_256_XTS_KEY_SIZE])
-{
- int res = 0;
- struct skcipher_request *req = NULL;
- DECLARE_EXT4_COMPLETION_RESULT(ecr);
- struct scatterlist src_sg, dst_sg;
- struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0);
-
- if (IS_ERR(tfm)) {
- res = PTR_ERR(tfm);
- tfm = NULL;
- goto out;
- }
- crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
- req = skcipher_request_alloc(tfm, GFP_NOFS);
- if (!req) {
- res = -ENOMEM;
- goto out;
- }
- skcipher_request_set_callback(req,
- CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- derive_crypt_complete, &ecr);
- res = crypto_skcipher_setkey(tfm, deriving_key,
- EXT4_AES_128_ECB_KEY_SIZE);
- if (res < 0)
- goto out;
- sg_init_one(&src_sg, source_key, EXT4_AES_256_XTS_KEY_SIZE);
- sg_init_one(&dst_sg, derived_key, EXT4_AES_256_XTS_KEY_SIZE);
- skcipher_request_set_crypt(req, &src_sg, &dst_sg,
- EXT4_AES_256_XTS_KEY_SIZE, NULL);
- res = crypto_skcipher_encrypt(req);
- if (res == -EINPROGRESS || res == -EBUSY) {
- wait_for_completion(&ecr.completion);
- res = ecr.res;
- }
-
-out:
- skcipher_request_free(req);
- crypto_free_skcipher(tfm);
- return res;
-}
-
-void ext4_free_crypt_info(struct ext4_crypt_info *ci)
-{
- if (!ci)
- return;
-
- if (ci->ci_keyring_key)
- key_put(ci->ci_keyring_key);
- crypto_free_skcipher(ci->ci_ctfm);
- kmem_cache_free(ext4_crypt_info_cachep, ci);
-}
-
-void ext4_free_encryption_info(struct inode *inode,
- struct ext4_crypt_info *ci)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
- struct ext4_crypt_info *prev;
-
- if (ci == NULL)
- ci = ACCESS_ONCE(ei->i_crypt_info);
- if (ci == NULL)
- return;
- prev = cmpxchg(&ei->i_crypt_info, ci, NULL);
- if (prev != ci)
- return;
-
- ext4_free_crypt_info(ci);
-}
-
-int _ext4_get_encryption_info(struct inode *inode)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
- struct ext4_crypt_info *crypt_info;
- char full_key_descriptor[EXT4_KEY_DESC_PREFIX_SIZE +
- (EXT4_KEY_DESCRIPTOR_SIZE * 2) + 1];
- struct key *keyring_key = NULL;
- struct ext4_encryption_key *master_key;
- struct ext4_encryption_context ctx;
- const struct user_key_payload *ukp;
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- struct crypto_skcipher *ctfm;
- const char *cipher_str;
- char raw_key[EXT4_MAX_KEY_SIZE];
- char mode;
- int res;
-
- if (!ext4_read_workqueue) {
- res = ext4_init_crypto();
- if (res)
- return res;
- }
-
-retry:
- crypt_info = ACCESS_ONCE(ei->i_crypt_info);
- if (crypt_info) {
- if (!crypt_info->ci_keyring_key ||
- key_validate(crypt_info->ci_keyring_key) == 0)
- return 0;
- ext4_free_encryption_info(inode, crypt_info);
- goto retry;
- }
-
- res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
- EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
- &ctx, sizeof(ctx));
- if (res < 0) {
- if (!DUMMY_ENCRYPTION_ENABLED(sbi))
- return res;
- ctx.contents_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS;
- ctx.filenames_encryption_mode =
- EXT4_ENCRYPTION_MODE_AES_256_CTS;
- ctx.flags = 0;
- } else if (res != sizeof(ctx))
- return -EINVAL;
- res = 0;
-
- crypt_info = kmem_cache_alloc(ext4_crypt_info_cachep, GFP_KERNEL);
- if (!crypt_info)
- return -ENOMEM;
-
- crypt_info->ci_flags = ctx.flags;
- crypt_info->ci_data_mode = ctx.contents_encryption_mode;
- crypt_info->ci_filename_mode = ctx.filenames_encryption_mode;
- crypt_info->ci_ctfm = NULL;
- crypt_info->ci_keyring_key = NULL;
- memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor,
- sizeof(crypt_info->ci_master_key));
- if (S_ISREG(inode->i_mode))
- mode = crypt_info->ci_data_mode;
- else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- mode = crypt_info->ci_filename_mode;
- else
- BUG();
- switch (mode) {
- case EXT4_ENCRYPTION_MODE_AES_256_XTS:
- cipher_str = "xts(aes)";
- break;
- case EXT4_ENCRYPTION_MODE_AES_256_CTS:
- cipher_str = "cts(cbc(aes))";
- break;
- default:
- printk_once(KERN_WARNING
- "ext4: unsupported key mode %d (ino %u)\n",
- mode, (unsigned) inode->i_ino);
- res = -ENOKEY;
- goto out;
- }
- if (DUMMY_ENCRYPTION_ENABLED(sbi)) {
- memset(raw_key, 0x42, EXT4_AES_256_XTS_KEY_SIZE);
- goto got_key;
- }
- memcpy(full_key_descriptor, EXT4_KEY_DESC_PREFIX,
- EXT4_KEY_DESC_PREFIX_SIZE);
- sprintf(full_key_descriptor + EXT4_KEY_DESC_PREFIX_SIZE,
- "%*phN", EXT4_KEY_DESCRIPTOR_SIZE,
- ctx.master_key_descriptor);
- full_key_descriptor[EXT4_KEY_DESC_PREFIX_SIZE +
- (2 * EXT4_KEY_DESCRIPTOR_SIZE)] = '\0';
- keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL);
- if (IS_ERR(keyring_key)) {
- res = PTR_ERR(keyring_key);
- keyring_key = NULL;
- goto out;
- }
- crypt_info->ci_keyring_key = keyring_key;
- if (keyring_key->type != &key_type_logon) {
- printk_once(KERN_WARNING
- "ext4: key type must be logon\n");
- res = -ENOKEY;
- goto out;
- }
- down_read(&keyring_key->sem);
- ukp = user_key_payload(keyring_key);
- if (ukp->datalen != sizeof(struct ext4_encryption_key)) {
- res = -EINVAL;
- up_read(&keyring_key->sem);
- goto out;
- }
- master_key = (struct ext4_encryption_key *)ukp->data;
- BUILD_BUG_ON(EXT4_AES_128_ECB_KEY_SIZE !=
- EXT4_KEY_DERIVATION_NONCE_SIZE);
- if (master_key->size != EXT4_AES_256_XTS_KEY_SIZE) {
- printk_once(KERN_WARNING
- "ext4: key size incorrect: %d\n",
- master_key->size);
- res = -ENOKEY;
- up_read(&keyring_key->sem);
- goto out;
- }
- res = ext4_derive_key_aes(ctx.nonce, master_key->raw,
- raw_key);
- up_read(&keyring_key->sem);
- if (res)
- goto out;
-got_key:
- ctfm = crypto_alloc_skcipher(cipher_str, 0, 0);
- if (!ctfm || IS_ERR(ctfm)) {
- res = ctfm ? PTR_ERR(ctfm) : -ENOMEM;
- printk(KERN_DEBUG
- "%s: error %d (inode %u) allocating crypto tfm\n",
- __func__, res, (unsigned) inode->i_ino);
- goto out;
- }
- crypt_info->ci_ctfm = ctfm;
- crypto_skcipher_clear_flags(ctfm, ~0);
- crypto_tfm_set_flags(crypto_skcipher_tfm(ctfm),
- CRYPTO_TFM_REQ_WEAK_KEY);
- res = crypto_skcipher_setkey(ctfm, raw_key,
- ext4_encryption_key_size(mode));
- if (res)
- goto out;
- memzero_explicit(raw_key, sizeof(raw_key));
- if (cmpxchg(&ei->i_crypt_info, NULL, crypt_info) != NULL) {
- ext4_free_crypt_info(crypt_info);
- goto retry;
- }
- return 0;
-
-out:
- if (res == -ENOKEY)
- res = 0;
- ext4_free_crypt_info(crypt_info);
- memzero_explicit(raw_key, sizeof(raw_key));
- return res;
-}
-
-int ext4_has_encryption_key(struct inode *inode)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
-
- return (ei->i_crypt_info != NULL);
-}
diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c
deleted file mode 100644
index ad050698143fd..0000000000000
--- a/fs/ext4/crypto_policy.c
+++ /dev/null
@@ -1,229 +0,0 @@
-/*
- * linux/fs/ext4/crypto_policy.c
- *
- * Copyright (C) 2015, Google, Inc.
- *
- * This contains encryption policy functions for ext4
- *
- * Written by Michael Halcrow, 2015.
- */
-
-#include <linux/random.h>
-#include <linux/string.h>
-#include <linux/types.h>
-
-#include "ext4_jbd2.h"
-#include "ext4.h"
-#include "xattr.h"
-
-static int ext4_inode_has_encryption_context(struct inode *inode)
-{
- int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
- EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, NULL, 0);
- return (res > 0);
-}
-
-/*
- * check whether the policy is consistent with the encryption context
- * for the inode
- */
-static int ext4_is_encryption_context_consistent_with_policy(
- struct inode *inode, const struct ext4_encryption_policy *policy)
-{
- struct ext4_encryption_context ctx;
- int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
- EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
- sizeof(ctx));
- if (res != sizeof(ctx))
- return 0;
- return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor,
- EXT4_KEY_DESCRIPTOR_SIZE) == 0 &&
- (ctx.flags ==
- policy->flags) &&
- (ctx.contents_encryption_mode ==
- policy->contents_encryption_mode) &&
- (ctx.filenames_encryption_mode ==
- policy->filenames_encryption_mode));
-}
-
-static int ext4_create_encryption_context_from_policy(
- struct inode *inode, const struct ext4_encryption_policy *policy)
-{
- struct ext4_encryption_context ctx;
- handle_t *handle;
- int res, res2;
-
- res = ext4_convert_inline_data(inode);
- if (res)
- return res;
-
- ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1;
- memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
- EXT4_KEY_DESCRIPTOR_SIZE);
- if (!ext4_valid_contents_enc_mode(policy->contents_encryption_mode)) {
- printk(KERN_WARNING
- "%s: Invalid contents encryption mode %d\n", __func__,
- policy->contents_encryption_mode);
- return -EINVAL;
- }
- if (!ext4_valid_filenames_enc_mode(policy->filenames_encryption_mode)) {
- printk(KERN_WARNING
- "%s: Invalid filenames encryption mode %d\n", __func__,
- policy->filenames_encryption_mode);
- return -EINVAL;
- }
- if (policy->flags & ~EXT4_POLICY_FLAGS_VALID)
- return -EINVAL;
- ctx.contents_encryption_mode = policy->contents_encryption_mode;
- ctx.filenames_encryption_mode = policy->filenames_encryption_mode;
- ctx.flags = policy->flags;
- BUILD_BUG_ON(sizeof(ctx.nonce) != EXT4_KEY_DERIVATION_NONCE_SIZE);
- get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE);
-
- handle = ext4_journal_start(inode, EXT4_HT_MISC,
- ext4_jbd2_credits_xattr(inode));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
- res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION,
- EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
- sizeof(ctx), 0);
- if (!res) {
- ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
- res = ext4_mark_inode_dirty(handle, inode);
- if (res)
- EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");
- }
- res2 = ext4_journal_stop(handle);
- if (!res)
- res = res2;
- return res;
-}
-
-int ext4_process_policy(const struct ext4_encryption_policy *policy,
- struct inode *inode)
-{
- if (policy->version != 0)
- return -EINVAL;
-
- if (!ext4_inode_has_encryption_context(inode)) {
- if (!S_ISDIR(inode->i_mode))
- return -EINVAL;
- if (!ext4_empty_dir(inode))
- return -ENOTEMPTY;
- return ext4_create_encryption_context_from_policy(inode,
- policy);
- }
-
- if (ext4_is_encryption_context_consistent_with_policy(inode, policy))
- return 0;
-
- printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n",
- __func__);
- return -EINVAL;
-}
-
-int ext4_get_policy(struct inode *inode, struct ext4_encryption_policy *policy)
-{
- struct ext4_encryption_context ctx;
-
- int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
- EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
- &ctx, sizeof(ctx));
- if (res != sizeof(ctx))
- return -ENOENT;
- if (ctx.format != EXT4_ENCRYPTION_CONTEXT_FORMAT_V1)
- return -EINVAL;
- policy->version = 0;
- policy->contents_encryption_mode = ctx.contents_encryption_mode;
- policy->filenames_encryption_mode = ctx.filenames_encryption_mode;
- policy->flags = ctx.flags;
- memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor,
- EXT4_KEY_DESCRIPTOR_SIZE);
- return 0;
-}
-
-int ext4_is_child_context_consistent_with_parent(struct inode *parent,
- struct inode *child)
-{
- struct ext4_crypt_info *parent_ci, *child_ci;
- int res;
-
- if ((parent == NULL) || (child == NULL)) {
- pr_err("parent %p child %p\n", parent, child);
- WARN_ON(1); /* Should never happen */
- return 0;
- }
- /* no restrictions if the parent directory is not encrypted */
- if (!ext4_encrypted_inode(parent))
- return 1;
- /* if the child directory is not encrypted, this is always a problem */
- if (!ext4_encrypted_inode(child))
- return 0;
- res = ext4_get_encryption_info(parent);
- if (res)
- return 0;
- res = ext4_get_encryption_info(child);
- if (res)
- return 0;
- parent_ci = EXT4_I(parent)->i_crypt_info;
- child_ci = EXT4_I(child)->i_crypt_info;
- if (!parent_ci && !child_ci)
- return 1;
- if (!parent_ci || !child_ci)
- return 0;
-
- return (memcmp(parent_ci->ci_master_key,
- child_ci->ci_master_key,
- EXT4_KEY_DESCRIPTOR_SIZE) == 0 &&
- (parent_ci->ci_data_mode == child_ci->ci_data_mode) &&
- (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) &&
- (parent_ci->ci_flags == child_ci->ci_flags));
-}
-
-/**
- * ext4_inherit_context() - Sets a child context from its parent
- * @parent: Parent inode from which the context is inherited.
- * @child: Child inode that inherits the context from @parent.
- *
- * Return: Zero on success, non-zero otherwise
- */
-int ext4_inherit_context(struct inode *parent, struct inode *child)
-{
- struct ext4_encryption_context ctx;
- struct ext4_crypt_info *ci;
- int res;
-
- res = ext4_get_encryption_info(parent);
- if (res < 0)
- return res;
- ci = EXT4_I(parent)->i_crypt_info;
- if (ci == NULL)
- return -ENOKEY;
-
- ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1;
- if (DUMMY_ENCRYPTION_ENABLED(EXT4_SB(parent->i_sb))) {
- ctx.contents_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS;
- ctx.filenames_encryption_mode =
- EXT4_ENCRYPTION_MODE_AES_256_CTS;
- ctx.flags = 0;
- memset(ctx.master_key_descriptor, 0x42,
- EXT4_KEY_DESCRIPTOR_SIZE);
- res = 0;
- } else {
- ctx.contents_encryption_mode = ci->ci_data_mode;
- ctx.filenames_encryption_mode = ci->ci_filename_mode;
- ctx.flags = ci->ci_flags;
- memcpy(ctx.master_key_descriptor, ci->ci_master_key,
- EXT4_KEY_DESCRIPTOR_SIZE);
- }
- get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE);
- res = ext4_xattr_set(child, EXT4_XATTR_INDEX_ENCRYPTION,
- EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
- sizeof(ctx), 0);
- if (!res) {
- ext4_set_inode_flag(child, EXT4_INODE_ENCRYPT);
- ext4_clear_inode_state(child, EXT4_STATE_MAY_INLINE_DATA);
- res = ext4_get_encryption_info(child);
- }
- return res;
-}
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 50ba27cbed034..67415e0e6af06 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -109,10 +109,10 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
struct super_block *sb = inode->i_sb;
struct buffer_head *bh = NULL;
int dir_has_error = 0;
- struct ext4_str fname_crypto_str = {.name = NULL, .len = 0};
+ struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
if (ext4_encrypted_inode(inode)) {
- err = ext4_get_encryption_info(inode);
+ err = fscrypt_get_encryption_info(inode);
if (err && err != -ENOKEY)
return err;
}
@@ -139,8 +139,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
}
if (ext4_encrypted_inode(inode)) {
- err = ext4_fname_crypto_alloc_buffer(inode, EXT4_NAME_LEN,
- &fname_crypto_str);
+ err = fscrypt_fname_alloc_buffer(inode, EXT4_NAME_LEN, &fstr);
if (err < 0)
return err;
}
@@ -150,18 +149,23 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
while (ctx->pos < inode->i_size) {
struct ext4_map_blocks map;
+ if (fatal_signal_pending(current)) {
+ err = -ERESTARTSYS;
+ goto errout;
+ }
+ cond_resched();
map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
map.m_len = 1;
err = ext4_map_blocks(NULL, inode, &map, 0);
if (err > 0) {
pgoff_t index = map.m_pblk >>
- (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ (PAGE_SHIFT - inode->i_blkbits);
if (!ra_has_index(&file->f_ra, index))
page_cache_sync_readahead(
sb->s_bdev->bd_inode->i_mapping,
&file->f_ra, file,
index, 1);
- file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
+ file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT;
bh = ext4_bread(NULL, inode, map.m_lblk, 0);
if (IS_ERR(bh)) {
err = PTR_ERR(bh);
@@ -248,16 +252,19 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
get_dtype(sb, de->file_type)))
goto done;
} else {
- int save_len = fname_crypto_str.len;
+ int save_len = fstr.len;
+ struct fscrypt_str de_name =
+ FSTR_INIT(de->name,
+ de->name_len);
/* Directory is encrypted */
- err = ext4_fname_disk_to_usr(inode,
- NULL, de, &fname_crypto_str);
- fname_crypto_str.len = save_len;
+ err = fscrypt_fname_disk_to_usr(inode,
+ 0, 0, &de_name, &fstr);
+ fstr.len = save_len;
if (err < 0)
goto errout;
if (!dir_emit(ctx,
- fname_crypto_str.name, err,
+ fstr.name, err,
le32_to_cpu(de->inode),
get_dtype(sb, de->file_type)))
goto done;
@@ -266,7 +273,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
ctx->pos += ext4_rec_len_from_disk(de->rec_len,
sb->s_blocksize);
}
- if ((ctx->pos < inode->i_size) && !dir_relax(inode))
+ if ((ctx->pos < inode->i_size) && !dir_relax_shared(inode))
goto done;
brelse(bh);
bh = NULL;
@@ -276,7 +283,7 @@ done:
err = 0;
errout:
#ifdef CONFIG_EXT4_FS_ENCRYPTION
- ext4_fname_crypto_free_buffer(&fname_crypto_str);
+ fscrypt_fname_free_buffer(&fstr);
#endif
brelse(bh);
return err;
@@ -427,7 +434,7 @@ void ext4_htree_free_dir_info(struct dir_private_info *p)
int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
__u32 minor_hash,
struct ext4_dir_entry_2 *dirent,
- struct ext4_str *ent_name)
+ struct fscrypt_str *ent_name)
{
struct rb_node **p, *parent = NULL;
struct fname *fname, *new_fn;
@@ -604,7 +611,7 @@ finished:
static int ext4_dir_open(struct inode * inode, struct file * filp)
{
if (ext4_encrypted_inode(inode))
- return ext4_get_encryption_info(inode) ? -EACCES : 0;
+ return fscrypt_get_encryption_info(inode) ? -EACCES : 0;
return 0;
}
@@ -644,7 +651,7 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf,
const struct file_operations ext4_dir_operations = {
.llseek = ext4_dir_llseek,
.read = generic_read_dir,
- .iterate = ext4_readdir,
+ .iterate_shared = ext4_readdir,
.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c047435198659..ea31931386ec3 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -32,7 +32,9 @@
#include <linux/percpu_counter.h>
#include <linux/ratelimit.h>
#include <crypto/hash.h>
+#include <linux/fscrypto.h>
#include <linux/falloc.h>
+#include <linux/percpu-rwsem.h>
#ifdef __KERNEL__
#include <linux/compat.h>
#endif
@@ -581,6 +583,9 @@ enum {
#define EXT4_GET_BLOCKS_ZERO 0x0200
#define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\
EXT4_GET_BLOCKS_ZERO)
+ /* Caller will submit data before dropping transaction handle. This
+ * allows jbd2 to avoid submitting data before commit. */
+#define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400
/*
* The bit position of these flags must not overlap with any of the
@@ -604,15 +609,6 @@ enum {
#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010
#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020
-/* Encryption algorithms */
-#define EXT4_ENCRYPTION_MODE_INVALID 0
-#define EXT4_ENCRYPTION_MODE_AES_256_XTS 1
-#define EXT4_ENCRYPTION_MODE_AES_256_GCM 2
-#define EXT4_ENCRYPTION_MODE_AES_256_CBC 3
-#define EXT4_ENCRYPTION_MODE_AES_256_CTS 4
-
-#include "ext4_crypto.h"
-
/*
* ioctl commands
*/
@@ -634,9 +630,9 @@ enum {
#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64)
#define EXT4_IOC_SWAP_BOOT _IO('f', 17)
#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18)
-#define EXT4_IOC_SET_ENCRYPTION_POLICY _IOR('f', 19, struct ext4_encryption_policy)
-#define EXT4_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16])
-#define EXT4_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct ext4_encryption_policy)
+#define EXT4_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY
+#define EXT4_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT
+#define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY
#ifndef FS_IOC_FSGETXATTR
/* Until the uapi changes get merged for project quota... */
@@ -912,6 +908,29 @@ do { \
#include "extents_status.h"
/*
+ * Lock subclasses for i_data_sem in the ext4_inode_info structure.
+ *
+ * These are needed to avoid lockdep false positives when we need to
+ * allocate blocks to the quota inode during ext4_map_blocks(), while
+ * holding i_data_sem for a normal (non-quota) inode. Since we don't
+ * do quota tracking for the quota inode, this avoids deadlock (as
+ * well as infinite recursion, since it isn't turtles all the way
+ * down...)
+ *
+ * I_DATA_SEM_NORMAL - Used for most inodes
+ * I_DATA_SEM_OTHER - Used by move_inode.c for the second normal inode
+ * where the second inode has larger inode number
+ * than the first
+ * I_DATA_SEM_QUOTA - Used for quota inodes only
+ */
+enum {
+ I_DATA_SEM_NORMAL = 0,
+ I_DATA_SEM_OTHER,
+ I_DATA_SEM_QUOTA,
+};
+
+
+/*
* fourth extended file system inode data in memory
*/
struct ext4_inode_info {
@@ -1055,10 +1074,6 @@ struct ext4_inode_info {
/* Precomputed uuid+inum+igen checksum for seeding inode checksums */
__u32 i_csum_seed;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
- /* Encryption params */
- struct ext4_crypt_info *i_crypt_info;
-#endif
kprojid_t i_projid;
};
@@ -1317,6 +1332,11 @@ struct ext4_super_block {
/* Number of quota types we support */
#define EXT4_MAXQUOTAS 3
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#define EXT4_KEY_DESC_PREFIX "ext4:"
+#define EXT4_KEY_DESC_PREFIX_SIZE 5
+#endif
+
/*
* fourth extended-fs super-block data in memory
*/
@@ -1403,6 +1423,7 @@ struct ext4_sb_info {
unsigned short *s_mb_offsets;
unsigned int *s_mb_maxs;
unsigned int s_group_info_size;
+ unsigned int s_mb_free_pending;
/* tunables */
unsigned long s_stripe;
@@ -1482,6 +1503,15 @@ struct ext4_sb_info {
struct ratelimit_state s_err_ratelimit_state;
struct ratelimit_state s_warning_ratelimit_state;
struct ratelimit_state s_msg_ratelimit_state;
+
+ /* Barrier between changing inodes' journal flags and writepages ops. */
+ struct percpu_rw_semaphore s_journal_flag_rwsem;
+
+ /* Encryption support */
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ u8 key_prefix[EXT4_KEY_DESC_PREFIX_SIZE];
+ u8 key_prefix_size;
+#endif
};
static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1526,7 +1556,6 @@ enum {
EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read
nolocking */
EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
- EXT4_STATE_ORDERED_MODE, /* data=ordered mode */
EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
};
@@ -1581,15 +1610,6 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
/*
* Returns true if the inode is inode is encrypted
*/
-static inline int ext4_encrypted_inode(struct inode *inode)
-{
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
- return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT);
-#else
- return 0;
-#endif
-}
-
#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime
/*
@@ -1961,7 +1981,7 @@ ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
{
unsigned len = le16_to_cpu(dlen);
-#if (PAGE_CACHE_SIZE >= 65536)
+#if (PAGE_SIZE >= 65536)
if (len == EXT4_MAX_REC_LEN || len == 0)
return blocksize;
return (len & 65532) | ((len & 3) << 16);
@@ -1974,7 +1994,7 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
{
if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
BUG();
-#if (PAGE_CACHE_SIZE >= 65536)
+#if (PAGE_SIZE >= 65536)
if (len < 65536)
return cpu_to_le16(len);
if (len == blocksize) {
@@ -2053,10 +2073,10 @@ struct dx_hash_info
struct ext4_filename {
const struct qstr *usr_fname;
- struct ext4_str disk_name;
+ struct fscrypt_str disk_name;
struct dx_hash_info hinfo;
#ifdef CONFIG_EXT4_FS_ENCRYPTION
- struct ext4_str crypto_buf;
+ struct fscrypt_str crypto_buf;
#endif
};
@@ -2267,130 +2287,82 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
struct ext4_group_desc *gdp);
ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
-/* crypto_policy.c */
-int ext4_is_child_context_consistent_with_parent(struct inode *parent,
- struct inode *child);
-int ext4_inherit_context(struct inode *parent, struct inode *child);
-void ext4_to_hex(char *dst, char *src, size_t src_size);
-int ext4_process_policy(const struct ext4_encryption_policy *policy,
- struct inode *inode);
-int ext4_get_policy(struct inode *inode,
- struct ext4_encryption_policy *policy);
-
-/* crypto.c */
-extern struct kmem_cache *ext4_crypt_info_cachep;
-bool ext4_valid_contents_enc_mode(uint32_t mode);
-uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size);
-extern struct workqueue_struct *ext4_read_workqueue;
-struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode);
-void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx);
-void ext4_restore_control_page(struct page *data_page);
-struct page *ext4_encrypt(struct inode *inode,
- struct page *plaintext_page);
-int ext4_decrypt(struct page *page);
-int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk,
- ext4_fsblk_t pblk, ext4_lblk_t len);
-extern const struct dentry_operations ext4_encrypted_d_ops;
-
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-int ext4_init_crypto(void);
-void ext4_exit_crypto(void);
static inline int ext4_sb_has_crypto(struct super_block *sb)
{
return ext4_has_feature_encrypt(sb);
}
-#else
-static inline int ext4_init_crypto(void) { return 0; }
-static inline void ext4_exit_crypto(void) { }
-static inline int ext4_sb_has_crypto(struct super_block *sb)
+
+static inline bool ext4_encrypted_inode(struct inode *inode)
{
- return 0;
+ return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT);
}
-#endif
-/* crypto_fname.c */
-bool ext4_valid_filenames_enc_mode(uint32_t mode);
-u32 ext4_fname_crypto_round_up(u32 size, u32 blksize);
-unsigned ext4_fname_encrypted_size(struct inode *inode, u32 ilen);
-int ext4_fname_crypto_alloc_buffer(struct inode *inode,
- u32 ilen, struct ext4_str *crypto_str);
-int _ext4_fname_disk_to_usr(struct inode *inode,
- struct dx_hash_info *hinfo,
- const struct ext4_str *iname,
- struct ext4_str *oname);
-int ext4_fname_disk_to_usr(struct inode *inode,
- struct dx_hash_info *hinfo,
- const struct ext4_dir_entry_2 *de,
- struct ext4_str *oname);
-int ext4_fname_usr_to_disk(struct inode *inode,
- const struct qstr *iname,
- struct ext4_str *oname);
#ifdef CONFIG_EXT4_FS_ENCRYPTION
-void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str);
-int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname,
- int lookup, struct ext4_filename *fname);
-void ext4_fname_free_filename(struct ext4_filename *fname);
-#else
-static inline
-int ext4_setup_fname_crypto(struct inode *inode)
-{
- return 0;
-}
-static inline void ext4_fname_crypto_free_buffer(struct ext4_str *p) { }
static inline int ext4_fname_setup_filename(struct inode *dir,
- const struct qstr *iname,
- int lookup, struct ext4_filename *fname)
+ const struct qstr *iname,
+ int lookup, struct ext4_filename *fname)
{
- fname->usr_fname = iname;
- fname->disk_name.name = (unsigned char *) iname->name;
- fname->disk_name.len = iname->len;
- return 0;
-}
-static inline void ext4_fname_free_filename(struct ext4_filename *fname) { }
-#endif
-
+ struct fscrypt_name name;
+ int err;
-/* crypto_key.c */
-void ext4_free_crypt_info(struct ext4_crypt_info *ci);
-void ext4_free_encryption_info(struct inode *inode, struct ext4_crypt_info *ci);
-int _ext4_get_encryption_info(struct inode *inode);
+ memset(fname, 0, sizeof(struct ext4_filename));
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-int ext4_has_encryption_key(struct inode *inode);
+ err = fscrypt_setup_filename(dir, iname, lookup, &name);
-static inline int ext4_get_encryption_info(struct inode *inode)
-{
- struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
-
- if (!ci ||
- (ci->ci_keyring_key &&
- (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
- (1 << KEY_FLAG_REVOKED) |
- (1 << KEY_FLAG_DEAD)))))
- return _ext4_get_encryption_info(inode);
- return 0;
+ fname->usr_fname = name.usr_fname;
+ fname->disk_name = name.disk_name;
+ fname->hinfo.hash = name.hash;
+ fname->hinfo.minor_hash = name.minor_hash;
+ fname->crypto_buf = name.crypto_buf;
+ return err;
}
-static inline struct ext4_crypt_info *ext4_encryption_info(struct inode *inode)
+static inline void ext4_fname_free_filename(struct ext4_filename *fname)
{
- return EXT4_I(inode)->i_crypt_info;
-}
+ struct fscrypt_name name;
-#else
-static inline int ext4_has_encryption_key(struct inode *inode)
-{
- return 0;
+ name.crypto_buf = fname->crypto_buf;
+ fscrypt_free_filename(&name);
+
+ fname->crypto_buf.name = NULL;
+ fname->usr_fname = NULL;
+ fname->disk_name.name = NULL;
}
-static inline int ext4_get_encryption_info(struct inode *inode)
+#else
+static inline int ext4_fname_setup_filename(struct inode *dir,
+ const struct qstr *iname,
+ int lookup, struct ext4_filename *fname)
{
+ fname->usr_fname = iname;
+ fname->disk_name.name = (unsigned char *) iname->name;
+ fname->disk_name.len = iname->len;
return 0;
}
-static inline struct ext4_crypt_info *ext4_encryption_info(struct inode *inode)
-{
- return NULL;
-}
-#endif
+static inline void ext4_fname_free_filename(struct ext4_filename *fname) { }
+#define fscrypt_set_d_op(i)
+#define fscrypt_get_ctx fscrypt_notsupp_get_ctx
+#define fscrypt_release_ctx fscrypt_notsupp_release_ctx
+#define fscrypt_encrypt_page fscrypt_notsupp_encrypt_page
+#define fscrypt_decrypt_page fscrypt_notsupp_decrypt_page
+#define fscrypt_decrypt_bio_pages fscrypt_notsupp_decrypt_bio_pages
+#define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page
+#define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page
+#define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range
+#define fscrypt_process_policy fscrypt_notsupp_process_policy
+#define fscrypt_get_policy fscrypt_notsupp_get_policy
+#define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context
+#define fscrypt_inherit_context fscrypt_notsupp_inherit_context
+#define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info
+#define fscrypt_put_encryption_info fscrypt_notsupp_put_encryption_info
+#define fscrypt_setup_filename fscrypt_notsupp_setup_filename
+#define fscrypt_free_filename fscrypt_notsupp_free_filename
+#define fscrypt_fname_encrypted_size fscrypt_notsupp_fname_encrypted_size
+#define fscrypt_fname_alloc_buffer fscrypt_notsupp_fname_alloc_buffer
+#define fscrypt_fname_free_buffer fscrypt_notsupp_fname_free_buffer
+#define fscrypt_fname_disk_to_usr fscrypt_notsupp_fname_disk_to_usr
+#define fscrypt_fname_usr_to_disk fscrypt_notsupp_fname_usr_to_disk
+#endif
/* dir.c */
extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
@@ -2404,7 +2376,7 @@ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
__u32 minor_hash,
struct ext4_dir_entry_2 *dirent,
- struct ext4_str *ent_name);
+ struct fscrypt_str *ent_name);
extern void ext4_htree_free_dir_info(struct dir_private_info *p);
extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
struct buffer_head *bh,
@@ -2496,8 +2468,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
-int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create);
+int ext4_dax_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
int ext4_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
int ext4_dio_get_block(struct inode *inode, sector_t iblock,
@@ -2556,8 +2528,6 @@ extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
/* indirect.c */
extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
-extern ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- loff_t offset);
extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
extern void ext4_ind_truncate(handle_t *, struct inode *inode);
@@ -2594,7 +2564,7 @@ extern int ext4_generic_delete_entry(handle_t *handle,
void *entry_buf,
int buf_size,
int csum_size);
-extern int ext4_empty_dir(struct inode *inode);
+extern bool ext4_empty_dir(struct inode *inode);
/* resize.c */
extern int ext4_group_add(struct super_block *sb,
@@ -3076,7 +3046,7 @@ extern int ext4_delete_inline_entry(handle_t *handle,
struct ext4_dir_entry_2 *de_del,
struct buffer_head *bh,
int *has_inline_data);
-extern int empty_inline_dir(struct inode *dir, int *has_inline_data);
+extern bool empty_inline_dir(struct inode *dir, int *has_inline_data);
extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
struct ext4_dir_entry_2 **parent_de,
int *retval);
@@ -3305,6 +3275,13 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
}
}
+static inline bool ext4_aligned_io(struct inode *inode, loff_t off, loff_t len)
+{
+ int blksize = 1 << inode->i_blkbits;
+
+ return IS_ALIGNED(off, blksize) && IS_ALIGNED(len, blksize);
+}
+
#endif /* __KERNEL__ */
#define EFSBADCRC EBADMSG /* Bad CRC detected */
diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h
deleted file mode 100644
index 1f73c29717e19..0000000000000
--- a/fs/ext4/ext4_crypto.h
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * linux/fs/ext4/ext4_crypto.h
- *
- * Copyright (C) 2015, Google, Inc.
- *
- * This contains encryption header content for ext4
- *
- * Written by Michael Halcrow, 2015.
- */
-
-#ifndef _EXT4_CRYPTO_H
-#define _EXT4_CRYPTO_H
-
-#include <linux/fs.h>
-
-#define EXT4_KEY_DESCRIPTOR_SIZE 8
-
-/* Policy provided via an ioctl on the topmost directory */
-struct ext4_encryption_policy {
- char version;
- char contents_encryption_mode;
- char filenames_encryption_mode;
- char flags;
- char master_key_descriptor[EXT4_KEY_DESCRIPTOR_SIZE];
-} __attribute__((__packed__));
-
-#define EXT4_ENCRYPTION_CONTEXT_FORMAT_V1 1
-#define EXT4_KEY_DERIVATION_NONCE_SIZE 16
-
-#define EXT4_POLICY_FLAGS_PAD_4 0x00
-#define EXT4_POLICY_FLAGS_PAD_8 0x01
-#define EXT4_POLICY_FLAGS_PAD_16 0x02
-#define EXT4_POLICY_FLAGS_PAD_32 0x03
-#define EXT4_POLICY_FLAGS_PAD_MASK 0x03
-#define EXT4_POLICY_FLAGS_VALID 0x03
-
-/**
- * Encryption context for inode
- *
- * Protector format:
- * 1 byte: Protector format (1 = this version)
- * 1 byte: File contents encryption mode
- * 1 byte: File names encryption mode
- * 1 byte: Reserved
- * 8 bytes: Master Key descriptor
- * 16 bytes: Encryption Key derivation nonce
- */
-struct ext4_encryption_context {
- char format;
- char contents_encryption_mode;
- char filenames_encryption_mode;
- char flags;
- char master_key_descriptor[EXT4_KEY_DESCRIPTOR_SIZE];
- char nonce[EXT4_KEY_DERIVATION_NONCE_SIZE];
-} __attribute__((__packed__));
-
-/* Encryption parameters */
-#define EXT4_XTS_TWEAK_SIZE 16
-#define EXT4_AES_128_ECB_KEY_SIZE 16
-#define EXT4_AES_256_GCM_KEY_SIZE 32
-#define EXT4_AES_256_CBC_KEY_SIZE 32
-#define EXT4_AES_256_CTS_KEY_SIZE 32
-#define EXT4_AES_256_XTS_KEY_SIZE 64
-#define EXT4_MAX_KEY_SIZE 64
-
-#define EXT4_KEY_DESC_PREFIX "ext4:"
-#define EXT4_KEY_DESC_PREFIX_SIZE 5
-
-/* This is passed in from userspace into the kernel keyring */
-struct ext4_encryption_key {
- __u32 mode;
- char raw[EXT4_MAX_KEY_SIZE];
- __u32 size;
-} __attribute__((__packed__));
-
-struct ext4_crypt_info {
- char ci_data_mode;
- char ci_filename_mode;
- char ci_flags;
- struct crypto_skcipher *ci_ctfm;
- struct key *ci_keyring_key;
- char ci_master_key[EXT4_KEY_DESCRIPTOR_SIZE];
-};
-
-#define EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001
-#define EXT4_WRITE_PATH_FL 0x00000002
-
-struct ext4_crypto_ctx {
- union {
- struct {
- struct page *bounce_page; /* Ciphertext page */
- struct page *control_page; /* Original page */
- } w;
- struct {
- struct bio *bio;
- struct work_struct work;
- } r;
- struct list_head free_list; /* Free list */
- };
- char flags; /* Flags */
- char mode; /* Encryption mode for tfm */
-};
-
-struct ext4_completion_result {
- struct completion completion;
- int res;
-};
-
-#define DECLARE_EXT4_COMPLETION_RESULT(ecr) \
- struct ext4_completion_result ecr = { \
- COMPLETION_INITIALIZER((ecr).completion), 0 }
-
-static inline int ext4_encryption_key_size(int mode)
-{
- switch (mode) {
- case EXT4_ENCRYPTION_MODE_AES_256_XTS:
- return EXT4_AES_256_XTS_KEY_SIZE;
- case EXT4_ENCRYPTION_MODE_AES_256_GCM:
- return EXT4_AES_256_GCM_KEY_SIZE;
- case EXT4_ENCRYPTION_MODE_AES_256_CBC:
- return EXT4_AES_256_CBC_KEY_SIZE;
- case EXT4_ENCRYPTION_MODE_AES_256_CTS:
- return EXT4_AES_256_CTS_KEY_SIZE;
- default:
- BUG();
- }
- return 0;
-}
-
-#define EXT4_FNAME_NUM_SCATTER_ENTRIES 4
-#define EXT4_CRYPTO_BLOCK_SIZE 16
-#define EXT4_FNAME_CRYPTO_DIGEST_SIZE 32
-
-struct ext4_str {
- unsigned char *name;
- u32 len;
-};
-
-/**
- * For encrypted symlinks, the ciphertext length is stored at the beginning
- * of the string in little-endian format.
- */
-struct ext4_encrypted_symlink_data {
- __le16 len;
- char encrypted_path[1];
-} __attribute__((__packed__));
-
-/**
- * This function is used to calculate the disk space required to
- * store a filename of length l in encrypted symlink format.
- */
-static inline u32 encrypted_symlink_data_len(u32 l)
-{
- if (l < EXT4_CRYPTO_BLOCK_SIZE)
- l = EXT4_CRYPTO_BLOCK_SIZE;
- return (l + sizeof(struct ext4_encrypted_symlink_data) - 1);
-}
-
-#endif /* _EXT4_CRYPTO_H */
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 5f58462110953..b1d52c14098e1 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -175,6 +175,13 @@ struct ext4_journal_cb_entry {
* There is no guaranteed calling order of multiple registered callbacks on
* the same transaction.
*/
+static inline void _ext4_journal_callback_add(handle_t *handle,
+ struct ext4_journal_cb_entry *jce)
+{
+ /* Add the jce to transaction's private list */
+ list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list);
+}
+
static inline void ext4_journal_callback_add(handle_t *handle,
void (*func)(struct super_block *sb,
struct ext4_journal_cb_entry *jce,
@@ -187,10 +194,11 @@ static inline void ext4_journal_callback_add(handle_t *handle,
/* Add the jce to transaction's private list */
jce->jce_func = func;
spin_lock(&sbi->s_md_lock);
- list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list);
+ _ext4_journal_callback_add(handle, jce);
spin_unlock(&sbi->s_md_lock);
}
+
/**
* ext4_journal_callback_del: delete a registered callback
* @handle: active journal transaction handle on which callback was registered
@@ -359,10 +367,21 @@ static inline int ext4_journal_force_commit(journal_t *journal)
return 0;
}
-static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
+static inline int ext4_jbd2_inode_add_write(handle_t *handle,
+ struct inode *inode)
+{
+ if (ext4_handle_valid(handle))
+ return jbd2_journal_inode_add_write(handle,
+ EXT4_I(inode)->jinode);
+ return 0;
+}
+
+static inline int ext4_jbd2_inode_add_wait(handle_t *handle,
+ struct inode *inode)
{
if (ext4_handle_valid(handle))
- return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode);
+ return jbd2_journal_inode_add_wait(handle,
+ EXT4_I(inode)->jinode);
return 0;
}
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 95bf4679ac548..d7ccb7f51dfca 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -120,9 +120,14 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle,
if (!ext4_handle_valid(handle))
return 0;
- if (handle->h_buffer_credits > needed)
+ if (handle->h_buffer_credits >= needed)
return 0;
- err = ext4_journal_extend(handle, needed);
+ /*
+ * If we need to extend the journal get a few extra blocks
+ * while we're at it for efficiency's sake.
+ */
+ needed += 3;
+ err = ext4_journal_extend(handle, needed - handle->h_buffer_credits);
if (err <= 0)
return err;
err = ext4_truncate_restart_trans(handle, inode, needed);
@@ -376,9 +381,13 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
ext4_fsblk_t block = ext4_ext_pblock(ext);
int len = ext4_ext_get_actual_len(ext);
ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);
- ext4_lblk_t last = lblock + len - 1;
- if (len == 0 || lblock > last)
+ /*
+ * We allow neither:
+ * - zero length
+ * - overflow/wrap-around
+ */
+ if (lblock + len <= lblock)
return 0;
return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
}
@@ -469,6 +478,10 @@ static int __ext4_ext_check(const char *function, unsigned int line,
error_msg = "invalid extent entries";
goto corrupted;
}
+ if (unlikely(depth > 32)) {
+ error_msg = "too large eh_depth";
+ goto corrupted;
+ }
/* Verify checksum on non-root extent tree nodes */
if (ext_depth(inode) != depth &&
!ext4_extent_block_csum_verify(inode, eh)) {
@@ -907,13 +920,6 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
eh = ext_block_hdr(bh);
ppos++;
- if (unlikely(ppos > depth)) {
- put_bh(bh);
- EXT4_ERROR_INODE(inode,
- "ppos %d > depth %d", ppos, depth);
- ret = -EFSCORRUPTED;
- goto err;
- }
path[ppos].p_bh = bh;
path[ppos].p_hdr = eh;
}
@@ -2583,7 +2589,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
}
} else
ext4_error(sbi->s_sb, "strange request: removal(2) "
- "%u-%u from %u:%u\n",
+ "%u-%u from %u:%u",
from, to, le32_to_cpu(ex->ee_block), ee_len);
return 0;
}
@@ -3738,7 +3744,7 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
if (ee_block != map->m_lblk || ee_len > map->m_len) {
#ifdef EXT4_DEBUG
ext4_warning("Inode (%ld) finished: extent logical block %llu,"
- " len %u; IO logical block %llu, len %u\n",
+ " len %u; IO logical block %llu, len %u",
inode->i_ino, (unsigned long long)ee_block, ee_len,
(unsigned long long)map->m_lblk, map->m_len);
#endif
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index e38b987ac7f5f..37e059202cd2f 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -707,7 +707,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
(status & EXTENT_STATUS_WRITTEN)) {
ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as "
" delayed and written which can potentially "
- " cause data loss.\n", lblk, len);
+ " cause data loss.", lblk, len);
WARN_ON(1);
}
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6659e216385e0..261ac3734c580 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -169,13 +169,8 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
ret = __generic_file_write_iter(iocb, from);
inode_unlock(inode);
- if (ret > 0) {
- ssize_t err;
-
- err = generic_write_sync(file, iocb->ki_pos - ret, ret);
- if (err < 0)
- ret = err;
- }
+ if (ret > 0)
+ ret = generic_write_sync(iocb, ret);
if (o_direct)
blk_finish_plug(&plug);
@@ -207,7 +202,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
if (IS_ERR(handle))
result = VM_FAULT_SIGBUS;
else
- result = __dax_fault(vma, vmf, ext4_dax_mmap_get_block, NULL);
+ result = dax_fault(vma, vmf, ext4_dax_get_block);
if (write) {
if (!IS_ERR(handle))
@@ -242,8 +237,8 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
if (IS_ERR(handle))
result = VM_FAULT_SIGBUS;
else
- result = __dax_pmd_fault(vma, addr, pmd, flags,
- ext4_dax_mmap_get_block, NULL);
+ result = dax_pmd_fault(vma, addr, pmd, flags,
+ ext4_dax_get_block);
if (write) {
if (!IS_ERR(handle))
@@ -308,10 +303,10 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
struct inode *inode = file->f_mapping->host;
if (ext4_encrypted_inode(inode)) {
- int err = ext4_get_encryption_info(inode);
+ int err = fscrypt_get_encryption_info(inode);
if (err)
return 0;
- if (ext4_encryption_info(inode) == NULL)
+ if (!fscrypt_has_encryption_key(inode))
return -ENOKEY;
}
file_accessed(file);
@@ -329,7 +324,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
struct super_block *sb = inode->i_sb;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct vfsmount *mnt = filp->f_path.mnt;
- struct inode *dir = filp->f_path.dentry->d_parent->d_inode;
+ struct dentry *dir;
struct path path;
char buf[64], *cp;
int ret;
@@ -367,20 +362,24 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
}
}
if (ext4_encrypted_inode(inode)) {
- ret = ext4_get_encryption_info(inode);
+ ret = fscrypt_get_encryption_info(inode);
if (ret)
return -EACCES;
- if (ext4_encryption_info(inode) == NULL)
+ if (!fscrypt_has_encryption_key(inode))
return -ENOKEY;
}
- if (ext4_encrypted_inode(dir) &&
- !ext4_is_child_context_consistent_with_parent(dir, inode)) {
+
+ dir = dget_parent(file_dentry(filp));
+ if (ext4_encrypted_inode(d_inode(dir)) &&
+ !fscrypt_has_permitted_context(d_inode(dir), inode)) {
ext4_warning(inode->i_sb,
- "Inconsistent encryption contexts: %lu/%lu\n",
- (unsigned long) dir->i_ino,
+ "Inconsistent encryption contexts: %lu/%lu",
+ (unsigned long) d_inode(dir)->i_ino,
(unsigned long) inode->i_ino);
+ dput(dir);
return -EPERM;
}
+ dput(dir);
/*
* Set up the jbd2_inode if we are opening the inode for
* writing and the journal is present
@@ -428,8 +427,8 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
lastoff = startoff;
endoff = (loff_t)end_blk << blkbits;
- index = startoff >> PAGE_CACHE_SHIFT;
- end = endoff >> PAGE_CACHE_SHIFT;
+ index = startoff >> PAGE_SHIFT;
+ end = endoff >> PAGE_SHIFT;
pagevec_init(&pvec, 0);
do {
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 8850254136ae3..5c4372512ef71 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -106,9 +106,11 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
}
if (!journal) {
- ret = generic_file_fsync(file, start, end, datasync);
+ ret = __generic_file_fsync(file, start, end, datasync);
if (!ret && !hlist_empty(&inode->i_dentry))
ret = ext4_sync_parent(inode);
+ if (test_opt(inode->i_sb, BARRIER))
+ goto issue_flush;
goto out;
}
@@ -140,6 +142,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
needs_barrier = true;
ret = jbd2_complete_transaction(journal, commit_tid);
if (needs_barrier) {
+ issue_flush:
err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
if (!ret)
ret = err;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 237b877d316d1..9e66cd1d7b782 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -214,7 +214,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
trace_ext4_load_inode_bitmap(sb, block_group);
bh->b_end_io = ext4_end_bitmap_read;
get_bh(bh);
- submit_bh(READ | REQ_META | REQ_PRIO, bh);
+ submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
put_bh(bh);
@@ -767,10 +767,10 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
if ((ext4_encrypted_inode(dir) ||
DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) &&
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
- err = ext4_get_encryption_info(dir);
+ err = fscrypt_get_encryption_info(dir);
if (err)
return ERR_PTR(err);
- if (ext4_encryption_info(dir) == NULL)
+ if (!fscrypt_has_encryption_key(dir))
return ERR_PTR(-EPERM);
if (!handle)
nblocks += EXT4_DATA_TRANS_BLOCKS(dir->i_sb);
@@ -1115,7 +1115,8 @@ got:
}
if (encrypt) {
- err = ext4_inherit_context(dir, inode);
+ /* give pointer to avoid set_context with journal ops. */
+ err = fscrypt_inherit_context(dir, inode, &encrypt, true);
if (err)
goto fail_free_drop;
}
@@ -1150,25 +1151,20 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
ext4_group_t block_group;
int bit;
- struct buffer_head *bitmap_bh;
+ struct buffer_head *bitmap_bh = NULL;
struct inode *inode = NULL;
- long err = -EIO;
+ int err = -EFSCORRUPTED;
- /* Error cases - e2fsck has already cleaned up for us */
- if (ino > max_ino) {
- ext4_warning(sb, "bad orphan ino %lu! e2fsck was run?", ino);
- err = -EFSCORRUPTED;
- goto error;
- }
+ if (ino < EXT4_FIRST_INO(sb) || ino > max_ino)
+ goto bad_orphan;
block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
if (IS_ERR(bitmap_bh)) {
- err = PTR_ERR(bitmap_bh);
- ext4_warning(sb, "inode bitmap error %ld for orphan %lu",
- ino, err);
- goto error;
+ ext4_error(sb, "inode bitmap error %ld for orphan %lu",
+ ino, PTR_ERR(bitmap_bh));
+ return (struct inode *) bitmap_bh;
}
/* Having the inode bit set should be a 100% indicator that this
@@ -1179,15 +1175,21 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
goto bad_orphan;
inode = ext4_iget(sb, ino);
- if (IS_ERR(inode))
- goto iget_failed;
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ ext4_error(sb, "couldn't read orphan inode %lu (err %d)",
+ ino, err);
+ return inode;
+ }
/*
- * If the orphans has i_nlinks > 0 then it should be able to be
- * truncated, otherwise it won't be removed from the orphan list
- * during processing and an infinite loop will result.
+ * If the orphans has i_nlinks > 0 then it should be able to
+ * be truncated, otherwise it won't be removed from the orphan
+ * list during processing and an infinite loop will result.
+ * Similarly, it must not be a bad inode.
*/
- if (inode->i_nlink && !ext4_can_truncate(inode))
+ if ((inode->i_nlink && !ext4_can_truncate(inode)) ||
+ is_bad_inode(inode))
goto bad_orphan;
if (NEXT_ORPHAN(inode) > max_ino)
@@ -1195,29 +1197,25 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
brelse(bitmap_bh);
return inode;
-iget_failed:
- err = PTR_ERR(inode);
- inode = NULL;
bad_orphan:
- ext4_warning(sb, "bad orphan inode %lu! e2fsck was run?", ino);
- printk(KERN_WARNING "ext4_test_bit(bit=%d, block=%llu) = %d\n",
- bit, (unsigned long long)bitmap_bh->b_blocknr,
- ext4_test_bit(bit, bitmap_bh->b_data));
- printk(KERN_WARNING "inode=%p\n", inode);
+ ext4_error(sb, "bad orphan inode %lu", ino);
+ if (bitmap_bh)
+ printk(KERN_ERR "ext4_test_bit(bit=%d, block=%llu) = %d\n",
+ bit, (unsigned long long)bitmap_bh->b_blocknr,
+ ext4_test_bit(bit, bitmap_bh->b_data));
if (inode) {
- printk(KERN_WARNING "is_bad_inode(inode)=%d\n",
+ printk(KERN_ERR "is_bad_inode(inode)=%d\n",
is_bad_inode(inode));
- printk(KERN_WARNING "NEXT_ORPHAN(inode)=%u\n",
+ printk(KERN_ERR "NEXT_ORPHAN(inode)=%u\n",
NEXT_ORPHAN(inode));
- printk(KERN_WARNING "max_ino=%lu\n", max_ino);
- printk(KERN_WARNING "i_nlink=%u\n", inode->i_nlink);
+ printk(KERN_ERR "max_ino=%lu\n", max_ino);
+ printk(KERN_ERR "i_nlink=%u\n", inode->i_nlink);
/* Avoid freeing blocks if we got a bad deleted inode */
if (inode->i_nlink == 0)
inode->i_blocks = 0;
iput(inode);
}
brelse(bitmap_bh);
-error:
return ERR_PTR(err);
}
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 3027fa681de53..bc15c2c176330 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -649,133 +649,6 @@ out:
}
/*
- * O_DIRECT for ext3 (or indirect map) based files
- *
- * If the O_DIRECT write will extend the file then add this inode to the
- * orphan list. So recovery will truncate it back to the original size
- * if the machine crashes during the write.
- *
- * If the O_DIRECT write is intantiating holes inside i_size and the machine
- * crashes then stale disk data _may_ be exposed inside the file. But current
- * VFS code falls back into buffered path in that case so we are safe.
- */
-ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- loff_t offset)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- struct ext4_inode_info *ei = EXT4_I(inode);
- handle_t *handle;
- ssize_t ret;
- int orphan = 0;
- size_t count = iov_iter_count(iter);
- int retries = 0;
-
- if (iov_iter_rw(iter) == WRITE) {
- loff_t final_size = offset + count;
-
- if (final_size > inode->i_size) {
- /* Credits for sb + inode write */
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
- }
- ret = ext4_orphan_add(handle, inode);
- if (ret) {
- ext4_journal_stop(handle);
- goto out;
- }
- orphan = 1;
- ei->i_disksize = inode->i_size;
- ext4_journal_stop(handle);
- }
- }
-
-retry:
- if (iov_iter_rw(iter) == READ && ext4_should_dioread_nolock(inode)) {
- /*
- * Nolock dioread optimization may be dynamically disabled
- * via ext4_inode_block_unlocked_dio(). Check inode's state
- * while holding extra i_dio_count ref.
- */
- inode_dio_begin(inode);
- smp_mb();
- if (unlikely(ext4_test_inode_state(inode,
- EXT4_STATE_DIOREAD_LOCK))) {
- inode_dio_end(inode);
- goto locked;
- }
- if (IS_DAX(inode))
- ret = dax_do_io(iocb, inode, iter, offset,
- ext4_dio_get_block, NULL, 0);
- else
- ret = __blockdev_direct_IO(iocb, inode,
- inode->i_sb->s_bdev, iter,
- offset, ext4_dio_get_block,
- NULL, NULL, 0);
- inode_dio_end(inode);
- } else {
-locked:
- if (IS_DAX(inode))
- ret = dax_do_io(iocb, inode, iter, offset,
- ext4_dio_get_block, NULL, DIO_LOCKING);
- else
- ret = blockdev_direct_IO(iocb, inode, iter, offset,
- ext4_dio_get_block);
-
- if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
- loff_t isize = i_size_read(inode);
- loff_t end = offset + count;
-
- if (end > isize)
- ext4_truncate_failed_write(inode);
- }
- }
- if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
-
- if (orphan) {
- int err;
-
- /* Credits for sb + inode write */
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- if (IS_ERR(handle)) {
- /* This is really bad luck. We've written the data
- * but cannot extend i_size. Bail out and pretend
- * the write failed... */
- ret = PTR_ERR(handle);
- if (inode->i_nlink)
- ext4_orphan_del(NULL, inode);
-
- goto out;
- }
- if (inode->i_nlink)
- ext4_orphan_del(handle, inode);
- if (ret > 0) {
- loff_t end = offset + ret;
- if (end > inode->i_size) {
- ei->i_disksize = end;
- i_size_write(inode, end);
- /*
- * We're going to return a positive `ret'
- * here due to non-zero-length I/O, so there's
- * no way of reporting error returns from
- * ext4_mark_inode_dirty() to userspace. So
- * ignore it.
- */
- ext4_mark_inode_dirty(handle, inode);
- }
- }
- err = ext4_journal_stop(handle);
- if (ret == 0)
- ret = err;
- }
-out:
- return ret;
-}
-
-/*
* Calculate the number of metadata blocks need to reserve
* to allocate a new block at @lblocks for non extent file based file
*/
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 7cbdd3752ba50..f74d5ee2cdec0 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -482,7 +482,7 @@ static int ext4_read_inline_page(struct inode *inode, struct page *page)
ret = ext4_read_inline_data(inode, kaddr, len, &iloc);
flush_dcache_page(page);
kunmap_atomic(kaddr);
- zero_user_segment(page, len, PAGE_CACHE_SIZE);
+ zero_user_segment(page, len, PAGE_SIZE);
SetPageUptodate(page);
brelse(iloc.bh);
@@ -507,7 +507,7 @@ int ext4_readpage_inline(struct inode *inode, struct page *page)
if (!page->index)
ret = ext4_read_inline_page(inode, page);
else if (!PageUptodate(page)) {
- zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ zero_user_segment(page, 0, PAGE_SIZE);
SetPageUptodate(page);
}
@@ -595,7 +595,7 @@ retry:
if (ret) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
page = NULL;
ext4_orphan_add(handle, inode);
up_write(&EXT4_I(inode)->xattr_sem);
@@ -621,7 +621,7 @@ retry:
out:
if (page) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
if (sem_held)
up_write(&EXT4_I(inode)->xattr_sem);
@@ -690,7 +690,7 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,
if (!ext4_has_inline_data(inode)) {
ret = 0;
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
goto out_up_read;
}
@@ -815,7 +815,7 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
if (ret) {
up_read(&EXT4_I(inode)->xattr_sem);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
ext4_truncate_failed_write(inode);
return ret;
}
@@ -829,7 +829,7 @@ out:
up_read(&EXT4_I(inode)->xattr_sem);
if (page) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
return ret;
}
@@ -919,7 +919,7 @@ retry_journal:
out_release_page:
up_read(&EXT4_I(inode)->xattr_sem);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
out_journal:
ext4_journal_stop(handle);
out:
@@ -947,7 +947,7 @@ int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
i_size_changed = 1;
}
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
/*
* Don't mark the inode dirty under page lock. First, it unnecessarily
@@ -1326,7 +1326,7 @@ int htree_inlinedir_to_tree(struct file *dir_file,
struct ext4_iloc iloc;
void *dir_buf = NULL;
struct ext4_dir_entry_2 fake;
- struct ext4_str tmp_str;
+ struct fscrypt_str tmp_str;
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)
@@ -1739,20 +1739,20 @@ ext4_get_inline_entry(struct inode *inode,
return (struct ext4_dir_entry_2 *)(inline_pos + offset);
}
-int empty_inline_dir(struct inode *dir, int *has_inline_data)
+bool empty_inline_dir(struct inode *dir, int *has_inline_data)
{
int err, inline_size;
struct ext4_iloc iloc;
void *inline_pos;
unsigned int offset;
struct ext4_dir_entry_2 *de;
- int ret = 1;
+ bool ret = true;
err = ext4_get_inode_loc(dir, &iloc);
if (err) {
EXT4_ERROR_INODE(dir, "error %d getting inode %lu block",
err, dir->i_ino);
- return 1;
+ return true;
}
down_read(&EXT4_I(dir)->xattr_sem);
@@ -1766,7 +1766,7 @@ int empty_inline_dir(struct inode *dir, int *has_inline_data)
ext4_warning(dir->i_sb,
"bad inline directory (dir #%lu) - no `..'",
dir->i_ino);
- ret = 1;
+ ret = true;
goto out;
}
@@ -1780,15 +1780,15 @@ int empty_inline_dir(struct inode *dir, int *has_inline_data)
ext4_warning(dir->i_sb,
"bad inline directory (dir #%lu) - "
"inode %u, rec_len %u, name_len %d"
- "inline size %d\n",
+ "inline size %d",
dir->i_ino, le32_to_cpu(de->inode),
le16_to_cpu(de->rec_len), de->name_len,
inline_size);
- ret = 1;
+ ret = true;
goto out;
}
if (le32_to_cpu(de->inode)) {
- ret = 0;
+ ret = false;
goto out;
}
offset += ext4_rec_len_from_disk(de->rec_len, inline_size);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index dab84a2530ff3..3131747199e16 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -51,25 +51,31 @@ static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
struct ext4_inode_info *ei)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- __u16 csum_lo;
- __u16 csum_hi = 0;
__u32 csum;
+ __u16 dummy_csum = 0;
+ int offset = offsetof(struct ext4_inode, i_checksum_lo);
+ unsigned int csum_size = sizeof(dummy_csum);
- csum_lo = le16_to_cpu(raw->i_checksum_lo);
- raw->i_checksum_lo = 0;
- if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
- EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
- csum_hi = le16_to_cpu(raw->i_checksum_hi);
- raw->i_checksum_hi = 0;
- }
-
- csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw,
- EXT4_INODE_SIZE(inode->i_sb));
+ csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset);
+ csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size);
+ offset += csum_size;
+ csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
+ EXT4_GOOD_OLD_INODE_SIZE - offset);
- raw->i_checksum_lo = cpu_to_le16(csum_lo);
- if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
- EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
- raw->i_checksum_hi = cpu_to_le16(csum_hi);
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+ offset = offsetof(struct ext4_inode, i_checksum_hi);
+ csum = ext4_chksum(sbi, csum, (__u8 *)raw +
+ EXT4_GOOD_OLD_INODE_SIZE,
+ offset - EXT4_GOOD_OLD_INODE_SIZE);
+ if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
+ csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum,
+ csum_size);
+ offset += csum_size;
+ csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
+ EXT4_INODE_SIZE(inode->i_sb) -
+ offset);
+ }
+ }
return csum;
}
@@ -205,9 +211,9 @@ void ext4_evict_inode(struct inode *inode)
* Note that directories do not have this problem because they
* don't use page cache.
*/
- if (ext4_should_journal_data(inode) &&
- (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) &&
- inode->i_ino != EXT4_JOURNAL_INO) {
+ if (inode->i_ino != EXT4_JOURNAL_INO &&
+ ext4_should_journal_data(inode) &&
+ (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) {
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
@@ -386,7 +392,7 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
int ret;
if (ext4_encrypted_inode(inode))
- return ext4_encrypted_zeroout(inode, lblk, pblk, len);
+ return fscrypt_zeroout_range(inode, lblk, pblk, len);
ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS);
if (ret > 0)
@@ -684,6 +690,24 @@ out_sem:
ret = check_block_validity(inode, map);
if (ret != 0)
return ret;
+
+ /*
+ * Inodes with freshly allocated blocks where contents will be
+ * visible after transaction commit must be on transaction's
+ * ordered data list.
+ */
+ if (map->m_flags & EXT4_MAP_NEW &&
+ !(map->m_flags & EXT4_MAP_UNWRITTEN) &&
+ !(flags & EXT4_GET_BLOCKS_ZERO) &&
+ !IS_NOQUOTA(inode) &&
+ ext4_should_order_data(inode)) {
+ if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
+ ret = ext4_jbd2_inode_add_wait(handle, inode);
+ else
+ ret = ext4_jbd2_inode_add_write(handle, inode);
+ if (ret)
+ return ret;
+ }
}
return retval;
}
@@ -763,39 +787,47 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
/* Maximum number of blocks we map for direct IO at once. */
#define DIO_MAX_BLOCKS 4096
-static handle_t *start_dio_trans(struct inode *inode,
- struct buffer_head *bh_result)
+/*
+ * Get blocks function for the cases that need to start a transaction -
+ * generally difference cases of direct IO and DAX IO. It also handles retries
+ * in case of ENOSPC.
+ */
+static int ext4_get_block_trans(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int flags)
{
int dio_credits;
+ handle_t *handle;
+ int retries = 0;
+ int ret;
/* Trim mapping request to maximum we can map at once for DIO */
if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS)
bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits;
dio_credits = ext4_chunk_trans_blocks(inode,
bh_result->b_size >> inode->i_blkbits);
- return ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
+retry:
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ ret = _ext4_get_block(inode, iblock, bh_result, flags);
+ ext4_journal_stop(handle);
+
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+ return ret;
}
/* Get block function for DIO reads and writes to inodes without extents */
int ext4_dio_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create)
{
- handle_t *handle;
- int ret;
-
/* We don't expect handle for direct IO */
WARN_ON_ONCE(ext4_journal_current_handle());
- if (create) {
- handle = start_dio_trans(inode, bh);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
- }
- ret = _ext4_get_block(inode, iblock, bh,
- create ? EXT4_GET_BLOCKS_CREATE : 0);
- if (create)
- ext4_journal_stop(handle);
- return ret;
+ if (!create)
+ return _ext4_get_block(inode, iblock, bh, 0);
+ return ext4_get_block_trans(inode, iblock, bh, EXT4_GET_BLOCKS_CREATE);
}
/*
@@ -806,18 +838,13 @@ int ext4_dio_get_block(struct inode *inode, sector_t iblock,
static int ext4_dio_get_block_unwritten_async(struct inode *inode,
sector_t iblock, struct buffer_head *bh_result, int create)
{
- handle_t *handle;
int ret;
/* We don't expect handle for direct IO */
WARN_ON_ONCE(ext4_journal_current_handle());
- handle = start_dio_trans(inode, bh_result);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
- ret = _ext4_get_block(inode, iblock, bh_result,
- EXT4_GET_BLOCKS_IO_CREATE_EXT);
- ext4_journal_stop(handle);
+ ret = ext4_get_block_trans(inode, iblock, bh_result,
+ EXT4_GET_BLOCKS_IO_CREATE_EXT);
/*
* When doing DIO using unwritten extents, we need io_end to convert
@@ -850,18 +877,13 @@ static int ext4_dio_get_block_unwritten_async(struct inode *inode,
static int ext4_dio_get_block_unwritten_sync(struct inode *inode,
sector_t iblock, struct buffer_head *bh_result, int create)
{
- handle_t *handle;
int ret;
/* We don't expect handle for direct IO */
WARN_ON_ONCE(ext4_journal_current_handle());
- handle = start_dio_trans(inode, bh_result);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
- ret = _ext4_get_block(inode, iblock, bh_result,
- EXT4_GET_BLOCKS_IO_CREATE_EXT);
- ext4_journal_stop(handle);
+ ret = ext4_get_block_trans(inode, iblock, bh_result,
+ EXT4_GET_BLOCKS_IO_CREATE_EXT);
/*
* Mark inode as having pending DIO writes to unwritten extents.
@@ -965,7 +987,7 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
return bh;
if (!bh || buffer_uptodate(bh))
return bh;
- ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
+ ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh);
wait_on_buffer(bh);
if (buffer_uptodate(bh))
return bh;
@@ -1057,7 +1079,7 @@ int do_journal_get_write_access(handle_t *handle,
static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
get_block_t *get_block)
{
- unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned from = pos & (PAGE_SIZE - 1);
unsigned to = from + len;
struct inode *inode = page->mapping->host;
unsigned block_start, block_end;
@@ -1069,15 +1091,15 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
bool decrypt = false;
BUG_ON(!PageLocked(page));
- BUG_ON(from > PAGE_CACHE_SIZE);
- BUG_ON(to > PAGE_CACHE_SIZE);
+ BUG_ON(from > PAGE_SIZE);
+ BUG_ON(to > PAGE_SIZE);
BUG_ON(from > to);
if (!page_has_buffers(page))
create_empty_buffers(page, blocksize, 0);
head = page_buffers(page);
bbits = ilog2(blocksize);
- block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
+ block = (sector_t)page->index << (PAGE_SHIFT - bbits);
for (bh = head, block_start = 0; bh != head || !block_start;
block++, block_start = block_end, bh = bh->b_this_page) {
@@ -1119,7 +1141,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
!buffer_unwritten(bh) &&
(block_start < from || block_end > to)) {
- ll_rw_block(READ, 1, &bh);
+ ll_rw_block(REQ_OP_READ, 0, 1, &bh);
*wait_bh++ = bh;
decrypt = ext4_encrypted_inode(inode) &&
S_ISREG(inode->i_mode);
@@ -1136,7 +1158,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
if (unlikely(err))
page_zero_new_buffers(page, from, to);
else if (decrypt)
- err = ext4_decrypt(page);
+ err = fscrypt_decrypt_page(page);
return err;
}
#endif
@@ -1159,8 +1181,8 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
* we allocate blocks but write fails for some reason
*/
needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
- index = pos >> PAGE_CACHE_SHIFT;
- from = pos & (PAGE_CACHE_SIZE - 1);
+ index = pos >> PAGE_SHIFT;
+ from = pos & (PAGE_SIZE - 1);
to = from + len;
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
@@ -1188,7 +1210,7 @@ retry_grab:
retry_journal:
handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
if (IS_ERR(handle)) {
- page_cache_release(page);
+ put_page(page);
return PTR_ERR(handle);
}
@@ -1196,7 +1218,7 @@ retry_journal:
if (page->mapping != mapping) {
/* The page got truncated from under us */
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
ext4_journal_stop(handle);
goto retry_grab;
}
@@ -1252,7 +1274,7 @@ retry_journal:
if (ret == -ENOSPC &&
ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry_journal;
- page_cache_release(page);
+ put_page(page);
return ret;
}
*pagep = page;
@@ -1291,15 +1313,6 @@ static int ext4_write_end(struct file *file,
int i_size_changed = 0;
trace_ext4_write_end(inode, pos, len, copied);
- if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) {
- ret = ext4_jbd2_file_inode(handle, inode);
- if (ret) {
- unlock_page(page);
- page_cache_release(page);
- goto errout;
- }
- }
-
if (ext4_has_inline_data(inode)) {
ret = ext4_write_inline_data_end(inode, pos, len,
copied, page);
@@ -1315,7 +1328,7 @@ static int ext4_write_end(struct file *file,
*/
i_size_changed = ext4_update_inode_size(inode, pos + copied);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if (old_size < pos)
pagecache_isize_extended(inode, old_size, pos);
@@ -1399,7 +1412,7 @@ static int ext4_journalled_write_end(struct file *file,
int size_changed = 0;
trace_ext4_journalled_write_end(inode, pos, len, copied);
- from = pos & (PAGE_CACHE_SIZE - 1);
+ from = pos & (PAGE_SIZE - 1);
to = from + len;
BUG_ON(!ext4_handle_valid(handle));
@@ -1423,7 +1436,7 @@ static int ext4_journalled_write_end(struct file *file,
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if (old_size < pos)
pagecache_isize_extended(inode, old_size, pos);
@@ -1537,7 +1550,7 @@ static void ext4_da_page_release_reservation(struct page *page,
int num_clusters;
ext4_fsblk_t lblk;
- BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+ BUG_ON(stop > PAGE_SIZE || stop < length);
head = page_buffers(page);
bh = head;
@@ -1553,7 +1566,7 @@ static void ext4_da_page_release_reservation(struct page *page,
clear_buffer_delay(bh);
} else if (contiguous_blks) {
lblk = page->index <<
- (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ (PAGE_SHIFT - inode->i_blkbits);
lblk += (curr_off >> inode->i_blkbits) -
contiguous_blks;
ext4_es_remove_extent(inode, lblk, contiguous_blks);
@@ -1563,7 +1576,7 @@ static void ext4_da_page_release_reservation(struct page *page,
} while ((bh = bh->b_this_page) != head);
if (contiguous_blks) {
- lblk = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ lblk = page->index << (PAGE_SHIFT - inode->i_blkbits);
lblk += (curr_off >> inode->i_blkbits) - contiguous_blks;
ext4_es_remove_extent(inode, lblk, contiguous_blks);
}
@@ -1572,7 +1585,7 @@ static void ext4_da_page_release_reservation(struct page *page,
* need to release the reserved space for that cluster. */
num_clusters = EXT4_NUM_B2C(sbi, to_release);
while (num_clusters > 0) {
- lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) +
+ lblk = (page->index << (PAGE_SHIFT - inode->i_blkbits)) +
((num_clusters - 1) << sbi->s_cluster_bits);
if (sbi->s_cluster_ratio == 1 ||
!ext4_find_delalloc_cluster(inode, lblk))
@@ -1619,8 +1632,8 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
end = mpd->next_page - 1;
if (invalidate) {
ext4_lblk_t start, last;
- start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
- last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ start = index << (PAGE_SHIFT - inode->i_blkbits);
+ last = end << (PAGE_SHIFT - inode->i_blkbits);
ext4_es_remove_extent(inode, start, last - start + 1);
}
@@ -1636,7 +1649,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
if (invalidate) {
- block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+ block_invalidatepage(page, 0, PAGE_SIZE);
ClearPageUptodate(page);
}
unlock_page(page);
@@ -2007,10 +2020,10 @@ static int ext4_writepage(struct page *page,
trace_ext4_writepage(page);
size = i_size_read(inode);
- if (page->index == size >> PAGE_CACHE_SHIFT)
- len = size & ~PAGE_CACHE_MASK;
+ if (page->index == size >> PAGE_SHIFT)
+ len = size & ~PAGE_MASK;
else
- len = PAGE_CACHE_SIZE;
+ len = PAGE_SIZE;
page_bufs = page_buffers(page);
/*
@@ -2034,7 +2047,7 @@ static int ext4_writepage(struct page *page,
ext4_bh_delay_or_unwritten)) {
redirty_page_for_writepage(wbc, page);
if ((current->flags & PF_MEMALLOC) ||
- (inode->i_sb->s_blocksize == PAGE_CACHE_SIZE)) {
+ (inode->i_sb->s_blocksize == PAGE_SIZE)) {
/*
* For memory cleaning there's no point in writing only
* some buffers. So just bail out. Warn if we came here
@@ -2076,10 +2089,10 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
int err;
BUG_ON(page->index != mpd->first_page);
- if (page->index == size >> PAGE_CACHE_SHIFT)
- len = size & ~PAGE_CACHE_MASK;
+ if (page->index == size >> PAGE_SHIFT)
+ len = size & ~PAGE_MASK;
else
- len = PAGE_CACHE_SIZE;
+ len = PAGE_SIZE;
clear_page_dirty_for_io(page);
err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false);
if (!err)
@@ -2213,7 +2226,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
int nr_pages, i;
struct inode *inode = mpd->inode;
struct buffer_head *head, *bh;
- int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
+ int bpp_bits = PAGE_SHIFT - inode->i_blkbits;
pgoff_t start, end;
ext4_lblk_t lblk;
sector_t pblock;
@@ -2274,7 +2287,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
* supports blocksize < pagesize as we will try to
* convert potentially unmapped parts of inode.
*/
- mpd->io_submit.io_end->size += PAGE_CACHE_SIZE;
+ mpd->io_submit.io_end->size += PAGE_SIZE;
/* Page fully mapped - let IO run! */
err = mpage_submit_page(mpd, page);
if (err < 0) {
@@ -2315,7 +2328,8 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
* the data was copied into the page cache.
*/
get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
- EXT4_GET_BLOCKS_METADATA_NOFAIL;
+ EXT4_GET_BLOCKS_METADATA_NOFAIL |
+ EXT4_GET_BLOCKS_IO_SUBMIT;
dioread_nolock = ext4_should_dioread_nolock(inode);
if (dioread_nolock)
get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
@@ -2426,7 +2440,7 @@ update_disksize:
* Update on-disk size after IO is submitted. Races with
* truncate are avoided by checking i_size under i_data_sem.
*/
- disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
+ disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT;
if (disksize > EXT4_I(inode)->i_disksize) {
int err2;
loff_t i_size;
@@ -2562,7 +2576,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
mpd->next_page = page->index + 1;
/* Add all dirty buffers to mpd */
lblk = ((ext4_lblk_t)page->index) <<
- (PAGE_CACHE_SHIFT - blkbits);
+ (PAGE_SHIFT - blkbits);
head = page_buffers(page);
err = mpage_process_page_bufs(mpd, head, head, lblk);
if (err <= 0)
@@ -2604,11 +2618,14 @@ static int ext4_writepages(struct address_space *mapping,
struct blk_plug plug;
bool give_up_on_write = false;
+ percpu_down_read(&sbi->s_journal_flag_rwsem);
trace_ext4_writepages(inode, wbc);
- if (dax_mapping(mapping))
- return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
- wbc);
+ if (dax_mapping(mapping)) {
+ ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
+ wbc);
+ goto out_writepages;
+ }
/*
* No pages to write? This is mainly a kludge to avoid starting
@@ -2647,7 +2664,7 @@ static int ext4_writepages(struct address_space *mapping,
* We may need to convert up to one extent per block in
* the page and we may dirty the inode.
*/
- rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits);
+ rsv_blocks = 1 + (PAGE_SIZE >> inode->i_blkbits);
}
/*
@@ -2678,8 +2695,8 @@ static int ext4_writepages(struct address_space *mapping,
mpd.first_page = writeback_index;
mpd.last_page = -1;
} else {
- mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT;
- mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;
+ mpd.first_page = wbc->range_start >> PAGE_SHIFT;
+ mpd.last_page = wbc->range_end >> PAGE_SHIFT;
}
mpd.inode = inode;
@@ -2737,13 +2754,36 @@ retry:
done = true;
}
}
- ext4_journal_stop(handle);
+ /*
+ * Caution: If the handle is synchronous,
+ * ext4_journal_stop() can wait for transaction commit
+ * to finish which may depend on writeback of pages to
+ * complete or on page lock to be released. In that
+ * case, we have to wait until after after we have
+ * submitted all the IO, released page locks we hold,
+ * and dropped io_end reference (for extent conversion
+ * to be able to complete) before stopping the handle.
+ */
+ if (!ext4_handle_valid(handle) || handle->h_sync == 0) {
+ ext4_journal_stop(handle);
+ handle = NULL;
+ }
/* Submit prepared bio */
ext4_io_submit(&mpd.io_submit);
/* Unlock pages we didn't use */
mpage_release_unused_pages(&mpd, give_up_on_write);
- /* Drop our io_end reference we got from init */
- ext4_put_io_end(mpd.io_submit.io_end);
+ /*
+ * Drop our io_end reference we got from init. We have
+ * to be careful and use deferred io_end finishing if
+ * we are still holding the transaction as we can
+ * release the last reference to io_end which may end
+ * up doing unwritten extent conversion.
+ */
+ if (handle) {
+ ext4_put_io_end_defer(mpd.io_submit.io_end);
+ ext4_journal_stop(handle);
+ } else
+ ext4_put_io_end(mpd.io_submit.io_end);
if (ret == -ENOSPC && sbi->s_journal) {
/*
@@ -2778,6 +2818,7 @@ retry:
out_writepages:
trace_ext4_writepages_result(inode, wbc, ret,
nr_to_write - wbc->nr_to_write);
+ percpu_up_read(&sbi->s_journal_flag_rwsem);
return ret;
}
@@ -2838,7 +2879,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
struct inode *inode = mapping->host;
handle_t *handle;
- index = pos >> PAGE_CACHE_SHIFT;
+ index = pos >> PAGE_SHIFT;
if (ext4_nonda_switch(inode->i_sb)) {
*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
@@ -2881,7 +2922,7 @@ retry_journal:
handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
ext4_da_write_credits(inode, pos, len));
if (IS_ERR(handle)) {
- page_cache_release(page);
+ put_page(page);
return PTR_ERR(handle);
}
@@ -2889,7 +2930,7 @@ retry_journal:
if (page->mapping != mapping) {
/* The page got truncated from under us */
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
ext4_journal_stop(handle);
goto retry_grab;
}
@@ -2917,7 +2958,7 @@ retry_journal:
ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry_journal;
- page_cache_release(page);
+ put_page(page);
return ret;
}
@@ -2965,7 +3006,7 @@ static int ext4_da_write_end(struct file *file,
len, copied, page, fsdata);
trace_ext4_da_write_end(inode, pos, len, copied);
- start = pos & (PAGE_CACHE_SIZE - 1);
+ start = pos & (PAGE_SIZE - 1);
end = start + copied - 1;
/*
@@ -3187,7 +3228,7 @@ static int __ext4_journalled_invalidatepage(struct page *page,
/*
* If it's a full truncate we just forget about the pending dirtying
*/
- if (offset == 0 && length == PAGE_CACHE_SIZE)
+ if (offset == 0 && length == PAGE_SIZE)
ClearPageChecked(page);
return jbd2_journal_invalidatepage(journal, page, offset, length);
@@ -3217,75 +3258,52 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
}
#ifdef CONFIG_FS_DAX
-int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
+/*
+ * Get block function for DAX IO and mmap faults. It takes care of converting
+ * unwritten extents to written ones and initializes new / converted blocks
+ * to zeros.
+ */
+int ext4_dax_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
{
- int ret, err;
- int credits;
- struct ext4_map_blocks map;
- handle_t *handle = NULL;
- int flags = 0;
-
- ext4_debug("ext4_dax_mmap_get_block: inode %lu, create flag %d\n",
- inode->i_ino, create);
- map.m_lblk = iblock;
- map.m_len = bh_result->b_size >> inode->i_blkbits;
- credits = ext4_chunk_trans_blocks(inode, map.m_len);
- if (create) {
- flags |= EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_CREATE_ZERO;
- handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- return ret;
- }
- }
+ int ret;
- ret = ext4_map_blocks(handle, inode, &map, flags);
- if (create) {
- err = ext4_journal_stop(handle);
- if (ret >= 0 && err < 0)
- ret = err;
- }
- if (ret <= 0)
- goto out;
- if (map.m_flags & EXT4_MAP_UNWRITTEN) {
- int err2;
+ ext4_debug("inode %lu, create flag %d\n", inode->i_ino, create);
+ if (!create)
+ return _ext4_get_block(inode, iblock, bh_result, 0);
- /*
- * We are protected by i_mmap_sem so we know block cannot go
- * away from under us even though we dropped i_data_sem.
- * Convert extent to written and write zeros there.
- *
- * Note: We may get here even when create == 0.
- */
- handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
- }
+ ret = ext4_get_block_trans(inode, iblock, bh_result,
+ EXT4_GET_BLOCKS_PRE_IO |
+ EXT4_GET_BLOCKS_CREATE_ZERO);
+ if (ret < 0)
+ return ret;
- err = ext4_map_blocks(handle, inode, &map,
- EXT4_GET_BLOCKS_CONVERT | EXT4_GET_BLOCKS_CREATE_ZERO);
- if (err < 0)
- ret = err;
- err2 = ext4_journal_stop(handle);
- if (err2 < 0 && ret > 0)
- ret = err2;
- }
-out:
- WARN_ON_ONCE(ret == 0 && create);
- if (ret > 0) {
- map_bh(bh_result, inode->i_sb, map.m_pblk);
+ if (buffer_unwritten(bh_result)) {
/*
- * At least for now we have to clear BH_New so that DAX code
- * doesn't attempt to zero blocks again in a racy way.
+ * We are protected by i_mmap_sem or i_mutex so we know block
+ * cannot go away from under us even though we dropped
+ * i_data_sem. Convert extent to written and write zeros there.
*/
- map.m_flags &= ~EXT4_MAP_NEW;
- ext4_update_bh_state(bh_result, map.m_flags);
- bh_result->b_size = map.m_len << inode->i_blkbits;
- ret = 0;
+ ret = ext4_get_block_trans(inode, iblock, bh_result,
+ EXT4_GET_BLOCKS_CONVERT |
+ EXT4_GET_BLOCKS_CREATE_ZERO);
+ if (ret < 0)
+ return ret;
}
- return ret;
+ /*
+ * At least for now we have to clear BH_New so that DAX code
+ * doesn't attempt to zero blocks again in a racy way.
+ */
+ clear_buffer_new(bh_result);
+ return 0;
+}
+#else
+/* Just define empty function, it will never get called. */
+int ext4_dax_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ BUG();
+ return 0;
}
#endif
@@ -3318,7 +3336,9 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
}
/*
- * For ext4 extent files, ext4 will do direct-io write to holes,
+ * Handling of direct IO writes.
+ *
+ * For ext4 extent files, ext4 will do direct-io write even to holes,
* preallocated extents, and those write extend the file, no need to
* fall back to buffered IO.
*
@@ -3336,21 +3356,37 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
* if the machine crashes during the write.
*
*/
-static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- loff_t offset)
+static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
+ struct ext4_inode_info *ei = EXT4_I(inode);
ssize_t ret;
+ loff_t offset = iocb->ki_pos;
size_t count = iov_iter_count(iter);
int overwrite = 0;
get_block_t *get_block_func = NULL;
int dio_flags = 0;
loff_t final_size = offset + count;
+ int orphan = 0;
+ handle_t *handle;
- /* Use the old path for reads and writes beyond i_size. */
- if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size)
- return ext4_ind_direct_IO(iocb, iter, offset);
+ if (final_size > inode->i_size) {
+ /* Credits for sb + inode write */
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+ ret = ext4_orphan_add(handle, inode);
+ if (ret) {
+ ext4_journal_stop(handle);
+ goto out;
+ }
+ orphan = 1;
+ ei->i_disksize = inode->i_size;
+ ext4_journal_stop(handle);
+ }
BUG_ON(iocb->private == NULL);
@@ -3359,8 +3395,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
* conversion. This also disallows race between truncate() and
* overwrite DIO as i_dio_count needs to be incremented under i_mutex.
*/
- if (iov_iter_rw(iter) == WRITE)
- inode_dio_begin(inode);
+ inode_dio_begin(inode);
/* If we do a overwrite dio, i_mutex locking can be released */
overwrite = *((int *)iocb->private);
@@ -3369,7 +3404,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
inode_unlock(inode);
/*
- * We could direct write to holes and fallocate.
+ * For extent mapped files we could direct write to holes and fallocate.
*
* Allocated blocks to fill the hole are marked as unwritten to prevent
* parallel buffered read to expose the stale data before DIO complete
@@ -3391,7 +3426,23 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
iocb->private = NULL;
if (overwrite)
get_block_func = ext4_dio_get_block_overwrite;
- else if (is_sync_kiocb(iocb)) {
+ else if (IS_DAX(inode)) {
+ /*
+ * We can avoid zeroing for aligned DAX writes beyond EOF. Other
+ * writes need zeroing either because they can race with page
+ * faults or because they use partial blocks.
+ */
+ if (round_down(offset, 1<<inode->i_blkbits) >= inode->i_size &&
+ ext4_aligned_io(inode, offset, count))
+ get_block_func = ext4_dio_get_block;
+ else
+ get_block_func = ext4_dax_get_block;
+ dio_flags = DIO_LOCKING;
+ } else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
+ round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) {
+ get_block_func = ext4_dio_get_block;
+ dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
+ } else if (is_sync_kiocb(iocb)) {
get_block_func = ext4_dio_get_block_unwritten_sync;
dio_flags = DIO_LOCKING;
} else {
@@ -3401,12 +3452,12 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
#ifdef CONFIG_EXT4_FS_ENCRYPTION
BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode));
#endif
- if (IS_DAX(inode))
- ret = dax_do_io(iocb, inode, iter, offset, get_block_func,
+ if (IS_DAX(inode)) {
+ ret = dax_do_io(iocb, inode, iter, get_block_func,
ext4_end_io_dio, dio_flags);
- else
+ } else
ret = __blockdev_direct_IO(iocb, inode,
- inode->i_sb->s_bdev, iter, offset,
+ inode->i_sb->s_bdev, iter,
get_block_func,
ext4_end_io_dio, NULL, dio_flags);
@@ -3424,21 +3475,95 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
}
- if (iov_iter_rw(iter) == WRITE)
- inode_dio_end(inode);
+ inode_dio_end(inode);
/* take i_mutex locking again if we do a ovewrite dio */
if (overwrite)
inode_lock(inode);
+ if (ret < 0 && final_size > inode->i_size)
+ ext4_truncate_failed_write(inode);
+
+ /* Handle extending of i_size after direct IO write */
+ if (orphan) {
+ int err;
+
+ /* Credits for sb + inode write */
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle)) {
+ /* This is really bad luck. We've written the data
+ * but cannot extend i_size. Bail out and pretend
+ * the write failed... */
+ ret = PTR_ERR(handle);
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+
+ goto out;
+ }
+ if (inode->i_nlink)
+ ext4_orphan_del(handle, inode);
+ if (ret > 0) {
+ loff_t end = offset + ret;
+ if (end > inode->i_size) {
+ ei->i_disksize = end;
+ i_size_write(inode, end);
+ /*
+ * We're going to return a positive `ret'
+ * here due to non-zero-length I/O, so there's
+ * no way of reporting error returns from
+ * ext4_mark_inode_dirty() to userspace. So
+ * ignore it.
+ */
+ ext4_mark_inode_dirty(handle, inode);
+ }
+ }
+ err = ext4_journal_stop(handle);
+ if (ret == 0)
+ ret = err;
+ }
+out:
return ret;
}
-static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- loff_t offset)
+static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
+{
+ int unlocked = 0;
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
+ ssize_t ret;
+
+ if (ext4_should_dioread_nolock(inode)) {
+ /*
+ * Nolock dioread optimization may be dynamically disabled
+ * via ext4_inode_block_unlocked_dio(). Check inode's state
+ * while holding extra i_dio_count ref.
+ */
+ inode_dio_begin(inode);
+ smp_mb();
+ if (unlikely(ext4_test_inode_state(inode,
+ EXT4_STATE_DIOREAD_LOCK)))
+ inode_dio_end(inode);
+ else
+ unlocked = 1;
+ }
+ if (IS_DAX(inode)) {
+ ret = dax_do_io(iocb, inode, iter, ext4_dio_get_block,
+ NULL, unlocked ? 0 : DIO_LOCKING);
+ } else {
+ ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
+ iter, ext4_dio_get_block,
+ NULL, NULL,
+ unlocked ? 0 : DIO_LOCKING);
+ }
+ if (unlocked)
+ inode_dio_end(inode);
+ return ret;
+}
+
+static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
size_t count = iov_iter_count(iter);
+ loff_t offset = iocb->ki_pos;
ssize_t ret;
#ifdef CONFIG_EXT4_FS_ENCRYPTION
@@ -3457,10 +3582,10 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
return 0;
trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
- if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- ret = ext4_ext_direct_IO(iocb, iter, offset);
+ if (iov_iter_rw(iter) == READ)
+ ret = ext4_direct_IO_read(iocb, iter);
else
- ret = ext4_ind_direct_IO(iocb, iter, offset);
+ ret = ext4_direct_IO_write(iocb, iter);
trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
return ret;
}
@@ -3536,10 +3661,7 @@ void ext4_set_aops(struct inode *inode)
{
switch (ext4_inode_journal_mode(inode)) {
case EXT4_INODE_ORDERED_DATA_MODE:
- ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE);
- break;
case EXT4_INODE_WRITEBACK_DATA_MODE:
- ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE);
break;
case EXT4_INODE_JOURNAL_DATA_MODE:
inode->i_mapping->a_ops = &ext4_journalled_aops;
@@ -3556,8 +3678,8 @@ void ext4_set_aops(struct inode *inode)
static int __ext4_block_zero_page_range(handle_t *handle,
struct address_space *mapping, loff_t from, loff_t length)
{
- ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
- unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ ext4_fsblk_t index = from >> PAGE_SHIFT;
+ unsigned offset = from & (PAGE_SIZE-1);
unsigned blocksize, pos;
ext4_lblk_t iblock;
struct inode *inode = mapping->host;
@@ -3565,14 +3687,14 @@ static int __ext4_block_zero_page_range(handle_t *handle,
struct page *page;
int err = 0;
- page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
+ page = find_or_create_page(mapping, from >> PAGE_SHIFT,
mapping_gfp_constraint(mapping, ~__GFP_FS));
if (!page)
return -ENOMEM;
blocksize = inode->i_sb->s_blocksize;
- iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+ iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
if (!page_has_buffers(page))
create_empty_buffers(page, blocksize, 0);
@@ -3605,7 +3727,7 @@ static int __ext4_block_zero_page_range(handle_t *handle,
if (!buffer_uptodate(bh)) {
err = -EIO;
- ll_rw_block(READ, 1, &bh);
+ ll_rw_block(REQ_OP_READ, 0, 1, &bh);
wait_on_buffer(bh);
/* Uhhuh. Read error. Complain and punt. */
if (!buffer_uptodate(bh))
@@ -3613,9 +3735,9 @@ static int __ext4_block_zero_page_range(handle_t *handle,
if (S_ISREG(inode->i_mode) &&
ext4_encrypted_inode(inode)) {
/* We expect the key to be set. */
- BUG_ON(!ext4_has_encryption_key(inode));
- BUG_ON(blocksize != PAGE_CACHE_SIZE);
- WARN_ON_ONCE(ext4_decrypt(page));
+ BUG_ON(!fscrypt_has_encryption_key(inode));
+ BUG_ON(blocksize != PAGE_SIZE);
+ WARN_ON_ONCE(fscrypt_decrypt_page(page));
}
}
if (ext4_should_journal_data(inode)) {
@@ -3632,13 +3754,13 @@ static int __ext4_block_zero_page_range(handle_t *handle,
} else {
err = 0;
mark_buffer_dirty(bh);
- if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
- err = ext4_jbd2_file_inode(handle, inode);
+ if (ext4_should_order_data(inode))
+ err = ext4_jbd2_inode_add_write(handle, inode);
}
unlock:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return err;
}
@@ -3653,7 +3775,7 @@ static int ext4_block_zero_page_range(handle_t *handle,
struct address_space *mapping, loff_t from, loff_t length)
{
struct inode *inode = mapping->host;
- unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned offset = from & (PAGE_SIZE-1);
unsigned blocksize = inode->i_sb->s_blocksize;
unsigned max = blocksize - (offset & (blocksize - 1));
@@ -3678,7 +3800,7 @@ static int ext4_block_zero_page_range(handle_t *handle,
static int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from)
{
- unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned offset = from & (PAGE_SIZE-1);
unsigned length;
unsigned blocksize;
struct inode *inode = mapping->host;
@@ -3816,7 +3938,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
*/
if (offset + length > inode->i_size) {
length = inode->i_size +
- PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
+ PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) -
offset;
}
@@ -4188,7 +4310,7 @@ make_io:
trace_ext4_load_inode(inode);
get_bh(bh);
bh->b_end_io = end_buffer_read_sync;
- submit_bh(READ | REQ_META | REQ_PRIO, bh);
+ submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
EXT4_ERROR_INODE_BLOCK(inode, block,
@@ -4891,23 +5013,23 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
tid_t commit_tid = 0;
int ret;
- offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
+ offset = inode->i_size & (PAGE_SIZE - 1);
/*
* All buffers in the last page remain valid? Then there's nothing to
- * do. We do the check mainly to optimize the common PAGE_CACHE_SIZE ==
+ * do. We do the check mainly to optimize the common PAGE_SIZE ==
* blocksize case
*/
- if (offset > PAGE_CACHE_SIZE - (1 << inode->i_blkbits))
+ if (offset > PAGE_SIZE - (1 << inode->i_blkbits))
return;
while (1) {
page = find_lock_page(inode->i_mapping,
- inode->i_size >> PAGE_CACHE_SHIFT);
+ inode->i_size >> PAGE_SHIFT);
if (!page)
return;
ret = __ext4_journalled_invalidatepage(page, offset,
- PAGE_CACHE_SIZE - offset);
+ PAGE_SIZE - offset);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if (ret != -EBUSY)
return;
commit_tid = 0;
@@ -5431,6 +5553,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
journal_t *journal;
handle_t *handle;
int err;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
/*
* We have to be very careful here: changing a data block's
@@ -5447,22 +5570,30 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
return 0;
if (is_journal_aborted(journal))
return -EROFS;
- /* We have to allocate physical blocks for delalloc blocks
- * before flushing journal. otherwise delalloc blocks can not
- * be allocated any more. even more truncate on delalloc blocks
- * could trigger BUG by flushing delalloc blocks in journal.
- * There is no delalloc block in non-journal data mode.
- */
- if (val && test_opt(inode->i_sb, DELALLOC)) {
- err = ext4_alloc_da_blocks(inode);
- if (err < 0)
- return err;
- }
/* Wait for all existing dio workers */
ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);
+ /*
+ * Before flushing the journal and switching inode's aops, we have
+ * to flush all dirty data the inode has. There can be outstanding
+ * delayed allocations, there can be unwritten extents created by
+ * fallocate or buffered writes in dioread_nolock mode covered by
+ * dirty data which can be converted only after flushing the dirty
+ * data (and journalled aops don't know how to handle these cases).
+ */
+ if (val) {
+ down_write(&EXT4_I(inode)->i_mmap_sem);
+ err = filemap_write_and_wait(inode->i_mapping);
+ if (err < 0) {
+ up_write(&EXT4_I(inode)->i_mmap_sem);
+ ext4_inode_resume_unlocked_dio(inode);
+ return err;
+ }
+ }
+
+ percpu_down_write(&sbi->s_journal_flag_rwsem);
jbd2_journal_lock_updates(journal);
/*
@@ -5479,6 +5610,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
err = jbd2_journal_flush(journal);
if (err < 0) {
jbd2_journal_unlock_updates(journal);
+ percpu_up_write(&sbi->s_journal_flag_rwsem);
ext4_inode_resume_unlocked_dio(inode);
return err;
}
@@ -5487,6 +5619,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
ext4_set_aops(inode);
jbd2_journal_unlock_updates(journal);
+ percpu_up_write(&sbi->s_journal_flag_rwsem);
+
+ if (val)
+ up_write(&EXT4_I(inode)->i_mmap_sem);
ext4_inode_resume_unlocked_dio(inode);
/* Finally we can mark the inode as dirty. */
@@ -5546,10 +5682,10 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
goto out;
}
- if (page->index == size >> PAGE_CACHE_SHIFT)
- len = size & ~PAGE_CACHE_MASK;
+ if (page->index == size >> PAGE_SHIFT)
+ len = size & ~PAGE_MASK;
else
- len = PAGE_CACHE_SIZE;
+ len = PAGE_SIZE;
/*
* Return if we have all the buffers mapped. This avoids the need to do
* journal_start/journal_stop which can block and take a long time
@@ -5580,7 +5716,7 @@ retry_alloc:
ret = block_page_mkwrite(vma, vmf, get_block);
if (!ret && ext4_should_journal_data(inode)) {
if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
- PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
+ PAGE_SIZE, NULL, do_journal_get_write_access)) {
unlock_page(page);
ret = VM_FAULT_SIGBUS;
ext4_journal_stop(handle);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index eae5917c534e5..10686fd67fb42 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -13,8 +13,8 @@
#include <linux/compat.h>
#include <linux/mount.h>
#include <linux/file.h>
-#include <linux/random.h>
#include <linux/quotaops.h>
+#include <linux/uuid.h>
#include <asm/uaccess.h>
#include "ext4_jbd2.h"
#include "ext4.h"
@@ -308,6 +308,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
kprojid_t kprojid;
struct ext4_iloc iloc;
struct ext4_inode *raw_inode;
+ struct dquot *transfer_to[MAXQUOTAS] = { };
if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_PROJECT)) {
@@ -361,17 +362,14 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
if (err)
goto out_stop;
- if (sb_has_quota_limits_enabled(sb, PRJQUOTA)) {
- struct dquot *transfer_to[MAXQUOTAS] = { };
-
- transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid));
- if (transfer_to[PRJQUOTA]) {
- err = __dquot_transfer(inode, transfer_to);
- dqput(transfer_to[PRJQUOTA]);
- if (err)
- goto out_dirty;
- }
+ transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid));
+ if (!IS_ERR(transfer_to[PRJQUOTA])) {
+ err = __dquot_transfer(inode, transfer_to);
+ dqput(transfer_to[PRJQUOTA]);
+ if (err)
+ goto out_dirty;
}
+
EXT4_I(inode)->i_projid = kprojid;
inode->i_ctime = ext4_current_time(inode);
out_dirty:
@@ -772,19 +770,13 @@ resizefs_out:
return ext4_ext_precache(inode);
case EXT4_IOC_SET_ENCRYPTION_POLICY: {
#ifdef CONFIG_EXT4_FS_ENCRYPTION
- struct ext4_encryption_policy policy;
- int err = 0;
+ struct fscrypt_policy policy;
if (copy_from_user(&policy,
- (struct ext4_encryption_policy __user *)arg,
- sizeof(policy))) {
- err = -EFAULT;
- goto encryption_policy_out;
- }
-
- err = ext4_process_policy(&policy, inode);
-encryption_policy_out:
- return err;
+ (struct fscrypt_policy __user *)arg,
+ sizeof(policy)))
+ return -EFAULT;
+ return fscrypt_process_policy(inode, &policy);
#else
return -EOPNOTSUPP;
#endif
@@ -827,12 +819,12 @@ encryption_policy_out:
}
case EXT4_IOC_GET_ENCRYPTION_POLICY: {
#ifdef CONFIG_EXT4_FS_ENCRYPTION
- struct ext4_encryption_policy policy;
+ struct fscrypt_policy policy;
int err = 0;
if (!ext4_encrypted_inode(inode))
return -ENOENT;
- err = ext4_get_policy(inode, &policy);
+ err = fscrypt_get_policy(inode, &policy);
if (err)
return err;
if (copy_to_user((void __user *)arg, &policy, sizeof(policy)))
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 50e05df28f665..f418f55c2bbe1 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -119,7 +119,7 @@ MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc");
*
*
* one block each for bitmap and buddy information. So for each group we
- * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE /
+ * take up 2 blocks. A page can contain blocks_per_page (PAGE_SIZE /
* blocksize) blocks. So it can have information regarding groups_per_page
* which is blocks_per_page/2
*
@@ -807,7 +807,7 @@ static void mb_regenerate_buddy(struct ext4_buddy *e4b)
*
* one block each for bitmap and buddy information.
* So for each group we take up 2 blocks. A page can
- * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks.
+ * contain blocks_per_page (PAGE_SIZE / blocksize) blocks.
* So it can have information regarding groups_per_page which
* is blocks_per_page/2
*
@@ -839,7 +839,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
sb = inode->i_sb;
ngroups = ext4_get_groups_count(sb);
blocksize = 1 << inode->i_blkbits;
- blocks_per_page = PAGE_CACHE_SIZE / blocksize;
+ blocks_per_page = PAGE_SIZE / blocksize;
groups_per_page = blocks_per_page >> 1;
if (groups_per_page == 0)
@@ -993,7 +993,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
e4b->bd_buddy_page = NULL;
e4b->bd_bitmap_page = NULL;
- blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+ blocks_per_page = PAGE_SIZE / sb->s_blocksize;
/*
* the buddy cache inode stores the block bitmap
* and buddy information in consecutive blocks.
@@ -1028,11 +1028,11 @@ static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
{
if (e4b->bd_bitmap_page) {
unlock_page(e4b->bd_bitmap_page);
- page_cache_release(e4b->bd_bitmap_page);
+ put_page(e4b->bd_bitmap_page);
}
if (e4b->bd_buddy_page) {
unlock_page(e4b->bd_buddy_page);
- page_cache_release(e4b->bd_buddy_page);
+ put_page(e4b->bd_buddy_page);
}
}
@@ -1125,7 +1125,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
might_sleep();
mb_debug(1, "load group %u\n", group);
- blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+ blocks_per_page = PAGE_SIZE / sb->s_blocksize;
grp = ext4_get_group_info(sb, group);
e4b->bd_blkbits = sb->s_blocksize_bits;
@@ -1167,7 +1167,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
* is yet to initialize the same. So
* wait for it to initialize.
*/
- page_cache_release(page);
+ put_page(page);
page = find_or_create_page(inode->i_mapping, pnum, gfp);
if (page) {
BUG_ON(page->mapping != inode->i_mapping);
@@ -1203,7 +1203,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
if (page == NULL || !PageUptodate(page)) {
if (page)
- page_cache_release(page);
+ put_page(page);
page = find_or_create_page(inode->i_mapping, pnum, gfp);
if (page) {
BUG_ON(page->mapping != inode->i_mapping);
@@ -1238,11 +1238,11 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
err:
if (page)
- page_cache_release(page);
+ put_page(page);
if (e4b->bd_bitmap_page)
- page_cache_release(e4b->bd_bitmap_page);
+ put_page(e4b->bd_bitmap_page);
if (e4b->bd_buddy_page)
- page_cache_release(e4b->bd_buddy_page);
+ put_page(e4b->bd_buddy_page);
e4b->bd_buddy = NULL;
e4b->bd_bitmap = NULL;
return ret;
@@ -1257,15 +1257,16 @@ static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
{
if (e4b->bd_bitmap_page)
- page_cache_release(e4b->bd_bitmap_page);
+ put_page(e4b->bd_bitmap_page);
if (e4b->bd_buddy_page)
- page_cache_release(e4b->bd_buddy_page);
+ put_page(e4b->bd_buddy_page);
}
static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
{
int order = 1;
+ int bb_incr = 1 << (e4b->bd_blkbits - 1);
void *bb;
BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
@@ -1278,7 +1279,8 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
/* this block is part of buddy of order 'order' */
return order;
}
- bb += 1 << (e4b->bd_blkbits - order);
+ bb += bb_incr;
+ bb_incr >>= 1;
order++;
}
return 0;
@@ -2348,7 +2350,6 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
}
const struct file_operations ext4_seq_mb_groups_fops = {
- .owner = THIS_MODULE,
.open = ext4_mb_seq_groups_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -2583,7 +2584,7 @@ int ext4_mb_init(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
unsigned i, j;
- unsigned offset;
+ unsigned offset, offset_incr;
unsigned max;
int ret;
@@ -2612,17 +2613,20 @@ int ext4_mb_init(struct super_block *sb)
i = 1;
offset = 0;
+ offset_incr = 1 << (sb->s_blocksize_bits - 1);
max = sb->s_blocksize << 2;
do {
sbi->s_mb_offsets[i] = offset;
sbi->s_mb_maxs[i] = max;
- offset += 1 << (sb->s_blocksize_bits - i);
+ offset += offset_incr;
+ offset_incr = offset_incr >> 1;
max = max >> 1;
i++;
} while (i <= sb->s_blocksize_bits + 1);
spin_lock_init(&sbi->s_md_lock);
spin_lock_init(&sbi->s_bal_lock);
+ sbi->s_mb_free_pending = 0;
sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
@@ -2810,6 +2814,9 @@ static void ext4_free_data_callback(struct super_block *sb,
/* we expect to find existing buddy because it's pinned */
BUG_ON(err != 0);
+ spin_lock(&EXT4_SB(sb)->s_md_lock);
+ EXT4_SB(sb)->s_mb_free_pending -= entry->efd_count;
+ spin_unlock(&EXT4_SB(sb)->s_md_lock);
db = e4b.bd_info;
/* there are blocks to put in buddy to make them really free */
@@ -2833,8 +2840,8 @@ static void ext4_free_data_callback(struct super_block *sb,
/* No more items in the per group rb tree
* balance refcounts from ext4_mb_free_metadata()
*/
- page_cache_release(e4b.bd_buddy_page);
- page_cache_release(e4b.bd_bitmap_page);
+ put_page(e4b.bd_buddy_page);
+ put_page(e4b.bd_bitmap_page);
}
ext4_unlock_group(sb, entry->efd_group);
kmem_cache_free(ext4_free_data_cachep, entry);
@@ -2935,7 +2942,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
"fs metadata", block, block+len);
/* File system mounted not to panic on error
- * Fix the bitmap and repeat the block allocation
+ * Fix the bitmap and return EFSCORRUPTED
* We leak some of the blocks here.
*/
ext4_lock_group(sb, ac->ac_b_ex.fe_group);
@@ -2944,7 +2951,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
if (!err)
- err = -EAGAIN;
+ err = -EFSCORRUPTED;
goto out_err;
}
@@ -4385,9 +4392,9 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
ext4_mb_put_pa(ac, ac->ac_sb, pa);
}
if (ac->ac_bitmap_page)
- page_cache_release(ac->ac_bitmap_page);
+ put_page(ac->ac_bitmap_page);
if (ac->ac_buddy_page)
- page_cache_release(ac->ac_buddy_page);
+ put_page(ac->ac_buddy_page);
if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
mutex_unlock(&ac->ac_lg->lg_mutex);
ext4_mb_collect_stats(ac);
@@ -4509,18 +4516,7 @@ repeat:
}
if (likely(ac->ac_status == AC_STATUS_FOUND)) {
*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
- if (*errp == -EAGAIN) {
- /*
- * drop the reference that we took
- * in ext4_mb_use_best_found
- */
- ext4_mb_release_context(ac);
- ac->ac_b_ex.fe_group = 0;
- ac->ac_b_ex.fe_start = 0;
- ac->ac_b_ex.fe_len = 0;
- ac->ac_status = AC_STATUS_CONTINUE;
- goto repeat;
- } else if (*errp) {
+ if (*errp) {
ext4_discard_allocated_blocks(ac);
goto errout;
} else {
@@ -4579,6 +4575,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
{
ext4_group_t group = e4b->bd_group;
ext4_grpblk_t cluster;
+ ext4_grpblk_t clusters = new_entry->efd_count;
struct ext4_free_data *entry;
struct ext4_group_info *db = e4b->bd_info;
struct super_block *sb = e4b->bd_sb;
@@ -4599,8 +4596,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
* otherwise we'll refresh it from
* on-disk bitmap and lose not-yet-available
* blocks */
- page_cache_get(e4b->bd_buddy_page);
- page_cache_get(e4b->bd_bitmap_page);
+ get_page(e4b->bd_buddy_page);
+ get_page(e4b->bd_bitmap_page);
}
while (*n) {
parent = *n;
@@ -4645,8 +4642,11 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
}
}
/* Add the extent to transaction's private list */
- ext4_journal_callback_add(handle, ext4_free_data_callback,
- &new_entry->efd_jce);
+ new_entry->efd_jce.jce_func = ext4_free_data_callback;
+ spin_lock(&sbi->s_md_lock);
+ _ext4_journal_callback_add(handle, &new_entry->efd_jce);
+ sbi->s_mb_free_pending += clusters;
+ spin_unlock(&sbi->s_md_lock);
return 0;
}
@@ -4935,7 +4935,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
* boundary.
*/
if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
- ext4_warning(sb, "too much blocks added to group %u\n",
+ ext4_warning(sb, "too much blocks added to group %u",
block_group);
err = -EINVAL;
goto error_return;
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 24445275d330e..d89754ef1aab7 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -52,7 +52,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
lock_buffer(bh);
bh->b_end_io = end_buffer_write_sync;
get_bh(bh);
- submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh);
+ submit_bh(REQ_OP_WRITE, WRITE_SYNC | REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
sb_end_write(sb);
if (unlikely(!buffer_uptodate(bh)))
@@ -88,7 +88,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
get_bh(*bh);
lock_buffer(*bh);
(*bh)->b_end_io = end_buffer_read_sync;
- submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh);
+ submit_bh(REQ_OP_READ, READ_SYNC | REQ_META | REQ_PRIO, *bh);
wait_on_buffer(*bh);
if (!buffer_uptodate(*bh)) {
ret = -EIO;
@@ -121,7 +121,7 @@ void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
__ext4_warning(sb, function, line, "%s", msg);
__ext4_warning(sb, function, line,
"MMP failure info: last update time: %llu, last update "
- "node: %s, last update device: %s\n",
+ "node: %s, last update device: %s",
(long long unsigned int) le64_to_cpu(mmp->mmp_time),
mmp->mmp_nodename, mmp->mmp_bdevname);
}
@@ -353,7 +353,7 @@ skip:
* wait for MMP interval and check mmp_seq.
*/
if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
- ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+ ext4_warning(sb, "MMP startup interrupted, failing mount");
goto failed;
}
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 4098acc701c3e..a920c5d29fac0 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -60,10 +60,10 @@ ext4_double_down_write_data_sem(struct inode *first, struct inode *second)
{
if (first < second) {
down_write(&EXT4_I(first)->i_data_sem);
- down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
+ down_write_nested(&EXT4_I(second)->i_data_sem, I_DATA_SEM_OTHER);
} else {
down_write(&EXT4_I(second)->i_data_sem);
- down_write_nested(&EXT4_I(first)->i_data_sem, SINGLE_DEPTH_NESTING);
+ down_write_nested(&EXT4_I(first)->i_data_sem, I_DATA_SEM_OTHER);
}
}
@@ -156,7 +156,7 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2,
page[1] = grab_cache_page_write_begin(mapping[1], index2, fl);
if (!page[1]) {
unlock_page(page[0]);
- page_cache_release(page[0]);
+ put_page(page[0]);
return -ENOMEM;
}
/*
@@ -192,7 +192,7 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
create_empty_buffers(page, blocksize, 0);
head = page_buffers(page);
- block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ block = (sector_t)page->index << (PAGE_SHIFT - inode->i_blkbits);
for (bh = head, block_start = 0; bh != head || !block_start;
block++, block_start = block_end, bh = bh->b_this_page) {
block_end = block_start + blocksize;
@@ -268,7 +268,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
int i, err2, jblocks, retries = 0;
int replaced_count = 0;
int from = data_offset_in_page << orig_inode->i_blkbits;
- int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
+ int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits;
struct super_block *sb = orig_inode->i_sb;
struct buffer_head *bh = NULL;
@@ -400,13 +400,13 @@ data_copy:
/* Even in case of data=writeback it is reasonable to pin
* inode to transaction, to prevent unexpected data loss */
- *err = ext4_jbd2_file_inode(handle, orig_inode);
+ *err = ext4_jbd2_inode_add_write(handle, orig_inode);
unlock_pages:
unlock_page(pagep[0]);
- page_cache_release(pagep[0]);
+ put_page(pagep[0]);
unlock_page(pagep[1]);
- page_cache_release(pagep[1]);
+ put_page(pagep[1]);
stop_journal:
ext4_journal_stop(handle);
if (*err == -ENOSPC &&
@@ -484,6 +484,13 @@ mext_check_arguments(struct inode *orig_inode,
return -EBUSY;
}
+ if (IS_NOQUOTA(orig_inode) || IS_NOQUOTA(donor_inode)) {
+ ext4_debug("ext4 move extent: The argument files should "
+ "not be quota files [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EBUSY;
+ }
+
/* Ext4 move extent supports only extent based file */
if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
ext4_debug("ext4 move extent: orig file is not extents "
@@ -554,7 +561,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
struct inode *orig_inode = file_inode(o_filp);
struct inode *donor_inode = file_inode(d_filp);
struct ext4_ext_path *path = NULL;
- int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
+ int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits;
ext4_lblk_t o_end, o_start = orig_blk;
ext4_lblk_t d_start = donor_blk;
int ret;
@@ -648,9 +655,9 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
if (o_end - o_start < cur_len)
cur_len = o_end - o_start;
- orig_page_index = o_start >> (PAGE_CACHE_SHIFT -
+ orig_page_index = o_start >> (PAGE_SHIFT -
orig_inode->i_blkbits);
- donor_page_index = d_start >> (PAGE_CACHE_SHIFT -
+ donor_page_index = d_start >> (PAGE_SHIFT -
donor_inode->i_blkbits);
offset_in_page = o_start % blocks_per_page;
if (cur_len > blocks_per_page- offset_in_page)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 48e4b8907826e..34c0142caf6a1 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -420,15 +420,14 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
__u32 csum;
- __le32 save_csum;
int size;
+ __u32 dummy_csum = 0;
+ int offset = offsetof(struct dx_tail, dt_checksum);
size = count_offset + (count * sizeof(struct dx_entry));
- save_csum = t->dt_checksum;
- t->dt_checksum = 0;
csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
- csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail));
- t->dt_checksum = save_csum;
+ csum = ext4_chksum(sbi, csum, (__u8 *)t, offset);
+ csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum));
return cpu_to_le32(csum);
}
@@ -446,14 +445,14 @@ static int ext4_dx_csum_verify(struct inode *inode,
c = get_dx_countlimit(inode, dirent, &count_offset);
if (!c) {
EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D.");
- return 1;
+ return 0;
}
limit = le16_to_cpu(c->limit);
count = le16_to_cpu(c->count);
if (count_offset + (limit * sizeof(struct dx_entry)) >
EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
warn_no_space_for_csum(inode);
- return 1;
+ return 0;
}
t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
@@ -612,19 +611,19 @@ static struct stats dx_show_leaf(struct inode *dir,
#ifdef CONFIG_EXT4_FS_ENCRYPTION
int len;
char *name;
- struct ext4_str fname_crypto_str
- = {.name = NULL, .len = 0};
+ struct fscrypt_str fname_crypto_str =
+ FSTR_INIT(NULL, 0);
int res = 0;
name = de->name;
len = de->name_len;
- if (ext4_encrypted_inode(inode))
- res = ext4_get_encryption_info(dir);
+ if (ext4_encrypted_inode(dir))
+ res = fscrypt_get_encryption_info(dir);
if (res) {
printk(KERN_WARNING "Error setting up"
" fname crypto: %d\n", res);
}
- if (ctx == NULL) {
+ if (!fscrypt_has_encryption_key(dir)) {
/* Directory is not encrypted */
ext4fs_dirhash(de->name,
de->name_len, &h);
@@ -633,19 +632,21 @@ static struct stats dx_show_leaf(struct inode *dir,
(unsigned) ((char *) de
- base));
} else {
+ struct fscrypt_str de_name =
+ FSTR_INIT(name, len);
+
/* Directory is encrypted */
- res = ext4_fname_crypto_alloc_buffer(
- ctx, de->name_len,
+ res = fscrypt_fname_alloc_buffer(
+ dir, len,
&fname_crypto_str);
- if (res < 0) {
+ if (res < 0)
printk(KERN_WARNING "Error "
"allocating crypto "
"buffer--skipping "
"crypto\n");
- ctx = NULL;
- }
- res = ext4_fname_disk_to_usr(ctx, NULL, de,
- &fname_crypto_str);
+ res = fscrypt_fname_disk_to_usr(dir,
+ 0, 0, &de_name,
+ &fname_crypto_str);
if (res < 0) {
printk(KERN_WARNING "Error "
"converting filename "
@@ -662,8 +663,8 @@ static struct stats dx_show_leaf(struct inode *dir,
printk("%*.s:(E)%x.%u ", len, name,
h.hash, (unsigned) ((char *) de
- base));
- ext4_fname_crypto_free_buffer(
- &fname_crypto_str);
+ fscrypt_fname_free_buffer(
+ &fname_crypto_str);
}
#else
int len = de->name_len;
@@ -952,7 +953,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
struct buffer_head *bh;
struct ext4_dir_entry_2 *de, *top;
int err = 0, count = 0;
- struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}, tmp_str;
+ struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0), tmp_str;
dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
(unsigned long)block));
@@ -967,12 +968,12 @@ static int htree_dirblock_to_tree(struct file *dir_file,
#ifdef CONFIG_EXT4_FS_ENCRYPTION
/* Check if the directory is encrypted */
if (ext4_encrypted_inode(dir)) {
- err = ext4_get_encryption_info(dir);
+ err = fscrypt_get_encryption_info(dir);
if (err < 0) {
brelse(bh);
return err;
}
- err = ext4_fname_crypto_alloc_buffer(dir, EXT4_NAME_LEN,
+ err = fscrypt_fname_alloc_buffer(dir, EXT4_NAME_LEN,
&fname_crypto_str);
if (err < 0) {
brelse(bh);
@@ -1003,10 +1004,13 @@ static int htree_dirblock_to_tree(struct file *dir_file,
&tmp_str);
} else {
int save_len = fname_crypto_str.len;
+ struct fscrypt_str de_name = FSTR_INIT(de->name,
+ de->name_len);
/* Directory is encrypted */
- err = ext4_fname_disk_to_usr(dir, hinfo, de,
- &fname_crypto_str);
+ err = fscrypt_fname_disk_to_usr(dir, hinfo->hash,
+ hinfo->minor_hash, &de_name,
+ &fname_crypto_str);
if (err < 0) {
count = err;
goto errout;
@@ -1025,7 +1029,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
errout:
brelse(bh);
#ifdef CONFIG_EXT4_FS_ENCRYPTION
- ext4_fname_crypto_free_buffer(&fname_crypto_str);
+ fscrypt_fname_free_buffer(&fname_crypto_str);
#endif
return count;
}
@@ -1050,7 +1054,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
int count = 0;
int ret, err;
__u32 hashval;
- struct ext4_str tmp_str;
+ struct fscrypt_str tmp_str;
dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
start_hash, start_minor_hash));
@@ -1107,6 +1111,11 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
}
while (1) {
+ if (fatal_signal_pending(current)) {
+ err = -ERESTARTSYS;
+ goto errout;
+ }
+ cond_resched();
block = dx_get_block(frame->at);
ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
start_hash, start_minor_hash);
@@ -1438,7 +1447,8 @@ restart:
}
bh_use[ra_max] = bh;
if (bh)
- ll_rw_block(READ | REQ_META | REQ_PRIO,
+ ll_rw_block(REQ_OP_READ,
+ REQ_META | REQ_PRIO,
1, &bh);
}
}
@@ -1558,26 +1568,23 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
struct ext4_dir_entry_2 *de;
struct buffer_head *bh;
- if (ext4_encrypted_inode(dir)) {
- int res = ext4_get_encryption_info(dir);
+ if (ext4_encrypted_inode(dir)) {
+ int res = fscrypt_get_encryption_info(dir);
/*
- * This should be a properly defined flag for
- * dentry->d_flags when we uplift this to the VFS.
- * d_fsdata is set to (void *) 1 if if the dentry is
+ * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is
* created while the directory was encrypted and we
- * don't have access to the key.
+ * have access to the key.
*/
- dentry->d_fsdata = NULL;
- if (ext4_encryption_info(dir))
- dentry->d_fsdata = (void *) 1;
- d_set_d_op(dentry, &ext4_encrypted_d_ops);
- if (res && res != -ENOKEY)
- return ERR_PTR(res);
- }
+ if (fscrypt_has_encryption_key(dir))
+ fscrypt_set_encrypted_dentry(dentry);
+ fscrypt_set_d_op(dentry);
+ if (res && res != -ENOKEY)
+ return ERR_PTR(res);
+ }
- if (dentry->d_name.len > EXT4_NAME_LEN)
- return ERR_PTR(-ENAMETOOLONG);
+ if (dentry->d_name.len > EXT4_NAME_LEN)
+ return ERR_PTR(-ENAMETOOLONG);
bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
if (IS_ERR(bh))
@@ -1604,16 +1611,14 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
}
if (!IS_ERR(inode) && ext4_encrypted_inode(dir) &&
(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
- !ext4_is_child_context_consistent_with_parent(dir,
- inode)) {
+ !fscrypt_has_permitted_context(dir, inode)) {
int nokey = ext4_encrypted_inode(inode) &&
- !ext4_encryption_info(inode);
-
+ !fscrypt_has_encryption_key(inode);
iput(inode);
if (nokey)
return ERR_PTR(-ENOKEY);
ext4_warning(inode->i_sb,
- "Inconsistent encryption contexts: %lu/%lu\n",
+ "Inconsistent encryption contexts: %lu/%lu",
(unsigned long) dir->i_ino,
(unsigned long) inode->i_ino);
return ERR_PTR(-EPERM);
@@ -1638,13 +1643,13 @@ struct dentry *ext4_get_parent(struct dentry *child)
ino = le32_to_cpu(de->inode);
brelse(bh);
- if (!ext4_valid_inum(d_inode(child)->i_sb, ino)) {
+ if (!ext4_valid_inum(child->d_sb, ino)) {
EXT4_ERROR_INODE(d_inode(child),
"bad parent inode number: %u", ino);
return ERR_PTR(-EFSCORRUPTED);
}
- return d_obtain_alias(ext4_iget_normal(d_inode(child)->i_sb, ino));
+ return d_obtain_alias(ext4_iget_normal(child->d_sb, ino));
}
/*
@@ -2685,30 +2690,30 @@ out_stop:
/*
* routine to check that the specified directory is empty (for rmdir)
*/
-int ext4_empty_dir(struct inode *inode)
+bool ext4_empty_dir(struct inode *inode)
{
unsigned int offset;
struct buffer_head *bh;
struct ext4_dir_entry_2 *de, *de1;
struct super_block *sb;
- int err = 0;
if (ext4_has_inline_data(inode)) {
int has_inline_data = 1;
+ int ret;
- err = empty_inline_dir(inode, &has_inline_data);
+ ret = empty_inline_dir(inode, &has_inline_data);
if (has_inline_data)
- return err;
+ return ret;
}
sb = inode->i_sb;
if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2)) {
EXT4_ERROR_INODE(inode, "invalid size");
- return 1;
+ return true;
}
bh = ext4_read_dirblock(inode, 0, EITHER);
if (IS_ERR(bh))
- return 1;
+ return true;
de = (struct ext4_dir_entry_2 *) bh->b_data;
de1 = ext4_next_entry(de, sb->s_blocksize);
@@ -2717,7 +2722,7 @@ int ext4_empty_dir(struct inode *inode)
strcmp(".", de->name) || strcmp("..", de1->name)) {
ext4_warning_inode(inode, "directory missing '.' and/or '..'");
brelse(bh);
- return 1;
+ return true;
}
offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) +
ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize);
@@ -2725,12 +2730,11 @@ int ext4_empty_dir(struct inode *inode)
while (offset < inode->i_size) {
if ((void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
unsigned int lblock;
- err = 0;
brelse(bh);
lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
bh = ext4_read_dirblock(inode, lblock, EITHER);
if (IS_ERR(bh))
- return 1;
+ return true;
de = (struct ext4_dir_entry_2 *) bh->b_data;
}
if (ext4_check_dir_entry(inode, NULL, de, bh,
@@ -2742,13 +2746,13 @@ int ext4_empty_dir(struct inode *inode)
}
if (le32_to_cpu(de->inode)) {
brelse(bh);
- return 0;
+ return false;
}
offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
de = ext4_next_entry(de, sb->s_blocksize);
}
brelse(bh);
- return 1;
+ return true;
}
/*
@@ -2828,7 +2832,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
* list entries can cause panics at unmount time.
*/
mutex_lock(&sbi->s_orphan_lock);
- list_del(&EXT4_I(inode)->i_orphan);
+ list_del_init(&EXT4_I(inode)->i_orphan);
mutex_unlock(&sbi->s_orphan_lock);
}
}
@@ -3071,8 +3075,8 @@ static int ext4_symlink(struct inode *dir,
int err, len = strlen(symname);
int credits;
bool encryption_required;
- struct ext4_str disk_link;
- struct ext4_encrypted_symlink_data *sd = NULL;
+ struct fscrypt_str disk_link;
+ struct fscrypt_symlink_data *sd = NULL;
disk_link.len = len + 1;
disk_link.name = (char *) symname;
@@ -3080,13 +3084,13 @@ static int ext4_symlink(struct inode *dir,
encryption_required = (ext4_encrypted_inode(dir) ||
DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb)));
if (encryption_required) {
- err = ext4_get_encryption_info(dir);
+ err = fscrypt_get_encryption_info(dir);
if (err)
return err;
- if (ext4_encryption_info(dir) == NULL)
+ if (!fscrypt_has_encryption_key(dir))
return -EPERM;
- disk_link.len = (ext4_fname_encrypted_size(dir, len) +
- sizeof(struct ext4_encrypted_symlink_data));
+ disk_link.len = (fscrypt_fname_encrypted_size(dir, len) +
+ sizeof(struct fscrypt_symlink_data));
sd = kzalloc(disk_link.len, GFP_KERNEL);
if (!sd)
return -ENOMEM;
@@ -3134,13 +3138,12 @@ static int ext4_symlink(struct inode *dir,
if (encryption_required) {
struct qstr istr;
- struct ext4_str ostr;
+ struct fscrypt_str ostr =
+ FSTR_INIT(sd->encrypted_path, disk_link.len);
istr.name = (const unsigned char *) symname;
istr.len = len;
- ostr.name = sd->encrypted_path;
- ostr.len = disk_link.len;
- err = ext4_fname_usr_to_disk(inode, &istr, &ostr);
+ err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr);
if (err < 0)
goto err_drop_inode;
sd->len = cpu_to_le16(ostr.len);
@@ -3229,7 +3232,7 @@ static int ext4_link(struct dentry *old_dentry,
if (inode->i_nlink >= EXT4_LINK_MAX)
return -EMLINK;
if (ext4_encrypted_inode(dir) &&
- !ext4_is_child_context_consistent_with_parent(dir, inode))
+ !fscrypt_has_permitted_context(dir, inode))
return -EPERM;
if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) &&
@@ -3552,8 +3555,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
if ((old.dir != new.dir) &&
ext4_encrypted_inode(new.dir) &&
- !ext4_is_child_context_consistent_with_parent(new.dir,
- old.inode)) {
+ !fscrypt_has_permitted_context(new.dir, old.inode)) {
retval = -EPERM;
goto end_rename;
}
@@ -3725,10 +3727,8 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
if ((ext4_encrypted_inode(old_dir) ||
ext4_encrypted_inode(new_dir)) &&
(old_dir != new_dir) &&
- (!ext4_is_child_context_consistent_with_parent(new_dir,
- old.inode) ||
- !ext4_is_child_context_consistent_with_parent(old_dir,
- new.inode)))
+ (!fscrypt_has_permitted_context(new_dir, old.inode) ||
+ !fscrypt_has_permitted_context(old_dir, new.inode)))
return -EPERM;
if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) &&
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index d77d15f4b6744..a6132a7309674 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -23,6 +23,8 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/mm.h>
+#include <linux/backing-dev.h>
+#include <linux/fscrypto.h>
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -66,7 +68,6 @@ static void ext4_finish_bio(struct bio *bio)
struct page *page = bvec->bv_page;
#ifdef CONFIG_EXT4_FS_ENCRYPTION
struct page *data_page = NULL;
- struct ext4_crypto_ctx *ctx = NULL;
#endif
struct buffer_head *bh, *head;
unsigned bio_start = bvec->bv_offset;
@@ -81,8 +82,7 @@ static void ext4_finish_bio(struct bio *bio)
if (!page->mapping) {
/* The bounce data pages are unmapped. */
data_page = page;
- ctx = (struct ext4_crypto_ctx *)page_private(data_page);
- page = ctx->w.control_page;
+ fscrypt_pullback_bio_page(&page, false);
}
#endif
@@ -112,8 +112,8 @@ static void ext4_finish_bio(struct bio *bio)
local_irq_restore(flags);
if (!under_io) {
#ifdef CONFIG_EXT4_FS_ENCRYPTION
- if (ctx)
- ext4_restore_control_page(data_page);
+ if (data_page)
+ fscrypt_restore_control_page(data_page);
#endif
end_page_writeback(page);
}
@@ -339,11 +339,10 @@ void ext4_io_submit(struct ext4_io_submit *io)
struct bio *bio = io->io_bio;
if (bio) {
- int io_op = io->io_wbc->sync_mode == WB_SYNC_ALL ?
- WRITE_SYNC : WRITE;
- bio_get(io->io_bio);
- submit_bio(io_op, io->io_bio);
- bio_put(io->io_bio);
+ int io_op_flags = io->io_wbc->sync_mode == WB_SYNC_ALL ?
+ WRITE_SYNC : 0;
+ bio_set_op_attrs(io->io_bio, REQ_OP_WRITE, io_op_flags);
+ submit_bio(io->io_bio);
}
io->io_bio = NULL;
}
@@ -432,8 +431,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
* the page size, the remaining memory is zeroed when mapped, and
* writes to that region are not written out to the file."
*/
- if (len < PAGE_CACHE_SIZE)
- zero_user_segment(page, len, PAGE_CACHE_SIZE);
+ if (len < PAGE_SIZE)
+ zero_user_segment(page, len, PAGE_SIZE);
/*
* In the first loop we prepare and mark buffers to submit. We have to
* mark all buffers in the page before submitting so that
@@ -470,9 +469,20 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode) &&
nr_to_submit) {
- data_page = ext4_encrypt(inode, page);
+ gfp_t gfp_flags = GFP_NOFS;
+
+ retry_encrypt:
+ data_page = fscrypt_encrypt_page(inode, page, gfp_flags);
if (IS_ERR(data_page)) {
ret = PTR_ERR(data_page);
+ if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) {
+ if (io->io_bio) {
+ ext4_io_submit(io);
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ }
+ gfp_flags |= __GFP_NOFAIL;
+ goto retry_encrypt;
+ }
data_page = NULL;
goto out;
}
@@ -500,7 +510,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
if (ret) {
out:
if (data_page)
- ext4_restore_control_page(data_page);
+ fscrypt_restore_control_page(data_page);
printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret);
redirty_page_for_writepage(wbc, page);
do {
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 5dc5e95063de2..a81b829d56def 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -23,7 +23,7 @@
*
* then this code just gives up and calls the buffer_head-based read function.
* It does handle a page which has holes at the end - that is a common case:
- * the end-of-file on blocksize < PAGE_CACHE_SIZE setups.
+ * the end-of-file on blocksize < PAGE_SIZE setups.
*
*/
@@ -46,37 +46,6 @@
#include "ext4.h"
-/*
- * Call ext4_decrypt on every single page, reusing the encryption
- * context.
- */
-static void completion_pages(struct work_struct *work)
-{
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
- struct ext4_crypto_ctx *ctx =
- container_of(work, struct ext4_crypto_ctx, r.work);
- struct bio *bio = ctx->r.bio;
- struct bio_vec *bv;
- int i;
-
- bio_for_each_segment_all(bv, bio, i) {
- struct page *page = bv->bv_page;
-
- int ret = ext4_decrypt(page);
- if (ret) {
- WARN_ON_ONCE(1);
- SetPageError(page);
- } else
- SetPageUptodate(page);
- unlock_page(page);
- }
- ext4_release_crypto_ctx(ctx);
- bio_put(bio);
-#else
- BUG();
-#endif
-}
-
static inline bool ext4_bio_encrypted(struct bio *bio)
{
#ifdef CONFIG_EXT4_FS_ENCRYPTION
@@ -104,14 +73,10 @@ static void mpage_end_io(struct bio *bio)
int i;
if (ext4_bio_encrypted(bio)) {
- struct ext4_crypto_ctx *ctx = bio->bi_private;
-
if (bio->bi_error) {
- ext4_release_crypto_ctx(ctx);
+ fscrypt_release_ctx(bio->bi_private);
} else {
- INIT_WORK(&ctx->r.work, completion_pages);
- ctx->r.bio = bio;
- queue_work(ext4_read_workqueue, &ctx->r.work);
+ fscrypt_decrypt_bio_pages(bio->bi_private, bio);
return;
}
}
@@ -135,12 +100,11 @@ int ext4_mpage_readpages(struct address_space *mapping,
unsigned nr_pages)
{
struct bio *bio = NULL;
- unsigned page_idx;
sector_t last_block_in_bio = 0;
struct inode *inode = mapping->host;
const unsigned blkbits = inode->i_blkbits;
- const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
+ const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
const unsigned blocksize = 1 << blkbits;
sector_t block_in_file;
sector_t last_block;
@@ -157,7 +121,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
map.m_len = 0;
map.m_flags = 0;
- for (page_idx = 0; nr_pages; page_idx++, nr_pages--) {
+ for (; nr_pages; nr_pages--) {
int fully_mapped = 1;
unsigned first_hole = blocks_per_page;
@@ -166,14 +130,14 @@ int ext4_mpage_readpages(struct address_space *mapping,
page = list_entry(pages->prev, struct page, lru);
list_del(&page->lru);
if (add_to_page_cache_lru(page, mapping, page->index,
- mapping_gfp_constraint(mapping, GFP_KERNEL)))
+ readahead_gfp_mask(mapping)))
goto next_page;
}
if (page_has_buffers(page))
goto confused;
- block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
+ block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
last_block = block_in_file + nr_pages * blocks_per_page;
last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
if (last_block > last_block_in_file)
@@ -217,7 +181,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
set_error_page:
SetPageError(page);
zero_user_segment(page, 0,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
unlock_page(page);
goto next_page;
}
@@ -250,7 +214,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
}
if (first_hole != blocks_per_page) {
zero_user_segment(page, first_hole << blkbits,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
if (first_hole == 0) {
SetPageUptodate(page);
unlock_page(page);
@@ -271,15 +235,15 @@ int ext4_mpage_readpages(struct address_space *mapping,
*/
if (bio && (last_block_in_bio != blocks[0] - 1)) {
submit_and_realloc:
- submit_bio(READ, bio);
+ submit_bio(bio);
bio = NULL;
}
if (bio == NULL) {
- struct ext4_crypto_ctx *ctx = NULL;
+ struct fscrypt_ctx *ctx = NULL;
if (ext4_encrypted_inode(inode) &&
S_ISREG(inode->i_mode)) {
- ctx = ext4_get_crypto_ctx(inode);
+ ctx = fscrypt_get_ctx(inode, GFP_NOFS);
if (IS_ERR(ctx))
goto set_error_page;
}
@@ -287,13 +251,14 @@ int ext4_mpage_readpages(struct address_space *mapping,
min_t(int, nr_pages, BIO_MAX_PAGES));
if (!bio) {
if (ctx)
- ext4_release_crypto_ctx(ctx);
+ fscrypt_release_ctx(ctx);
goto set_error_page;
}
bio->bi_bdev = bdev;
bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
bio->bi_end_io = mpage_end_io;
bio->bi_private = ctx;
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
}
length = first_hole << blkbits;
@@ -303,14 +268,14 @@ int ext4_mpage_readpages(struct address_space *mapping,
if (((map.m_flags & EXT4_MAP_BOUNDARY) &&
(relative_block == map.m_len)) ||
(first_hole != blocks_per_page)) {
- submit_bio(READ, bio);
+ submit_bio(bio);
bio = NULL;
} else
last_block_in_bio = blocks[blocks_per_page - 1];
goto next_page;
confused:
if (bio) {
- submit_bio(READ, bio);
+ submit_bio(bio);
bio = NULL;
}
if (!PageUptodate(page))
@@ -319,10 +284,10 @@ int ext4_mpage_readpages(struct address_space *mapping,
unlock_page(page);
next_page:
if (pages)
- page_cache_release(page);
+ put_page(page);
}
BUG_ON(pages && !list_empty(pages));
if (bio)
- submit_bio(READ, bio);
+ submit_bio(bio);
return 0;
}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 34038e3598d59..cf681004b1965 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -41,7 +41,7 @@ int ext4_resize_begin(struct super_block *sb)
*/
if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
ext4_warning(sb, "There are errors in the filesystem, "
- "so online resizing is not allowed\n");
+ "so online resizing is not allowed");
return -EPERM;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 5392975158963..1c593aa0218ee 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -859,6 +859,7 @@ static void ext4_put_super(struct super_block *sb)
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+ percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
brelse(sbi->s_sbh);
#ifdef CONFIG_QUOTA
for (i = 0; i < EXT4_MAXQUOTAS; i++)
@@ -944,9 +945,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->i_datasync_tid = 0;
atomic_set(&ei->i_unwritten, 0);
INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
- ei->i_crypt_info = NULL;
-#endif
return &ei->vfs_inode;
}
@@ -1025,8 +1023,7 @@ void ext4_clear_inode(struct inode *inode)
EXT4_I(inode)->jinode = NULL;
}
#ifdef CONFIG_EXT4_FS_ENCRYPTION
- if (EXT4_I(inode)->i_crypt_info)
- ext4_free_encryption_info(inode, EXT4_I(inode)->i_crypt_info);
+ fscrypt_put_encryption_info(inode, NULL);
#endif
}
@@ -1093,6 +1090,90 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
return try_to_free_buffers(page);
}
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+static int ext4_get_context(struct inode *inode, void *ctx, size_t len)
+{
+ return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len);
+}
+
+static int ext4_key_prefix(struct inode *inode, u8 **key)
+{
+ *key = EXT4_SB(inode->i_sb)->key_prefix;
+ return EXT4_SB(inode->i_sb)->key_prefix_size;
+}
+
+static int ext4_prepare_context(struct inode *inode)
+{
+ return ext4_convert_inline_data(inode);
+}
+
+static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
+ void *fs_data)
+{
+ handle_t *handle;
+ int res, res2;
+
+ /* fs_data is null when internally used. */
+ if (fs_data) {
+ res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx,
+ len, 0);
+ if (!res) {
+ ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
+ ext4_clear_inode_state(inode,
+ EXT4_STATE_MAY_INLINE_DATA);
+ }
+ return res;
+ }
+
+ handle = ext4_journal_start(inode, EXT4_HT_MISC,
+ ext4_jbd2_credits_xattr(inode));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx,
+ len, 0);
+ if (!res) {
+ ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
+ res = ext4_mark_inode_dirty(handle, inode);
+ if (res)
+ EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");
+ }
+ res2 = ext4_journal_stop(handle);
+ if (!res)
+ res = res2;
+ return res;
+}
+
+static int ext4_dummy_context(struct inode *inode)
+{
+ return DUMMY_ENCRYPTION_ENABLED(EXT4_SB(inode->i_sb));
+}
+
+static unsigned ext4_max_namelen(struct inode *inode)
+{
+ return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize :
+ EXT4_NAME_LEN;
+}
+
+static struct fscrypt_operations ext4_cryptops = {
+ .get_context = ext4_get_context,
+ .key_prefix = ext4_key_prefix,
+ .prepare_context = ext4_prepare_context,
+ .set_context = ext4_set_context,
+ .dummy_context = ext4_dummy_context,
+ .is_encrypted = ext4_encrypted_inode,
+ .empty_dir = ext4_empty_dir,
+ .max_namelen = ext4_max_namelen,
+};
+#else
+static struct fscrypt_operations ext4_cryptops = {
+ .is_encrypted = ext4_encrypted_inode,
+};
+#endif
+
#ifdef CONFIG_QUOTA
static char *quotatypes[] = INITQFNAMES;
#define QTYPE2NAME(t) (quotatypes[t])
@@ -1113,6 +1194,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
unsigned int flags);
static int ext4_enable_quotas(struct super_block *sb);
+static int ext4_get_next_id(struct super_block *sb, struct kqid *qid);
static struct dquot **ext4_get_dquots(struct inode *inode)
{
@@ -1129,7 +1211,7 @@ static const struct dquot_operations ext4_quota_operations = {
.alloc_dquot = dquot_alloc,
.destroy_dquot = dquot_destroy,
.get_projid = ext4_get_projid,
- .get_next_id = dquot_get_next_id,
+ .get_next_id = ext4_get_next_id,
};
static const struct quotactl_ops ext4_qctl_operations = {
@@ -1323,9 +1405,9 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
return -1;
}
if (ext4_has_feature_quota(sb)) {
- ext4_msg(sb, KERN_ERR, "Cannot set journaled quota options "
- "when QUOTA feature is enabled");
- return -1;
+ ext4_msg(sb, KERN_INFO, "Journaled quota options "
+ "ignored when QUOTA feature is enabled");
+ return 1;
}
qname = match_strdup(args);
if (!qname) {
@@ -1688,10 +1770,10 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
return -1;
}
if (ext4_has_feature_quota(sb)) {
- ext4_msg(sb, KERN_ERR,
- "Cannot set journaled quota options "
+ ext4_msg(sb, KERN_INFO,
+ "Quota format mount options ignored "
"when QUOTA feature is enabled");
- return -1;
+ return 1;
}
sbi->s_jquota_fmt = m->mount_opt;
#endif
@@ -1756,11 +1838,11 @@ static int parse_options(char *options, struct super_block *sb,
#ifdef CONFIG_QUOTA
if (ext4_has_feature_quota(sb) &&
(test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) {
- ext4_msg(sb, KERN_ERR, "Cannot set quota options when QUOTA "
- "feature is enabled");
- return 0;
- }
- if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
+ ext4_msg(sb, KERN_INFO, "Quota feature enabled, usrquota and grpquota "
+ "mount options ignored.");
+ clear_opt(sb, USRQUOTA);
+ clear_opt(sb, GRPQUOTA);
+ } else if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
clear_opt(sb, USRQUOTA);
@@ -1784,7 +1866,7 @@ static int parse_options(char *options, struct super_block *sb,
int blocksize =
BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
- if (blocksize < PAGE_CACHE_SIZE) {
+ if (blocksize < PAGE_SIZE) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"dioread_nolock if block size != PAGE_SIZE");
return 0;
@@ -2066,23 +2148,25 @@ failed:
static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group,
struct ext4_group_desc *gdp)
{
- int offset;
+ int offset = offsetof(struct ext4_group_desc, bg_checksum);
__u16 crc = 0;
__le32 le_group = cpu_to_le32(block_group);
struct ext4_sb_info *sbi = EXT4_SB(sb);
if (ext4_has_metadata_csum(sbi->s_sb)) {
/* Use new metadata_csum algorithm */
- __le16 save_csum;
__u32 csum32;
+ __u16 dummy_csum = 0;
- save_csum = gdp->bg_checksum;
- gdp->bg_checksum = 0;
csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
sizeof(le_group));
- csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp,
- sbi->s_desc_size);
- gdp->bg_checksum = save_csum;
+ csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset);
+ csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum,
+ sizeof(dummy_csum));
+ offset += sizeof(dummy_csum);
+ if (offset < sbi->s_desc_size)
+ csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset,
+ sbi->s_desc_size - offset);
crc = csum32 & 0xFFFF;
goto out;
@@ -2092,8 +2176,6 @@ static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group,
if (!ext4_has_feature_gdt_csum(sb))
return 0;
- offset = offsetof(struct ext4_group_desc, bg_checksum);
-
crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
crc = crc16(crc, (__u8 *)gdp, offset);
@@ -2276,6 +2358,16 @@ static void ext4_orphan_cleanup(struct super_block *sb,
while (es->s_last_orphan) {
struct inode *inode;
+ /*
+ * We may have encountered an error during cleanup; if
+ * so, skip the rest.
+ */
+ if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
+ jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
+ es->s_last_orphan = 0;
+ break;
+ }
+
inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
if (IS_ERR(inode)) {
es->s_last_orphan = 0;
@@ -3414,17 +3506,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
+ if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (blocksize / 4)) {
+ ext4_msg(sb, KERN_ERR,
+ "Number of reserved GDT blocks insanely large: %d",
+ le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks));
+ goto failed_mount;
+ }
+
if (sbi->s_mount_opt & EXT4_MOUNT_DAX) {
- if (blocksize != PAGE_SIZE) {
- ext4_msg(sb, KERN_ERR,
- "error: unsupported blocksize for dax");
- goto failed_mount;
- }
- if (!sb->s_bdev->bd_disk->fops->direct_access) {
- ext4_msg(sb, KERN_ERR,
- "error: device does not support dax");
+ err = bdev_dax_supported(sb, blocksize);
+ if (err)
goto failed_mount;
- }
}
if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) {
@@ -3691,6 +3783,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sb->s_op = &ext4_sops;
sb->s_export_op = &ext4_export_ops;
sb->s_xattr = ext4_xattr_handlers;
+ sb->s_cop = &ext4_cryptops;
#ifdef CONFIG_QUOTA
sb->dq_op = &ext4_quota_operations;
if (ext4_has_feature_quota(sb))
@@ -3808,7 +3901,7 @@ no_journal:
}
if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
- (blocksize != PAGE_CACHE_SIZE)) {
+ (blocksize != PAGE_SIZE)) {
ext4_msg(sb, KERN_ERR,
"Unsupported blocksize for fs encryption");
goto failed_mount_wq;
@@ -3929,6 +4022,9 @@ no_journal:
if (!err)
err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
GFP_KERNEL);
+ if (!err)
+ err = percpu_init_rwsem(&sbi->s_journal_flag_rwsem);
+
if (err) {
ext4_msg(sb, KERN_ERR, "insufficient memory");
goto failed_mount6;
@@ -3998,6 +4094,11 @@ no_journal:
ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10);
kfree(orig_data);
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ memcpy(sbi->key_prefix, EXT4_KEY_DESC_PREFIX,
+ EXT4_KEY_DESC_PREFIX_SIZE);
+ sbi->key_prefix_size = EXT4_KEY_DESC_PREFIX_SIZE;
+#endif
return 0;
cantfind_ext4:
@@ -4206,7 +4307,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
goto out_bdev;
}
journal->j_private = sb;
- ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer);
+ ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer);
wait_on_buffer(journal->j_sb_buffer);
if (!buffer_uptodate(journal->j_sb_buffer)) {
ext4_msg(sb, KERN_ERR, "I/O error on journal device");
@@ -4329,20 +4430,6 @@ static int ext4_commit_super(struct super_block *sb, int sync)
if (!sbh || block_device_ejected(sb))
return error;
- if (buffer_write_io_error(sbh)) {
- /*
- * Oh, dear. A previous attempt to write the
- * superblock failed. This could happen because the
- * USB device was yanked out. Or it could happen to
- * be a transient write error and maybe the block will
- * be remapped. Nothing we can do but to retry the
- * write and hope for the best.
- */
- ext4_msg(sb, KERN_ERR, "previous I/O error to "
- "superblock detected");
- clear_buffer_write_io_error(sbh);
- set_buffer_uptodate(sbh);
- }
/*
* If the file system is mounted read-only, don't update the
* superblock write time. This avoids updating the superblock
@@ -4373,7 +4460,23 @@ static int ext4_commit_super(struct super_block *sb, int sync)
&EXT4_SB(sb)->s_freeinodes_counter));
BUFFER_TRACE(sbh, "marking dirty");
ext4_superblock_csum_set(sb);
+ lock_buffer(sbh);
+ if (buffer_write_io_error(sbh)) {
+ /*
+ * Oh, dear. A previous attempt to write the
+ * superblock failed. This could happen because the
+ * USB device was yanked out. Or it could happen to
+ * be a transient write error and maybe the block will
+ * be remapped. Nothing we can do but to retry the
+ * write and hope for the best.
+ */
+ ext4_msg(sb, KERN_ERR, "previous I/O error to "
+ "superblock detected");
+ clear_buffer_write_io_error(sbh);
+ set_buffer_uptodate(sbh);
+ }
mark_buffer_dirty(sbh);
+ unlock_buffer(sbh);
if (sync) {
error = __sync_dirty_buffer(sbh,
test_opt(sb, BARRIER) ? WRITE_FUA : WRITE_SYNC);
@@ -5028,6 +5131,20 @@ static int ext4_quota_on_mount(struct super_block *sb, int type)
EXT4_SB(sb)->s_jquota_fmt, type);
}
+static void lockdep_set_quota_inode(struct inode *inode, int subclass)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ /* The first argument of lockdep_set_subclass has to be
+ * *exactly* the same as the argument to init_rwsem() --- in
+ * this case, in init_once() --- or lockdep gets unhappy
+ * because the name of the lock is set using the
+ * stringification of the argument to init_rwsem().
+ */
+ (void) ei; /* shut up clang warning if !CONFIG_LOCKDEP */
+ lockdep_set_subclass(&ei->i_data_sem, subclass);
+}
+
/*
* Standard function to be called on quota_on
*/
@@ -5067,8 +5184,12 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
if (err)
return err;
}
-
- return dquot_quota_on(sb, type, format_id, path);
+ lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA);
+ err = dquot_quota_on(sb, type, format_id, path);
+ if (err)
+ lockdep_set_quota_inode(path->dentry->d_inode,
+ I_DATA_SEM_NORMAL);
+ return err;
}
static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
@@ -5095,8 +5216,11 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
/* Don't account quota for quota files to avoid recursion */
qf_inode->i_flags |= S_NOQUOTA;
+ lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA);
err = dquot_enable(qf_inode, type, format_id, flags);
iput(qf_inode);
+ if (err)
+ lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL);
return err;
}
@@ -5253,6 +5377,17 @@ out:
return len;
}
+static int ext4_get_next_id(struct super_block *sb, struct kqid *qid)
+{
+ const struct quota_format_ops *ops;
+
+ if (!sb_has_quota_loaded(sb, qid->type))
+ return -ESRCH;
+ ops = sb_dqopt(sb)->ops[qid->type];
+ if (!ops || !ops->get_next_id)
+ return -ENOSYS;
+ return dquot_get_next_id(sb, qid);
+}
#endif
static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
@@ -5392,7 +5527,6 @@ out5:
static void __exit ext4_exit_fs(void)
{
- ext4_exit_crypto();
ext4_destroy_lazyinit_thread();
unregister_as_ext2();
unregister_as_ext3();
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 6f7ee30a89ce8..4d83d9e05f2e8 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -22,23 +22,22 @@
#include "ext4.h"
#include "xattr.h"
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
static const char *ext4_encrypted_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
{
struct page *cpage = NULL;
char *caddr, *paddr = NULL;
- struct ext4_str cstr, pstr;
- struct ext4_encrypted_symlink_data *sd;
+ struct fscrypt_str cstr, pstr;
+ struct fscrypt_symlink_data *sd;
loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1);
int res;
- u32 plen, max_size = inode->i_sb->s_blocksize;
+ u32 max_size = inode->i_sb->s_blocksize;
if (!dentry)
return ERR_PTR(-ECHILD);
- res = ext4_get_encryption_info(inode);
+ res = fscrypt_get_encryption_info(inode);
if (res)
return ERR_PTR(res);
@@ -54,38 +53,35 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry,
}
/* Symlink is encrypted */
- sd = (struct ext4_encrypted_symlink_data *)caddr;
+ sd = (struct fscrypt_symlink_data *)caddr;
cstr.name = sd->encrypted_path;
cstr.len = le16_to_cpu(sd->len);
- if ((cstr.len +
- sizeof(struct ext4_encrypted_symlink_data) - 1) >
- max_size) {
+ if ((cstr.len + sizeof(struct fscrypt_symlink_data) - 1) > max_size) {
/* Symlink data on the disk is corrupted */
res = -EFSCORRUPTED;
goto errout;
}
- plen = (cstr.len < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2) ?
- EXT4_FNAME_CRYPTO_DIGEST_SIZE*2 : cstr.len;
- paddr = kmalloc(plen + 1, GFP_NOFS);
- if (!paddr) {
- res = -ENOMEM;
+
+ res = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr);
+ if (res)
goto errout;
- }
- pstr.name = paddr;
- pstr.len = plen;
- res = _ext4_fname_disk_to_usr(inode, NULL, &cstr, &pstr);
+
+ res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr);
if (res < 0)
goto errout;
+
+ paddr = pstr.name;
+
/* Null-terminate the name */
- if (res <= plen)
+ if (res <= pstr.len)
paddr[res] = '\0';
if (cpage)
- page_cache_release(cpage);
+ put_page(cpage);
set_delayed_call(done, kfree_link, paddr);
return paddr;
errout:
if (cpage)
- page_cache_release(cpage);
+ put_page(cpage);
kfree(paddr);
return ERR_PTR(res);
}
@@ -99,7 +95,6 @@ const struct inode_operations ext4_encrypted_symlink_inode_operations = {
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
};
-#endif
const struct inode_operations ext4_symlink_inode_operations = {
.readlink = generic_readlink,
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 1420a3c614afb..73bcfd41f5f26 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -359,7 +359,6 @@ static int name##_open(struct inode *inode, struct file *file) \
} \
\
static const struct file_operations ext4_seq_##name##_fops = { \
- .owner = THIS_MODULE, \
.open = name##_open, \
.read = seq_read, \
.llseek = seq_lseek, \
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 0441e055c8e8b..39e9cfb1b3715 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -121,17 +121,18 @@ static __le32 ext4_xattr_block_csum(struct inode *inode,
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__u32 csum;
- __le32 save_csum;
__le64 dsk_block_nr = cpu_to_le64(block_nr);
+ __u32 dummy_csum = 0;
+ int offset = offsetof(struct ext4_xattr_header, h_checksum);
- save_csum = hdr->h_checksum;
- hdr->h_checksum = 0;
csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr,
sizeof(dsk_block_nr));
- csum = ext4_chksum(sbi, csum, (__u8 *)hdr,
- EXT4_BLOCK_SIZE(inode->i_sb));
+ csum = ext4_chksum(sbi, csum, (__u8 *)hdr, offset);
+ csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum));
+ offset += sizeof(dummy_csum);
+ csum = ext4_chksum(sbi, csum, (__u8 *)hdr + offset,
+ EXT4_BLOCK_SIZE(inode->i_sb) - offset);
- hdr->h_checksum = save_csum;
return cpu_to_le32(csum);
}
@@ -230,6 +231,27 @@ ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh)
return error;
}
+static int
+__xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
+ void *end, const char *function, unsigned int line)
+{
+ struct ext4_xattr_entry *entry = IFIRST(header);
+ int error = -EFSCORRUPTED;
+
+ if (((void *) header >= end) ||
+ (header->h_magic != le32_to_cpu(EXT4_XATTR_MAGIC)))
+ goto errout;
+ error = ext4_xattr_check_names(entry, end, entry);
+errout:
+ if (error)
+ __ext4_error_inode(inode, function, line, 0,
+ "corrupted in-inode xattr");
+ return error;
+}
+
+#define xattr_check_inode(inode, header, end) \
+ __xattr_check_inode((inode), (header), (end), __func__, __LINE__)
+
static inline int
ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size)
{
@@ -341,7 +363,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
header = IHDR(inode, raw_inode);
entry = IFIRST(header);
end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
- error = ext4_xattr_check_names(entry, end, entry);
+ error = xattr_check_inode(inode, header, end);
if (error)
goto cleanup;
error = ext4_xattr_find_entry(&entry, name_index, name,
@@ -477,7 +499,7 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
raw_inode = ext4_raw_inode(&iloc);
header = IHDR(inode, raw_inode);
end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
- error = ext4_xattr_check_names(IFIRST(header), end, IFIRST(header));
+ error = xattr_check_inode(inode, header, end);
if (error)
goto cleanup;
error = ext4_xattr_list_entries(dentry, IFIRST(header),
@@ -1040,8 +1062,7 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
is->s.here = is->s.first;
is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
- error = ext4_xattr_check_names(IFIRST(header), is->s.end,
- IFIRST(header));
+ error = xattr_check_inode(inode, header, is->s.end);
if (error)
return error;
/* Find the named attribute. */
@@ -1356,6 +1377,10 @@ retry:
last = entry;
total_ino = sizeof(struct ext4_xattr_ibody_header);
+ error = xattr_check_inode(inode, header, end);
+ if (error)
+ goto cleanup;
+
free = ext4_xattr_free_space(last, &min_offs, base, &total_ino);
if (free >= new_extra_isize) {
entry = IFIRST(header);
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 3e81bdca071a6..a8921112030d3 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -13,19 +13,20 @@
static int
ext4_xattr_security_get(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- void *buffer, size_t size)
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *buffer, size_t size)
{
- return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY,
+ return ext4_xattr_get(inode, EXT4_XATTR_INDEX_SECURITY,
name, buffer, size);
}
static int
ext4_xattr_security_set(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY,
+ return ext4_xattr_set(inode, EXT4_XATTR_INDEX_SECURITY,
name, value, size, flags);
}
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index 2a3c6f9b8cb84..c7765c7357144 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -20,19 +20,20 @@ ext4_xattr_trusted_list(struct dentry *dentry)
static int
ext4_xattr_trusted_get(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name, void *buffer,
- size_t size)
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *buffer, size_t size)
{
- return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED,
+ return ext4_xattr_get(inode, EXT4_XATTR_INDEX_TRUSTED,
name, buffer, size);
}
static int
ext4_xattr_trusted_set(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED,
+ return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED,
name, value, size, flags);
}
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index d152f431e432a..ca20e423034bf 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -19,23 +19,24 @@ ext4_xattr_user_list(struct dentry *dentry)
static int
ext4_xattr_user_get(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- void *buffer, size_t size)
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *buffer, size_t size)
{
- if (!test_opt(dentry->d_sb, XATTR_USER))
+ if (!test_opt(inode->i_sb, XATTR_USER))
return -EOPNOTSUPP;
- return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_USER,
+ return ext4_xattr_get(inode, EXT4_XATTR_INDEX_USER,
name, buffer, size);
}
static int
ext4_xattr_user_set(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- if (!test_opt(dentry->d_sb, XATTR_USER))
+ if (!test_opt(inode->i_sb, XATTR_USER))
return -EOPNOTSUPP;
- return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_USER,
+ return ext4_xattr_set(inode, EXT4_XATTR_INDEX_USER,
name, value, size, flags);
}
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 1f8982a957f15..378c221d68a92 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -94,3 +94,11 @@ config F2FS_IO_TRACE
information and block IO patterns in the filesystem level.
If unsure, say N.
+
+config F2FS_FAULT_INJECTION
+ bool "F2FS fault injection facility"
+ depends on F2FS_FS
+ help
+ Test F2FS to inject faults such as ENOMEM, ENOSPC, and so on.
+
+ If unsure, say N.
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index c8f25f7241f06..4dcc9e28dc5c7 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -115,7 +115,7 @@ static void *f2fs_acl_to_disk(const struct posix_acl *acl, size_t *size)
struct f2fs_acl_entry *entry;
int i;
- f2fs_acl = kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count *
+ f2fs_acl = f2fs_kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count *
sizeof(struct f2fs_acl_entry), GFP_NOFS);
if (!f2fs_acl)
return ERR_PTR(-ENOMEM);
@@ -175,7 +175,7 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type,
retval = f2fs_getxattr(inode, name_index, "", NULL, 0, dpage);
if (retval > 0) {
- value = kmalloc(retval, GFP_F2FS_ZERO);
+ value = f2fs_kmalloc(retval, GFP_F2FS_ZERO);
if (!value)
return ERR_PTR(-ENOMEM);
retval = f2fs_getxattr(inode, name_index, "", value,
@@ -190,9 +190,6 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type,
acl = ERR_PTR(retval);
kfree(value);
- if (!IS_ERR(acl))
- set_cached_acl(inode, type, acl);
-
return acl;
}
@@ -204,7 +201,6 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
static int __f2fs_set_acl(struct inode *inode, int type,
struct posix_acl *acl, struct page *ipage)
{
- struct f2fs_inode_info *fi = F2FS_I(inode);
int name_index;
void *value = NULL;
size_t size = 0;
@@ -217,7 +213,7 @@ static int __f2fs_set_acl(struct inode *inode, int type,
error = posix_acl_equiv_mode(acl, &inode->i_mode);
if (error < 0)
return error;
- set_acl_inode(fi, inode->i_mode);
+ set_acl_inode(inode, inode->i_mode);
if (error == 0)
acl = NULL;
}
@@ -236,7 +232,7 @@ static int __f2fs_set_acl(struct inode *inode, int type,
if (acl) {
value = f2fs_acl_to_disk(acl, &size);
if (IS_ERR(value)) {
- clear_inode_flag(fi, FI_ACL_MODE);
+ clear_inode_flag(inode, FI_ACL_MODE);
return (int)PTR_ERR(value);
}
}
@@ -247,7 +243,7 @@ static int __f2fs_set_acl(struct inode *inode, int type,
if (!error)
set_cached_acl(inode, type, acl);
- clear_inode_flag(fi, FI_ACL_MODE);
+ clear_inode_flag(inode, FI_ACL_MODE);
return error;
}
@@ -388,6 +384,8 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
if (error)
return error;
+ f2fs_mark_inode_dirty_sync(inode);
+
if (default_acl) {
error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl,
ipage);
diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h
index 997ca8edb6cbf..b2334d11dae80 100644
--- a/fs/f2fs/acl.h
+++ b/fs/f2fs/acl.h
@@ -37,7 +37,7 @@ struct f2fs_acl_header {
#ifdef CONFIG_F2FS_FS_POSIX_ACL
extern struct posix_acl *f2fs_get_acl(struct inode *, int);
-extern int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int f2fs_set_acl(struct inode *, struct posix_acl *, int);
extern int f2fs_init_acl(struct inode *, struct inode *, struct page *,
struct page *);
#else
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 0955312e5ca04..f94d01e7d001f 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -26,6 +26,14 @@
static struct kmem_cache *ino_entry_slab;
struct kmem_cache *inode_entry_slab;
+void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io)
+{
+ set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
+ sbi->sb->s_flags |= MS_RDONLY;
+ if (!end_io)
+ f2fs_flush_merged_bios(sbi);
+}
+
/*
* We guarantee no failure on the returned page.
*/
@@ -34,13 +42,14 @@ struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
struct address_space *mapping = META_MAPPING(sbi);
struct page *page = NULL;
repeat:
- page = grab_cache_page(mapping, index);
+ page = f2fs_grab_cache_page(mapping, index, false);
if (!page) {
cond_resched();
goto repeat;
}
f2fs_wait_on_page_writeback(page, META, true);
- SetPageUptodate(page);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
return page;
}
@@ -55,16 +64,17 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
struct f2fs_io_info fio = {
.sbi = sbi,
.type = META,
- .rw = READ_SYNC | REQ_META | REQ_PRIO,
+ .op = REQ_OP_READ,
+ .op_flags = READ_SYNC | REQ_META | REQ_PRIO,
.old_blkaddr = index,
.new_blkaddr = index,
.encrypted_page = NULL,
};
if (unlikely(!is_meta))
- fio.rw &= ~REQ_META;
+ fio.op_flags &= ~REQ_META;
repeat:
- page = grab_cache_page(mapping, index);
+ page = f2fs_grab_cache_page(mapping, index, false);
if (!page) {
cond_resched();
goto repeat;
@@ -91,7 +101,7 @@ repeat:
* meta page.
*/
if (unlikely(!PageUptodate(page)))
- f2fs_stop_checkpoint(sbi);
+ f2fs_stop_checkpoint(sbi, false);
out:
return page;
}
@@ -149,13 +159,14 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
struct f2fs_io_info fio = {
.sbi = sbi,
.type = META,
- .rw = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : READA,
+ .op = REQ_OP_READ,
+ .op_flags = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : REQ_RAHEAD,
.encrypted_page = NULL,
};
struct blk_plug plug;
if (unlikely(type == META_POR))
- fio.rw &= ~REQ_META;
+ fio.op_flags &= ~REQ_META;
blk_start_plug(&plug);
for (; nrpages-- > 0; blkno++) {
@@ -186,7 +197,8 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
BUG();
}
- page = grab_cache_page(META_MAPPING(sbi), fio.new_blkaddr);
+ page = f2fs_grab_cache_page(META_MAPPING(sbi),
+ fio.new_blkaddr, false);
if (!page)
continue;
if (PageUptodate(page)) {
@@ -211,7 +223,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index)
bool readahead = false;
page = find_get_page(META_MAPPING(sbi), index);
- if (!page || (page && !PageUptodate(page)))
+ if (!page || !PageUptodate(page))
readahead = true;
f2fs_put_page(page, 0);
@@ -255,6 +267,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
+ struct blk_plug plug;
long diff, written;
/* collect a number of dirty meta pages and write together */
@@ -267,7 +280,9 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
/* if mounting is failed, skip writing node pages */
mutex_lock(&sbi->cp_mutex);
diff = nr_pages_to_write(sbi, META, wbc);
+ blk_start_plug(&plug);
written = sync_meta_pages(sbi, META, wbc->nr_to_write);
+ blk_finish_plug(&plug);
mutex_unlock(&sbi->cp_mutex);
wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff);
return 0;
@@ -355,9 +370,10 @@ static int f2fs_set_meta_page_dirty(struct page *page)
{
trace_f2fs_set_page_dirty(page, META);
- SetPageUptodate(page);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
if (!PageDirty(page)) {
- __set_page_dirty_nobuffers(page);
+ f2fs_set_page_dirty_nobuffers(page);
inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
SetPagePrivate(page);
f2fs_trace_pid(page);
@@ -448,12 +464,12 @@ bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode)
return e ? true : false;
}
-void release_ino_entry(struct f2fs_sb_info *sbi)
+void release_ino_entry(struct f2fs_sb_info *sbi, bool all)
{
struct ino_entry *e, *tmp;
int i;
- for (i = APPEND_INO; i <= UPDATE_INO; i++) {
+ for (i = all ? ORPHAN_INO: APPEND_INO; i <= UPDATE_INO; i++) {
struct inode_management *im = &sbi->im[i];
spin_lock(&im->ino_lock);
@@ -473,6 +489,13 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi)
int err = 0;
spin_lock(&im->ino_lock);
+
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (time_to_inject(FAULT_ORPHAN)) {
+ spin_unlock(&im->ino_lock);
+ return -ENOSPC;
+ }
+#endif
if (unlikely(im->ino_num >= sbi->max_orphans))
err = -ENOSPC;
else
@@ -492,10 +515,11 @@ void release_orphan_inode(struct f2fs_sb_info *sbi)
spin_unlock(&im->ino_lock);
}
-void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
+void add_orphan_inode(struct inode *inode)
{
/* add new orphan ino entry into list */
- __add_ino_entry(sbi, ino, ORPHAN_INO);
+ __add_ino_entry(F2FS_I_SB(inode), inode->i_ino, ORPHAN_INO);
+ update_inode_page(inode);
}
void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
@@ -743,28 +767,25 @@ fail_no_cp:
static void __add_dirty_inode(struct inode *inode, enum inode_type type)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct f2fs_inode_info *fi = F2FS_I(inode);
int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE;
- if (is_inode_flag_set(fi, flag))
+ if (is_inode_flag_set(inode, flag))
return;
- set_inode_flag(fi, flag);
- list_add_tail(&fi->dirty_list, &sbi->inode_list[type]);
+ set_inode_flag(inode, flag);
+ list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]);
stat_inc_dirty_inode(sbi, type);
}
static void __remove_dirty_inode(struct inode *inode, enum inode_type type)
{
- struct f2fs_inode_info *fi = F2FS_I(inode);
int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE;
- if (get_dirty_pages(inode) ||
- !is_inode_flag_set(F2FS_I(inode), flag))
+ if (get_dirty_pages(inode) || !is_inode_flag_set(inode, flag))
return;
- list_del_init(&fi->dirty_list);
- clear_inode_flag(fi, flag);
+ list_del_init(&F2FS_I(inode)->dirty_list);
+ clear_inode_flag(inode, flag);
stat_dec_dirty_inode(F2FS_I_SB(inode), type);
}
@@ -778,7 +799,8 @@ void update_dirty_page(struct inode *inode, struct page *page)
return;
spin_lock(&sbi->inode_lock[type]);
- __add_dirty_inode(inode, type);
+ if (type != FILE_INODE || test_opt(sbi, DATA_FLUSH))
+ __add_dirty_inode(inode, type);
inode_inc_dirty_pages(inode);
spin_unlock(&sbi->inode_lock[type]);
@@ -786,34 +808,21 @@ void update_dirty_page(struct inode *inode, struct page *page)
f2fs_trace_pid(page);
}
-void add_dirty_dir_inode(struct inode *inode)
-{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-
- spin_lock(&sbi->inode_lock[DIR_INODE]);
- __add_dirty_inode(inode, DIR_INODE);
- spin_unlock(&sbi->inode_lock[DIR_INODE]);
-}
-
void remove_dirty_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct f2fs_inode_info *fi = F2FS_I(inode);
enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE;
if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) &&
!S_ISLNK(inode->i_mode))
return;
+ if (type == FILE_INODE && !test_opt(sbi, DATA_FLUSH))
+ return;
+
spin_lock(&sbi->inode_lock[type]);
__remove_dirty_inode(inode, type);
spin_unlock(&sbi->inode_lock[type]);
-
- /* Only from the recovery routine */
- if (is_inode_flag_set(fi, FI_DELAY_IPUT)) {
- clear_inode_flag(fi, FI_DELAY_IPUT);
- iput(inode);
- }
}
int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type)
@@ -857,6 +866,34 @@ retry:
goto retry;
}
+int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi)
+{
+ struct list_head *head = &sbi->inode_list[DIRTY_META];
+ struct inode *inode;
+ struct f2fs_inode_info *fi;
+ s64 total = get_pages(sbi, F2FS_DIRTY_IMETA);
+
+ while (total--) {
+ if (unlikely(f2fs_cp_error(sbi)))
+ return -EIO;
+
+ spin_lock(&sbi->inode_lock[DIRTY_META]);
+ if (list_empty(head)) {
+ spin_unlock(&sbi->inode_lock[DIRTY_META]);
+ return 0;
+ }
+ fi = list_entry(head->next, struct f2fs_inode_info,
+ gdirty_list);
+ inode = igrab(&fi->vfs_inode);
+ spin_unlock(&sbi->inode_lock[DIRTY_META]);
+ if (inode) {
+ update_inode_page(inode);
+ iput(inode);
+ }
+ };
+ return 0;
+}
+
/*
* Freeze all the FS-operations for checkpoint.
*/
@@ -883,6 +920,14 @@ retry_flush_dents:
goto retry_flush_dents;
}
+ if (get_pages(sbi, F2FS_DIRTY_IMETA)) {
+ f2fs_unlock_all(sbi);
+ err = f2fs_sync_inode_meta(sbi);
+ if (err)
+ goto out;
+ goto retry_flush_dents;
+ }
+
/*
* POR: we should ensure that there are no dirty node pages
* until finishing nat/sit flush.
@@ -892,7 +937,7 @@ retry_flush_nodes:
if (get_pages(sbi, F2FS_DIRTY_NODES)) {
up_write(&sbi->node_write);
- err = sync_node_pages(sbi, 0, &wbc);
+ err = sync_node_pages(sbi, &wbc);
if (err) {
f2fs_unlock_all(sbi);
goto out;
@@ -907,6 +952,8 @@ out:
static void unblock_operations(struct f2fs_sb_info *sbi)
{
up_write(&sbi->node_write);
+
+ build_free_nids(sbi);
f2fs_unlock_all(sbi);
}
@@ -917,7 +964,7 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
for (;;) {
prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE);
- if (!get_pages(sbi, F2FS_WRITEBACK))
+ if (!atomic_read(&sbi->nr_wb_bios))
break;
io_schedule_timeout(5*HZ);
@@ -947,7 +994,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
* This avoids to conduct wrong roll-forward operations and uses
* metapages, so should be called prior to sync_meta_pages below.
*/
- if (discard_next_dnode(sbi, discard_blk))
+ if (!test_opt(sbi, LFS) && discard_next_dnode(sbi, discard_blk))
invalidate = true;
/* Flush all the NAT/SIT pages */
@@ -1082,7 +1129,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* update user_block_counts */
sbi->last_valid_block_count = sbi->total_valid_block_count;
- sbi->alloc_valid_block_count = 0;
+ percpu_counter_set(&sbi->alloc_valid_block_count, 0);
/* Here, we only have one bio having CP pack */
sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
@@ -1098,7 +1145,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
invalidate_mapping_pages(META_MAPPING(sbi), discard_blk,
discard_blk);
- release_ino_entry(sbi);
+ release_ino_entry(sbi, false);
if (unlikely(f2fs_cp_error(sbi)))
return -EIO;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index e5c762b372390..d64d2a515cb2c 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -19,6 +19,8 @@
#include <linux/bio.h>
#include <linux/prefetch.h>
#include <linux/uio.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
#include <linux/cleancache.h>
#include "f2fs.h"
@@ -45,7 +47,8 @@ static void f2fs_read_end_io(struct bio *bio)
struct page *page = bvec->bv_page;
if (!bio->bi_error) {
- SetPageUptodate(page);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
} else {
ClearPageUptodate(page);
SetPageError(page);
@@ -68,13 +71,12 @@ static void f2fs_write_end_io(struct bio *bio)
if (unlikely(bio->bi_error)) {
set_bit(AS_EIO, &page->mapping->flags);
- f2fs_stop_checkpoint(sbi);
+ f2fs_stop_checkpoint(sbi, true);
}
end_page_writeback(page);
- dec_page_count(sbi, F2FS_WRITEBACK);
}
-
- if (!get_pages(sbi, F2FS_WRITEBACK) && wq_has_sleeper(&sbi->cp_wait))
+ if (atomic_dec_and_test(&sbi->nr_wb_bios) &&
+ wq_has_sleeper(&sbi->cp_wait))
wake_up(&sbi->cp_wait);
bio_put(bio);
@@ -98,6 +100,18 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
return bio;
}
+static inline void __submit_bio(struct f2fs_sb_info *sbi,
+ struct bio *bio, enum page_type type)
+{
+ if (!is_read_io(bio_op(bio))) {
+ atomic_inc(&sbi->nr_wb_bios);
+ if (f2fs_sb_mounted_hmsmr(sbi->sb) &&
+ current->plug && (type == DATA || type == NODE))
+ blk_finish_plug(current->plug);
+ }
+ submit_bio(bio);
+}
+
static void __submit_merged_bio(struct f2fs_bio_info *io)
{
struct f2fs_io_info *fio = &io->fio;
@@ -105,12 +119,14 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
if (!io->bio)
return;
- if (is_read_io(fio->rw))
+ if (is_read_io(fio->op))
trace_f2fs_submit_read_bio(io->sbi->sb, fio, io->bio);
else
trace_f2fs_submit_write_bio(io->sbi->sb, fio, io->bio);
- submit_bio(fio->rw, io->bio);
+ bio_set_op_attrs(io->bio, fio->op, fio->op_flags);
+
+ __submit_bio(io->sbi, io->bio, fio->type);
io->bio = NULL;
}
@@ -176,10 +192,12 @@ static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
/* change META to META_FLUSH in the checkpoint procedure */
if (type >= META_FLUSH) {
io->fio.type = META_FLUSH;
+ io->fio.op = REQ_OP_WRITE;
if (test_opt(sbi, NOBARRIER))
- io->fio.rw = WRITE_FLUSH | REQ_META | REQ_PRIO;
+ io->fio.op_flags = WRITE_FLUSH | REQ_META | REQ_PRIO;
else
- io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO;
+ io->fio.op_flags = WRITE_FLUSH_FUA | REQ_META |
+ REQ_PRIO;
}
__submit_merged_bio(io);
out:
@@ -221,14 +239,15 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
f2fs_trace_ios(fio, 0);
/* Allocate a new bio */
- bio = __bio_alloc(fio->sbi, fio->new_blkaddr, 1, is_read_io(fio->rw));
+ bio = __bio_alloc(fio->sbi, fio->new_blkaddr, 1, is_read_io(fio->op));
- if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
+ if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
bio_put(bio);
return -EFAULT;
}
+ bio_set_op_attrs(bio, fio->op, fio->op_flags);
- submit_bio(fio->rw, bio);
+ __submit_bio(fio->sbi, bio, fio->type);
return 0;
}
@@ -237,7 +256,7 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio)
struct f2fs_sb_info *sbi = fio->sbi;
enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
struct f2fs_bio_info *io;
- bool is_read = is_read_io(fio->rw);
+ bool is_read = is_read_io(fio->op);
struct page *bio_page;
io = is_read ? &sbi->read_io : &sbi->write_io[btype];
@@ -248,11 +267,8 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio)
down_write(&io->io_rwsem);
- if (!is_read)
- inc_page_count(sbi, F2FS_WRITEBACK);
-
if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 ||
- io->fio.rw != fio->rw))
+ (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags)))
__submit_merged_bio(io);
alloc_new:
if (io->bio == NULL) {
@@ -265,8 +281,8 @@ alloc_new:
bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page;
- if (bio_add_page(io->bio, bio_page, PAGE_CACHE_SIZE, 0) <
- PAGE_CACHE_SIZE) {
+ if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) <
+ PAGE_SIZE) {
__submit_merged_bio(io);
goto alloc_new;
}
@@ -278,6 +294,16 @@ alloc_new:
trace_f2fs_submit_page_mbio(fio->page, fio);
}
+static void __set_data_blkaddr(struct dnode_of_data *dn)
+{
+ struct f2fs_node *rn = F2FS_NODE(dn->node_page);
+ __le32 *addr_array;
+
+ /* Get physical address of data block */
+ addr_array = blkaddr_in_node(rn);
+ addr_array[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr);
+}
+
/*
* Lock ordering for the change of data block address:
* ->data_page
@@ -286,19 +312,9 @@ alloc_new:
*/
void set_data_blkaddr(struct dnode_of_data *dn)
{
- struct f2fs_node *rn;
- __le32 *addr_array;
- struct page *node_page = dn->node_page;
- unsigned int ofs_in_node = dn->ofs_in_node;
-
- f2fs_wait_on_page_writeback(node_page, NODE, true);
-
- rn = F2FS_NODE(node_page);
-
- /* Get physical address of data block */
- addr_array = blkaddr_in_node(rn);
- addr_array[ofs_in_node] = cpu_to_le32(dn->data_blkaddr);
- if (set_page_dirty(node_page))
+ f2fs_wait_on_page_writeback(dn->node_page, NODE, true);
+ __set_data_blkaddr(dn);
+ if (set_page_dirty(dn->node_page))
dn->node_changed = true;
}
@@ -309,24 +325,50 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr)
f2fs_update_extent_cache(dn);
}
-int reserve_new_block(struct dnode_of_data *dn)
+/* dn->ofs_in_node will be returned with up-to-date last block pointer */
+int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
- if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
+ if (!count)
+ return 0;
+
+ if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC)))
return -EPERM;
- if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
+ if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count)))
return -ENOSPC;
- trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node);
+ trace_f2fs_reserve_new_blocks(dn->inode, dn->nid,
+ dn->ofs_in_node, count);
- dn->data_blkaddr = NEW_ADDR;
- set_data_blkaddr(dn);
- mark_inode_dirty(dn->inode);
- sync_inode_page(dn);
+ f2fs_wait_on_page_writeback(dn->node_page, NODE, true);
+
+ for (; count > 0; dn->ofs_in_node++) {
+ block_t blkaddr =
+ datablock_addr(dn->node_page, dn->ofs_in_node);
+ if (blkaddr == NULL_ADDR) {
+ dn->data_blkaddr = NEW_ADDR;
+ __set_data_blkaddr(dn);
+ count--;
+ }
+ }
+
+ if (set_page_dirty(dn->node_page))
+ dn->node_changed = true;
return 0;
}
+/* Should keep dn->ofs_in_node unchanged */
+int reserve_new_block(struct dnode_of_data *dn)
+{
+ unsigned int ofs_in_node = dn->ofs_in_node;
+ int ret;
+
+ ret = reserve_new_blocks(dn, 1);
+ dn->ofs_in_node = ofs_in_node;
+ return ret;
+}
+
int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
{
bool need_put = dn->inode_page ? false : true;
@@ -357,7 +399,7 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index)
}
struct page *get_read_data_page(struct inode *inode, pgoff_t index,
- int rw, bool for_write)
+ int op_flags, bool for_write)
{
struct address_space *mapping = inode->i_mapping;
struct dnode_of_data dn;
@@ -367,7 +409,8 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index,
struct f2fs_io_info fio = {
.sbi = F2FS_I_SB(inode),
.type = DATA,
- .rw = rw,
+ .op = REQ_OP_READ,
+ .op_flags = op_flags,
.encrypted_page = NULL,
};
@@ -406,8 +449,9 @@ got_it:
* see, f2fs_add_link -> get_new_data_page -> init_inode_metadata.
*/
if (dn.data_blkaddr == NEW_ADDR) {
- zero_user_segment(page, 0, PAGE_CACHE_SIZE);
- SetPageUptodate(page);
+ zero_user_segment(page, 0, PAGE_SIZE);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
unlock_page(page);
return page;
}
@@ -466,14 +510,14 @@ repeat:
/* wait for read completion */
lock_page(page);
- if (unlikely(!PageUptodate(page))) {
- f2fs_put_page(page, 1);
- return ERR_PTR(-EIO);
- }
if (unlikely(page->mapping != mapping)) {
f2fs_put_page(page, 1);
goto repeat;
}
+ if (unlikely(!PageUptodate(page))) {
+ f2fs_put_page(page, 1);
+ return ERR_PTR(-EIO);
+ }
return page;
}
@@ -517,8 +561,9 @@ struct page *get_new_data_page(struct inode *inode,
goto got_it;
if (dn.data_blkaddr == NEW_ADDR) {
- zero_user_segment(page, 0, PAGE_CACHE_SIZE);
- SetPageUptodate(page);
+ zero_user_segment(page, 0, PAGE_SIZE);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
} else {
f2fs_put_page(page, 1);
@@ -530,11 +575,8 @@ struct page *get_new_data_page(struct inode *inode,
}
got_it:
if (new_i_size && i_size_read(inode) <
- ((loff_t)(index + 1) << PAGE_CACHE_SHIFT)) {
- i_size_write(inode, ((loff_t)(index + 1) << PAGE_CACHE_SHIFT));
- /* Only the directory inode sets new_i_size */
- set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
- }
+ ((loff_t)(index + 1) << PAGE_SHIFT))
+ f2fs_i_size_write(inode, ((loff_t)(index + 1) << PAGE_SHIFT));
return page;
}
@@ -545,15 +587,16 @@ static int __allocate_data_block(struct dnode_of_data *dn)
struct node_info ni;
int seg = CURSEG_WARM_DATA;
pgoff_t fofs;
+ blkcnt_t count = 1;
- if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
+ if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC)))
return -EPERM;
dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node);
if (dn->data_blkaddr == NEW_ADDR)
goto alloc;
- if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
+ if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count)))
return -ENOSPC;
alloc:
@@ -570,9 +613,9 @@ alloc:
/* update i_size */
fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
dn->ofs_in_node;
- if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT))
- i_size_write(dn->inode,
- ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT));
+ if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_SHIFT))
+ f2fs_i_size_write(dn->inode,
+ ((loff_t)(fofs + 1) << PAGE_SHIFT));
return 0;
}
@@ -582,8 +625,8 @@ ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
struct f2fs_map_blocks map;
ssize_t ret = 0;
- map.m_lblk = F2FS_BYTES_TO_BLK(iocb->ki_pos);
- map.m_len = F2FS_BLK_ALIGN(iov_iter_count(from));
+ map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos);
+ map.m_len = F2FS_BYTES_TO_BLK(iov_iter_count(from));
map.m_next_pgofs = NULL;
if (f2fs_encrypted_inode(inode))
@@ -620,9 +663,11 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
unsigned int maxblocks = map->m_len;
struct dnode_of_data dn;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA;
- pgoff_t pgofs, end_offset;
+ int mode = create ? ALLOC_NODE : LOOKUP_NODE;
+ pgoff_t pgofs, end_offset, end;
int err = 0, ofs = 1;
+ unsigned int ofs_in_node, last_ofs_in_node;
+ blkcnt_t prealloc;
struct extent_info ei;
bool allocated = false;
block_t blkaddr;
@@ -632,6 +677,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
/* it only supports block size == page size */
pgofs = (pgoff_t)map->m_lblk;
+ end = pgofs + maxblocks;
if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) {
map->m_pblk = ei.blk + pgofs - ei.fofs;
@@ -648,6 +694,8 @@ next_dnode:
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, pgofs, mode);
if (err) {
+ if (flag == F2FS_GET_BLOCK_BMAP)
+ map->m_pblk = 0;
if (err == -ENOENT) {
err = 0;
if (map->m_next_pgofs)
@@ -657,6 +705,8 @@ next_dnode:
goto unlock_out;
}
+ prealloc = 0;
+ ofs_in_node = dn.ofs_in_node;
end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
next_block:
@@ -669,31 +719,40 @@ next_block:
goto sync_out;
}
if (flag == F2FS_GET_BLOCK_PRE_AIO) {
- if (blkaddr == NULL_ADDR)
- err = reserve_new_block(&dn);
+ if (blkaddr == NULL_ADDR) {
+ prealloc++;
+ last_ofs_in_node = dn.ofs_in_node;
+ }
} else {
err = __allocate_data_block(&dn);
+ if (!err) {
+ set_inode_flag(inode, FI_APPEND_WRITE);
+ allocated = true;
+ }
}
if (err)
goto sync_out;
- allocated = true;
map->m_flags = F2FS_MAP_NEW;
blkaddr = dn.data_blkaddr;
} else {
+ if (flag == F2FS_GET_BLOCK_BMAP) {
+ map->m_pblk = 0;
+ goto sync_out;
+ }
if (flag == F2FS_GET_BLOCK_FIEMAP &&
blkaddr == NULL_ADDR) {
if (map->m_next_pgofs)
*map->m_next_pgofs = pgofs + 1;
}
if (flag != F2FS_GET_BLOCK_FIEMAP ||
- blkaddr != NEW_ADDR) {
- if (flag == F2FS_GET_BLOCK_BMAP)
- err = -ENOENT;
+ blkaddr != NEW_ADDR)
goto sync_out;
- }
}
}
+ if (flag == F2FS_GET_BLOCK_PRE_AIO)
+ goto skip;
+
if (map->m_len == 0) {
/* preallocated unwritten block should be mapped for fiemap. */
if (blkaddr == NEW_ADDR)
@@ -705,36 +764,49 @@ next_block:
} else if ((map->m_pblk != NEW_ADDR &&
blkaddr == (map->m_pblk + ofs)) ||
(map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) ||
- flag == F2FS_GET_BLOCK_PRE_DIO ||
- flag == F2FS_GET_BLOCK_PRE_AIO) {
+ flag == F2FS_GET_BLOCK_PRE_DIO) {
ofs++;
map->m_len++;
} else {
goto sync_out;
}
+skip:
dn.ofs_in_node++;
pgofs++;
- if (map->m_len < maxblocks) {
- if (dn.ofs_in_node < end_offset)
- goto next_block;
+ /* preallocate blocks in batch for one dnode page */
+ if (flag == F2FS_GET_BLOCK_PRE_AIO &&
+ (pgofs == end || dn.ofs_in_node == end_offset)) {
- if (allocated)
- sync_inode_page(&dn);
- f2fs_put_dnode(&dn);
+ dn.ofs_in_node = ofs_in_node;
+ err = reserve_new_blocks(&dn, prealloc);
+ if (err)
+ goto sync_out;
- if (create) {
- f2fs_unlock_op(sbi);
- f2fs_balance_fs(sbi, allocated);
+ map->m_len += dn.ofs_in_node - ofs_in_node;
+ if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) {
+ err = -ENOSPC;
+ goto sync_out;
}
- allocated = false;
- goto next_dnode;
+ dn.ofs_in_node = end_offset;
}
+ if (pgofs >= end)
+ goto sync_out;
+ else if (dn.ofs_in_node < end_offset)
+ goto next_block;
+
+ f2fs_put_dnode(&dn);
+
+ if (create) {
+ f2fs_unlock_op(sbi);
+ f2fs_balance_fs(sbi, allocated);
+ }
+ allocated = false;
+ goto next_dnode;
+
sync_out:
- if (allocated)
- sync_inode_page(&dn);
f2fs_put_dnode(&dn);
unlock_out:
if (create) {
@@ -894,6 +966,37 @@ out:
return ret;
}
+struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr,
+ unsigned nr_pages)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct fscrypt_ctx *ctx = NULL;
+ struct block_device *bdev = sbi->sb->s_bdev;
+ struct bio *bio;
+
+ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
+ ctx = fscrypt_get_ctx(inode, GFP_NOFS);
+ if (IS_ERR(ctx))
+ return ERR_CAST(ctx);
+
+ /* wait the page to be moved by cleaning */
+ f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr);
+ }
+
+ bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES));
+ if (!bio) {
+ if (ctx)
+ fscrypt_release_ctx(ctx);
+ return ERR_PTR(-ENOMEM);
+ }
+ bio->bi_bdev = bdev;
+ bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blkaddr);
+ bio->bi_end_io = f2fs_read_end_io;
+ bio->bi_private = ctx;
+
+ return bio;
+}
+
/*
* This function was originally taken from fs/mpage.c, and customized for f2fs.
* Major change was from block_size == page_size in f2fs by default.
@@ -912,7 +1015,6 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
sector_t last_block;
sector_t last_block_in_file;
sector_t block_nr;
- struct block_device *bdev = inode->i_sb->s_bdev;
struct f2fs_map_blocks map;
map.m_pblk = 0;
@@ -928,7 +1030,8 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
page = list_entry(pages->prev, struct page, lru);
list_del(&page->lru);
if (add_to_page_cache_lru(page, mapping,
- page->index, GFP_KERNEL))
+ page->index,
+ readahead_gfp_mask(mapping)))
goto next_page;
}
@@ -971,8 +1074,9 @@ got_it:
goto confused;
}
} else {
- zero_user_segment(page, 0, PAGE_CACHE_SIZE);
- SetPageUptodate(page);
+ zero_user_segment(page, 0, PAGE_SIZE);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
unlock_page(page);
goto next_page;
}
@@ -983,35 +1087,16 @@ got_it:
*/
if (bio && (last_block_in_bio != block_nr - 1)) {
submit_and_realloc:
- submit_bio(READ, bio);
+ __submit_bio(F2FS_I_SB(inode), bio, DATA);
bio = NULL;
}
if (bio == NULL) {
- struct fscrypt_ctx *ctx = NULL;
-
- if (f2fs_encrypted_inode(inode) &&
- S_ISREG(inode->i_mode)) {
-
- ctx = fscrypt_get_ctx(inode);
- if (IS_ERR(ctx))
- goto set_error_page;
-
- /* wait the page to be moved by cleaning */
- f2fs_wait_on_encrypted_page_writeback(
- F2FS_I_SB(inode), block_nr);
- }
-
- bio = bio_alloc(GFP_KERNEL,
- min_t(int, nr_pages, BIO_MAX_PAGES));
- if (!bio) {
- if (ctx)
- fscrypt_release_ctx(ctx);
+ bio = f2fs_grab_bio(inode, block_nr, nr_pages);
+ if (IS_ERR(bio)) {
+ bio = NULL;
goto set_error_page;
}
- bio->bi_bdev = bdev;
- bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(block_nr);
- bio->bi_end_io = f2fs_read_end_io;
- bio->bi_private = ctx;
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
}
if (bio_add_page(bio, page, blocksize, 0) < blocksize)
@@ -1021,22 +1106,22 @@ submit_and_realloc:
goto next_page;
set_error_page:
SetPageError(page);
- zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ zero_user_segment(page, 0, PAGE_SIZE);
unlock_page(page);
goto next_page;
confused:
if (bio) {
- submit_bio(READ, bio);
+ __submit_bio(F2FS_I_SB(inode), bio, DATA);
bio = NULL;
}
unlock_page(page);
next_page:
if (pages)
- page_cache_release(page);
+ put_page(page);
}
BUG_ON(pages && !list_empty(pages));
if (bio)
- submit_bio(READ, bio);
+ __submit_bio(F2FS_I_SB(inode), bio, DATA);
return 0;
}
@@ -1092,14 +1177,24 @@ int do_write_data_page(struct f2fs_io_info *fio)
}
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
+ gfp_t gfp_flags = GFP_NOFS;
/* wait for GCed encrypted page writeback */
f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode),
fio->old_blkaddr);
-
- fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page);
+retry_encrypt:
+ fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page,
+ gfp_flags);
if (IS_ERR(fio->encrypted_page)) {
err = PTR_ERR(fio->encrypted_page);
+ if (err == -ENOMEM) {
+ /* flush pending ios and wait for a while */
+ f2fs_flush_merged_bios(F2FS_I_SB(inode));
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ gfp_flags |= __GFP_NOFAIL;
+ err = 0;
+ goto retry_encrypt;
+ }
goto out_writepage;
}
}
@@ -1115,14 +1210,14 @@ int do_write_data_page(struct f2fs_io_info *fio)
!IS_ATOMIC_WRITTEN_PAGE(page) &&
need_inplace_update(inode))) {
rewrite_data_page(fio);
- set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE);
+ set_inode_flag(inode, FI_UPDATE_WRITE);
trace_f2fs_do_write_data_page(page, IPU);
} else {
write_data_page(&dn, fio);
trace_f2fs_do_write_data_page(page, OPU);
- set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
+ set_inode_flag(inode, FI_APPEND_WRITE);
if (page->index == 0)
- set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN);
+ set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
}
out_writepage:
f2fs_put_dnode(&dn);
@@ -1136,14 +1231,16 @@ static int f2fs_write_data_page(struct page *page,
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
loff_t i_size = i_size_read(inode);
const pgoff_t end_index = ((unsigned long long) i_size)
- >> PAGE_CACHE_SHIFT;
+ >> PAGE_SHIFT;
+ loff_t psize = (page->index + 1) << PAGE_SHIFT;
unsigned offset = 0;
bool need_balance_fs = false;
int err = 0;
struct f2fs_io_info fio = {
.sbi = sbi,
.type = DATA,
- .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
+ .op = REQ_OP_WRITE,
+ .op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0,
.page = page,
.encrypted_page = NULL,
};
@@ -1157,34 +1254,34 @@ static int f2fs_write_data_page(struct page *page,
* If the offset is out-of-range of file size,
* this page does not have to be written to disk.
*/
- offset = i_size & (PAGE_CACHE_SIZE - 1);
+ offset = i_size & (PAGE_SIZE - 1);
if ((page->index >= end_index + 1) || !offset)
goto out;
- zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+ zero_user_segment(page, offset, PAGE_SIZE);
write:
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
goto redirty_out;
if (f2fs_is_drop_cache(inode))
goto out;
- if (f2fs_is_volatile_file(inode) && !wbc->for_reclaim &&
- available_free_memory(sbi, BASE_CHECK))
+ /* we should not write 0'th page having journal header */
+ if (f2fs_is_volatile_file(inode) && (!page->index ||
+ (!wbc->for_reclaim &&
+ available_free_memory(sbi, BASE_CHECK))))
goto redirty_out;
+ /* we should bypass data pages to proceed the kworkder jobs */
+ if (unlikely(f2fs_cp_error(sbi))) {
+ mapping_set_error(page->mapping, -EIO);
+ goto out;
+ }
+
/* Dentry blocks are controlled by checkpoint */
if (S_ISDIR(inode->i_mode)) {
- if (unlikely(f2fs_cp_error(sbi)))
- goto redirty_out;
err = do_write_data_page(&fio);
goto done;
}
- /* we should bypass data pages to proceed the kworkder jobs */
- if (unlikely(f2fs_cp_error(sbi))) {
- SetPageError(page);
- goto out;
- }
-
if (!wbc->for_reclaim)
need_balance_fs = true;
else if (has_not_enough_free_secs(sbi, 0))
@@ -1196,6 +1293,8 @@ write:
err = f2fs_write_inline_data(inode, page);
if (err == -EAGAIN)
err = do_write_data_page(&fio);
+ if (F2FS_I(inode)->last_disk_size < psize)
+ F2FS_I(inode)->last_disk_size = psize;
f2fs_unlock_op(sbi);
done:
if (err && err != -ENOENT)
@@ -1222,16 +1321,8 @@ out:
redirty_out:
redirty_page_for_writepage(wbc, page);
- return AOP_WRITEPAGE_ACTIVATE;
-}
-
-static int __f2fs_writepage(struct page *page, struct writeback_control *wbc,
- void *data)
-{
- struct address_space *mapping = data;
- int ret = mapping->a_ops->writepage(page, wbc);
- mapping_set_error(mapping, ret);
- return ret;
+ unlock_page(page);
+ return err;
}
/*
@@ -1240,8 +1331,7 @@ static int __f2fs_writepage(struct page *page, struct writeback_control *wbc,
* warm/hot data page.
*/
static int f2fs_write_cache_pages(struct address_space *mapping,
- struct writeback_control *wbc, writepage_t writepage,
- void *data)
+ struct writeback_control *wbc)
{
int ret = 0;
int done = 0;
@@ -1254,10 +1344,9 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
int cycled;
int range_whole = 0;
int tag;
- int step = 0;
pagevec_init(&pvec, 0);
-next:
+
if (wbc->range_cyclic) {
writeback_index = mapping->writeback_index; /* prev offset */
index = writeback_index;
@@ -1267,8 +1356,8 @@ next:
cycled = 0;
end = -1;
} else {
- index = wbc->range_start >> PAGE_CACHE_SHIFT;
- end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ index = wbc->range_start >> PAGE_SHIFT;
+ end = wbc->range_end >> PAGE_SHIFT;
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
cycled = 1; /* ignore range_cyclic tests */
@@ -1312,9 +1401,6 @@ continue_unlock:
goto continue_unlock;
}
- if (step == is_cold_data(page))
- goto continue_unlock;
-
if (PageWriteback(page)) {
if (wbc->sync_mode != WB_SYNC_NONE)
f2fs_wait_on_page_writeback(page,
@@ -1327,16 +1413,11 @@ continue_unlock:
if (!clear_page_dirty_for_io(page))
goto continue_unlock;
- ret = (*writepage)(page, wbc, data);
+ ret = mapping->a_ops->writepage(page, wbc);
if (unlikely(ret)) {
- if (ret == AOP_WRITEPAGE_ACTIVATE) {
- unlock_page(page);
- ret = 0;
- } else {
- done_index = page->index + 1;
- done = 1;
- break;
- }
+ done_index = page->index + 1;
+ done = 1;
+ break;
}
if (--wbc->nr_to_write <= 0 &&
@@ -1349,11 +1430,6 @@ continue_unlock:
cond_resched();
}
- if (step < 1) {
- step++;
- goto next;
- }
-
if (!cycled && !done) {
cycled = 1;
index = 0;
@@ -1371,9 +1447,8 @@ static int f2fs_write_data_pages(struct address_space *mapping,
{
struct inode *inode = mapping->host;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- bool locked = false;
+ struct blk_plug plug;
int ret;
- long diff;
/* deal with chardevs and other special file */
if (!mapping->a_ops->writepage)
@@ -1389,7 +1464,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
goto skip_write;
/* skip writing during file defragment */
- if (is_inode_flag_set(F2FS_I(inode), FI_DO_DEFRAG))
+ if (is_inode_flag_set(inode, FI_DO_DEFRAG))
goto skip_write;
/* during POR, we don't need to trigger writepage at all. */
@@ -1398,20 +1473,16 @@ static int f2fs_write_data_pages(struct address_space *mapping,
trace_f2fs_writepages(mapping->host, wbc, DATA);
- diff = nr_pages_to_write(sbi, DATA, wbc);
-
- if (!S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_ALL) {
- mutex_lock(&sbi->writepages);
- locked = true;
- }
- ret = f2fs_write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
- f2fs_submit_merged_bio_cond(sbi, inode, NULL, 0, DATA, WRITE);
- if (locked)
- mutex_unlock(&sbi->writepages);
+ blk_start_plug(&plug);
+ ret = f2fs_write_cache_pages(mapping, wbc);
+ blk_finish_plug(&plug);
+ /*
+ * if some pages were truncated, we cannot guarantee its mapping->host
+ * to detect pending bios.
+ */
+ f2fs_submit_merged_bio(sbi, DATA, WRITE);
remove_dirty_inode(inode);
-
- wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
return ret;
skip_write:
@@ -1448,11 +1519,11 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
* the block addresses when there is no need to fill the page.
*/
if (!f2fs_has_inline_data(inode) && !f2fs_encrypted_inode(inode) &&
- len == PAGE_CACHE_SIZE)
+ len == PAGE_SIZE)
return 0;
if (f2fs_has_inline_data(inode) ||
- (pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
+ (pos & PAGE_MASK) >= i_size_read(inode)) {
f2fs_lock_op(sbi);
locked = true;
}
@@ -1469,8 +1540,9 @@ restart:
if (f2fs_has_inline_data(inode)) {
if (pos + len <= MAX_INLINE_DATA) {
read_inline_data(page, ipage);
- set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
- set_inline_node(ipage);
+ set_inode_flag(inode, FI_DATA_EXIST);
+ if (inode->i_nlink)
+ set_inline_node(ipage);
} else {
err = f2fs_convert_inline_page(&dn, page);
if (err)
@@ -1486,7 +1558,7 @@ restart:
} else {
/* hole case */
err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
- if (err || (!err && dn.data_blkaddr == NULL_ADDR)) {
+ if (err || dn.data_blkaddr == NULL_ADDR) {
f2fs_put_dnode(&dn);
f2fs_lock_op(sbi);
locked = true;
@@ -1513,7 +1585,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
struct inode *inode = mapping->host;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct page *page = NULL;
- pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT;
+ pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT;
bool need_balance = false;
block_t blkaddr = NULL_ADDR;
int err = 0;
@@ -1561,55 +1633,52 @@ repeat:
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr);
- if (len == PAGE_CACHE_SIZE)
+ if (len == PAGE_SIZE)
goto out_update;
if (PageUptodate(page))
goto out_clear;
- if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
- unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+ if ((pos & PAGE_MASK) >= i_size_read(inode)) {
+ unsigned start = pos & (PAGE_SIZE - 1);
unsigned end = start + len;
/* Reading beyond i_size is simple: memset to zero */
- zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE);
+ zero_user_segments(page, 0, start, end, PAGE_SIZE);
goto out_update;
}
if (blkaddr == NEW_ADDR) {
- zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ zero_user_segment(page, 0, PAGE_SIZE);
} else {
- struct f2fs_io_info fio = {
- .sbi = sbi,
- .type = DATA,
- .rw = READ_SYNC,
- .old_blkaddr = blkaddr,
- .new_blkaddr = blkaddr,
- .page = page,
- .encrypted_page = NULL,
- };
- err = f2fs_submit_page_bio(&fio);
- if (err)
- goto fail;
+ struct bio *bio;
- lock_page(page);
- if (unlikely(!PageUptodate(page))) {
- err = -EIO;
+ bio = f2fs_grab_bio(inode, blkaddr, 1);
+ if (IS_ERR(bio)) {
+ err = PTR_ERR(bio);
+ goto fail;
+ }
+ bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC);
+ if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+ bio_put(bio);
+ err = -EFAULT;
goto fail;
}
+
+ __submit_bio(sbi, bio, DATA);
+
+ lock_page(page);
if (unlikely(page->mapping != mapping)) {
f2fs_put_page(page, 1);
goto repeat;
}
-
- /* avoid symlink page */
- if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
- err = fscrypt_decrypt_page(page);
- if (err)
- goto fail;
+ if (unlikely(!PageUptodate(page))) {
+ err = -EIO;
+ goto fail;
}
}
out_update:
- SetPageUptodate(page);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
out_clear:
clear_cold_data(page);
return 0;
@@ -1630,13 +1699,11 @@ static int f2fs_write_end(struct file *file,
trace_f2fs_write_end(inode, pos, len, copied);
set_page_dirty(page);
+ f2fs_put_page(page, 1);
- if (pos + copied > i_size_read(inode)) {
- i_size_write(inode, pos + copied);
- mark_inode_dirty(inode);
- }
+ if (pos + copied > i_size_read(inode))
+ f2fs_i_size_write(inode, pos + copied);
- f2fs_put_page(page, 1);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return copied;
}
@@ -1655,12 +1722,13 @@ static int check_direct_IO(struct inode *inode, struct iov_iter *iter,
return 0;
}
-static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- loff_t offset)
+static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = mapping->host;
size_t count = iov_iter_count(iter);
+ loff_t offset = iocb->ki_pos;
+ int rw = iov_iter_rw(iter);
int err;
err = check_direct_IO(inode, iter, offset);
@@ -1669,14 +1737,23 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
return 0;
+ if (test_opt(F2FS_I_SB(inode), LFS))
+ return 0;
- trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
+ trace_f2fs_direct_IO_enter(inode, offset, count, rw);
- err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio);
- if (err < 0 && iov_iter_rw(iter) == WRITE)
- f2fs_write_failed(mapping, offset + count);
+ down_read(&F2FS_I(inode)->dio_rwsem[rw]);
+ err = blockdev_direct_IO(iocb, inode, iter, get_data_block_dio);
+ up_read(&F2FS_I(inode)->dio_rwsem[rw]);
+
+ if (rw == WRITE) {
+ if (err > 0)
+ set_inode_flag(inode, FI_UPDATE_WRITE);
+ else if (err < 0)
+ f2fs_write_failed(mapping, offset + count);
+ }
- trace_f2fs_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), err);
+ trace_f2fs_direct_IO_exit(inode, offset, count, rw, err);
return err;
}
@@ -1688,7 +1765,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
if (inode->i_ino >= F2FS_ROOT_INO(sbi) &&
- (offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE))
+ (offset % PAGE_SIZE || length != PAGE_SIZE))
return;
if (PageDirty(page)) {
@@ -1704,6 +1781,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
if (IS_ATOMIC_WRITTEN_PAGE(page))
return;
+ set_page_private(page, 0);
ClearPagePrivate(page);
}
@@ -1717,10 +1795,40 @@ int f2fs_release_page(struct page *page, gfp_t wait)
if (IS_ATOMIC_WRITTEN_PAGE(page))
return 0;
+ set_page_private(page, 0);
ClearPagePrivate(page);
return 1;
}
+/*
+ * This was copied from __set_page_dirty_buffers which gives higher performance
+ * in very high speed storages. (e.g., pmem)
+ */
+void f2fs_set_page_dirty_nobuffers(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ unsigned long flags;
+
+ if (unlikely(!mapping))
+ return;
+
+ spin_lock(&mapping->private_lock);
+ lock_page_memcg(page);
+ SetPageDirty(page);
+ spin_unlock(&mapping->private_lock);
+
+ spin_lock_irqsave(&mapping->tree_lock, flags);
+ WARN_ON_ONCE(!PageUptodate(page));
+ account_page_dirtied(page, mapping);
+ radix_tree_tag_set(&mapping->page_tree,
+ page_index(page), PAGECACHE_TAG_DIRTY);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ unlock_page_memcg(page);
+
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+ return;
+}
+
static int f2fs_set_data_page_dirty(struct page *page)
{
struct address_space *mapping = page->mapping;
@@ -1728,7 +1836,8 @@ static int f2fs_set_data_page_dirty(struct page *page)
trace_f2fs_set_page_dirty(page, DATA);
- SetPageUptodate(page);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
if (f2fs_is_atomic_file(inode)) {
if (!IS_ATOMIC_WRITTEN_PAGE(page)) {
@@ -1743,7 +1852,7 @@ static int f2fs_set_data_page_dirty(struct page *page)
}
if (!PageDirty(page)) {
- __set_page_dirty_nobuffers(page);
+ f2fs_set_page_dirty_nobuffers(page);
update_dirty_page(inode, page);
return 1;
}
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 4fb6ef88a34f2..badd407bb622a 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -47,8 +47,9 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA);
si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE];
si->ndirty_files = sbi->ndirty_inode[FILE_INODE];
+ si->ndirty_all = sbi->ndirty_inode[DIRTY_META];
si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
- si->wb_pages = get_pages(sbi, F2FS_WRITEBACK);
+ si->wb_bios = atomic_read(&sbi->nr_wb_bios);
si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
si->rsvd_segs = reserved_segments(sbi);
si->overp_segs = overprovision_segments(sbi);
@@ -58,6 +59,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->inline_xattr = atomic_read(&sbi->inline_xattr);
si->inline_inode = atomic_read(&sbi->inline_inode);
si->inline_dir = atomic_read(&sbi->inline_dir);
+ si->orphans = sbi->im[ORPHAN_INO].ino_num;
si->utilization = utilization(sbi);
si->free_segs = free_segments(sbi);
@@ -143,6 +145,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize;
si->base_mem += 2 * sizeof(struct f2fs_inode_info);
si->base_mem += sizeof(*sbi->ckpt);
+ si->base_mem += sizeof(struct percpu_counter) * NR_COUNT_TYPE;
/* build sm */
si->base_mem += sizeof(struct f2fs_sm_info);
@@ -164,7 +167,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
/* build curseg */
si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE;
- si->base_mem += PAGE_CACHE_SIZE * NR_CURSEG_TYPE;
+ si->base_mem += PAGE_SIZE * NR_CURSEG_TYPE;
/* build dirty segmap */
si->base_mem += sizeof(struct dirty_seglist_info);
@@ -192,7 +195,7 @@ get_cache:
si->cache_mem += NM_I(sbi)->dirty_nat_cnt *
sizeof(struct nat_entry_set);
si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages);
- for (i = 0; i <= UPDATE_INO; i++)
+ for (i = 0; i <= ORPHAN_INO; i++)
si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
si->cache_mem += atomic_read(&sbi->total_ext_tree) *
sizeof(struct extent_tree);
@@ -201,9 +204,9 @@ get_cache:
si->page_mem = 0;
npages = NODE_MAPPING(sbi)->nrpages;
- si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT;
+ si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
npages = META_MAPPING(sbi)->nrpages;
- si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT;
+ si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
}
static int stat_show(struct seq_file *s, void *v)
@@ -216,8 +219,9 @@ static int stat_show(struct seq_file *s, void *v)
list_for_each_entry(si, &f2fs_stat_list, stat_list) {
update_general_status(si->sbi);
- seq_printf(s, "\n=====[ partition info(%pg). #%d ]=====\n",
- si->sbi->sb->s_bdev, i++);
+ seq_printf(s, "\n=====[ partition info(%pg). #%d, %s]=====\n",
+ si->sbi->sb->s_bdev, i++,
+ f2fs_readonly(si->sbi->sb) ? "RO": "RW");
seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ",
si->sit_area_segs, si->nat_area_segs);
seq_printf(s, "[SSA: %d] [MAIN: %d",
@@ -237,6 +241,8 @@ static int stat_show(struct seq_file *s, void *v)
si->inline_inode);
seq_printf(s, " - Inline_dentry Inode: %u\n",
si->inline_dir);
+ seq_printf(s, " - Orphan Inode: %u\n",
+ si->orphans);
seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n",
si->main_area_segs, si->main_area_sections,
si->main_area_zones);
@@ -295,15 +301,15 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n",
si->ext_tree, si->zombie_tree, si->ext_node);
seq_puts(s, "\nBalancing F2FS Async:\n");
- seq_printf(s, " - inmem: %4d, wb: %4d\n",
- si->inmem_pages, si->wb_pages);
- seq_printf(s, " - nodes: %4d in %4d\n",
+ seq_printf(s, " - inmem: %4lld, wb_bios: %4d\n",
+ si->inmem_pages, si->wb_bios);
+ seq_printf(s, " - nodes: %4lld in %4d\n",
si->ndirty_node, si->node_pages);
- seq_printf(s, " - dents: %4d in dirs:%4d\n",
- si->ndirty_dent, si->ndirty_dirs);
- seq_printf(s, " - datas: %4d in files:%4d\n",
+ seq_printf(s, " - dents: %4lld in dirs:%4d (%4d)\n",
+ si->ndirty_dent, si->ndirty_dirs, si->ndirty_all);
+ seq_printf(s, " - datas: %4lld in files:%4d\n",
si->ndirty_data, si->ndirty_files);
- seq_printf(s, " - meta: %4d in %4d\n",
+ seq_printf(s, " - meta: %4lld in %4d\n",
si->ndirty_meta, si->meta_pages);
seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n",
si->dirty_nats, si->nats, si->dirty_sits, si->sits);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 80641ad827459..9054aeac80152 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -17,8 +17,8 @@
static unsigned long dir_blocks(struct inode *inode)
{
- return ((unsigned long long) (i_size_read(inode) + PAGE_CACHE_SIZE - 1))
- >> PAGE_CACHE_SHIFT;
+ return ((unsigned long long) (i_size_read(inode) + PAGE_SIZE - 1))
+ >> PAGE_SHIFT;
}
static unsigned int dir_buckets(unsigned int level, int dir_level)
@@ -48,7 +48,6 @@ unsigned char f2fs_filetype_table[F2FS_FT_MAX] = {
[F2FS_FT_SYMLINK] = DT_LNK,
};
-#define S_SHIFT 12
static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = {
[S_IFREG >> S_SHIFT] = F2FS_FT_REG_FILE,
[S_IFDIR >> S_SHIFT] = F2FS_FT_DIR,
@@ -64,6 +63,13 @@ void set_de_type(struct f2fs_dir_entry *de, umode_t mode)
de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
}
+unsigned char get_de_type(struct f2fs_dir_entry *de)
+{
+ if (de->file_type < F2FS_FT_MAX)
+ return f2fs_filetype_table[de->file_type];
+ return DT_UNKNOWN;
+}
+
static unsigned long dir_block_index(unsigned int level,
int dir_level, unsigned int idx)
{
@@ -95,11 +101,6 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
else
kunmap(dentry_page);
- /*
- * For the most part, it should be a bug when name_len is zero.
- * We stop here for figuring out where the bugs has occurred.
- */
- f2fs_bug_on(F2FS_P_SB(dentry_page), d.max < 0);
return de;
}
@@ -124,6 +125,11 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname,
de = &d->dentry[bit_pos];
+ if (unlikely(!de->name_len)) {
+ bit_pos++;
+ continue;
+ }
+
/* encrypted case */
de_name.name = d->filename[bit_pos];
de_name.len = le16_to_cpu(de->name_len);
@@ -141,10 +147,6 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname,
*max_slots = max_len;
max_len = 0;
- /* remain bug on condition */
- if (unlikely(!de->name_len))
- d->max = -1;
-
bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
}
@@ -183,8 +185,13 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
/* no need to allocate new dentry pages to all the indices */
dentry_page = find_data_page(dir, bidx);
if (IS_ERR(dentry_page)) {
- room = true;
- continue;
+ if (PTR_ERR(dentry_page) == -ENOENT) {
+ room = true;
+ continue;
+ } else {
+ *res_page = dentry_page;
+ break;
+ }
}
de = find_in_block(dentry_page, fname, namehash, &max_slots,
@@ -212,7 +219,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
* Entry is guaranteed to be valid.
*/
struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
- struct qstr *child, struct page **res_page)
+ const struct qstr *child, struct page **res_page)
{
unsigned long npages = dir_blocks(dir);
struct f2fs_dir_entry *de = NULL;
@@ -221,19 +228,22 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
struct fscrypt_name fname;
int err;
- *res_page = NULL;
-
err = fscrypt_setup_filename(dir, child, 1, &fname);
- if (err)
+ if (err) {
+ *res_page = ERR_PTR(err);
return NULL;
+ }
if (f2fs_has_inline_dentry(dir)) {
+ *res_page = NULL;
de = find_in_inline_dir(dir, &fname, res_page);
goto out;
}
- if (npages == 0)
+ if (npages == 0) {
+ *res_page = NULL;
goto out;
+ }
max_depth = F2FS_I(dir)->i_current_depth;
if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) {
@@ -241,13 +251,13 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
"Corrupted max_depth of %lu: %u",
dir->i_ino, max_depth);
max_depth = MAX_DIR_HASH_DEPTH;
- F2FS_I(dir)->i_current_depth = max_depth;
- mark_inode_dirty(dir);
+ f2fs_i_depth_write(dir, max_depth);
}
for (level = 0; level < max_depth; level++) {
+ *res_page = NULL;
de = find_in_level(dir, level, &fname, res_page);
- if (de)
+ if (de || IS_ERR(*res_page))
break;
}
out:
@@ -257,35 +267,22 @@ out:
struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p)
{
- struct page *page;
- struct f2fs_dir_entry *de;
- struct f2fs_dentry_block *dentry_blk;
-
- if (f2fs_has_inline_dentry(dir))
- return f2fs_parent_inline_dir(dir, p);
-
- page = get_lock_data_page(dir, 0, false);
- if (IS_ERR(page))
- return NULL;
+ struct qstr dotdot = QSTR_INIT("..", 2);
- dentry_blk = kmap(page);
- de = &dentry_blk->dentry[1];
- *p = page;
- unlock_page(page);
- return de;
+ return f2fs_find_entry(dir, &dotdot, p);
}
-ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr)
+ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr,
+ struct page **page)
{
ino_t res = 0;
struct f2fs_dir_entry *de;
- struct page *page;
- de = f2fs_find_entry(dir, qstr, &page);
+ de = f2fs_find_entry(dir, qstr, page);
if (de) {
res = le32_to_cpu(de->ino);
- f2fs_dentry_kunmap(dir, page);
- f2fs_put_page(page, 0);
+ f2fs_dentry_kunmap(dir, *page);
+ f2fs_put_page(*page, 0);
}
return res;
@@ -301,9 +298,9 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
set_de_type(de, inode->i_mode);
f2fs_dentry_kunmap(dir, page);
set_page_dirty(page);
- dir->i_mtime = dir->i_ctime = CURRENT_TIME;
- mark_inode_dirty(dir);
+ dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ f2fs_mark_inode_dirty_sync(dir);
f2fs_put_page(page, 1);
}
@@ -383,15 +380,20 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
struct page *page;
int err;
- if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
+ if (is_inode_flag_set(inode, FI_NEW_INODE)) {
page = new_inode_page(inode);
if (IS_ERR(page))
return page;
if (S_ISDIR(inode->i_mode)) {
+ /* in order to handle error case */
+ get_page(page);
err = make_empty_dir(inode, dir, page);
- if (err)
- goto error;
+ if (err) {
+ lock_page(page);
+ goto put_error;
+ }
+ put_page(page);
}
err = f2fs_init_acl(inode, dir, page, dpage);
@@ -422,7 +424,7 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
* This file should be checkpointed during fsync.
* We lost i_pino from now on.
*/
- if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) {
+ if (is_inode_flag_set(inode, FI_INC_LINK)) {
file_lost_pino(inode);
/*
* If link the tmpfile to alias through linkat path,
@@ -430,41 +432,33 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
*/
if (inode->i_nlink == 0)
remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino);
- inc_nlink(inode);
+ f2fs_i_links_write(inode, true);
}
return page;
put_error:
+ clear_nlink(inode);
+ update_inode(inode, page);
f2fs_put_page(page, 1);
-error:
- /* once the failed inode becomes a bad inode, i_mode is S_IFREG */
- truncate_inode_pages(&inode->i_data, 0);
- truncate_blocks(inode, 0, false);
- remove_dirty_inode(inode);
- remove_inode_page(inode);
return ERR_PTR(err);
}
void update_parent_metadata(struct inode *dir, struct inode *inode,
unsigned int current_depth)
{
- if (inode && is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
- if (S_ISDIR(inode->i_mode)) {
- inc_nlink(dir);
- set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
- }
- clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
+ if (inode && is_inode_flag_set(inode, FI_NEW_INODE)) {
+ if (S_ISDIR(inode->i_mode))
+ f2fs_i_links_write(dir, true);
+ clear_inode_flag(inode, FI_NEW_INODE);
}
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
- mark_inode_dirty(dir);
+ f2fs_mark_inode_dirty_sync(dir);
- if (F2FS_I(dir)->i_current_depth != current_depth) {
- F2FS_I(dir)->i_current_depth = current_depth;
- set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
- }
+ if (F2FS_I(dir)->i_current_depth != current_depth)
+ f2fs_i_depth_write(dir, current_depth);
- if (inode && is_inode_flag_set(F2FS_I(inode), FI_INC_LINK))
- clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
+ if (inode && is_inode_flag_set(inode, FI_INC_LINK))
+ clear_inode_flag(inode, FI_INC_LINK);
}
int room_for_filename(const void *bitmap, int slots, int max_slots)
@@ -509,11 +503,7 @@ void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d,
}
}
-/*
- * Caller should grab and release a rwsem by calling f2fs_lock_op() and
- * f2fs_unlock_op().
- */
-int __f2fs_add_link(struct inode *dir, const struct qstr *name,
+int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name,
struct inode *inode, nid_t ino, umode_t mode)
{
unsigned int bit_pos;
@@ -526,28 +516,11 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name,
struct f2fs_dentry_block *dentry_blk = NULL;
struct f2fs_dentry_ptr d;
struct page *page = NULL;
- struct fscrypt_name fname;
- struct qstr new_name;
- int slots, err;
-
- err = fscrypt_setup_filename(dir, name, 0, &fname);
- if (err)
- return err;
-
- new_name.name = fname_name(&fname);
- new_name.len = fname_len(&fname);
-
- if (f2fs_has_inline_dentry(dir)) {
- err = f2fs_add_inline_entry(dir, &new_name, inode, ino, mode);
- if (!err || err != -EAGAIN)
- goto out;
- else
- err = 0;
- }
+ int slots, err = 0;
level = 0;
- slots = GET_DENTRY_SLOTS(new_name.len);
- dentry_hash = f2fs_dentry_hash(&new_name);
+ slots = GET_DENTRY_SLOTS(new_name->len);
+ dentry_hash = f2fs_dentry_hash(new_name);
current_depth = F2FS_I(dir)->i_current_depth;
if (F2FS_I(dir)->chash == dentry_hash) {
@@ -556,10 +529,12 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name,
}
start:
- if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) {
- err = -ENOSPC;
- goto out;
- }
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (time_to_inject(FAULT_DIR_DEPTH))
+ return -ENOSPC;
+#endif
+ if (unlikely(current_depth == MAX_DIR_HASH_DEPTH))
+ return -ENOSPC;
/* Increase the depth, if required */
if (level == current_depth)
@@ -573,10 +548,8 @@ start:
for (block = bidx; block <= (bidx + nblock - 1); block++) {
dentry_page = get_new_data_page(dir, NULL, block, true);
- if (IS_ERR(dentry_page)) {
- err = PTR_ERR(dentry_page);
- goto out;
- }
+ if (IS_ERR(dentry_page))
+ return PTR_ERR(dentry_page);
dentry_blk = kmap(dentry_page);
bit_pos = room_for_filename(&dentry_blk->dentry_bitmap,
@@ -596,7 +569,7 @@ add_dentry:
if (inode) {
down_write(&F2FS_I(inode)->i_sem);
- page = init_inode_metadata(inode, dir, &new_name, NULL);
+ page = init_inode_metadata(inode, dir, new_name, NULL);
if (IS_ERR(page)) {
err = PTR_ERR(page);
goto fail;
@@ -606,14 +579,12 @@ add_dentry:
}
make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1);
- f2fs_update_dentry(ino, mode, &d, &new_name, dentry_hash, bit_pos);
+ f2fs_update_dentry(ino, mode, &d, new_name, dentry_hash, bit_pos);
set_page_dirty(dentry_page);
if (inode) {
- /* we don't need to mark_inode_dirty now */
- F2FS_I(inode)->i_pino = dir->i_ino;
- update_inode(inode, page);
+ f2fs_i_pino_write(inode, dir->i_ino);
f2fs_put_page(page, 1);
}
@@ -622,13 +593,36 @@ fail:
if (inode)
up_write(&F2FS_I(inode)->i_sem);
- if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) {
- update_inode_page(dir);
- clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
- }
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
-out:
+
+ return err;
+}
+
+/*
+ * Caller should grab and release a rwsem by calling f2fs_lock_op() and
+ * f2fs_unlock_op().
+ */
+int __f2fs_add_link(struct inode *dir, const struct qstr *name,
+ struct inode *inode, nid_t ino, umode_t mode)
+{
+ struct fscrypt_name fname;
+ struct qstr new_name;
+ int err;
+
+ err = fscrypt_setup_filename(dir, name, 0, &fname);
+ if (err)
+ return err;
+
+ new_name.name = fname_name(&fname);
+ new_name.len = fname_len(&fname);
+
+ err = -EAGAIN;
+ if (f2fs_has_inline_dentry(dir))
+ err = f2fs_add_inline_entry(dir, &new_name, inode, ino, mode);
+ if (err == -EAGAIN)
+ err = f2fs_add_regular_entry(dir, &new_name, inode, ino, mode);
+
fscrypt_free_filename(&fname);
f2fs_update_time(F2FS_I_SB(dir), REQ_TIME);
return err;
@@ -645,42 +639,34 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir)
err = PTR_ERR(page);
goto fail;
}
- /* we don't need to mark_inode_dirty now */
- update_inode(inode, page);
f2fs_put_page(page, 1);
- clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
+ clear_inode_flag(inode, FI_NEW_INODE);
fail:
up_write(&F2FS_I(inode)->i_sem);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return err;
}
-void f2fs_drop_nlink(struct inode *dir, struct inode *inode, struct page *page)
+void f2fs_drop_nlink(struct inode *dir, struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
down_write(&F2FS_I(inode)->i_sem);
- if (S_ISDIR(inode->i_mode)) {
- drop_nlink(dir);
- if (page)
- update_inode(dir, page);
- else
- update_inode_page(dir);
- }
+ if (S_ISDIR(inode->i_mode))
+ f2fs_i_links_write(dir, false);
inode->i_ctime = CURRENT_TIME;
- drop_nlink(inode);
+ f2fs_i_links_write(inode, false);
if (S_ISDIR(inode->i_mode)) {
- drop_nlink(inode);
- i_size_write(inode, 0);
+ f2fs_i_links_write(inode, false);
+ f2fs_i_size_write(inode, 0);
}
up_write(&F2FS_I(inode)->i_sem);
- update_inode_page(inode);
if (inode->i_nlink == 0)
- add_orphan_inode(sbi, inode->i_ino);
+ add_orphan_inode(inode);
else
release_orphan_inode(sbi);
}
@@ -718,9 +704,10 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
set_page_dirty(page);
dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ f2fs_mark_inode_dirty_sync(dir);
if (inode)
- f2fs_drop_nlink(dir, inode, NULL);
+ f2fs_drop_nlink(dir, inode);
if (bit_pos == NR_DENTRY_IN_BLOCK &&
!truncate_hole(dir, page->index, page->index + 1)) {
@@ -792,10 +779,7 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
continue;
}
- if (de->file_type < F2FS_FT_MAX)
- d_type = f2fs_filetype_table[de->file_type];
- else
- d_type = DT_UNKNOWN;
+ d_type = get_de_type(de);
de_name.name = d->filename[bit_pos];
de_name.len = le16_to_cpu(de->name_len);
@@ -804,7 +788,7 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
int save_len = fstr->len;
int ret;
- de_name.name = kmalloc(de_name.len, GFP_NOFS);
+ de_name.name = f2fs_kmalloc(de_name.len, GFP_NOFS);
if (!de_name.name)
return false;
@@ -887,6 +871,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
}
+ err = 0;
out:
fscrypt_fname_free_buffer(&fstr);
return err;
@@ -902,7 +887,7 @@ static int f2fs_dir_open(struct inode *inode, struct file *filp)
const struct file_operations f2fs_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .iterate = f2fs_readdir,
+ .iterate_shared = f2fs_readdir,
.fsync = f2fs_sync_file,
.open = f2fs_dir_open,
.unlocked_ioctl = f2fs_ioctl,
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index c859bb0447280..2b06d4fcd954e 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -170,8 +170,10 @@ static void __drop_largest_extent(struct inode *inode,
{
struct extent_info *largest = &F2FS_I(inode)->extent_tree->largest;
- if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs)
+ if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) {
largest->len = 0;
+ f2fs_mark_inode_dirty_sync(inode);
+ }
}
/* return true, if inode page is changed */
@@ -196,8 +198,7 @@ bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext)
if (!i_ext || !i_ext->len)
return false;
- set_extent_info(&ei, le32_to_cpu(i_ext->fofs),
- le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len));
+ get_extent_info(&ei, i_ext);
write_lock(&et->lock);
if (atomic_read(&et->node_cnt))
@@ -336,11 +337,12 @@ lookup_neighbors:
return en;
}
-static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
+static struct extent_node *__try_merge_extent_node(struct inode *inode,
struct extent_tree *et, struct extent_info *ei,
struct extent_node *prev_ex,
struct extent_node *next_ex)
{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct extent_node *en = NULL;
if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei)) {
@@ -361,7 +363,7 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
if (!en)
return NULL;
- __try_update_largest_extent(et, en);
+ __try_update_largest_extent(inode, et, en);
spin_lock(&sbi->extent_lock);
if (!list_empty(&en->list)) {
@@ -372,11 +374,12 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
return en;
}
-static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi,
+static struct extent_node *__insert_extent_tree(struct inode *inode,
struct extent_tree *et, struct extent_info *ei,
struct rb_node **insert_p,
struct rb_node *insert_parent)
{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct rb_node **p = &et->root.rb_node;
struct rb_node *parent = NULL;
struct extent_node *en = NULL;
@@ -403,7 +406,7 @@ do_insert:
if (!en)
return NULL;
- __try_update_largest_extent(et, en);
+ __try_update_largest_extent(inode, et, en);
/* update in global extent list */
spin_lock(&sbi->extent_lock);
@@ -432,7 +435,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
write_lock(&et->lock);
- if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) {
+ if (is_inode_flag_set(inode, FI_NO_EXTENT)) {
write_unlock(&et->lock);
return false;
}
@@ -474,7 +477,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
set_extent_info(&ei, end,
end - dei.fofs + dei.blk,
org_end - end);
- en1 = __insert_extent_tree(sbi, et, &ei,
+ en1 = __insert_extent_tree(inode, et, &ei,
NULL, NULL);
next_en = en1;
} else {
@@ -495,7 +498,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
}
if (parts)
- __try_update_largest_extent(et, en);
+ __try_update_largest_extent(inode, et, en);
else
__release_extent_node(sbi, et, en);
@@ -515,20 +518,20 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
if (blkaddr) {
set_extent_info(&ei, fofs, blkaddr, len);
- if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
- __insert_extent_tree(sbi, et, &ei,
+ if (!__try_merge_extent_node(inode, et, &ei, prev_en, next_en))
+ __insert_extent_tree(inode, et, &ei,
insert_p, insert_parent);
/* give up extent_cache, if split and small updates happen */
if (dei.len >= 1 &&
prev.len < F2FS_MIN_EXTENT_LEN &&
et->largest.len < F2FS_MIN_EXTENT_LEN) {
- et->largest.len = 0;
- set_inode_flag(F2FS_I(inode), FI_NO_EXTENT);
+ __drop_largest_extent(inode, 0, UINT_MAX);
+ set_inode_flag(inode, FI_NO_EXTENT);
}
}
- if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT))
+ if (is_inode_flag_set(inode, FI_NO_EXTENT))
__free_extent_tree(sbi, et);
write_unlock(&et->lock);
@@ -628,6 +631,19 @@ unsigned int f2fs_destroy_extent_node(struct inode *inode)
return node_cnt;
}
+void f2fs_drop_extent_tree(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct extent_tree *et = F2FS_I(inode)->extent_tree;
+
+ set_inode_flag(inode, FI_NO_EXTENT);
+
+ write_lock(&et->lock);
+ __free_extent_tree(sbi, et);
+ __drop_largest_extent(inode, 0, UINT_MAX);
+ write_unlock(&et->lock);
+}
+
void f2fs_destroy_extent_tree(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -686,9 +702,7 @@ void f2fs_update_extent_cache(struct dnode_of_data *dn)
fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
dn->ofs_in_node;
-
- if (f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1))
- sync_inode_page(dn);
+ f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1);
}
void f2fs_update_extent_cache_range(struct dnode_of_data *dn,
@@ -698,8 +712,7 @@ void f2fs_update_extent_cache_range(struct dnode_of_data *dn,
if (!f2fs_may_extent_tree(dn->inode))
return;
- if (f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len))
- sync_inode_page(dn);
+ f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len);
}
void init_extent_cache_info(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index bbe2cd1265d0c..675fa79d86f65 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -37,6 +37,60 @@
} while (0)
#endif
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+enum {
+ FAULT_KMALLOC,
+ FAULT_PAGE_ALLOC,
+ FAULT_ALLOC_NID,
+ FAULT_ORPHAN,
+ FAULT_BLOCK,
+ FAULT_DIR_DEPTH,
+ FAULT_EVICT_INODE,
+ FAULT_MAX,
+};
+
+struct f2fs_fault_info {
+ atomic_t inject_ops;
+ unsigned int inject_rate;
+ unsigned int inject_type;
+};
+
+extern struct f2fs_fault_info f2fs_fault;
+extern char *fault_name[FAULT_MAX];
+#define IS_FAULT_SET(type) (f2fs_fault.inject_type & (1 << (type)))
+
+static inline bool time_to_inject(int type)
+{
+ if (!f2fs_fault.inject_rate)
+ return false;
+ if (type == FAULT_KMALLOC && !IS_FAULT_SET(type))
+ return false;
+ else if (type == FAULT_PAGE_ALLOC && !IS_FAULT_SET(type))
+ return false;
+ else if (type == FAULT_ALLOC_NID && !IS_FAULT_SET(type))
+ return false;
+ else if (type == FAULT_ORPHAN && !IS_FAULT_SET(type))
+ return false;
+ else if (type == FAULT_BLOCK && !IS_FAULT_SET(type))
+ return false;
+ else if (type == FAULT_DIR_DEPTH && !IS_FAULT_SET(type))
+ return false;
+ else if (type == FAULT_EVICT_INODE && !IS_FAULT_SET(type))
+ return false;
+
+ atomic_inc(&f2fs_fault.inject_ops);
+ if (atomic_read(&f2fs_fault.inject_ops) >= f2fs_fault.inject_rate) {
+ atomic_set(&f2fs_fault.inject_ops, 0);
+ printk("%sF2FS-fs : inject %s in %pF\n",
+ KERN_INFO,
+ fault_name[type],
+ __builtin_return_address(0));
+ return true;
+ }
+ return false;
+}
+#endif
+
/*
* For mount options
*/
@@ -56,6 +110,9 @@
#define F2FS_MOUNT_EXTENT_CACHE 0x00002000
#define F2FS_MOUNT_FORCE_FG_GC 0x00004000
#define F2FS_MOUNT_DATA_FLUSH 0x00008000
+#define F2FS_MOUNT_FAULT_INJECTION 0x00010000
+#define F2FS_MOUNT_ADAPTIVE 0x00020000
+#define F2FS_MOUNT_LFS 0x00040000
#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -76,6 +133,7 @@ struct f2fs_mount_info {
};
#define F2FS_FEATURE_ENCRYPT 0x0001
+#define F2FS_FEATURE_HMSMR 0x0002
#define F2FS_HAS_FEATURE(sb, mask) \
((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
@@ -106,7 +164,7 @@ enum {
#define BATCHED_TRIM_BLOCKS(sbi) \
(BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg)
#define DEF_CP_INTERVAL 60 /* 60 secs */
-#define DEF_IDLE_INTERVAL 120 /* 2 mins */
+#define DEF_IDLE_INTERVAL 5 /* 5 secs */
struct cp_control {
int reason;
@@ -159,7 +217,6 @@ struct fsync_inode_entry {
struct inode *inode; /* vfs inode pointer */
block_t blkaddr; /* block address locating the last fsync */
block_t last_dentry; /* block address locating the last dentry */
- block_t last_inode; /* block address locating the last inode */
};
#define nats_in_cursum(jnl) (le16_to_cpu(jnl->n_nats))
@@ -211,6 +268,8 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal,
#define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6)
#define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7)
#define F2FS_IOC_DEFRAGMENT _IO(F2FS_IOCTL_MAGIC, 8)
+#define F2FS_IOC_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \
+ struct f2fs_move_range)
#define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY
#define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY
@@ -240,6 +299,13 @@ struct f2fs_defragment {
u64 len;
};
+struct f2fs_move_range {
+ u32 dst_fd; /* destination fd */
+ u64 pos_in; /* start position in src_fd */
+ u64 pos_out; /* start position in dst_fd */
+ u64 len; /* size to move */
+};
+
/*
* For INODE and NODE manager
*/
@@ -385,24 +451,27 @@ struct f2fs_inode_info {
/* Use below internally in f2fs*/
unsigned long flags; /* use to pass per-file flags */
struct rw_semaphore i_sem; /* protect fi info */
- atomic_t dirty_pages; /* # of dirty pages */
+ struct percpu_counter dirty_pages; /* # of dirty pages */
f2fs_hash_t chash; /* hash value of given file name */
unsigned int clevel; /* maximum level of given file name */
nid_t i_xattr_nid; /* node id that contains xattrs */
unsigned long long xattr_ver; /* cp version of xattr modification */
+ loff_t last_disk_size; /* lastly written file size */
- struct list_head dirty_list; /* linked in global dirty list */
+ struct list_head dirty_list; /* dirty list for dirs and files */
+ struct list_head gdirty_list; /* linked in global dirty list */
struct list_head inmem_pages; /* inmemory pages managed by f2fs */
struct mutex inmem_lock; /* lock for inmemory pages */
struct extent_tree *extent_tree; /* cached extent_tree entry */
+ struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */
};
static inline void get_extent_info(struct extent_info *ext,
- struct f2fs_extent i_ext)
+ struct f2fs_extent *i_ext)
{
- ext->fofs = le32_to_cpu(i_ext.fofs);
- ext->blk = le32_to_cpu(i_ext.blk);
- ext->len = le32_to_cpu(i_ext.len);
+ ext->fofs = le32_to_cpu(i_ext->fofs);
+ ext->blk = le32_to_cpu(i_ext->blk);
+ ext->len = le32_to_cpu(i_ext->len);
}
static inline void set_raw_extent(struct extent_info *ext,
@@ -447,11 +516,14 @@ static inline bool __is_front_mergeable(struct extent_info *cur,
return __is_extent_mergeable(cur, front);
}
-static inline void __try_update_largest_extent(struct extent_tree *et,
- struct extent_node *en)
+extern void f2fs_mark_inode_dirty_sync(struct inode *);
+static inline void __try_update_largest_extent(struct inode *inode,
+ struct extent_tree *et, struct extent_node *en)
{
- if (en->ei.len > et->largest.len)
+ if (en->ei.len > et->largest.len) {
et->largest = en->ei;
+ f2fs_mark_inode_dirty_sync(inode);
+ }
}
struct f2fs_nm_info {
@@ -466,7 +538,7 @@ struct f2fs_nm_info {
/* NAT cache management */
struct radix_tree_root nat_root;/* root of the nat entry cache */
struct radix_tree_root nat_set_root;/* root of the nat set cache */
- struct rw_semaphore nat_tree_lock; /* protect nat_tree_lock */
+ struct percpu_rw_semaphore nat_tree_lock; /* protect nat_tree_lock */
struct list_head nat_entries; /* cached nat entry list (clean) */
unsigned int nat_cnt; /* the # of cached nat entries */
unsigned int dirty_nat_cnt; /* total num of nat entries in set */
@@ -548,6 +620,7 @@ struct flush_cmd {
struct flush_cmd_control {
struct task_struct *f2fs_issue_flush; /* flush thread */
wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */
+ atomic_t submit_flush; /* # of issued flushes */
struct llist_head issue_list; /* list for command issue */
struct llist_node *dispatch_list; /* list for command dispatch */
};
@@ -599,12 +672,12 @@ struct f2fs_sm_info {
* dirty dentry blocks, dirty node blocks, and dirty meta blocks.
*/
enum count_type {
- F2FS_WRITEBACK,
F2FS_DIRTY_DENTS,
F2FS_DIRTY_DATA,
F2FS_DIRTY_NODES,
F2FS_DIRTY_META,
F2FS_INMEM_PAGES,
+ F2FS_DIRTY_IMETA,
NR_COUNT_TYPE,
};
@@ -636,14 +709,15 @@ enum page_type {
struct f2fs_io_info {
struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */
enum page_type type; /* contains DATA/NODE/META/META_FLUSH */
- int rw; /* contains R/RS/W/WS with REQ_META/REQ_PRIO */
+ int op; /* contains REQ_OP_ */
+ int op_flags; /* rq_flag_bits */
block_t new_blkaddr; /* new block address to be written */
block_t old_blkaddr; /* old block address before Cow */
struct page *page; /* page to be written */
struct page *encrypted_page; /* encrypted page */
};
-#define is_read_io(rw) (((rw) & 1) == READ)
+#define is_read_io(rw) (rw == READ)
struct f2fs_bio_info {
struct f2fs_sb_info *sbi; /* f2fs superblock */
struct bio *bio; /* bios to merge */
@@ -655,6 +729,7 @@ struct f2fs_bio_info {
enum inode_type {
DIR_INODE, /* for dirty dir inode */
FILE_INODE, /* for dirty regular/symlink inode */
+ DIRTY_META, /* for all dirtied inode metadata */
NR_INODE_TYPE,
};
@@ -672,6 +747,7 @@ enum {
SBI_IS_CLOSE, /* specify unmounting */
SBI_NEED_FSCK, /* need fsck.f2fs to fix */
SBI_POR_DOING, /* recovery is doing or not */
+ SBI_NEED_SB_WRITE, /* need to recover superblock */
};
enum {
@@ -680,6 +756,10 @@ enum {
MAX_TIME,
};
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#define F2FS_KEY_DESC_PREFIX "f2fs:"
+#define F2FS_KEY_DESC_PREFIX_SIZE 5
+#endif
struct f2fs_sb_info {
struct super_block *sb; /* pointer to VFS super block */
struct proc_dir_entry *s_proc; /* proc entry */
@@ -687,6 +767,10 @@ struct f2fs_sb_info {
int valid_super_block; /* valid super block no */
int s_flag; /* flags for sbi */
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+ u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE];
+ u8 key_prefix_size;
+#endif
/* for node-related operations */
struct f2fs_nm_info *nm_info; /* node manager */
struct inode *node_inode; /* cache node blocks */
@@ -697,14 +781,14 @@ struct f2fs_sb_info {
/* for bio operations */
struct f2fs_bio_info read_io; /* for read bios */
struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */
+ struct mutex wio_mutex[NODE + 1]; /* bio ordering for NODE/DATA */
/* for checkpoint */
struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */
struct inode *meta_inode; /* cache meta blocks */
struct mutex cp_mutex; /* checkpoint procedure lock */
- struct rw_semaphore cp_rwsem; /* blocking FS operations */
+ struct percpu_rw_semaphore cp_rwsem; /* blocking FS operations */
struct rw_semaphore node_write; /* locking node writes */
- struct mutex writepages; /* mutex for writepages() */
wait_queue_head_t cp_wait;
unsigned long last_time[MAX_TIME]; /* to store time in jiffies */
long interval_time[MAX_TIME]; /* to store thresholds */
@@ -742,18 +826,24 @@ struct f2fs_sb_info {
unsigned int total_sections; /* total section count */
unsigned int total_node_count; /* total node block count */
unsigned int total_valid_node_count; /* valid node block count */
- unsigned int total_valid_inode_count; /* valid inode count */
loff_t max_file_blocks; /* max block index of file */
int active_logs; /* # of active logs */
int dir_level; /* directory level */
block_t user_block_count; /* # of user blocks */
block_t total_valid_block_count; /* # of valid blocks */
- block_t alloc_valid_block_count; /* # of allocated blocks */
block_t discard_blks; /* discard command candidats */
block_t last_valid_block_count; /* for recovery */
u32 s_next_generation; /* for NFS support */
- atomic_t nr_pages[NR_COUNT_TYPE]; /* # of pages, see count_type */
+ atomic_t nr_wb_bios; /* # of writeback bios */
+
+ /* # of pages, see count_type */
+ struct percpu_counter nr_pages[NR_COUNT_TYPE];
+ /* # of allocated blocks */
+ struct percpu_counter alloc_valid_block_count;
+
+ /* valid inode count */
+ struct percpu_counter total_valid_inode_count;
struct f2fs_mount_info mount_opt; /* mount options */
@@ -984,22 +1074,22 @@ static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
{
- down_read(&sbi->cp_rwsem);
+ percpu_down_read(&sbi->cp_rwsem);
}
static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi)
{
- up_read(&sbi->cp_rwsem);
+ percpu_up_read(&sbi->cp_rwsem);
}
static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
{
- down_write(&sbi->cp_rwsem);
+ percpu_down_write(&sbi->cp_rwsem);
}
static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
{
- up_write(&sbi->cp_rwsem);
+ percpu_up_write(&sbi->cp_rwsem);
}
static inline int __get_cp_reason(struct f2fs_sb_info *sbi)
@@ -1054,22 +1144,37 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs)
return ofs == XATTR_NODE_OFFSET;
}
+static inline void f2fs_i_blocks_write(struct inode *, blkcnt_t, bool);
static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
- struct inode *inode, blkcnt_t count)
+ struct inode *inode, blkcnt_t *count)
{
- block_t valid_block_count;
+ blkcnt_t diff;
- spin_lock(&sbi->stat_lock);
- valid_block_count =
- sbi->total_valid_block_count + (block_t)count;
- if (unlikely(valid_block_count > sbi->user_block_count)) {
- spin_unlock(&sbi->stat_lock);
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (time_to_inject(FAULT_BLOCK))
return false;
+#endif
+ /*
+ * let's increase this in prior to actual block count change in order
+ * for f2fs_sync_file to avoid data races when deciding checkpoint.
+ */
+ percpu_counter_add(&sbi->alloc_valid_block_count, (*count));
+
+ spin_lock(&sbi->stat_lock);
+ sbi->total_valid_block_count += (block_t)(*count);
+ if (unlikely(sbi->total_valid_block_count > sbi->user_block_count)) {
+ diff = sbi->total_valid_block_count - sbi->user_block_count;
+ *count -= diff;
+ sbi->total_valid_block_count = sbi->user_block_count;
+ if (!*count) {
+ spin_unlock(&sbi->stat_lock);
+ percpu_counter_sub(&sbi->alloc_valid_block_count, diff);
+ return false;
+ }
}
- inode->i_blocks += count;
- sbi->total_valid_block_count = valid_block_count;
- sbi->alloc_valid_block_count += (block_t)count;
spin_unlock(&sbi->stat_lock);
+
+ f2fs_i_blocks_write(inode, *count, true);
return true;
}
@@ -1080,27 +1185,27 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
spin_lock(&sbi->stat_lock);
f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count);
f2fs_bug_on(sbi, inode->i_blocks < count);
- inode->i_blocks -= count;
sbi->total_valid_block_count -= (block_t)count;
spin_unlock(&sbi->stat_lock);
+ f2fs_i_blocks_write(inode, count, false);
}
static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
{
- atomic_inc(&sbi->nr_pages[count_type]);
+ percpu_counter_inc(&sbi->nr_pages[count_type]);
set_sbi_flag(sbi, SBI_IS_DIRTY);
}
static inline void inode_inc_dirty_pages(struct inode *inode)
{
- atomic_inc(&F2FS_I(inode)->dirty_pages);
+ percpu_counter_inc(&F2FS_I(inode)->dirty_pages);
inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
}
static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
{
- atomic_dec(&sbi->nr_pages[count_type]);
+ percpu_counter_dec(&sbi->nr_pages[count_type]);
}
static inline void inode_dec_dirty_pages(struct inode *inode)
@@ -1109,26 +1214,28 @@ static inline void inode_dec_dirty_pages(struct inode *inode)
!S_ISLNK(inode->i_mode))
return;
- atomic_dec(&F2FS_I(inode)->dirty_pages);
+ percpu_counter_dec(&F2FS_I(inode)->dirty_pages);
dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
}
-static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
+static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type)
{
- return atomic_read(&sbi->nr_pages[count_type]);
+ return percpu_counter_sum_positive(&sbi->nr_pages[count_type]);
}
-static inline int get_dirty_pages(struct inode *inode)
+static inline s64 get_dirty_pages(struct inode *inode)
{
- return atomic_read(&F2FS_I(inode)->dirty_pages);
+ return percpu_counter_sum_positive(&F2FS_I(inode)->dirty_pages);
}
static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
{
unsigned int pages_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg;
- return ((get_pages(sbi, block_type) + pages_per_sec - 1)
- >> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
+ unsigned int segs = (get_pages(sbi, block_type) + pages_per_sec - 1) >>
+ sbi->log_blocks_per_seg;
+
+ return segs / sbi->segs_per_sec;
}
static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi)
@@ -1215,13 +1322,13 @@ static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi,
}
if (inode)
- inode->i_blocks++;
+ f2fs_i_blocks_write(inode, 1, true);
- sbi->alloc_valid_block_count++;
sbi->total_valid_node_count++;
sbi->total_valid_block_count++;
spin_unlock(&sbi->stat_lock);
+ percpu_counter_inc(&sbi->alloc_valid_block_count);
return true;
}
@@ -1234,7 +1341,7 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
f2fs_bug_on(sbi, !sbi->total_valid_node_count);
f2fs_bug_on(sbi, !inode->i_blocks);
- inode->i_blocks--;
+ f2fs_i_blocks_write(inode, 1, false);
sbi->total_valid_node_count--;
sbi->total_valid_block_count--;
@@ -1248,28 +1355,30 @@ static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi)
static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
{
- spin_lock(&sbi->stat_lock);
- f2fs_bug_on(sbi, sbi->total_valid_inode_count == sbi->total_node_count);
- sbi->total_valid_inode_count++;
- spin_unlock(&sbi->stat_lock);
+ percpu_counter_inc(&sbi->total_valid_inode_count);
}
static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi)
{
- spin_lock(&sbi->stat_lock);
- f2fs_bug_on(sbi, !sbi->total_valid_inode_count);
- sbi->total_valid_inode_count--;
- spin_unlock(&sbi->stat_lock);
+ percpu_counter_dec(&sbi->total_valid_inode_count);
}
-static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi)
+static inline s64 valid_inode_count(struct f2fs_sb_info *sbi)
{
- return sbi->total_valid_inode_count;
+ return percpu_counter_sum_positive(&sbi->total_valid_inode_count);
}
static inline struct page *f2fs_grab_cache_page(struct address_space *mapping,
pgoff_t index, bool for_write)
{
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ struct page *page = find_lock_page(mapping, index);
+ if (page)
+ return page;
+
+ if (time_to_inject(FAULT_PAGE_ALLOC))
+ return NULL;
+#endif
if (!for_write)
return grab_cache_page(mapping, index);
return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
@@ -1294,7 +1403,7 @@ static inline void f2fs_put_page(struct page *page, int unlock)
f2fs_bug_on(F2FS_P_SB(page), !PageLocked(page));
unlock_page(page);
}
- page_cache_release(page);
+ put_page(page);
}
static inline void f2fs_put_dnode(struct dnode_of_data *dn)
@@ -1429,13 +1538,12 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr)
enum {
FI_NEW_INODE, /* indicate newly allocated inode */
FI_DIRTY_INODE, /* indicate inode is dirty or not */
+ FI_AUTO_RECOVER, /* indicate inode is recoverable */
FI_DIRTY_DIR, /* indicate directory has dirty pages */
FI_INC_LINK, /* need to increment i_nlink */
FI_ACL_MODE, /* indicate acl mode */
FI_NO_ALLOC, /* should not allocate any blocks */
FI_FREE_NID, /* free allocated nide */
- FI_UPDATE_DIR, /* should update inode block for consistency */
- FI_DELAY_IPUT, /* used for the recovery */
FI_NO_EXTENT, /* not to use the extent cache */
FI_INLINE_XATTR, /* used for inline xattr */
FI_INLINE_DATA, /* used for inline data*/
@@ -1453,64 +1561,143 @@ enum {
FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */
};
-static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
+static inline void __mark_inode_dirty_flag(struct inode *inode,
+ int flag, bool set)
+{
+ switch (flag) {
+ case FI_INLINE_XATTR:
+ case FI_INLINE_DATA:
+ case FI_INLINE_DENTRY:
+ if (set)
+ return;
+ case FI_DATA_EXIST:
+ case FI_INLINE_DOTS:
+ f2fs_mark_inode_dirty_sync(inode);
+ }
+}
+
+static inline void set_inode_flag(struct inode *inode, int flag)
+{
+ if (!test_bit(flag, &F2FS_I(inode)->flags))
+ set_bit(flag, &F2FS_I(inode)->flags);
+ __mark_inode_dirty_flag(inode, flag, true);
+}
+
+static inline int is_inode_flag_set(struct inode *inode, int flag)
+{
+ return test_bit(flag, &F2FS_I(inode)->flags);
+}
+
+static inline void clear_inode_flag(struct inode *inode, int flag)
+{
+ if (test_bit(flag, &F2FS_I(inode)->flags))
+ clear_bit(flag, &F2FS_I(inode)->flags);
+ __mark_inode_dirty_flag(inode, flag, false);
+}
+
+static inline void set_acl_inode(struct inode *inode, umode_t mode)
+{
+ F2FS_I(inode)->i_acl_mode = mode;
+ set_inode_flag(inode, FI_ACL_MODE);
+ f2fs_mark_inode_dirty_sync(inode);
+}
+
+static inline void f2fs_i_links_write(struct inode *inode, bool inc)
+{
+ if (inc)
+ inc_nlink(inode);
+ else
+ drop_nlink(inode);
+ f2fs_mark_inode_dirty_sync(inode);
+}
+
+static inline void f2fs_i_blocks_write(struct inode *inode,
+ blkcnt_t diff, bool add)
+{
+ bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE);
+ bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER);
+
+ inode->i_blocks = add ? inode->i_blocks + diff :
+ inode->i_blocks - diff;
+ f2fs_mark_inode_dirty_sync(inode);
+ if (clean || recover)
+ set_inode_flag(inode, FI_AUTO_RECOVER);
+}
+
+static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size)
{
- if (!test_bit(flag, &fi->flags))
- set_bit(flag, &fi->flags);
+ bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE);
+ bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER);
+
+ if (i_size_read(inode) == i_size)
+ return;
+
+ i_size_write(inode, i_size);
+ f2fs_mark_inode_dirty_sync(inode);
+ if (clean || recover)
+ set_inode_flag(inode, FI_AUTO_RECOVER);
+}
+
+static inline bool f2fs_skip_inode_update(struct inode *inode)
+{
+ if (!is_inode_flag_set(inode, FI_AUTO_RECOVER))
+ return false;
+ return F2FS_I(inode)->last_disk_size == i_size_read(inode);
}
-static inline int is_inode_flag_set(struct f2fs_inode_info *fi, int flag)
+static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth)
{
- return test_bit(flag, &fi->flags);
+ F2FS_I(inode)->i_current_depth = depth;
+ f2fs_mark_inode_dirty_sync(inode);
}
-static inline void clear_inode_flag(struct f2fs_inode_info *fi, int flag)
+static inline void f2fs_i_xnid_write(struct inode *inode, nid_t xnid)
{
- if (test_bit(flag, &fi->flags))
- clear_bit(flag, &fi->flags);
+ F2FS_I(inode)->i_xattr_nid = xnid;
+ f2fs_mark_inode_dirty_sync(inode);
}
-static inline void set_acl_inode(struct f2fs_inode_info *fi, umode_t mode)
+static inline void f2fs_i_pino_write(struct inode *inode, nid_t pino)
{
- fi->i_acl_mode = mode;
- set_inode_flag(fi, FI_ACL_MODE);
+ F2FS_I(inode)->i_pino = pino;
+ f2fs_mark_inode_dirty_sync(inode);
}
-static inline void get_inline_info(struct f2fs_inode_info *fi,
- struct f2fs_inode *ri)
+static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri)
{
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+
if (ri->i_inline & F2FS_INLINE_XATTR)
- set_inode_flag(fi, FI_INLINE_XATTR);
+ set_bit(FI_INLINE_XATTR, &fi->flags);
if (ri->i_inline & F2FS_INLINE_DATA)
- set_inode_flag(fi, FI_INLINE_DATA);
+ set_bit(FI_INLINE_DATA, &fi->flags);
if (ri->i_inline & F2FS_INLINE_DENTRY)
- set_inode_flag(fi, FI_INLINE_DENTRY);
+ set_bit(FI_INLINE_DENTRY, &fi->flags);
if (ri->i_inline & F2FS_DATA_EXIST)
- set_inode_flag(fi, FI_DATA_EXIST);
+ set_bit(FI_DATA_EXIST, &fi->flags);
if (ri->i_inline & F2FS_INLINE_DOTS)
- set_inode_flag(fi, FI_INLINE_DOTS);
+ set_bit(FI_INLINE_DOTS, &fi->flags);
}
-static inline void set_raw_inline(struct f2fs_inode_info *fi,
- struct f2fs_inode *ri)
+static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri)
{
ri->i_inline = 0;
- if (is_inode_flag_set(fi, FI_INLINE_XATTR))
+ if (is_inode_flag_set(inode, FI_INLINE_XATTR))
ri->i_inline |= F2FS_INLINE_XATTR;
- if (is_inode_flag_set(fi, FI_INLINE_DATA))
+ if (is_inode_flag_set(inode, FI_INLINE_DATA))
ri->i_inline |= F2FS_INLINE_DATA;
- if (is_inode_flag_set(fi, FI_INLINE_DENTRY))
+ if (is_inode_flag_set(inode, FI_INLINE_DENTRY))
ri->i_inline |= F2FS_INLINE_DENTRY;
- if (is_inode_flag_set(fi, FI_DATA_EXIST))
+ if (is_inode_flag_set(inode, FI_DATA_EXIST))
ri->i_inline |= F2FS_DATA_EXIST;
- if (is_inode_flag_set(fi, FI_INLINE_DOTS))
+ if (is_inode_flag_set(inode, FI_INLINE_DOTS))
ri->i_inline |= F2FS_INLINE_DOTS;
}
static inline int f2fs_has_inline_xattr(struct inode *inode)
{
- return is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR);
+ return is_inode_flag_set(inode, FI_INLINE_XATTR);
}
static inline unsigned int addrs_per_inode(struct inode *inode)
@@ -1537,43 +1724,43 @@ static inline int inline_xattr_size(struct inode *inode)
static inline int f2fs_has_inline_data(struct inode *inode)
{
- return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA);
+ return is_inode_flag_set(inode, FI_INLINE_DATA);
}
static inline void f2fs_clear_inline_inode(struct inode *inode)
{
- clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
- clear_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
+ clear_inode_flag(inode, FI_INLINE_DATA);
+ clear_inode_flag(inode, FI_DATA_EXIST);
}
static inline int f2fs_exist_data(struct inode *inode)
{
- return is_inode_flag_set(F2FS_I(inode), FI_DATA_EXIST);
+ return is_inode_flag_set(inode, FI_DATA_EXIST);
}
static inline int f2fs_has_inline_dots(struct inode *inode)
{
- return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DOTS);
+ return is_inode_flag_set(inode, FI_INLINE_DOTS);
}
static inline bool f2fs_is_atomic_file(struct inode *inode)
{
- return is_inode_flag_set(F2FS_I(inode), FI_ATOMIC_FILE);
+ return is_inode_flag_set(inode, FI_ATOMIC_FILE);
}
static inline bool f2fs_is_volatile_file(struct inode *inode)
{
- return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE);
+ return is_inode_flag_set(inode, FI_VOLATILE_FILE);
}
static inline bool f2fs_is_first_block_written(struct inode *inode)
{
- return is_inode_flag_set(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN);
+ return is_inode_flag_set(inode, FI_FIRST_BLOCK_WRITTEN);
}
static inline bool f2fs_is_drop_cache(struct inode *inode)
{
- return is_inode_flag_set(F2FS_I(inode), FI_DROP_CACHE);
+ return is_inode_flag_set(inode, FI_DROP_CACHE);
}
static inline void *inline_data_addr(struct page *page)
@@ -1584,7 +1771,7 @@ static inline void *inline_data_addr(struct page *page)
static inline int f2fs_has_inline_dentry(struct inode *inode)
{
- return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DENTRY);
+ return is_inode_flag_set(inode, FI_INLINE_DENTRY);
}
static inline void f2fs_dentry_kunmap(struct inode *dir, struct page *page)
@@ -1601,11 +1788,13 @@ static inline int is_file(struct inode *inode, int type)
static inline void set_file(struct inode *inode, int type)
{
F2FS_I(inode)->i_advise |= type;
+ f2fs_mark_inode_dirty_sync(inode);
}
static inline void clear_file(struct inode *inode, int type)
{
F2FS_I(inode)->i_advise &= ~type;
+ f2fs_mark_inode_dirty_sync(inode);
}
static inline int f2fs_readonly(struct super_block *sb)
@@ -1618,12 +1807,6 @@ static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi)
return is_set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
}
-static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi)
-{
- set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
- sbi->sb->s_flags |= MS_RDONLY;
-}
-
static inline bool is_dot_dotdot(const struct qstr *str)
{
if (str->len == 1 && str->name[0] == '.')
@@ -1638,12 +1821,21 @@ static inline bool is_dot_dotdot(const struct qstr *str)
static inline bool f2fs_may_extent_tree(struct inode *inode)
{
if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE) ||
- is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT))
+ is_inode_flag_set(inode, FI_NO_EXTENT))
return false;
return S_ISREG(inode->i_mode);
}
+static inline void *f2fs_kmalloc(size_t size, gfp_t flags)
+{
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (time_to_inject(FAULT_KMALLOC))
+ return NULL;
+#endif
+ return kmalloc(size, flags);
+}
+
static inline void *f2fs_kvmalloc(size_t size, gfp_t flags)
{
void *ret;
@@ -1665,7 +1857,7 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags)
}
#define get_inode_mode(i) \
- ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \
+ ((is_inode_flag_set(i, FI_ACL_MODE)) ? \
(F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
/* get offset of first page in next direct node */
@@ -1680,7 +1872,7 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags)
int f2fs_sync_file(struct file *, loff_t, loff_t, int);
void truncate_data_blocks(struct dnode_of_data *);
int truncate_blocks(struct inode *, u64, bool);
-int f2fs_truncate(struct inode *, bool);
+int f2fs_truncate(struct inode *);
int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
int f2fs_setattr(struct dentry *, struct iattr *);
int truncate_hole(struct inode *, pgoff_t, pgoff_t);
@@ -1710,7 +1902,7 @@ struct dentry *f2fs_get_parent(struct dentry *child);
*/
extern unsigned char f2fs_filetype_table[F2FS_FT_MAX];
void set_de_type(struct f2fs_dir_entry *, umode_t);
-
+unsigned char get_de_type(struct f2fs_dir_entry *);
struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *,
f2fs_hash_t, int *, struct f2fs_dentry_ptr *);
bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *,
@@ -1721,16 +1913,18 @@ struct page *init_inode_metadata(struct inode *, struct inode *,
const struct qstr *, struct page *);
void update_parent_metadata(struct inode *, struct inode *, unsigned int);
int room_for_filename(const void *, int, int);
-void f2fs_drop_nlink(struct inode *, struct inode *, struct page *);
-struct f2fs_dir_entry *f2fs_find_entry(struct inode *, struct qstr *,
+void f2fs_drop_nlink(struct inode *, struct inode *);
+struct f2fs_dir_entry *f2fs_find_entry(struct inode *, const struct qstr *,
struct page **);
struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **);
-ino_t f2fs_inode_by_name(struct inode *, struct qstr *);
+ino_t f2fs_inode_by_name(struct inode *, const struct qstr *, struct page **);
void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
struct page *, struct inode *);
int update_dent_inode(struct inode *, struct inode *, const struct qstr *);
void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *,
const struct qstr *, f2fs_hash_t , unsigned int);
+int f2fs_add_regular_entry(struct inode *, const struct qstr *,
+ struct inode *, nid_t, umode_t);
int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t,
umode_t);
void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *,
@@ -1747,6 +1941,8 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
/*
* super.c
*/
+int f2fs_inode_dirtied(struct inode *);
+void f2fs_inode_synced(struct inode *);
int f2fs_commit_super(struct f2fs_sb_info *, bool);
int f2fs_sync_fs(struct super_block *, int);
extern __printf(3, 4)
@@ -1780,8 +1976,11 @@ struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *);
void ra_node_page(struct f2fs_sb_info *, nid_t);
struct page *get_node_page(struct f2fs_sb_info *, pgoff_t);
struct page *get_node_page_ra(struct page *, int);
-void sync_inode_page(struct dnode_of_data *);
-int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *);
+void move_node_page(struct page *, int);
+int fsync_node_pages(struct f2fs_sb_info *, struct inode *,
+ struct writeback_control *, bool);
+int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *);
+void build_free_nids(struct f2fs_sb_info *);
bool alloc_nid(struct f2fs_sb_info *, nid_t *);
void alloc_nid_done(struct f2fs_sb_info *, nid_t);
void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
@@ -1843,6 +2042,7 @@ void destroy_segment_manager_caches(void);
/*
* checkpoint.c
*/
+void f2fs_stop_checkpoint(struct f2fs_sb_info *, bool);
struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
struct page *get_tmp_page(struct f2fs_sb_info *, pgoff_t);
@@ -1852,16 +2052,16 @@ void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t);
long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
void add_ino_entry(struct f2fs_sb_info *, nid_t, int type);
void remove_ino_entry(struct f2fs_sb_info *, nid_t, int type);
-void release_ino_entry(struct f2fs_sb_info *);
+void release_ino_entry(struct f2fs_sb_info *, bool);
bool exist_written_data(struct f2fs_sb_info *, nid_t, int);
+int f2fs_sync_inode_meta(struct f2fs_sb_info *);
int acquire_orphan_inode(struct f2fs_sb_info *);
void release_orphan_inode(struct f2fs_sb_info *);
-void add_orphan_inode(struct f2fs_sb_info *, nid_t);
+void add_orphan_inode(struct inode *);
void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
int recover_orphan_inodes(struct f2fs_sb_info *);
int get_valid_checkpoint(struct f2fs_sb_info *);
void update_dirty_page(struct inode *, struct page *);
-void add_dirty_dir_inode(struct inode *);
void remove_dirty_inode(struct inode *);
int sync_dirty_inodes(struct f2fs_sb_info *, enum inode_type);
int write_checkpoint(struct f2fs_sb_info *, struct cp_control *);
@@ -1880,6 +2080,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *);
void f2fs_submit_page_mbio(struct f2fs_io_info *);
void set_data_blkaddr(struct dnode_of_data *);
void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t);
+int reserve_new_blocks(struct dnode_of_data *, blkcnt_t);
int reserve_new_block(struct dnode_of_data *);
int f2fs_get_block(struct dnode_of_data *, pgoff_t);
ssize_t f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *);
@@ -1891,6 +2092,7 @@ struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
int do_write_data_page(struct f2fs_io_info *);
int f2fs_map_blocks(struct inode *, struct f2fs_map_blocks *, int, int);
int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
+void f2fs_set_page_dirty_nobuffers(struct page *);
void f2fs_invalidate_page(struct page *, unsigned int, unsigned int);
int f2fs_release_page(struct page *, gfp_t);
@@ -1906,7 +2108,7 @@ void build_gc_manager(struct f2fs_sb_info *);
/*
* recovery.c
*/
-int recover_fsync_data(struct f2fs_sb_info *);
+int recover_fsync_data(struct f2fs_sb_info *, bool);
bool space_for_roll_forward(struct f2fs_sb_info *);
/*
@@ -1921,12 +2123,12 @@ struct f2fs_stat_info {
unsigned long long hit_largest, hit_cached, hit_rbtree;
unsigned long long hit_total, total_ext;
int ext_tree, zombie_tree, ext_node;
- int ndirty_node, ndirty_meta;
- int ndirty_dent, ndirty_dirs, ndirty_data, ndirty_files;
+ s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, inmem_pages;
+ unsigned int ndirty_dirs, ndirty_files, ndirty_all;
int nats, dirty_nats, sits, dirty_sits, fnids;
int total_count, utilization;
- int bg_gc, inmem_pages, wb_pages;
- int inline_xattr, inline_inode, inline_dir;
+ int bg_gc, wb_bios;
+ int inline_xattr, inline_inode, inline_dir, orphans;
unsigned int valid_count, valid_node_count, valid_inode_count;
unsigned int bimodal, avg_vblocks;
int util_free, util_valid, util_invalid;
@@ -2091,7 +2293,6 @@ int f2fs_write_inline_data(struct inode *, struct page *);
bool recover_inline_data(struct inode *, struct page *);
struct f2fs_dir_entry *find_in_inline_dir(struct inode *,
struct fscrypt_name *, struct page **);
-struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *, struct page **);
int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *);
int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *,
nid_t, umode_t);
@@ -2116,6 +2317,7 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *);
*/
unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int);
bool f2fs_init_extent_tree(struct inode *, struct f2fs_extent *);
+void f2fs_drop_extent_tree(struct inode *);
unsigned int f2fs_destroy_extent_node(struct inode *);
void f2fs_destroy_extent_tree(struct inode *);
bool f2fs_lookup_extent_cache(struct inode *, pgoff_t, struct extent_info *);
@@ -2151,6 +2353,26 @@ static inline int f2fs_sb_has_crypto(struct super_block *sb)
return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT);
}
+static inline int f2fs_sb_mounted_hmsmr(struct super_block *sb)
+{
+ return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_HMSMR);
+}
+
+static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt)
+{
+ clear_opt(sbi, ADAPTIVE);
+ clear_opt(sbi, LFS);
+
+ switch (mt) {
+ case F2FS_MOUNT_ADAPTIVE:
+ set_opt(sbi, ADAPTIVE);
+ break;
+ case F2FS_MOUNT_LFS:
+ set_opt(sbi, LFS);
+ break;
+ }
+}
+
static inline bool f2fs_may_encrypt(struct inode *inode)
{
#ifdef CONFIG_F2FS_FS_ENCRYPTION
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index b41c3579ea9e8..0e493f63ea414 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -20,7 +20,8 @@
#include <linux/uaccess.h>
#include <linux/mount.h>
#include <linux/pagevec.h>
-#include <linux/random.h>
+#include <linux/uuid.h>
+#include <linux/file.h>
#include "f2fs.h"
#include "node.h"
@@ -74,14 +75,15 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
goto mapped;
/* page is wholly or partially inside EOF */
- if (((loff_t)(page->index + 1) << PAGE_CACHE_SHIFT) >
+ if (((loff_t)(page->index + 1) << PAGE_SHIFT) >
i_size_read(inode)) {
unsigned offset;
- offset = i_size_read(inode) & ~PAGE_CACHE_MASK;
- zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+ offset = i_size_read(inode) & ~PAGE_MASK;
+ zero_user_segment(page, offset, PAGE_SIZE);
}
set_page_dirty(page);
- SetPageUptodate(page);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
trace_f2fs_vm_page_mkwrite(page, DATA);
mapped:
@@ -171,21 +173,16 @@ static void try_to_fix_pino(struct inode *inode)
fi->xattr_ver = 0;
if (file_wrong_pino(inode) && inode->i_nlink == 1 &&
get_parent_ino(inode, &pino)) {
- fi->i_pino = pino;
+ f2fs_i_pino_write(inode, pino);
file_got_pino(inode);
- up_write(&fi->i_sem);
-
- mark_inode_dirty_sync(inode);
- f2fs_write_inode(inode, NULL);
- } else {
- up_write(&fi->i_sem);
}
+ up_write(&fi->i_sem);
}
-int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
+static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
+ int datasync, bool atomic)
{
struct inode *inode = file->f_mapping->host;
- struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
nid_t ino = inode->i_ino;
int ret = 0;
@@ -203,9 +200,9 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
/* if fdatasync is triggered, let's do in-place-update */
if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks)
- set_inode_flag(fi, FI_NEED_IPU);
+ set_inode_flag(inode, FI_NEED_IPU);
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
- clear_inode_flag(fi, FI_NEED_IPU);
+ clear_inode_flag(inode, FI_NEED_IPU);
if (ret) {
trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
@@ -213,7 +210,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
}
/* if the inode is dirty, let's recover all the time */
- if (!datasync) {
+ if (!datasync && !f2fs_skip_inode_update(inode)) {
f2fs_write_inode(inode, NULL);
goto go_write;
}
@@ -221,14 +218,14 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
/*
* if there is no written data, don't waste time to write recovery info.
*/
- if (!is_inode_flag_set(fi, FI_APPEND_WRITE) &&
+ if (!is_inode_flag_set(inode, FI_APPEND_WRITE) &&
!exist_written_data(sbi, ino, APPEND_INO)) {
/* it may call write_inode just prior to fsync */
if (need_inode_page_update(sbi, ino))
goto go_write;
- if (is_inode_flag_set(fi, FI_UPDATE_WRITE) ||
+ if (is_inode_flag_set(inode, FI_UPDATE_WRITE) ||
exist_written_data(sbi, ino, UPDATE_INO))
goto flush_out;
goto out;
@@ -238,9 +235,9 @@ go_write:
* Both of fdatasync() and fsync() are able to be recovered from
* sudden-power-off.
*/
- down_read(&fi->i_sem);
+ down_read(&F2FS_I(inode)->i_sem);
need_cp = need_do_checkpoint(inode);
- up_read(&fi->i_sem);
+ up_read(&F2FS_I(inode)->i_sem);
if (need_cp) {
/* all the dirty node pages should be flushed for POR */
@@ -251,12 +248,14 @@ go_write:
* will be used only for fsynced inodes after checkpoint.
*/
try_to_fix_pino(inode);
- clear_inode_flag(fi, FI_APPEND_WRITE);
- clear_inode_flag(fi, FI_UPDATE_WRITE);
+ clear_inode_flag(inode, FI_APPEND_WRITE);
+ clear_inode_flag(inode, FI_UPDATE_WRITE);
goto out;
}
sync_nodes:
- sync_node_pages(sbi, ino, &wbc);
+ ret = fsync_node_pages(sbi, inode, &wbc, atomic);
+ if (ret)
+ goto out;
/* if cp_error was enabled, we should avoid infinite loop */
if (unlikely(f2fs_cp_error(sbi))) {
@@ -265,7 +264,7 @@ sync_nodes:
}
if (need_inode_block_update(sbi, ino)) {
- mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode);
f2fs_write_inode(inode, NULL);
goto sync_nodes;
}
@@ -276,10 +275,10 @@ sync_nodes:
/* once recovery info is written, don't need to tack this */
remove_ino_entry(sbi, ino, APPEND_INO);
- clear_inode_flag(fi, FI_APPEND_WRITE);
+ clear_inode_flag(inode, FI_APPEND_WRITE);
flush_out:
remove_ino_entry(sbi, ino, UPDATE_INO);
- clear_inode_flag(fi, FI_UPDATE_WRITE);
+ clear_inode_flag(inode, FI_UPDATE_WRITE);
ret = f2fs_issue_flush(sbi);
f2fs_update_time(sbi, REQ_TIME);
out:
@@ -288,6 +287,11 @@ out:
return ret;
}
+int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
+{
+ return f2fs_do_sync_file(file, start, end, datasync, false);
+}
+
static pgoff_t __get_first_dirty_index(struct address_space *mapping,
pgoff_t pgofs, int whence)
{
@@ -346,13 +350,13 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
goto found;
}
- pgofs = (pgoff_t)(offset >> PAGE_CACHE_SHIFT);
+ pgofs = (pgoff_t)(offset >> PAGE_SHIFT);
dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence);
- for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) {
+ for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_SHIFT) {
set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA);
+ err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE);
if (err && err != -ENOENT) {
goto fail;
} else if (err == -ENOENT) {
@@ -370,7 +374,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
/* find data/hole in dnode block */
for (; dn.ofs_in_node < end_offset;
dn.ofs_in_node++, pgofs++,
- data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) {
+ data_ofs = (loff_t)pgofs << PAGE_SHIFT) {
block_t blkaddr;
blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
@@ -441,7 +445,7 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
static int f2fs_file_open(struct inode *inode, struct file *filp)
{
int ret = generic_file_open(inode, filp);
- struct inode *dir = filp->f_path.dentry->d_parent->d_inode;
+ struct dentry *dir;
if (!ret && f2fs_encrypted_inode(inode)) {
ret = fscrypt_get_encryption_info(inode);
@@ -450,9 +454,13 @@ static int f2fs_file_open(struct inode *inode, struct file *filp)
if (!fscrypt_has_encryption_key(inode))
return -ENOKEY;
}
- if (f2fs_encrypted_inode(dir) &&
- !fscrypt_has_permitted_context(dir, inode))
+ dir = dget_parent(file_dentry(filp));
+ if (f2fs_encrypted_inode(d_inode(dir)) &&
+ !fscrypt_has_permitted_context(d_inode(dir), inode)) {
+ dput(dir);
return -EPERM;
+ }
+ dput(dir);
return ret;
}
@@ -475,8 +483,7 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
set_data_blkaddr(dn);
invalidate_blocks(sbi, blkaddr);
if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page))
- clear_inode_flag(F2FS_I(dn->inode),
- FI_FIRST_BLOCK_WRITTEN);
+ clear_inode_flag(dn->inode, FI_FIRST_BLOCK_WRITTEN);
nr_free++;
}
@@ -490,7 +497,6 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
dn->inode) + ofs;
f2fs_update_extent_cache_range(dn, fofs, 0, len);
dec_valid_block_count(sbi, dn->inode, nr_free);
- sync_inode_page(dn);
}
dn->ofs_in_node = ofs;
@@ -508,8 +514,8 @@ void truncate_data_blocks(struct dnode_of_data *dn)
static int truncate_partial_data_page(struct inode *inode, u64 from,
bool cache_only)
{
- unsigned offset = from & (PAGE_CACHE_SIZE - 1);
- pgoff_t index = from >> PAGE_CACHE_SHIFT;
+ unsigned offset = from & (PAGE_SIZE - 1);
+ pgoff_t index = from >> PAGE_SHIFT;
struct address_space *mapping = inode->i_mapping;
struct page *page;
@@ -529,7 +535,7 @@ static int truncate_partial_data_page(struct inode *inode, u64 from,
return 0;
truncate_out:
f2fs_wait_on_page_writeback(page, DATA, true);
- zero_user(page, offset, PAGE_CACHE_SIZE - offset);
+ zero_user(page, offset, PAGE_SIZE - offset);
if (!cache_only || !f2fs_encrypted_inode(inode) ||
!S_ISREG(inode->i_mode))
set_page_dirty(page);
@@ -551,6 +557,9 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
free_from = (pgoff_t)F2FS_BYTES_TO_BLK(from + blocksize - 1);
+ if (free_from >= sbi->max_file_blocks)
+ goto free_partial;
+
if (lock)
f2fs_lock_op(sbi);
@@ -569,7 +578,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
}
set_new_dnode(&dn, inode, ipage, NULL, 0);
- err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE);
+ err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE_RA);
if (err) {
if (err == -ENOENT)
goto free_next;
@@ -592,7 +601,7 @@ free_next:
out:
if (lock)
f2fs_unlock_op(sbi);
-
+free_partial:
/* lastly zero out the first data page */
if (!err)
err = truncate_partial_data_page(inode, from, truncate_page);
@@ -601,7 +610,7 @@ out:
return err;
}
-int f2fs_truncate(struct inode *inode, bool lock)
+int f2fs_truncate(struct inode *inode)
{
int err;
@@ -618,12 +627,12 @@ int f2fs_truncate(struct inode *inode, bool lock)
return err;
}
- err = truncate_blocks(inode, i_size_read(inode), lock);
+ err = truncate_blocks(inode, i_size_read(inode), true);
if (err)
return err;
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- mark_inode_dirty(inode);
+ f2fs_mark_inode_dirty_sync(inode);
return 0;
}
@@ -639,7 +648,6 @@ int f2fs_getattr(struct vfsmount *mnt,
#ifdef CONFIG_F2FS_FS_POSIX_ACL
static void __setattr_copy(struct inode *inode, const struct iattr *attr)
{
- struct f2fs_inode_info *fi = F2FS_I(inode);
unsigned int ia_valid = attr->ia_valid;
if (ia_valid & ATTR_UID)
@@ -660,7 +668,7 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr)
if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
mode &= ~S_ISGID;
- set_acl_inode(fi, mode);
+ set_acl_inode(inode, mode);
}
}
#else
@@ -670,7 +678,6 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr)
int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
- struct f2fs_inode_info *fi = F2FS_I(inode);
int err;
err = inode_change_ok(inode, attr);
@@ -684,7 +691,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
if (attr->ia_size <= i_size_read(inode)) {
truncate_setsize(inode, attr->ia_size);
- err = f2fs_truncate(inode, true);
+ err = f2fs_truncate(inode);
if (err)
return err;
f2fs_balance_fs(F2FS_I_SB(inode), true);
@@ -709,13 +716,13 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
if (attr->ia_valid & ATTR_MODE) {
err = posix_acl_chmod(inode, get_inode_mode(inode));
- if (err || is_inode_flag_set(fi, FI_ACL_MODE)) {
- inode->i_mode = fi->i_acl_mode;
- clear_inode_flag(fi, FI_ACL_MODE);
+ if (err || is_inode_flag_set(inode, FI_ACL_MODE)) {
+ inode->i_mode = F2FS_I(inode)->i_acl_mode;
+ clear_inode_flag(inode, FI_ACL_MODE);
}
}
- mark_inode_dirty(inode);
+ f2fs_mark_inode_dirty_sync(inode);
return err;
}
@@ -799,11 +806,11 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
if (ret)
return ret;
- pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
- pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
+ pg_start = ((unsigned long long) offset) >> PAGE_SHIFT;
+ pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT;
- off_start = offset & (PAGE_CACHE_SIZE - 1);
- off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);
+ off_start = offset & (PAGE_SIZE - 1);
+ off_end = (offset + len) & (PAGE_SIZE - 1);
if (pg_start == pg_end) {
ret = fill_zero(inode, pg_start, off_start,
@@ -813,7 +820,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
} else {
if (off_start) {
ret = fill_zero(inode, pg_start++, off_start,
- PAGE_CACHE_SIZE - off_start);
+ PAGE_SIZE - off_start);
if (ret)
return ret;
}
@@ -830,8 +837,8 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
f2fs_balance_fs(sbi, true);
- blk_start = (loff_t)pg_start << PAGE_CACHE_SHIFT;
- blk_end = (loff_t)pg_end << PAGE_CACHE_SHIFT;
+ blk_start = (loff_t)pg_start << PAGE_SHIFT;
+ blk_end = (loff_t)pg_end << PAGE_SHIFT;
truncate_inode_pages_range(mapping, blk_start,
blk_end - 1);
@@ -844,79 +851,199 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
return ret;
}
-static int __exchange_data_block(struct inode *inode, pgoff_t src,
- pgoff_t dst, bool full)
+static int __read_out_blkaddrs(struct inode *inode, block_t *blkaddr,
+ int *do_replace, pgoff_t off, pgoff_t len)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct dnode_of_data dn;
- block_t new_addr;
- bool do_replace = false;
- int ret;
+ int ret, done, i;
+next_dnode:
set_new_dnode(&dn, inode, NULL, NULL, 0);
- ret = get_dnode_of_data(&dn, src, LOOKUP_NODE_RA);
+ ret = get_dnode_of_data(&dn, off, LOOKUP_NODE_RA);
if (ret && ret != -ENOENT) {
return ret;
} else if (ret == -ENOENT) {
- new_addr = NULL_ADDR;
- } else {
- new_addr = dn.data_blkaddr;
- if (!is_checkpointed_data(sbi, new_addr)) {
+ if (dn.max_level == 0)
+ return -ENOENT;
+ done = min((pgoff_t)ADDRS_PER_BLOCK - dn.ofs_in_node, len);
+ blkaddr += done;
+ do_replace += done;
+ goto next;
+ }
+
+ done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) -
+ dn.ofs_in_node, len);
+ for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) {
+ *blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+ if (!is_checkpointed_data(sbi, *blkaddr)) {
+
+ if (test_opt(sbi, LFS)) {
+ f2fs_put_dnode(&dn);
+ return -ENOTSUPP;
+ }
+
/* do not invalidate this block address */
f2fs_update_data_blkaddr(&dn, NULL_ADDR);
- do_replace = true;
+ *do_replace = 1;
}
- f2fs_put_dnode(&dn);
}
+ f2fs_put_dnode(&dn);
+next:
+ len -= done;
+ off += done;
+ if (len)
+ goto next_dnode;
+ return 0;
+}
- if (new_addr == NULL_ADDR)
- return full ? truncate_hole(inode, dst, dst + 1) : 0;
+static int __roll_back_blkaddrs(struct inode *inode, block_t *blkaddr,
+ int *do_replace, pgoff_t off, int len)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct dnode_of_data dn;
+ int ret, i;
- if (do_replace) {
- struct page *ipage = get_node_page(sbi, inode->i_ino);
- struct node_info ni;
+ for (i = 0; i < len; i++, do_replace++, blkaddr++) {
+ if (*do_replace == 0)
+ continue;
- if (IS_ERR(ipage)) {
- ret = PTR_ERR(ipage);
- goto err_out;
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ ret = get_dnode_of_data(&dn, off + i, LOOKUP_NODE_RA);
+ if (ret) {
+ dec_valid_block_count(sbi, inode, 1);
+ invalidate_blocks(sbi, *blkaddr);
+ } else {
+ f2fs_update_data_blkaddr(&dn, *blkaddr);
+ }
+ f2fs_put_dnode(&dn);
+ }
+ return 0;
+}
+
+static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode,
+ block_t *blkaddr, int *do_replace,
+ pgoff_t src, pgoff_t dst, pgoff_t len, bool full)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(src_inode);
+ pgoff_t i = 0;
+ int ret;
+
+ while (i < len) {
+ if (blkaddr[i] == NULL_ADDR && !full) {
+ i++;
+ continue;
}
- set_new_dnode(&dn, inode, ipage, NULL, 0);
- ret = f2fs_reserve_block(&dn, dst);
- if (ret)
- goto err_out;
+ if (do_replace[i] || blkaddr[i] == NULL_ADDR) {
+ struct dnode_of_data dn;
+ struct node_info ni;
+ size_t new_size;
+ pgoff_t ilen;
- truncate_data_blocks_range(&dn, 1);
+ set_new_dnode(&dn, dst_inode, NULL, NULL, 0);
+ ret = get_dnode_of_data(&dn, dst + i, ALLOC_NODE);
+ if (ret)
+ return ret;
- get_node_info(sbi, dn.nid, &ni);
- f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr,
- ni.version, true, false);
- f2fs_put_dnode(&dn);
- } else {
- struct page *psrc, *pdst;
+ get_node_info(sbi, dn.nid, &ni);
+ ilen = min((pgoff_t)
+ ADDRS_PER_PAGE(dn.node_page, dst_inode) -
+ dn.ofs_in_node, len - i);
+ do {
+ dn.data_blkaddr = datablock_addr(dn.node_page,
+ dn.ofs_in_node);
+ truncate_data_blocks_range(&dn, 1);
+
+ if (do_replace[i]) {
+ f2fs_i_blocks_write(src_inode,
+ 1, false);
+ f2fs_i_blocks_write(dst_inode,
+ 1, true);
+ f2fs_replace_block(sbi, &dn, dn.data_blkaddr,
+ blkaddr[i], ni.version, true, false);
+
+ do_replace[i] = 0;
+ }
+ dn.ofs_in_node++;
+ i++;
+ new_size = (dst + i) << PAGE_SHIFT;
+ if (dst_inode->i_size < new_size)
+ f2fs_i_size_write(dst_inode, new_size);
+ } while ((do_replace[i] || blkaddr[i] == NULL_ADDR) && --ilen);
- psrc = get_lock_data_page(inode, src, true);
- if (IS_ERR(psrc))
- return PTR_ERR(psrc);
- pdst = get_new_data_page(inode, NULL, dst, true);
- if (IS_ERR(pdst)) {
+ f2fs_put_dnode(&dn);
+ } else {
+ struct page *psrc, *pdst;
+
+ psrc = get_lock_data_page(src_inode, src + i, true);
+ if (IS_ERR(psrc))
+ return PTR_ERR(psrc);
+ pdst = get_new_data_page(dst_inode, NULL, dst + i,
+ true);
+ if (IS_ERR(pdst)) {
+ f2fs_put_page(psrc, 1);
+ return PTR_ERR(pdst);
+ }
+ f2fs_copy_page(psrc, pdst);
+ set_page_dirty(pdst);
+ f2fs_put_page(pdst, 1);
f2fs_put_page(psrc, 1);
- return PTR_ERR(pdst);
- }
- f2fs_copy_page(psrc, pdst);
- set_page_dirty(pdst);
- f2fs_put_page(pdst, 1);
- f2fs_put_page(psrc, 1);
- return truncate_hole(inode, src, src + 1);
+ ret = truncate_hole(src_inode, src + i, src + i + 1);
+ if (ret)
+ return ret;
+ i++;
+ }
}
return 0;
+}
-err_out:
- if (!get_dnode_of_data(&dn, src, LOOKUP_NODE)) {
- f2fs_update_data_blkaddr(&dn, new_addr);
- f2fs_put_dnode(&dn);
+static int __exchange_data_block(struct inode *src_inode,
+ struct inode *dst_inode, pgoff_t src, pgoff_t dst,
+ pgoff_t len, bool full)
+{
+ block_t *src_blkaddr;
+ int *do_replace;
+ pgoff_t olen;
+ int ret;
+
+ while (len) {
+ olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len);
+
+ src_blkaddr = f2fs_kvzalloc(sizeof(block_t) * olen, GFP_KERNEL);
+ if (!src_blkaddr)
+ return -ENOMEM;
+
+ do_replace = f2fs_kvzalloc(sizeof(int) * olen, GFP_KERNEL);
+ if (!do_replace) {
+ kvfree(src_blkaddr);
+ return -ENOMEM;
+ }
+
+ ret = __read_out_blkaddrs(src_inode, src_blkaddr,
+ do_replace, src, olen);
+ if (ret)
+ goto roll_back;
+
+ ret = __clone_blkaddrs(src_inode, dst_inode, src_blkaddr,
+ do_replace, src, dst, olen, full);
+ if (ret)
+ goto roll_back;
+
+ src += olen;
+ dst += olen;
+ len -= olen;
+
+ kvfree(src_blkaddr);
+ kvfree(do_replace);
}
+ return 0;
+
+roll_back:
+ __roll_back_blkaddrs(src_inode, src_blkaddr, do_replace, src, len);
+ kvfree(src_blkaddr);
+ kvfree(do_replace);
return ret;
}
@@ -924,16 +1051,15 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
- int ret = 0;
+ int ret;
- for (; end < nrpages; start++, end++) {
- f2fs_balance_fs(sbi, true);
- f2fs_lock_op(sbi);
- ret = __exchange_data_block(inode, end, start, true);
- f2fs_unlock_op(sbi);
- if (ret)
- break;
- }
+ f2fs_balance_fs(sbi, true);
+ f2fs_lock_op(sbi);
+
+ f2fs_drop_extent_tree(inode);
+
+ ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true);
+ f2fs_unlock_op(sbi);
return ret;
}
@@ -954,8 +1080,8 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
if (ret)
return ret;
- pg_start = offset >> PAGE_CACHE_SHIFT;
- pg_end = (offset + len) >> PAGE_CACHE_SHIFT;
+ pg_start = offset >> PAGE_SHIFT;
+ pg_end = (offset + len) >> PAGE_SHIFT;
/* write out all dirty pages from offset */
ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
@@ -977,7 +1103,50 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
ret = truncate_blocks(inode, new_size, true);
if (!ret)
- i_size_write(inode, new_size);
+ f2fs_i_size_write(inode, new_size);
+
+ return ret;
+}
+
+static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
+ pgoff_t end)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
+ pgoff_t index = start;
+ unsigned int ofs_in_node = dn->ofs_in_node;
+ blkcnt_t count = 0;
+ int ret;
+
+ for (; index < end; index++, dn->ofs_in_node++) {
+ if (datablock_addr(dn->node_page, dn->ofs_in_node) == NULL_ADDR)
+ count++;
+ }
+
+ dn->ofs_in_node = ofs_in_node;
+ ret = reserve_new_blocks(dn, count);
+ if (ret)
+ return ret;
+
+ dn->ofs_in_node = ofs_in_node;
+ for (index = start; index < end; index++, dn->ofs_in_node++) {
+ dn->data_blkaddr =
+ datablock_addr(dn->node_page, dn->ofs_in_node);
+ /*
+ * reserve_new_blocks will not guarantee entire block
+ * allocation.
+ */
+ if (dn->data_blkaddr == NULL_ADDR) {
+ ret = -ENOSPC;
+ break;
+ }
+ if (dn->data_blkaddr != NEW_ADDR) {
+ invalidate_blocks(sbi, dn->data_blkaddr);
+ dn->data_blkaddr = NEW_ADDR;
+ set_data_blkaddr(dn);
+ }
+ }
+
+ f2fs_update_extent_cache_range(dn, start, 0, index - start);
return ret;
}
@@ -1006,11 +1175,11 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
truncate_pagecache_range(inode, offset, offset + len - 1);
- pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
- pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
+ pg_start = ((unsigned long long) offset) >> PAGE_SHIFT;
+ pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT;
- off_start = offset & (PAGE_CACHE_SIZE - 1);
- off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);
+ off_start = offset & (PAGE_SIZE - 1);
+ off_end = (offset + len) & (PAGE_SIZE - 1);
if (pg_start == pg_end) {
ret = fill_zero(inode, pg_start, off_start,
@@ -1024,43 +1193,40 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
} else {
if (off_start) {
ret = fill_zero(inode, pg_start++, off_start,
- PAGE_CACHE_SIZE - off_start);
+ PAGE_SIZE - off_start);
if (ret)
return ret;
new_size = max_t(loff_t, new_size,
- (loff_t)pg_start << PAGE_CACHE_SHIFT);
+ (loff_t)pg_start << PAGE_SHIFT);
}
- for (index = pg_start; index < pg_end; index++) {
+ for (index = pg_start; index < pg_end;) {
struct dnode_of_data dn;
- struct page *ipage;
+ unsigned int end_offset;
+ pgoff_t end;
f2fs_lock_op(sbi);
- ipage = get_node_page(sbi, inode->i_ino);
- if (IS_ERR(ipage)) {
- ret = PTR_ERR(ipage);
- f2fs_unlock_op(sbi);
- goto out;
- }
-
- set_new_dnode(&dn, inode, ipage, NULL, 0);
- ret = f2fs_reserve_block(&dn, index);
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ ret = get_dnode_of_data(&dn, index, ALLOC_NODE);
if (ret) {
f2fs_unlock_op(sbi);
goto out;
}
- if (dn.data_blkaddr != NEW_ADDR) {
- invalidate_blocks(sbi, dn.data_blkaddr);
- f2fs_update_data_blkaddr(&dn, NEW_ADDR);
- }
+ end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
+ end = min(pg_end, end_offset - dn.ofs_in_node + index);
+
+ ret = f2fs_do_zero_range(&dn, index, end);
f2fs_put_dnode(&dn);
f2fs_unlock_op(sbi);
+ if (ret)
+ goto out;
+ index = end;
new_size = max_t(loff_t, new_size,
- (loff_t)(index + 1) << PAGE_CACHE_SHIFT);
+ (loff_t)index << PAGE_SHIFT);
}
if (off_end) {
@@ -1073,11 +1239,8 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
}
out:
- if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) {
- i_size_write(inode, new_size);
- mark_inode_dirty(inode);
- update_inode_page(inode);
- }
+ if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size)
+ f2fs_i_size_write(inode, new_size);
return ret;
}
@@ -1085,7 +1248,7 @@ out:
static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- pgoff_t pg_start, pg_end, delta, nrpages, idx;
+ pgoff_t nr, pg_start, pg_end, delta, idx;
loff_t new_size;
int ret = 0;
@@ -1117,17 +1280,23 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
truncate_pagecache(inode, offset);
- pg_start = offset >> PAGE_CACHE_SHIFT;
- pg_end = (offset + len) >> PAGE_CACHE_SHIFT;
+ pg_start = offset >> PAGE_SHIFT;
+ pg_end = (offset + len) >> PAGE_SHIFT;
delta = pg_end - pg_start;
- nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
+ idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
+
+ while (!ret && idx > pg_start) {
+ nr = idx - pg_start;
+ if (nr > delta)
+ nr = delta;
+ idx -= nr;
- for (idx = nrpages - 1; idx >= pg_start && idx != -1; idx--) {
f2fs_lock_op(sbi);
- ret = __exchange_data_block(inode, idx, idx + delta, false);
+ f2fs_drop_extent_tree(inode);
+
+ ret = __exchange_data_block(inode, inode, idx,
+ idx + delta, nr, false);
f2fs_unlock_op(sbi);
- if (ret)
- break;
}
/* write out all moved pages, if possible */
@@ -1135,7 +1304,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
truncate_pagecache(inode, offset);
if (!ret)
- i_size_write(inode, new_size);
+ f2fs_i_size_write(inode, new_size);
return ret;
}
@@ -1143,10 +1312,11 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
loff_t len, int mode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- pgoff_t index, pg_start, pg_end;
+ struct f2fs_map_blocks map = { .m_next_pgofs = NULL };
+ pgoff_t pg_end;
loff_t new_size = i_size_read(inode);
- loff_t off_start, off_end;
- int ret = 0;
+ loff_t off_end;
+ int ret;
ret = inode_newsize_ok(inode, (len + offset));
if (ret)
@@ -1158,43 +1328,32 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
f2fs_balance_fs(sbi, true);
- pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
- pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
+ pg_end = ((unsigned long long)offset + len) >> PAGE_SHIFT;
+ off_end = (offset + len) & (PAGE_SIZE - 1);
- off_start = offset & (PAGE_CACHE_SIZE - 1);
- off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);
+ map.m_lblk = ((unsigned long long)offset) >> PAGE_SHIFT;
+ map.m_len = pg_end - map.m_lblk;
+ if (off_end)
+ map.m_len++;
- f2fs_lock_op(sbi);
+ ret = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
+ if (ret) {
+ pgoff_t last_off;
- for (index = pg_start; index <= pg_end; index++) {
- struct dnode_of_data dn;
+ if (!map.m_len)
+ return ret;
- if (index == pg_end && !off_end)
- goto noalloc;
+ last_off = map.m_lblk + map.m_len - 1;
- set_new_dnode(&dn, inode, NULL, NULL, 0);
- ret = f2fs_reserve_block(&dn, index);
- if (ret)
- break;
-noalloc:
- if (pg_start == pg_end)
- new_size = offset + len;
- else if (index == pg_start && off_start)
- new_size = (loff_t)(index + 1) << PAGE_CACHE_SHIFT;
- else if (index == pg_end)
- new_size = ((loff_t)index << PAGE_CACHE_SHIFT) +
- off_end;
- else
- new_size += PAGE_CACHE_SIZE;
+ /* update new size to the failed position */
+ new_size = (last_off == pg_end) ? offset + len:
+ (loff_t)(last_off + 1) << PAGE_SHIFT;
+ } else {
+ new_size = ((loff_t)pg_end << PAGE_SHIFT) + off_end;
}
- if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- i_size_read(inode) < new_size) {
- i_size_write(inode, new_size);
- mark_inode_dirty(inode);
- update_inode_page(inode);
- }
- f2fs_unlock_op(sbi);
+ if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size)
+ f2fs_i_size_write(inode, new_size);
return ret;
}
@@ -1237,7 +1396,7 @@ static long f2fs_fallocate(struct file *file, int mode,
if (!ret) {
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- mark_inode_dirty(inode);
+ f2fs_mark_inode_dirty_sync(inode);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
}
@@ -1250,13 +1409,22 @@ out:
static int f2fs_release_file(struct inode *inode, struct file *filp)
{
+ /*
+ * f2fs_relase_file is called at every close calls. So we should
+ * not drop any inmemory pages by close called by other process.
+ */
+ if (!(filp->f_mode & FMODE_WRITE) ||
+ atomic_read(&inode->i_writecount) != 1)
+ return 0;
+
/* some remained atomic pages should discarded */
if (f2fs_is_atomic_file(inode))
drop_inmem_pages(inode);
if (f2fs_is_volatile_file(inode)) {
- set_inode_flag(F2FS_I(inode), FI_DROP_CACHE);
+ clear_inode_flag(inode, FI_VOLATILE_FILE);
+ set_inode_flag(inode, FI_DROP_CACHE);
filemap_fdatawrite(inode->i_mapping);
- clear_inode_flag(F2FS_I(inode), FI_DROP_CACHE);
+ clear_inode_flag(inode, FI_DROP_CACHE);
}
return 0;
}
@@ -1290,20 +1458,16 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
unsigned int oldflags;
int ret;
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+
+ if (get_user(flags, (int __user *)arg))
+ return -EFAULT;
+
ret = mnt_want_write_file(filp);
if (ret)
return ret;
- if (!inode_owner_or_capable(inode)) {
- ret = -EACCES;
- goto out;
- }
-
- if (get_user(flags, (int __user *)arg)) {
- ret = -EFAULT;
- goto out;
- }
-
flags = f2fs_mask_flags(inode->i_mode, flags);
inode_lock(inode);
@@ -1323,9 +1487,8 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
fi->i_flags = flags;
inode_unlock(inode);
- f2fs_set_inode_flags(inode);
inode->i_ctime = CURRENT_TIME;
- mark_inode_dirty(inode);
+ f2fs_set_inode_flags(inode);
out:
mnt_drop_write_file(filp);
return ret;
@@ -1346,17 +1509,35 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
if (!inode_owner_or_capable(inode))
return -EACCES;
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ inode_lock(inode);
+
if (f2fs_is_atomic_file(inode))
- return 0;
+ goto out;
ret = f2fs_convert_inline_inode(inode);
if (ret)
- return ret;
+ goto out;
- set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+ set_inode_flag(inode, FI_ATOMIC_FILE);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
- return 0;
+ if (!get_dirty_pages(inode))
+ goto out;
+
+ f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING,
+ "Unexpected flush for atomic writes: ino=%lu, npages=%lld",
+ inode->i_ino, get_dirty_pages(inode));
+ ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
+ if (ret)
+ clear_inode_flag(inode, FI_ATOMIC_FILE);
+out:
+ inode_unlock(inode);
+ mnt_drop_write_file(filp);
+ return ret;
}
static int f2fs_ioc_commit_atomic_write(struct file *filp)
@@ -1367,24 +1548,27 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
if (!inode_owner_or_capable(inode))
return -EACCES;
- if (f2fs_is_volatile_file(inode))
- return 0;
-
ret = mnt_want_write_file(filp);
if (ret)
return ret;
+ inode_lock(inode);
+
+ if (f2fs_is_volatile_file(inode))
+ goto err_out;
+
if (f2fs_is_atomic_file(inode)) {
- clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+ clear_inode_flag(inode, FI_ATOMIC_FILE);
ret = commit_inmem_pages(inode);
if (ret) {
- set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+ set_inode_flag(inode, FI_ATOMIC_FILE);
goto err_out;
}
}
- ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
+ ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
err_out:
+ inode_unlock(inode);
mnt_drop_write_file(filp);
return ret;
}
@@ -1397,32 +1581,54 @@ static int f2fs_ioc_start_volatile_write(struct file *filp)
if (!inode_owner_or_capable(inode))
return -EACCES;
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ inode_lock(inode);
+
if (f2fs_is_volatile_file(inode))
- return 0;
+ goto out;
ret = f2fs_convert_inline_inode(inode);
if (ret)
- return ret;
+ goto out;
- set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+ set_inode_flag(inode, FI_VOLATILE_FILE);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
- return 0;
+out:
+ inode_unlock(inode);
+ mnt_drop_write_file(filp);
+ return ret;
}
static int f2fs_ioc_release_volatile_write(struct file *filp)
{
struct inode *inode = file_inode(filp);
+ int ret;
if (!inode_owner_or_capable(inode))
return -EACCES;
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ inode_lock(inode);
+
if (!f2fs_is_volatile_file(inode))
- return 0;
+ goto out;
- if (!f2fs_is_first_block_written(inode))
- return truncate_partial_data_page(inode, 0, true);
+ if (!f2fs_is_first_block_written(inode)) {
+ ret = truncate_partial_data_page(inode, 0, true);
+ goto out;
+ }
- return punch_hole(inode, 0, F2FS_BLKSIZE);
+ ret = punch_hole(inode, 0, F2FS_BLKSIZE);
+out:
+ inode_unlock(inode);
+ mnt_drop_write_file(filp);
+ return ret;
}
static int f2fs_ioc_abort_volatile_write(struct file *filp)
@@ -1437,15 +1643,17 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
if (ret)
return ret;
- if (f2fs_is_atomic_file(inode)) {
- clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+ inode_lock(inode);
+
+ if (f2fs_is_atomic_file(inode))
drop_inmem_pages(inode);
- }
if (f2fs_is_volatile_file(inode)) {
- clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
- ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
+ clear_inode_flag(inode, FI_VOLATILE_FILE);
+ ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
}
+ inode_unlock(inode);
+
mnt_drop_write_file(filp);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return ret;
@@ -1457,6 +1665,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct super_block *sb = sbi->sb;
__u32 in;
+ int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -1464,31 +1673,38 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
if (get_user(in, (__u32 __user *)arg))
return -EFAULT;
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
switch (in) {
case F2FS_GOING_DOWN_FULLSYNC:
sb = freeze_bdev(sb->s_bdev);
if (sb && !IS_ERR(sb)) {
- f2fs_stop_checkpoint(sbi);
+ f2fs_stop_checkpoint(sbi, false);
thaw_bdev(sb->s_bdev, sb);
}
break;
case F2FS_GOING_DOWN_METASYNC:
/* do checkpoint only */
f2fs_sync_fs(sb, 1);
- f2fs_stop_checkpoint(sbi);
+ f2fs_stop_checkpoint(sbi, false);
break;
case F2FS_GOING_DOWN_NOSYNC:
- f2fs_stop_checkpoint(sbi);
+ f2fs_stop_checkpoint(sbi, false);
break;
case F2FS_GOING_DOWN_METAFLUSH:
sync_meta_pages(sbi, META, LONG_MAX);
- f2fs_stop_checkpoint(sbi);
+ f2fs_stop_checkpoint(sbi, false);
break;
default:
- return -EINVAL;
+ ret = -EINVAL;
+ goto out;
}
f2fs_update_time(sbi, REQ_TIME);
- return 0;
+out:
+ mnt_drop_write_file(filp);
+ return ret;
}
static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
@@ -1509,9 +1725,14 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
sizeof(range)))
return -EFAULT;
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
range.minlen = max((unsigned int)range.minlen,
q->limits.discard_granularity);
ret = f2fs_trim_fs(F2FS_SB(sb), &range);
+ mnt_drop_write_file(filp);
if (ret < 0)
return ret;
@@ -1536,13 +1757,21 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
{
struct fscrypt_policy policy;
struct inode *inode = file_inode(filp);
+ int ret;
if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg,
sizeof(policy)))
return -EFAULT;
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
- return fscrypt_process_policy(inode, &policy);
+ ret = fscrypt_process_policy(inode, &policy);
+
+ mnt_drop_write_file(filp);
+ return ret;
}
static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
@@ -1599,6 +1828,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
struct inode *inode = file_inode(filp);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
__u32 sync;
+ int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -1609,20 +1839,30 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
if (f2fs_readonly(sbi->sb))
return -EROFS;
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
if (!sync) {
- if (!mutex_trylock(&sbi->gc_mutex))
- return -EBUSY;
+ if (!mutex_trylock(&sbi->gc_mutex)) {
+ ret = -EBUSY;
+ goto out;
+ }
} else {
mutex_lock(&sbi->gc_mutex);
}
- return f2fs_gc(sbi, sync);
+ ret = f2fs_gc(sbi, sync);
+out:
+ mnt_drop_write_file(filp);
+ return ret;
}
static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -1630,7 +1870,14 @@ static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg)
if (f2fs_readonly(sbi->sb))
return -EROFS;
- return f2fs_sync_fs(sbi->sb, 1);
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ ret = f2fs_sync_fs(sbi->sb, 1);
+
+ mnt_drop_write_file(filp);
+ return ret;
}
static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
@@ -1652,8 +1899,8 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
if (need_inplace_update(inode))
return -EINVAL;
- pg_start = range->start >> PAGE_CACHE_SHIFT;
- pg_end = (range->start + range->len) >> PAGE_CACHE_SHIFT;
+ pg_start = range->start >> PAGE_SHIFT;
+ pg_end = (range->start + range->len) >> PAGE_SHIFT;
f2fs_balance_fs(sbi, true);
@@ -1734,7 +1981,7 @@ do_map:
continue;
}
- set_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
+ set_inode_flag(inode, FI_DO_DEFRAG);
idx = map.m_lblk;
while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) {
@@ -1759,18 +2006,18 @@ do_map:
if (idx < pg_end && cnt < blk_per_seg)
goto do_map;
- clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
+ clear_inode_flag(inode, FI_DO_DEFRAG);
err = filemap_fdatawrite(inode->i_mapping);
if (err)
goto out;
}
clear_out:
- clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
+ clear_inode_flag(inode, FI_DO_DEFRAG);
out:
inode_unlock(inode);
if (!err)
- range->len = (u64)total << PAGE_CACHE_SHIFT;
+ range->len = (u64)total << PAGE_SHIFT;
return err;
}
@@ -1822,6 +2069,133 @@ out:
return err;
}
+static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out, size_t len)
+{
+ struct inode *src = file_inode(file_in);
+ struct inode *dst = file_inode(file_out);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(src);
+ size_t olen = len, dst_max_i_size = 0;
+ size_t dst_osize;
+ int ret;
+
+ if (file_in->f_path.mnt != file_out->f_path.mnt ||
+ src->i_sb != dst->i_sb)
+ return -EXDEV;
+
+ if (unlikely(f2fs_readonly(src->i_sb)))
+ return -EROFS;
+
+ if (S_ISDIR(src->i_mode) || S_ISDIR(dst->i_mode))
+ return -EISDIR;
+
+ if (f2fs_encrypted_inode(src) || f2fs_encrypted_inode(dst))
+ return -EOPNOTSUPP;
+
+ inode_lock(src);
+ if (src != dst)
+ inode_lock(dst);
+
+ ret = -EINVAL;
+ if (pos_in + len > src->i_size || pos_in + len < pos_in)
+ goto out_unlock;
+ if (len == 0)
+ olen = len = src->i_size - pos_in;
+ if (pos_in + len == src->i_size)
+ len = ALIGN(src->i_size, F2FS_BLKSIZE) - pos_in;
+ if (len == 0) {
+ ret = 0;
+ goto out_unlock;
+ }
+
+ dst_osize = dst->i_size;
+ if (pos_out + olen > dst->i_size)
+ dst_max_i_size = pos_out + olen;
+
+ /* verify the end result is block aligned */
+ if (!IS_ALIGNED(pos_in, F2FS_BLKSIZE) ||
+ !IS_ALIGNED(pos_in + len, F2FS_BLKSIZE) ||
+ !IS_ALIGNED(pos_out, F2FS_BLKSIZE))
+ goto out_unlock;
+
+ ret = f2fs_convert_inline_inode(src);
+ if (ret)
+ goto out_unlock;
+
+ ret = f2fs_convert_inline_inode(dst);
+ if (ret)
+ goto out_unlock;
+
+ /* write out all dirty pages from offset */
+ ret = filemap_write_and_wait_range(src->i_mapping,
+ pos_in, pos_in + len);
+ if (ret)
+ goto out_unlock;
+
+ ret = filemap_write_and_wait_range(dst->i_mapping,
+ pos_out, pos_out + len);
+ if (ret)
+ goto out_unlock;
+
+ f2fs_balance_fs(sbi, true);
+ f2fs_lock_op(sbi);
+ ret = __exchange_data_block(src, dst, pos_in,
+ pos_out, len >> F2FS_BLKSIZE_BITS, false);
+
+ if (!ret) {
+ if (dst_max_i_size)
+ f2fs_i_size_write(dst, dst_max_i_size);
+ else if (dst_osize != dst->i_size)
+ f2fs_i_size_write(dst, dst_osize);
+ }
+ f2fs_unlock_op(sbi);
+out_unlock:
+ if (src != dst)
+ inode_unlock(dst);
+ inode_unlock(src);
+ return ret;
+}
+
+static int f2fs_ioc_move_range(struct file *filp, unsigned long arg)
+{
+ struct f2fs_move_range range;
+ struct fd dst;
+ int err;
+
+ if (!(filp->f_mode & FMODE_READ) ||
+ !(filp->f_mode & FMODE_WRITE))
+ return -EBADF;
+
+ if (copy_from_user(&range, (struct f2fs_move_range __user *)arg,
+ sizeof(range)))
+ return -EFAULT;
+
+ dst = fdget(range.dst_fd);
+ if (!dst.file)
+ return -EBADF;
+
+ if (!(dst.file->f_mode & FMODE_WRITE)) {
+ err = -EBADF;
+ goto err_out;
+ }
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ goto err_out;
+
+ err = f2fs_move_file_range(filp, range.pos_in, dst.file,
+ range.pos_out, range.len);
+
+ mnt_drop_write_file(filp);
+
+ if (copy_to_user((struct f2fs_move_range __user *)arg,
+ &range, sizeof(range)))
+ err = -EFAULT;
+err_out:
+ fdput(dst);
+ return err;
+}
+
long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
switch (cmd) {
@@ -1857,6 +2231,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return f2fs_ioc_write_checkpoint(filp, arg);
case F2FS_IOC_DEFRAGMENT:
return f2fs_ioc_defragment(filp, arg);
+ case F2FS_IOC_MOVE_RANGE:
+ return f2fs_ioc_move_range(filp, arg);
default:
return -ENOTTY;
}
@@ -1866,6 +2242,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
+ struct blk_plug plug;
ssize_t ret;
if (f2fs_encrypted_inode(inode) &&
@@ -1877,18 +2254,16 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
ret = generic_write_checks(iocb, from);
if (ret > 0) {
ret = f2fs_preallocate_blocks(iocb, from);
- if (!ret)
+ if (!ret) {
+ blk_start_plug(&plug);
ret = __generic_file_write_iter(iocb, from);
+ blk_finish_plug(&plug);
+ }
}
inode_unlock(inode);
- if (ret > 0) {
- ssize_t err;
-
- err = generic_write_sync(file, iocb->ki_pos - ret, ret);
- if (err < 0)
- ret = err;
- }
+ if (ret > 0)
+ ret = generic_write_sync(iocb, ret);
return ret;
}
@@ -1918,6 +2293,8 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case F2FS_IOC_WRITE_CHECKPOINT:
case F2FS_IOC_DEFRAGMENT:
break;
+ case F2FS_IOC_MOVE_RANGE:
+ break;
default:
return -ENOIOCTLCMD;
}
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index b0051a97824cc..8f7fa326ce95b 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -96,7 +96,7 @@ int start_gc_thread(struct f2fs_sb_info *sbi)
dev_t dev = sbi->sb->s_bdev->bd_dev;
int err = 0;
- gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
+ gc_th = f2fs_kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
if (!gc_th) {
err = -ENOMEM;
goto out;
@@ -465,15 +465,7 @@ next_step:
continue;
}
- /* set page dirty and write it */
- if (gc_type == FG_GC) {
- f2fs_wait_on_page_writeback(node_page, NODE, true);
- set_page_dirty(node_page);
- } else {
- if (!PageWriteback(node_page))
- set_page_dirty(node_page);
- }
- f2fs_put_page(node_page, 1);
+ move_node_page(node_page, gc_type);
stat_inc_node_blk_count(sbi, 1, gc_type);
}
@@ -546,7 +538,8 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
struct f2fs_io_info fio = {
.sbi = F2FS_I_SB(inode),
.type = DATA,
- .rw = READ_SYNC,
+ .op = REQ_OP_READ,
+ .op_flags = READ_SYNC,
.encrypted_page = NULL,
};
struct dnode_of_data dn;
@@ -601,11 +594,11 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
/* write page */
lock_page(fio.encrypted_page);
- if (unlikely(!PageUptodate(fio.encrypted_page))) {
+ if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) {
err = -EIO;
goto put_page_out;
}
- if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) {
+ if (unlikely(!PageUptodate(fio.encrypted_page))) {
err = -EIO;
goto put_page_out;
}
@@ -620,14 +613,15 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
/* allocate block address */
f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
- fio.rw = WRITE_SYNC;
+ fio.op = REQ_OP_WRITE;
+ fio.op_flags = WRITE_SYNC;
fio.new_blkaddr = newaddr;
f2fs_submit_page_mbio(&fio);
f2fs_update_data_blkaddr(&dn, newaddr);
- set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
+ set_inode_flag(inode, FI_APPEND_WRITE);
if (page->index == 0)
- set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN);
+ set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
put_page_out:
f2fs_put_page(fio.encrypted_page, 1);
recover_block:
@@ -657,16 +651,28 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type)
struct f2fs_io_info fio = {
.sbi = F2FS_I_SB(inode),
.type = DATA,
- .rw = WRITE_SYNC,
+ .op = REQ_OP_WRITE,
+ .op_flags = WRITE_SYNC,
.page = page,
.encrypted_page = NULL,
};
+ bool is_dirty = PageDirty(page);
+ int err;
+
+retry:
set_page_dirty(page);
f2fs_wait_on_page_writeback(page, DATA, true);
if (clear_page_dirty_for_io(page))
inode_dec_dirty_pages(inode);
+
set_cold_data(page);
- do_write_data_page(&fio);
+
+ err = do_write_data_page(&fio);
+ if (err == -ENOMEM && is_dirty) {
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ goto retry;
+ }
+
clear_cold_data(page);
}
out:
@@ -738,7 +744,8 @@ next_step:
start_bidx = start_bidx_of_node(nofs, inode);
data_page = get_read_data_page(inode,
- start_bidx + ofs_in_node, READA, true);
+ start_bidx + ofs_in_node, REQ_RAHEAD,
+ true);
if (IS_ERR(data_page)) {
iput(inode);
continue;
@@ -752,12 +759,32 @@ next_step:
/* phase 3 */
inode = find_gc_inode(gc_list, dni.ino);
if (inode) {
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ bool locked = false;
+
+ if (S_ISREG(inode->i_mode)) {
+ if (!down_write_trylock(&fi->dio_rwsem[READ]))
+ continue;
+ if (!down_write_trylock(
+ &fi->dio_rwsem[WRITE])) {
+ up_write(&fi->dio_rwsem[READ]);
+ continue;
+ }
+ locked = true;
+ }
+
start_bidx = start_bidx_of_node(nofs, inode)
+ ofs_in_node;
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
move_encrypted_block(inode, start_bidx);
else
move_data_page(inode, start_bidx, gc_type);
+
+ if (locked) {
+ up_write(&fi->dio_rwsem[WRITE]);
+ up_write(&fi->dio_rwsem[READ]);
+ }
+
stat_inc_data_blk_count(sbi, 1, gc_type);
}
}
@@ -806,6 +833,10 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
blk_start_plug(&plug);
for (segno = start_segno; segno < end_segno; segno++) {
+
+ if (get_valid_blocks(sbi, segno, 1) == 0)
+ continue;
+
/* find segment summary of victim */
sum_page = find_get_page(META_MAPPING(sbi),
GET_SUM_BLOCK(sbi, segno));
@@ -834,18 +865,9 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
f2fs_put_page(sum_page, 0);
}
- if (gc_type == FG_GC) {
- if (type == SUM_TYPE_NODE) {
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = LONG_MAX,
- .for_reclaim = 0,
- };
- sync_node_pages(sbi, 0, &wbc);
- } else {
- f2fs_submit_merged_bio(sbi, DATA, WRITE);
- }
- }
+ if (gc_type == FG_GC)
+ f2fs_submit_merged_bio(sbi,
+ (type == SUM_TYPE_NODE) ? NODE : DATA, WRITE);
blk_finish_plug(&plug);
@@ -890,10 +912,13 @@ gc_more:
* enough free sections, we should flush dent/node blocks and do
* garbage collections.
*/
- if (__get_victim(sbi, &segno, gc_type) || prefree_segments(sbi))
+ if (__get_victim(sbi, &segno, gc_type) ||
+ prefree_segments(sbi)) {
write_checkpoint(sbi, &cpc);
- else if (has_not_enough_free_secs(sbi, 0))
+ segno = NULL_SEGNO;
+ } else if (has_not_enough_free_secs(sbi, 0)) {
write_checkpoint(sbi, &cpc);
+ }
}
if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type))
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 358214e9f7076..ccea8735de593 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -51,7 +51,7 @@ void read_inline_data(struct page *page, struct page *ipage)
f2fs_bug_on(F2FS_P_SB(page), page->index);
- zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
+ zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE);
/* Copy the whole inline data block */
src_addr = inline_data_addr(ipage);
@@ -59,7 +59,8 @@ void read_inline_data(struct page *page, struct page *ipage)
memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
flush_dcache_page(page);
kunmap_atomic(dst_addr);
- SetPageUptodate(page);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
}
bool truncate_inline_inode(struct page *ipage, u64 from)
@@ -73,7 +74,7 @@ bool truncate_inline_inode(struct page *ipage, u64 from)
f2fs_wait_on_page_writeback(ipage, NODE, true);
memset(addr + from, 0, MAX_INLINE_DATA - from);
-
+ set_page_dirty(ipage);
return true;
}
@@ -93,11 +94,12 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
}
if (page->index)
- zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ zero_user_segment(page, 0, PAGE_SIZE);
else
read_inline_data(page, ipage);
- SetPageUptodate(page);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
f2fs_put_page(ipage, 1);
unlock_page(page);
return 0;
@@ -108,7 +110,8 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
struct f2fs_io_info fio = {
.sbi = F2FS_I_SB(dn->inode),
.type = DATA,
- .rw = WRITE_SYNC | REQ_PRIO,
+ .op = REQ_OP_WRITE,
+ .op_flags = WRITE_SYNC | REQ_PRIO,
.page = page,
.encrypted_page = NULL,
};
@@ -138,7 +141,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
inode_dec_dirty_pages(dn->inode);
/* this converted inline_data should be recovered. */
- set_inode_flag(F2FS_I(dn->inode), FI_APPEND_WRITE);
+ set_inode_flag(dn->inode, FI_APPEND_WRITE);
/* clear inline data and flag after data writeback */
truncate_inline_inode(dn->inode_page, 0);
@@ -146,7 +149,6 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
clear_out:
stat_dec_inline_inode(dn->inode);
f2fs_clear_inline_inode(dn->inode);
- sync_inode_page(dn);
f2fs_put_dnode(dn);
return 0;
}
@@ -161,7 +163,7 @@ int f2fs_convert_inline_inode(struct inode *inode)
if (!f2fs_has_inline_data(inode))
return 0;
- page = grab_cache_page(inode->i_mapping, 0);
+ page = f2fs_grab_cache_page(inode->i_mapping, 0, false);
if (!page)
return -ENOMEM;
@@ -212,11 +214,11 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
dst_addr = inline_data_addr(dn.inode_page);
memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
kunmap_atomic(src_addr);
+ set_page_dirty(dn.inode_page);
- set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
- set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
+ set_inode_flag(inode, FI_APPEND_WRITE);
+ set_inode_flag(inode, FI_DATA_EXIST);
- sync_inode_page(&dn);
clear_inline_node(dn.inode_page);
f2fs_put_dnode(&dn);
return 0;
@@ -252,10 +254,10 @@ process_inline:
dst_addr = inline_data_addr(ipage);
memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
- set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
- set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
+ set_inode_flag(inode, FI_INLINE_DATA);
+ set_inode_flag(inode, FI_DATA_EXIST);
- update_inode(inode, ipage);
+ set_page_dirty(ipage);
f2fs_put_page(ipage, 1);
return true;
}
@@ -266,7 +268,6 @@ process_inline:
if (!truncate_inline_inode(ipage, 0))
return false;
f2fs_clear_inline_inode(inode);
- update_inode(inode, ipage);
f2fs_put_page(ipage, 1);
} else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) {
if (truncate_blocks(inode, 0, false))
@@ -288,8 +289,10 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
f2fs_hash_t namehash;
ipage = get_node_page(sbi, dir->i_ino);
- if (IS_ERR(ipage))
+ if (IS_ERR(ipage)) {
+ *res_page = ipage;
return NULL;
+ }
namehash = f2fs_dentry_hash(&name);
@@ -303,30 +306,6 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
else
f2fs_put_page(ipage, 0);
- /*
- * For the most part, it should be a bug when name_len is zero.
- * We stop here for figuring out where the bugs has occurred.
- */
- f2fs_bug_on(sbi, d.max < 0);
- return de;
-}
-
-struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *dir,
- struct page **p)
-{
- struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
- struct page *ipage;
- struct f2fs_dir_entry *de;
- struct f2fs_inline_dentry *dentry_blk;
-
- ipage = get_node_page(sbi, dir->i_ino);
- if (IS_ERR(ipage))
- return NULL;
-
- dentry_blk = inline_data_addr(ipage);
- de = &dentry_blk->dentry[1];
- *p = ipage;
- unlock_page(ipage);
return de;
}
@@ -344,10 +323,8 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent,
set_page_dirty(ipage);
/* update i_size to MAX_INLINE_DATA */
- if (i_size_read(inode) < MAX_INLINE_DATA) {
- i_size_write(inode, MAX_INLINE_DATA);
- set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
- }
+ if (i_size_read(inode) < MAX_INLINE_DATA)
+ f2fs_i_size_write(inode, MAX_INLINE_DATA);
return 0;
}
@@ -355,7 +332,7 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent,
* NOTE: ipage is grabbed by caller, but if any error occurs, we should
* release ipage in this function.
*/
-static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
+static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
struct f2fs_inline_dentry *inline_dentry)
{
struct page *page;
@@ -363,7 +340,7 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
struct f2fs_dentry_block *dentry_blk;
int err;
- page = grab_cache_page(dir->i_mapping, 0);
+ page = f2fs_grab_cache_page(dir->i_mapping, 0, false);
if (!page) {
f2fs_put_page(ipage, 1);
return -ENOMEM;
@@ -375,7 +352,7 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
goto out;
f2fs_wait_on_page_writeback(page, DATA, true);
- zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
+ zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE);
dentry_blk = kmap_atomic(page);
@@ -396,26 +373,121 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
NR_INLINE_DENTRY * F2FS_SLOT_LEN);
kunmap_atomic(dentry_blk);
- SetPageUptodate(page);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
set_page_dirty(page);
/* clear inline dir and flag after data writeback */
truncate_inline_inode(ipage, 0);
stat_dec_inline_dir(dir);
- clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY);
-
- if (i_size_read(dir) < PAGE_CACHE_SIZE) {
- i_size_write(dir, PAGE_CACHE_SIZE);
- set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
- }
+ clear_inode_flag(dir, FI_INLINE_DENTRY);
- sync_inode_page(&dn);
+ f2fs_i_depth_write(dir, 1);
+ if (i_size_read(dir) < PAGE_SIZE)
+ f2fs_i_size_write(dir, PAGE_SIZE);
out:
f2fs_put_page(page, 1);
return err;
}
+static int f2fs_add_inline_entries(struct inode *dir,
+ struct f2fs_inline_dentry *inline_dentry)
+{
+ struct f2fs_dentry_ptr d;
+ unsigned long bit_pos = 0;
+ int err = 0;
+
+ make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2);
+
+ while (bit_pos < d.max) {
+ struct f2fs_dir_entry *de;
+ struct qstr new_name;
+ nid_t ino;
+ umode_t fake_mode;
+
+ if (!test_bit_le(bit_pos, d.bitmap)) {
+ bit_pos++;
+ continue;
+ }
+
+ de = &d.dentry[bit_pos];
+
+ if (unlikely(!de->name_len)) {
+ bit_pos++;
+ continue;
+ }
+
+ new_name.name = d.filename[bit_pos];
+ new_name.len = de->name_len;
+
+ ino = le32_to_cpu(de->ino);
+ fake_mode = get_de_type(de) << S_SHIFT;
+
+ err = f2fs_add_regular_entry(dir, &new_name, NULL,
+ ino, fake_mode);
+ if (err)
+ goto punch_dentry_pages;
+
+ bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
+ }
+ return 0;
+punch_dentry_pages:
+ truncate_inode_pages(&dir->i_data, 0);
+ truncate_blocks(dir, 0, false);
+ remove_dirty_inode(dir);
+ return err;
+}
+
+static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage,
+ struct f2fs_inline_dentry *inline_dentry)
+{
+ struct f2fs_inline_dentry *backup_dentry;
+ int err;
+
+ backup_dentry = f2fs_kmalloc(sizeof(struct f2fs_inline_dentry),
+ GFP_F2FS_ZERO);
+ if (!backup_dentry) {
+ f2fs_put_page(ipage, 1);
+ return -ENOMEM;
+ }
+
+ memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA);
+ truncate_inline_inode(ipage, 0);
+
+ unlock_page(ipage);
+
+ err = f2fs_add_inline_entries(dir, backup_dentry);
+ if (err)
+ goto recover;
+
+ lock_page(ipage);
+
+ stat_dec_inline_dir(dir);
+ clear_inode_flag(dir, FI_INLINE_DENTRY);
+ kfree(backup_dentry);
+ return 0;
+recover:
+ lock_page(ipage);
+ memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA);
+ f2fs_i_depth_write(dir, 0);
+ f2fs_i_size_write(dir, MAX_INLINE_DATA);
+ set_page_dirty(ipage);
+ f2fs_put_page(ipage, 1);
+
+ kfree(backup_dentry);
+ return err;
+}
+
+static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
+ struct f2fs_inline_dentry *inline_dentry)
+{
+ if (!F2FS_I(dir)->i_dir_level)
+ return f2fs_move_inline_dirents(dir, ipage, inline_dentry);
+ else
+ return f2fs_move_rehashed_dirents(dir, ipage, inline_dentry);
+}
+
int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name,
struct inode *inode, nid_t ino, umode_t mode)
{
@@ -464,8 +536,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name,
/* we don't need to mark_inode_dirty now */
if (inode) {
- F2FS_I(inode)->i_pino = dir->i_ino;
- update_inode(inode, page);
+ f2fs_i_pino_write(inode, dir->i_ino);
f2fs_put_page(page, 1);
}
@@ -473,11 +544,6 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name,
fail:
if (inode)
up_write(&F2FS_I(inode)->i_sem);
-
- if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) {
- update_inode(dir, ipage);
- clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
- }
out:
f2fs_put_page(ipage, 1);
return err;
@@ -501,13 +567,13 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
&inline_dentry->dentry_bitmap);
set_page_dirty(page);
+ f2fs_put_page(page, 1);
dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ f2fs_mark_inode_dirty_sync(dir);
if (inode)
- f2fs_drop_nlink(dir, inode, page);
-
- f2fs_put_page(page, 1);
+ f2fs_drop_nlink(dir, inode);
}
bool f2fs_empty_inline_dir(struct inode *dir)
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index cb269c46ac254..9ac5efc15347f 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -18,6 +18,13 @@
#include <trace/events/f2fs.h>
+void f2fs_mark_inode_dirty_sync(struct inode *inode)
+{
+ if (f2fs_inode_dirtied(inode))
+ return;
+ mark_inode_dirty_sync(inode);
+}
+
void f2fs_set_inode_flags(struct inode *inode)
{
unsigned int flags = F2FS_I(inode)->i_flags;
@@ -35,6 +42,7 @@ void f2fs_set_inode_flags(struct inode *inode)
new_fl |= S_DIRSYNC;
inode_set_flags(inode, new_fl,
S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+ f2fs_mark_inode_dirty_sync(inode);
}
static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
@@ -85,8 +93,8 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage)
if (*start++) {
f2fs_wait_on_page_writeback(ipage, NODE, true);
- set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
- set_raw_inline(F2FS_I(inode), F2FS_INODE(ipage));
+ set_inode_flag(inode, FI_DATA_EXIST);
+ set_raw_inline(inode, F2FS_INODE(ipage));
set_page_dirty(ipage);
return;
}
@@ -141,7 +149,7 @@ static int do_read_inode(struct inode *inode)
if (f2fs_init_extent_tree(inode, &ri->i_ext))
set_page_dirty(node_page);
- get_inline_info(fi, ri);
+ get_inline_info(inode, ri);
/* check data exist */
if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode))
@@ -151,7 +159,10 @@ static int do_read_inode(struct inode *inode)
__get_inode_rdev(inode, ri);
if (__written_first_block(ri))
- set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN);
+ set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
+
+ if (!need_inode_block_update(sbi, inode->i_ino))
+ fi->last_disk_size = inode->i_size;
f2fs_put_page(node_page, 1);
@@ -227,6 +238,8 @@ int update_inode(struct inode *inode, struct page *node_page)
{
struct f2fs_inode *ri;
+ f2fs_inode_synced(inode);
+
f2fs_wait_on_page_writeback(node_page, NODE, true);
ri = F2FS_INODE(node_page);
@@ -244,7 +257,7 @@ int update_inode(struct inode *inode, struct page *node_page)
&ri->i_ext);
else
memset(&ri->i_ext, 0, sizeof(ri->i_ext));
- set_raw_inline(F2FS_I(inode), ri);
+ set_raw_inline(inode, ri);
ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
@@ -261,7 +274,6 @@ int update_inode(struct inode *inode, struct page *node_page)
__set_inode_rdev(inode, ri);
set_cold_node(inode, node_page);
- clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
/* deleted inode */
if (inode->i_nlink == 0)
@@ -283,8 +295,9 @@ retry:
cond_resched();
goto retry;
} else if (err != -ENOENT) {
- f2fs_stop_checkpoint(sbi);
+ f2fs_stop_checkpoint(sbi, false);
}
+ f2fs_inode_synced(inode);
return 0;
}
ret = update_inode(inode, node_page);
@@ -300,7 +313,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
inode->i_ino == F2FS_META_INO(sbi))
return 0;
- if (!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_INODE))
+ if (!is_inode_flag_set(inode, FI_DIRTY_INODE))
return 0;
/*
@@ -318,8 +331,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
void f2fs_evict_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct f2fs_inode_info *fi = F2FS_I(inode);
- nid_t xnid = fi->i_xattr_nid;
+ nid_t xnid = F2FS_I(inode)->i_xattr_nid;
int err = 0;
/* some remained atomic pages should discarded */
@@ -341,12 +353,17 @@ void f2fs_evict_inode(struct inode *inode)
if (inode->i_nlink || is_bad_inode(inode))
goto no_delete;
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (time_to_inject(FAULT_EVICT_INODE))
+ goto no_delete;
+#endif
+
sb_start_intwrite(inode->i_sb);
- set_inode_flag(fi, FI_NO_ALLOC);
+ set_inode_flag(inode, FI_NO_ALLOC);
i_size_write(inode, 0);
-
+retry:
if (F2FS_HAS_BLOCKS(inode))
- err = f2fs_truncate(inode, true);
+ err = f2fs_truncate(inode);
if (!err) {
f2fs_lock_op(sbi);
@@ -354,6 +371,14 @@ void f2fs_evict_inode(struct inode *inode)
f2fs_unlock_op(sbi);
}
+ /* give more chances, if ENOMEM case */
+ if (err == -ENOMEM) {
+ err = 0;
+ goto retry;
+ }
+
+ if (err)
+ update_inode_page(inode);
sb_end_intwrite(inode->i_sb);
no_delete:
stat_dec_inline_xattr(inode);
@@ -363,31 +388,16 @@ no_delete:
invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino);
if (xnid)
invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid);
- if (is_inode_flag_set(fi, FI_APPEND_WRITE))
+ if (is_inode_flag_set(inode, FI_APPEND_WRITE))
add_ino_entry(sbi, inode->i_ino, APPEND_INO);
- if (is_inode_flag_set(fi, FI_UPDATE_WRITE))
+ if (is_inode_flag_set(inode, FI_UPDATE_WRITE))
add_ino_entry(sbi, inode->i_ino, UPDATE_INO);
- if (is_inode_flag_set(fi, FI_FREE_NID)) {
- if (err && err != -ENOENT)
- alloc_nid_done(sbi, inode->i_ino);
- else
- alloc_nid_failed(sbi, inode->i_ino);
- clear_inode_flag(fi, FI_FREE_NID);
- }
-
- if (err && err != -ENOENT) {
- if (!exist_written_data(sbi, inode->i_ino, ORPHAN_INO)) {
- /*
- * get here because we failed to release resource
- * of inode previously, reminder our user to run fsck
- * for fixing.
- */
- set_sbi_flag(sbi, SBI_NEED_FSCK);
- f2fs_msg(sbi->sb, KERN_WARNING,
- "inode (ino:%lu) resource leak, run fsck "
- "to fix this issue!", inode->i_ino);
- }
+ if (is_inode_flag_set(inode, FI_FREE_NID)) {
+ alloc_nid_failed(sbi, inode->i_ino);
+ clear_inode_flag(inode, FI_FREE_NID);
}
+ f2fs_bug_on(sbi, err &&
+ !exist_written_data(sbi, inode->i_ino, ORPHAN_INO));
out_clear:
fscrypt_put_encryption_info(inode, NULL);
clear_inode(inode);
@@ -397,37 +407,32 @@ out_clear:
void handle_failed_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- int err = 0;
+ struct node_info ni;
- clear_nlink(inode);
- make_bad_inode(inode);
+ /* don't make bad inode, since it becomes a regular file. */
unlock_new_inode(inode);
- i_size_write(inode, 0);
- if (F2FS_HAS_BLOCKS(inode))
- err = f2fs_truncate(inode, false);
-
- if (!err)
- err = remove_inode_page(inode);
-
/*
- * if we skip truncate_node in remove_inode_page bacause we failed
- * before, it's better to find another way to release resource of
- * this inode (e.g. valid block count, node block or nid). Here we
- * choose to add this inode to orphan list, so that we can call iput
- * for releasing in orphan recovery flow.
- *
* Note: we should add inode to orphan list before f2fs_unlock_op()
* so we can prevent losing this orphan when encoutering checkpoint
* and following suddenly power-off.
*/
- if (err && err != -ENOENT) {
- err = acquire_orphan_inode(sbi);
- if (!err)
- add_orphan_inode(sbi, inode->i_ino);
+ get_node_info(sbi, inode->i_ino, &ni);
+
+ if (ni.blk_addr != NULL_ADDR) {
+ int err = acquire_orphan_inode(sbi);
+ if (err) {
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "Too many orphan inodes, run fsck to fix.");
+ } else {
+ add_orphan_inode(inode);
+ }
+ alloc_nid_done(sbi, inode->i_ino);
+ } else {
+ set_inode_flag(inode, FI_FREE_NID);
}
- set_inode_flag(F2FS_I(inode), FI_FREE_NID);
f2fs_unlock_op(sbi);
/* iput will drop the inode object */
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 7876f10521019..73fa356f8fbb0 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -60,10 +60,14 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode))
f2fs_set_encrypted_inode(inode);
+ set_inode_flag(inode, FI_NEW_INODE);
+
+ if (test_opt(sbi, INLINE_XATTR))
+ set_inode_flag(inode, FI_INLINE_XATTR);
if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
- set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
+ set_inode_flag(inode, FI_INLINE_DATA);
if (f2fs_may_inline_dentry(inode))
- set_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY);
+ set_inode_flag(inode, FI_INLINE_DENTRY);
f2fs_init_extent_tree(inode, NULL);
@@ -72,14 +76,13 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
stat_inc_inline_dir(inode);
trace_f2fs_new_inode(inode, 0);
- mark_inode_dirty(inode);
return inode;
fail:
trace_f2fs_new_inode(inode, err);
make_bad_inode(inode);
if (nid_free)
- set_inode_flag(F2FS_I(inode), FI_FREE_NID);
+ set_inode_flag(inode, FI_FREE_NID);
iput(inode);
return ERR_PTR(err);
}
@@ -177,7 +180,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
inode->i_ctime = CURRENT_TIME;
ihold(inode);
- set_inode_flag(F2FS_I(inode), FI_INC_LINK);
+ set_inode_flag(inode, FI_INC_LINK);
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
if (err)
@@ -190,7 +193,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
f2fs_sync_fs(sbi->sb, 1);
return 0;
out:
- clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
+ clear_inode_flag(inode, FI_INC_LINK);
iput(inode);
f2fs_unlock_op(sbi);
return err;
@@ -199,10 +202,14 @@ out:
struct dentry *f2fs_get_parent(struct dentry *child)
{
struct qstr dotdot = QSTR_INIT("..", 2);
- unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot);
- if (!ino)
+ struct page *page;
+ unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot, &page);
+ if (!ino) {
+ if (IS_ERR(page))
+ return ERR_CAST(page);
return ERR_PTR(-ENOENT);
- return d_obtain_alias(f2fs_iget(d_inode(child)->i_sb, ino));
+ }
+ return d_obtain_alias(f2fs_iget(child->d_sb, ino));
}
static int __recover_dot_dentries(struct inode *dir, nid_t pino)
@@ -229,6 +236,9 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino)
if (de) {
f2fs_dentry_kunmap(dir, page);
f2fs_put_page(page, 0);
+ } else if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ goto out;
} else {
err = __f2fs_add_link(dir, &dot, NULL, dir->i_ino, S_IFDIR);
if (err)
@@ -239,14 +249,14 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino)
if (de) {
f2fs_dentry_kunmap(dir, page);
f2fs_put_page(page, 0);
+ } else if (IS_ERR(page)) {
+ err = PTR_ERR(page);
} else {
err = __f2fs_add_link(dir, &dotdot, NULL, pino, S_IFDIR);
}
out:
- if (!err) {
- clear_inode_flag(F2FS_I(dir), FI_INLINE_DOTS);
- mark_inode_dirty(dir);
- }
+ if (!err)
+ clear_inode_flag(dir, FI_INLINE_DOTS);
f2fs_unlock_op(sbi);
return err;
@@ -281,8 +291,11 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
return ERR_PTR(-ENAMETOOLONG);
de = f2fs_find_entry(dir, &dentry->d_name, &page);
- if (!de)
+ if (!de) {
+ if (IS_ERR(page))
+ return (struct dentry *)page;
return d_splice_alias(inode, dentry);
+ }
ino = le32_to_cpu(de->ino);
f2fs_dentry_kunmap(dir, page);
@@ -329,8 +342,11 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
trace_f2fs_unlink_enter(dir, dentry);
de = f2fs_find_entry(dir, &dentry->d_name, &page);
- if (!de)
+ if (!de) {
+ if (IS_ERR(page))
+ err = PTR_ERR(page);
goto fail;
+ }
f2fs_balance_fs(sbi, true);
@@ -345,9 +361,6 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
f2fs_delete_entry(de, page, dir, inode);
f2fs_unlock_op(sbi);
- /* In order to evict this inode, we set it dirty */
- mark_inode_dirty(inode);
-
if (IS_DIRSYNC(dir))
f2fs_sync_fs(sbi->sb, 1);
fail:
@@ -492,7 +505,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
f2fs_balance_fs(sbi, true);
- set_inode_flag(F2FS_I(inode), FI_INC_LINK);
+ set_inode_flag(inode, FI_INC_LINK);
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
if (err)
@@ -509,7 +522,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
return 0;
out_fail:
- clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
+ clear_inode_flag(inode, FI_INC_LINK);
handle_failed_inode(inode);
return err;
}
@@ -592,17 +605,17 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
* add this non-linked tmpfile to orphan list, in this way we could
* remove all unused data of tmpfile after abnormal power-off.
*/
- add_orphan_inode(sbi, inode->i_ino);
- f2fs_unlock_op(sbi);
-
+ add_orphan_inode(inode);
alloc_nid_done(sbi, inode->i_ino);
if (whiteout) {
- inode_dec_link_count(inode);
+ f2fs_i_links_write(inode, false);
*whiteout = inode;
} else {
d_tmpfile(dentry, inode);
}
+ /* link_count was changed by d_tmpfile as well. */
+ f2fs_unlock_op(sbi);
unlock_new_inode(inode);
return 0;
@@ -652,14 +665,19 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
}
old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
- if (!old_entry)
+ if (!old_entry) {
+ if (IS_ERR(old_page))
+ err = PTR_ERR(old_page);
goto out;
+ }
if (S_ISDIR(old_inode->i_mode)) {
- err = -EIO;
old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page);
- if (!old_dir_entry)
+ if (!old_dir_entry) {
+ if (IS_ERR(old_dir_page))
+ err = PTR_ERR(old_dir_page);
goto out_old;
+ }
}
if (flags & RENAME_WHITEOUT) {
@@ -677,8 +695,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
err = -ENOENT;
new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name,
&new_page);
- if (!new_entry)
+ if (!new_entry) {
+ if (IS_ERR(new_page))
+ err = PTR_ERR(new_page);
goto out_whiteout;
+ }
f2fs_balance_fs(sbi, true);
@@ -700,19 +721,14 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
new_inode->i_ctime = CURRENT_TIME;
down_write(&F2FS_I(new_inode)->i_sem);
if (old_dir_entry)
- drop_nlink(new_inode);
- drop_nlink(new_inode);
+ f2fs_i_links_write(new_inode, false);
+ f2fs_i_links_write(new_inode, false);
up_write(&F2FS_I(new_inode)->i_sem);
- mark_inode_dirty(new_inode);
-
if (!new_inode->i_nlink)
- add_orphan_inode(sbi, new_inode->i_ino);
+ add_orphan_inode(new_inode);
else
release_orphan_inode(sbi);
-
- update_inode_page(old_inode);
- update_inode_page(new_inode);
} else {
f2fs_balance_fs(sbi, true);
@@ -724,10 +740,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
goto out_whiteout;
}
- if (old_dir_entry) {
- inc_nlink(new_dir);
- update_inode_page(new_dir);
- }
+ if (old_dir_entry)
+ f2fs_i_links_write(new_dir, true);
/*
* old entry and new entry can locate in the same inline
@@ -743,7 +757,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
old_entry = f2fs_find_entry(old_dir,
&old_dentry->d_name, &old_page);
if (!old_entry) {
- err = -EIO;
+ err = -ENOENT;
+ if (IS_ERR(old_page))
+ err = PTR_ERR(old_page);
f2fs_unlock_op(sbi);
goto out_whiteout;
}
@@ -757,13 +773,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
up_write(&F2FS_I(old_inode)->i_sem);
old_inode->i_ctime = CURRENT_TIME;
- mark_inode_dirty(old_inode);
+ f2fs_mark_inode_dirty_sync(old_inode);
f2fs_delete_entry(old_entry, old_page, old_dir, NULL);
if (whiteout) {
whiteout->i_state |= I_LINKABLE;
- set_inode_flag(F2FS_I(whiteout), FI_INC_LINK);
+ set_inode_flag(whiteout, FI_INC_LINK);
err = f2fs_add_link(old_dentry, whiteout);
if (err)
goto put_out_dir;
@@ -775,14 +791,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (old_dir != new_dir && !whiteout) {
f2fs_set_link(old_inode, old_dir_entry,
old_dir_page, new_dir);
- update_inode_page(old_inode);
} else {
f2fs_dentry_kunmap(old_inode, old_dir_page);
f2fs_put_page(old_dir_page, 0);
}
- drop_nlink(old_dir);
- mark_inode_dirty(old_dir);
- update_inode_page(old_dir);
+ f2fs_i_links_write(old_dir, false);
}
f2fs_unlock_op(sbi);
@@ -832,29 +845,39 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
return -EPERM;
old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
- if (!old_entry)
+ if (!old_entry) {
+ if (IS_ERR(old_page))
+ err = PTR_ERR(old_page);
goto out;
+ }
new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, &new_page);
- if (!new_entry)
+ if (!new_entry) {
+ if (IS_ERR(new_page))
+ err = PTR_ERR(new_page);
goto out_old;
+ }
/* prepare for updating ".." directory entry info later */
if (old_dir != new_dir) {
if (S_ISDIR(old_inode->i_mode)) {
- err = -EIO;
old_dir_entry = f2fs_parent_dir(old_inode,
&old_dir_page);
- if (!old_dir_entry)
+ if (!old_dir_entry) {
+ if (IS_ERR(old_dir_page))
+ err = PTR_ERR(old_dir_page);
goto out_new;
+ }
}
if (S_ISDIR(new_inode->i_mode)) {
- err = -EIO;
new_dir_entry = f2fs_parent_dir(new_inode,
&new_dir_page);
- if (!new_dir_entry)
+ if (!new_dir_entry) {
+ if (IS_ERR(new_dir_page))
+ err = PTR_ERR(new_dir_page);
goto out_old_dir;
+ }
}
}
@@ -904,19 +927,13 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
file_lost_pino(old_inode);
up_write(&F2FS_I(old_inode)->i_sem);
- update_inode_page(old_inode);
-
old_dir->i_ctime = CURRENT_TIME;
if (old_nlink) {
down_write(&F2FS_I(old_dir)->i_sem);
- if (old_nlink < 0)
- drop_nlink(old_dir);
- else
- inc_nlink(old_dir);
+ f2fs_i_links_write(old_dir, old_nlink > 0);
up_write(&F2FS_I(old_dir)->i_sem);
}
- mark_inode_dirty(old_dir);
- update_inode_page(old_dir);
+ f2fs_mark_inode_dirty_sync(old_dir);
/* update directory entry info of new dir inode */
f2fs_set_link(new_dir, new_entry, new_page, old_inode);
@@ -925,19 +942,13 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
file_lost_pino(new_inode);
up_write(&F2FS_I(new_inode)->i_sem);
- update_inode_page(new_inode);
-
new_dir->i_ctime = CURRENT_TIME;
if (new_nlink) {
down_write(&F2FS_I(new_dir)->i_sem);
- if (new_nlink < 0)
- drop_nlink(new_dir);
- else
- inc_nlink(new_dir);
+ f2fs_i_links_write(new_dir, new_nlink > 0);
up_write(&F2FS_I(new_dir)->i_sem);
}
- mark_inode_dirty(new_dir);
- update_inode_page(new_dir);
+ f2fs_mark_inode_dirty_sync(new_dir);
f2fs_unlock_op(sbi);
@@ -1027,12 +1038,6 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
goto errout;
}
- /* this is broken symlink case */
- if (unlikely(cstr.name[0] == 0)) {
- res = -ENOENT;
- goto errout;
- }
-
if ((cstr.len + sizeof(struct fscrypt_symlink_data) - 1) > max_size) {
/* Symlink data on the disk is corrupted */
res = -EIO;
@@ -1046,17 +1051,23 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
if (res < 0)
goto errout;
+ /* this is broken symlink case */
+ if (unlikely(pstr.name[0] == 0)) {
+ res = -ENOENT;
+ goto errout;
+ }
+
paddr = pstr.name;
/* Null-terminate the name */
paddr[res] = '\0';
- page_cache_release(cpage);
+ put_page(cpage);
set_delayed_call(done, kfree_link, paddr);
return paddr;
errout:
fscrypt_fname_free_buffer(&pstr);
- page_cache_release(cpage);
+ put_page(cpage);
return ERR_PTR(res);
}
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 118321bd1a7fa..b2fa4b615925b 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -46,12 +46,16 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
*/
if (type == FREE_NIDS) {
mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >>
- PAGE_CACHE_SHIFT;
+ PAGE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
} else if (type == NAT_ENTRIES) {
mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >>
- PAGE_CACHE_SHIFT;
+ PAGE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
+ if (excess_cached_nats(sbi))
+ res = false;
+ if (nm_i->nat_cnt > DEF_NAT_CACHE_THRESHOLD)
+ res = false;
} else if (type == DIRTY_DENTS) {
if (sbi->sb->s_bdi->wb.dirty_exceeded)
return false;
@@ -62,13 +66,13 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
for (i = 0; i <= UPDATE_INO; i++)
mem_size += (sbi->im[i].ino_num *
- sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT;
+ sizeof(struct ino_entry)) >> PAGE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
} else if (type == EXTENT_CACHE) {
mem_size = (atomic_read(&sbi->total_ext_tree) *
sizeof(struct extent_tree) +
atomic_read(&sbi->total_ext_node) *
- sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT;
+ sizeof(struct extent_node)) >> PAGE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
} else {
if (!sbi->sb->s_bdi->wb.dirty_exceeded)
@@ -121,7 +125,7 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
src_addr = page_address(src_page);
dst_addr = page_address(dst_page);
- memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE);
+ memcpy(dst_addr, src_addr, PAGE_SIZE);
set_page_dirty(dst_page);
f2fs_put_page(src_page, 1);
@@ -202,14 +206,14 @@ int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
struct nat_entry *e;
bool need = false;
- down_read(&nm_i->nat_tree_lock);
+ percpu_down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (e) {
if (!get_nat_flag(e, IS_CHECKPOINTED) &&
!get_nat_flag(e, HAS_FSYNCED_INODE))
need = true;
}
- up_read(&nm_i->nat_tree_lock);
+ percpu_up_read(&nm_i->nat_tree_lock);
return need;
}
@@ -219,11 +223,11 @@ bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
struct nat_entry *e;
bool is_cp = true;
- down_read(&nm_i->nat_tree_lock);
+ percpu_down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (e && !get_nat_flag(e, IS_CHECKPOINTED))
is_cp = false;
- up_read(&nm_i->nat_tree_lock);
+ percpu_up_read(&nm_i->nat_tree_lock);
return is_cp;
}
@@ -233,13 +237,13 @@ bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
struct nat_entry *e;
bool need_update = true;
- down_read(&nm_i->nat_tree_lock);
+ percpu_down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, ino);
if (e && get_nat_flag(e, HAS_LAST_FSYNC) &&
(get_nat_flag(e, IS_CHECKPOINTED) ||
get_nat_flag(e, HAS_FSYNCED_INODE)))
need_update = false;
- up_read(&nm_i->nat_tree_lock);
+ percpu_up_read(&nm_i->nat_tree_lock);
return need_update;
}
@@ -280,7 +284,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *e;
- down_write(&nm_i->nat_tree_lock);
+ percpu_down_write(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, ni->nid);
if (!e) {
e = grab_nat_entry(nm_i, ni->nid);
@@ -330,7 +334,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
set_nat_flag(e, HAS_FSYNCED_INODE, true);
set_nat_flag(e, HAS_LAST_FSYNC, fsync_done);
}
- up_write(&nm_i->nat_tree_lock);
+ percpu_up_write(&nm_i->nat_tree_lock);
}
int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
@@ -338,8 +342,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
struct f2fs_nm_info *nm_i = NM_I(sbi);
int nr = nr_shrink;
- if (!down_write_trylock(&nm_i->nat_tree_lock))
- return 0;
+ percpu_down_write(&nm_i->nat_tree_lock);
while (nr_shrink && !list_empty(&nm_i->nat_entries)) {
struct nat_entry *ne;
@@ -348,7 +351,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
__del_from_nat_cache(nm_i, ne);
nr_shrink--;
}
- up_write(&nm_i->nat_tree_lock);
+ percpu_up_write(&nm_i->nat_tree_lock);
return nr - nr_shrink;
}
@@ -370,13 +373,13 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
ni->nid = nid;
/* Check nat cache */
- down_read(&nm_i->nat_tree_lock);
+ percpu_down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (e) {
ni->ino = nat_get_ino(e);
ni->blk_addr = nat_get_blkaddr(e);
ni->version = nat_get_version(e);
- up_read(&nm_i->nat_tree_lock);
+ percpu_up_read(&nm_i->nat_tree_lock);
return;
}
@@ -400,11 +403,34 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
node_info_from_raw_nat(ni, &ne);
f2fs_put_page(page, 1);
cache:
- up_read(&nm_i->nat_tree_lock);
+ percpu_up_read(&nm_i->nat_tree_lock);
/* cache nat entry */
- down_write(&nm_i->nat_tree_lock);
+ percpu_down_write(&nm_i->nat_tree_lock);
cache_nat_entry(sbi, nid, &ne);
- up_write(&nm_i->nat_tree_lock);
+ percpu_up_write(&nm_i->nat_tree_lock);
+}
+
+/*
+ * readahead MAX_RA_NODE number of node pages.
+ */
+static void ra_node_pages(struct page *parent, int start, int n)
+{
+ struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
+ struct blk_plug plug;
+ int i, end;
+ nid_t nid;
+
+ blk_start_plug(&plug);
+
+ /* Then, try readahead for siblings of the desired node */
+ end = start + n;
+ end = min(end, NIDS_PER_BLOCK);
+ for (i = start; i < end; i++) {
+ nid = get_nid(parent, i, false);
+ ra_node_page(sbi, nid);
+ }
+
+ blk_finish_plug(&plug);
}
pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs)
@@ -623,6 +649,7 @@ release_out:
if (err == -ENOENT) {
dn->cur_level = i;
dn->max_level = level;
+ dn->ofs_in_node = offset[level];
}
return err;
}
@@ -647,8 +674,7 @@ static void truncate_node(struct dnode_of_data *dn)
if (dn->nid == dn->inode->i_ino) {
remove_orphan_inode(sbi, dn->nid);
dec_valid_inode_count(sbi);
- } else {
- sync_inode_page(dn);
+ f2fs_inode_synced(dn->inode);
}
invalidate:
clear_node_page_dirty(dn->node_page);
@@ -707,6 +733,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
return PTR_ERR(page);
}
+ ra_node_pages(page, ofs, NIDS_PER_BLOCK);
+
rn = F2FS_NODE(page);
if (depth < 3) {
for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) {
@@ -784,6 +812,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
nid[i + 1] = get_nid(pages[i], offset[i + 1], false);
}
+ ra_node_pages(pages[idx], offset[idx + 1], NIDS_PER_BLOCK);
+
/* free direct nodes linked to a partial indirect node */
for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) {
child_nid = get_nid(pages[idx], i, false);
@@ -832,7 +862,7 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from)
trace_f2fs_truncate_inode_blocks_enter(inode, from);
level = get_node_path(inode, from, offset, noffset);
-restart:
+
page = get_node_page(sbi, inode->i_ino);
if (IS_ERR(page)) {
trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page));
@@ -896,10 +926,7 @@ skip_partial:
if (offset[1] == 0 &&
ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) {
lock_page(page);
- if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
- f2fs_put_page(page, 1);
- goto restart;
- }
+ BUG_ON(page->mapping != NODE_MAPPING(sbi));
f2fs_wait_on_page_writeback(page, NODE, true);
ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
set_page_dirty(page);
@@ -929,7 +956,7 @@ int truncate_xattr_node(struct inode *inode, struct page *page)
if (IS_ERR(npage))
return PTR_ERR(npage);
- F2FS_I(inode)->i_xattr_nid = 0;
+ f2fs_i_xnid_write(inode, 0);
/* need to do checkpoint during fsync */
F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi));
@@ -995,10 +1022,10 @@ struct page *new_node_page(struct dnode_of_data *dn,
struct page *page;
int err;
- if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
+ if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC)))
return ERR_PTR(-EPERM);
- page = grab_cache_page(NODE_MAPPING(sbi), dn->nid);
+ page = f2fs_grab_cache_page(NODE_MAPPING(sbi), dn->nid, false);
if (!page)
return ERR_PTR(-ENOMEM);
@@ -1018,21 +1045,16 @@ struct page *new_node_page(struct dnode_of_data *dn,
f2fs_wait_on_page_writeback(page, NODE, true);
fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
set_cold_node(dn->inode, page);
- SetPageUptodate(page);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
if (set_page_dirty(page))
dn->node_changed = true;
if (f2fs_has_xattr_block(ofs))
- F2FS_I(dn->inode)->i_xattr_nid = dn->nid;
+ f2fs_i_xnid_write(dn->inode, dn->nid);
- dn->node_page = page;
- if (ipage)
- update_inode(dn->inode, ipage);
- else
- sync_inode_page(dn);
if (ofs == 0)
inc_valid_inode_count(sbi);
-
return page;
fail:
@@ -1046,18 +1068,22 @@ fail:
* 0: f2fs_put_page(page, 0)
* LOCKED_PAGE or error: f2fs_put_page(page, 1)
*/
-static int read_node_page(struct page *page, int rw)
+static int read_node_page(struct page *page, int op_flags)
{
struct f2fs_sb_info *sbi = F2FS_P_SB(page);
struct node_info ni;
struct f2fs_io_info fio = {
.sbi = sbi,
.type = NODE,
- .rw = rw,
+ .op = REQ_OP_READ,
+ .op_flags = op_flags,
.page = page,
.encrypted_page = NULL,
};
+ if (PageUptodate(page))
+ return LOCKED_PAGE;
+
get_node_info(sbi, page->index, &ni);
if (unlikely(ni.blk_addr == NULL_ADDR)) {
@@ -1065,9 +1091,6 @@ static int read_node_page(struct page *page, int rw)
return -ENOENT;
}
- if (PageUptodate(page))
- return LOCKED_PAGE;
-
fio.new_blkaddr = fio.old_blkaddr = ni.blk_addr;
return f2fs_submit_page_bio(&fio);
}
@@ -1090,37 +1113,14 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
if (apage)
return;
- apage = grab_cache_page(NODE_MAPPING(sbi), nid);
+ apage = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false);
if (!apage)
return;
- err = read_node_page(apage, READA);
+ err = read_node_page(apage, REQ_RAHEAD);
f2fs_put_page(apage, err ? 1 : 0);
}
-/*
- * readahead MAX_RA_NODE number of node pages.
- */
-static void ra_node_pages(struct page *parent, int start)
-{
- struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
- struct blk_plug plug;
- int i, end;
- nid_t nid;
-
- blk_start_plug(&plug);
-
- /* Then, try readahead for siblings of the desired node */
- end = start + MAX_RA_NODE;
- end = min(end, NIDS_PER_BLOCK);
- for (i = start; i < end; i++) {
- nid = get_nid(parent, i, false);
- ra_node_page(sbi, nid);
- }
-
- blk_finish_plug(&plug);
-}
-
static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid,
struct page *parent, int start)
{
@@ -1131,7 +1131,7 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid,
return ERR_PTR(-ENOENT);
f2fs_bug_on(sbi, check_nid_range(sbi, nid));
repeat:
- page = grab_cache_page(NODE_MAPPING(sbi), nid);
+ page = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false);
if (!page)
return ERR_PTR(-ENOMEM);
@@ -1144,20 +1144,25 @@ repeat:
}
if (parent)
- ra_node_pages(parent, start + 1);
+ ra_node_pages(parent, start + 1, MAX_RA_NODE);
lock_page(page);
- if (unlikely(!PageUptodate(page))) {
- f2fs_put_page(page, 1);
- return ERR_PTR(-EIO);
- }
if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
f2fs_put_page(page, 1);
goto repeat;
}
+
+ if (unlikely(!PageUptodate(page)))
+ goto out_err;
page_hit:
- f2fs_bug_on(sbi, nid != nid_of_node(page));
+ if(unlikely(nid != nid_of_node(page))) {
+ f2fs_bug_on(sbi, 1);
+ ClearPageUptodate(page);
+out_err:
+ f2fs_put_page(page, 1);
+ return ERR_PTR(-EIO);
+ }
return page;
}
@@ -1174,41 +1179,21 @@ struct page *get_node_page_ra(struct page *parent, int start)
return __get_node_page(sbi, nid, parent, start);
}
-void sync_inode_page(struct dnode_of_data *dn)
-{
- int ret = 0;
-
- if (IS_INODE(dn->node_page) || dn->inode_page == dn->node_page) {
- ret = update_inode(dn->inode, dn->node_page);
- } else if (dn->inode_page) {
- if (!dn->inode_page_locked)
- lock_page(dn->inode_page);
- ret = update_inode(dn->inode, dn->inode_page);
- if (!dn->inode_page_locked)
- unlock_page(dn->inode_page);
- } else {
- ret = update_inode_page(dn->inode);
- }
- dn->node_changed = ret ? true: false;
-}
-
static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino)
{
struct inode *inode;
struct page *page;
+ int ret;
/* should flush inline_data before evict_inode */
inode = ilookup(sbi->sb, ino);
if (!inode)
return;
- page = pagecache_get_page(inode->i_mapping, 0, FGP_NOWAIT, 0);
+ page = pagecache_get_page(inode->i_mapping, 0, FGP_LOCK|FGP_NOWAIT, 0);
if (!page)
goto iput_out;
- if (!trylock_page(page))
- goto release_out;
-
if (!PageUptodate(page))
goto page_out;
@@ -1218,24 +1203,219 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino)
if (!clear_page_dirty_for_io(page))
goto page_out;
- if (!f2fs_write_inline_data(inode, page))
- inode_dec_dirty_pages(inode);
- else
+ ret = f2fs_write_inline_data(inode, page);
+ inode_dec_dirty_pages(inode);
+ if (ret)
set_page_dirty(page);
page_out:
- unlock_page(page);
-release_out:
- f2fs_put_page(page, 0);
+ f2fs_put_page(page, 1);
iput_out:
iput(inode);
}
-int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino,
- struct writeback_control *wbc)
+void move_node_page(struct page *node_page, int gc_type)
+{
+ if (gc_type == FG_GC) {
+ struct f2fs_sb_info *sbi = F2FS_P_SB(node_page);
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = 1,
+ .for_reclaim = 0,
+ };
+
+ set_page_dirty(node_page);
+ f2fs_wait_on_page_writeback(node_page, NODE, true);
+
+ f2fs_bug_on(sbi, PageWriteback(node_page));
+ if (!clear_page_dirty_for_io(node_page))
+ goto out_page;
+
+ if (NODE_MAPPING(sbi)->a_ops->writepage(node_page, &wbc))
+ unlock_page(node_page);
+ goto release_page;
+ } else {
+ /* set page dirty and write it */
+ if (!PageWriteback(node_page))
+ set_page_dirty(node_page);
+ }
+out_page:
+ unlock_page(node_page);
+release_page:
+ f2fs_put_page(node_page, 0);
+}
+
+static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino)
+{
+ pgoff_t index, end;
+ struct pagevec pvec;
+ struct page *last_page = NULL;
+
+ pagevec_init(&pvec, 0);
+ index = 0;
+ end = ULONG_MAX;
+
+ while (index <= end) {
+ int i, nr_pages;
+ nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
+ PAGECACHE_TAG_DIRTY,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+ if (nr_pages == 0)
+ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ if (unlikely(f2fs_cp_error(sbi))) {
+ f2fs_put_page(last_page, 0);
+ pagevec_release(&pvec);
+ return ERR_PTR(-EIO);
+ }
+
+ if (!IS_DNODE(page) || !is_cold_node(page))
+ continue;
+ if (ino_of_node(page) != ino)
+ continue;
+
+ lock_page(page);
+
+ if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
+continue_unlock:
+ unlock_page(page);
+ continue;
+ }
+ if (ino_of_node(page) != ino)
+ goto continue_unlock;
+
+ if (!PageDirty(page)) {
+ /* someone wrote it for us */
+ goto continue_unlock;
+ }
+
+ if (last_page)
+ f2fs_put_page(last_page, 0);
+
+ get_page(page);
+ last_page = page;
+ unlock_page(page);
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+ return last_page;
+}
+
+int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
+ struct writeback_control *wbc, bool atomic)
+{
+ pgoff_t index, end;
+ struct pagevec pvec;
+ int ret = 0;
+ struct page *last_page = NULL;
+ bool marked = false;
+ nid_t ino = inode->i_ino;
+
+ if (atomic) {
+ last_page = last_fsync_dnode(sbi, ino);
+ if (IS_ERR_OR_NULL(last_page))
+ return PTR_ERR_OR_ZERO(last_page);
+ }
+retry:
+ pagevec_init(&pvec, 0);
+ index = 0;
+ end = ULONG_MAX;
+
+ while (index <= end) {
+ int i, nr_pages;
+ nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
+ PAGECACHE_TAG_DIRTY,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+ if (nr_pages == 0)
+ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ if (unlikely(f2fs_cp_error(sbi))) {
+ f2fs_put_page(last_page, 0);
+ pagevec_release(&pvec);
+ return -EIO;
+ }
+
+ if (!IS_DNODE(page) || !is_cold_node(page))
+ continue;
+ if (ino_of_node(page) != ino)
+ continue;
+
+ lock_page(page);
+
+ if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
+continue_unlock:
+ unlock_page(page);
+ continue;
+ }
+ if (ino_of_node(page) != ino)
+ goto continue_unlock;
+
+ if (!PageDirty(page) && page != last_page) {
+ /* someone wrote it for us */
+ goto continue_unlock;
+ }
+
+ f2fs_wait_on_page_writeback(page, NODE, true);
+ BUG_ON(PageWriteback(page));
+
+ if (!atomic || page == last_page) {
+ set_fsync_mark(page, 1);
+ if (IS_INODE(page)) {
+ if (is_inode_flag_set(inode,
+ FI_DIRTY_INODE))
+ update_inode(inode, page);
+ set_dentry_mark(page,
+ need_dentry_mark(sbi, ino));
+ }
+ /* may be written by other thread */
+ if (!PageDirty(page))
+ set_page_dirty(page);
+ }
+
+ if (!clear_page_dirty_for_io(page))
+ goto continue_unlock;
+
+ ret = NODE_MAPPING(sbi)->a_ops->writepage(page, wbc);
+ if (ret) {
+ unlock_page(page);
+ f2fs_put_page(last_page, 0);
+ break;
+ }
+ if (page == last_page) {
+ f2fs_put_page(page, 0);
+ marked = true;
+ break;
+ }
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+
+ if (ret || marked)
+ break;
+ }
+ if (!ret && atomic && !marked) {
+ f2fs_msg(sbi->sb, KERN_DEBUG,
+ "Retry to write fsync mark: ino=%u, idx=%lx",
+ ino, last_page->index);
+ lock_page(last_page);
+ set_page_dirty(last_page);
+ unlock_page(last_page);
+ goto retry;
+ }
+ return ret ? -EIO: 0;
+}
+
+int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc)
{
pgoff_t index, end;
struct pagevec pvec;
- int step = ino ? 2 : 0;
+ int step = 0;
int nwritten = 0;
pagevec_init(&pvec, 0);
@@ -1274,15 +1454,8 @@ next_step:
if (step == 2 && (!IS_DNODE(page) ||
!is_cold_node(page)))
continue;
-
- /*
- * If an fsync mode,
- * we should not skip writing node pages.
- */
lock_node:
- if (ino && ino_of_node(page) == ino)
- lock_page(page);
- else if (!trylock_page(page))
+ if (!trylock_page(page))
continue;
if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
@@ -1290,8 +1463,6 @@ continue_unlock:
unlock_page(page);
continue;
}
- if (ino && ino_of_node(page) != ino)
- goto continue_unlock;
if (!PageDirty(page)) {
/* someone wrote it for us */
@@ -1299,7 +1470,7 @@ continue_unlock:
}
/* flush inline_data */
- if (!ino && is_inline_node(page)) {
+ if (is_inline_node(page)) {
clear_inline_node(page);
unlock_page(page);
flush_inline_data(sbi, ino_of_node(page));
@@ -1312,17 +1483,8 @@ continue_unlock:
if (!clear_page_dirty_for_io(page))
goto continue_unlock;
- /* called by fsync() */
- if (ino && IS_DNODE(page)) {
- set_fsync_mark(page, 1);
- if (IS_INODE(page))
- set_dentry_mark(page,
- need_dentry_mark(sbi, ino));
- nwritten++;
- } else {
- set_fsync_mark(page, 0);
- set_dentry_mark(page, 0);
- }
+ set_fsync_mark(page, 0);
+ set_dentry_mark(page, 0);
if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc))
unlock_page(page);
@@ -1397,7 +1559,8 @@ static int f2fs_write_node_page(struct page *page,
struct f2fs_io_info fio = {
.sbi = sbi,
.type = NODE,
- .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
+ .op = REQ_OP_WRITE,
+ .op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0,
.page = page,
.encrypted_page = NULL,
};
@@ -1457,6 +1620,7 @@ static int f2fs_write_node_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
+ struct blk_plug plug;
long diff;
/* balancing f2fs's metadata in background */
@@ -1470,7 +1634,9 @@ static int f2fs_write_node_pages(struct address_space *mapping,
diff = nr_pages_to_write(sbi, NODE, wbc);
wbc->sync_mode = WB_SYNC_NONE;
- sync_node_pages(sbi, 0, wbc);
+ blk_start_plug(&plug);
+ sync_node_pages(sbi, wbc);
+ blk_finish_plug(&plug);
wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
return 0;
@@ -1484,9 +1650,10 @@ static int f2fs_set_node_page_dirty(struct page *page)
{
trace_f2fs_set_page_dirty(page, NODE);
- SetPageUptodate(page);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
if (!PageDirty(page)) {
- __set_page_dirty_nobuffers(page);
+ f2fs_set_page_dirty_nobuffers(page);
inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
SetPagePrivate(page);
f2fs_trace_pid(page);
@@ -1524,7 +1691,6 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *i;
struct nat_entry *ne;
- bool allocated = false;
if (!available_free_memory(sbi, FREE_NIDS))
return -1;
@@ -1538,8 +1704,6 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
ne = __lookup_nat_cache(nm_i, nid);
if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
nat_get_blkaddr(ne) != NULL_ADDR))
- allocated = true;
- if (allocated)
return 0;
}
@@ -1608,7 +1772,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi,
}
}
-static void build_free_nids(struct f2fs_sb_info *sbi)
+void build_free_nids(struct f2fs_sb_info *sbi)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
@@ -1617,14 +1781,14 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
nid_t nid = nm_i->next_scan_nid;
/* Enough entries */
- if (nm_i->fcnt > NAT_ENTRY_PER_BLOCK)
+ if (nm_i->fcnt >= NAT_ENTRY_PER_BLOCK)
return;
/* readahead nat pages to be scanned */
ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
META_NAT, true);
- down_read(&nm_i->nat_tree_lock);
+ percpu_down_read(&nm_i->nat_tree_lock);
while (1) {
struct page *page = get_current_nat_page(sbi, nid);
@@ -1656,7 +1820,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
remove_free_nid(nm_i, nid);
}
up_read(&curseg->journal_rwsem);
- up_read(&nm_i->nat_tree_lock);
+ percpu_up_read(&nm_i->nat_tree_lock);
ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
nm_i->ra_nid_pages, META_NAT, false);
@@ -1672,6 +1836,10 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *i = NULL;
retry:
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (time_to_inject(FAULT_ALLOC_NID))
+ return false;
+#endif
if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids))
return false;
@@ -1751,12 +1919,15 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink)
struct free_nid *i, *next;
int nr = nr_shrink;
+ if (nm_i->fcnt <= MAX_FREE_NIDS)
+ return 0;
+
if (!mutex_trylock(&nm_i->build_lock))
return 0;
spin_lock(&nm_i->free_nid_list_lock);
list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) {
- if (nr_shrink <= 0 || nm_i->fcnt <= NAT_ENTRY_PER_BLOCK)
+ if (nr_shrink <= 0 || nm_i->fcnt <= MAX_FREE_NIDS)
break;
if (i->state == NID_ALLOC)
continue;
@@ -1783,7 +1954,7 @@ void recover_inline_xattr(struct inode *inode, struct page *page)
ri = F2FS_INODE(page);
if (!(ri->i_inline & F2FS_INLINE_XATTR)) {
- clear_inode_flag(F2FS_I(inode), FI_INLINE_XATTR);
+ clear_inode_flag(inode, FI_INLINE_XATTR);
goto update_inode;
}
@@ -1825,13 +1996,11 @@ recover_xnid:
get_node_info(sbi, new_xnid, &ni);
ni.ino = inode->i_ino;
set_node_addr(sbi, &ni, NEW_ADDR, false);
- F2FS_I(inode)->i_xattr_nid = new_xnid;
+ f2fs_i_xnid_write(inode, new_xnid);
/* 3: update xattr blkaddr */
refresh_sit_entry(sbi, NEW_ADDR, blkaddr);
set_node_addr(sbi, &ni, blkaddr, false);
-
- update_inode_page(inode);
}
int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
@@ -1846,14 +2015,15 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
if (unlikely(old_ni.blk_addr != NULL_ADDR))
return -EINVAL;
- ipage = grab_cache_page(NODE_MAPPING(sbi), ino);
+ ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false);
if (!ipage)
return -ENOMEM;
/* Should not use this inode from free nid list */
remove_free_nid(NM_I(sbi), ino);
- SetPageUptodate(ipage);
+ if (!PageUptodate(ipage))
+ SetPageUptodate(ipage);
fill_node_footer(ipage, ino, ino, 0, true);
src = F2FS_INODE(page);
@@ -2039,7 +2209,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
if (!nm_i->dirty_nat_cnt)
return;
- down_write(&nm_i->nat_tree_lock);
+ percpu_down_write(&nm_i->nat_tree_lock);
/*
* if there are no enough space in journal to store dirty nat
@@ -2062,7 +2232,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
list_for_each_entry_safe(set, tmp, &sets, set_list)
__flush_nat_entry_set(sbi, set);
- up_write(&nm_i->nat_tree_lock);
+ percpu_up_write(&nm_i->nat_tree_lock);
f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
}
@@ -2098,7 +2268,8 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
mutex_init(&nm_i->build_lock);
spin_lock_init(&nm_i->free_nid_list_lock);
- init_rwsem(&nm_i->nat_tree_lock);
+ if (percpu_init_rwsem(&nm_i->nat_tree_lock))
+ return -ENOMEM;
nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
@@ -2155,7 +2326,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
spin_unlock(&nm_i->free_nid_list_lock);
/* destroy nat cache */
- down_write(&nm_i->nat_tree_lock);
+ percpu_down_write(&nm_i->nat_tree_lock);
while ((found = __gang_lookup_nat_cache(nm_i,
nid, NATVEC_SIZE, natvec))) {
unsigned idx;
@@ -2180,8 +2351,9 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
kmem_cache_free(nat_entry_set_slab, setvec[idx]);
}
}
- up_write(&nm_i->nat_tree_lock);
+ percpu_up_write(&nm_i->nat_tree_lock);
+ percpu_free_rwsem(&nm_i->nat_tree_lock);
kfree(nm_i->nat_bitmap);
sbi->nm_info = NULL;
kfree(nm_i);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 1f4f9d4569d9c..fc7684554b1a9 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -15,18 +15,21 @@
#define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK)
/* # of pages to perform synchronous readahead before building free nids */
-#define FREE_NID_PAGES 4
+#define FREE_NID_PAGES 8
+#define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES)
-#define DEF_RA_NID_PAGES 4 /* # of nid pages to be readaheaded */
+#define DEF_RA_NID_PAGES 0 /* # of nid pages to be readaheaded */
/* maximum readahead size for node during getting data blocks */
#define MAX_RA_NODE 128
/* control the memory footprint threshold (10MB per 1GB ram) */
-#define DEF_RAM_THRESHOLD 10
+#define DEF_RAM_THRESHOLD 1
/* control dirty nats ratio threshold (default: 10% over max nid count) */
#define DEF_DIRTY_NAT_RATIO_THRESHOLD 10
+/* control total # of nats */
+#define DEF_NAT_CACHE_THRESHOLD 100000
/* vector size for gang look-up from nat cache that consists of radix tree */
#define NATVEC_SIZE 64
@@ -126,6 +129,11 @@ static inline bool excess_dirty_nats(struct f2fs_sb_info *sbi)
NM_I(sbi)->dirty_nats_ratio / 100;
}
+static inline bool excess_cached_nats(struct f2fs_sb_info *sbi)
+{
+ return NM_I(sbi)->nat_cnt >= DEF_NAT_CACHE_THRESHOLD;
+}
+
enum mem_type {
FREE_NIDS, /* indicates the free nid list */
NAT_ENTRIES, /* indicates the cached nat entry */
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 0b30cd2aeebd5..9e652d5a659bd 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -49,8 +49,9 @@ static struct kmem_cache *fsync_entry_slab;
bool space_for_roll_forward(struct f2fs_sb_info *sbi)
{
- if (sbi->last_valid_block_count + sbi->alloc_valid_block_count
- > sbi->user_block_count)
+ s64 nalloc = percpu_counter_sum_positive(&sbi->alloc_valid_block_count);
+
+ if (sbi->last_valid_block_count + nalloc > sbi->user_block_count)
return false;
return true;
}
@@ -67,7 +68,30 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
return NULL;
}
-static int recover_dentry(struct inode *inode, struct page *ipage)
+static struct fsync_inode_entry *add_fsync_inode(struct list_head *head,
+ struct inode *inode)
+{
+ struct fsync_inode_entry *entry;
+
+ entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO);
+ if (!entry)
+ return NULL;
+
+ entry->inode = inode;
+ list_add_tail(&entry->list, head);
+
+ return entry;
+}
+
+static void del_fsync_inode(struct fsync_inode_entry *entry)
+{
+ iput(entry->inode);
+ list_del(&entry->list);
+ kmem_cache_free(fsync_entry_slab, entry);
+}
+
+static int recover_dentry(struct inode *inode, struct page *ipage,
+ struct list_head *dir_list)
{
struct f2fs_inode *raw_inode = F2FS_INODE(ipage);
nid_t pino = le32_to_cpu(raw_inode->i_pino);
@@ -75,18 +99,29 @@ static int recover_dentry(struct inode *inode, struct page *ipage)
struct qstr name;
struct page *page;
struct inode *dir, *einode;
+ struct fsync_inode_entry *entry;
int err = 0;
- dir = f2fs_iget(inode->i_sb, pino);
- if (IS_ERR(dir)) {
- err = PTR_ERR(dir);
- goto out;
+ entry = get_fsync_inode(dir_list, pino);
+ if (!entry) {
+ dir = f2fs_iget(inode->i_sb, pino);
+ if (IS_ERR(dir)) {
+ err = PTR_ERR(dir);
+ goto out;
+ }
+
+ entry = add_fsync_inode(dir_list, dir);
+ if (!entry) {
+ err = -ENOMEM;
+ iput(dir);
+ goto out;
+ }
}
- if (file_enc_name(inode)) {
- iput(dir);
+ dir = entry->inode;
+
+ if (file_enc_name(inode))
return 0;
- }
name.len = le32_to_cpu(raw_inode->i_namelen);
name.name = raw_inode->i_name;
@@ -94,7 +129,7 @@ static int recover_dentry(struct inode *inode, struct page *ipage)
if (unlikely(name.len > F2FS_NAME_LEN)) {
WARN_ON(1);
err = -ENAMETOOLONG;
- goto out_err;
+ goto out;
}
retry:
de = f2fs_find_entry(dir, &name, &page);
@@ -118,25 +153,17 @@ retry:
f2fs_delete_entry(de, page, dir, einode);
iput(einode);
goto retry;
- }
- err = __f2fs_add_link(dir, &name, inode, inode->i_ino, inode->i_mode);
- if (err)
- goto out_err;
-
- if (is_inode_flag_set(F2FS_I(dir), FI_DELAY_IPUT)) {
- iput(dir);
+ } else if (IS_ERR(page)) {
+ err = PTR_ERR(page);
} else {
- add_dirty_dir_inode(dir);
- set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT);
+ err = __f2fs_add_link(dir, &name, inode,
+ inode->i_ino, inode->i_mode);
}
-
goto out;
out_unmap_put:
f2fs_dentry_kunmap(dir, page);
f2fs_put_page(page, 0);
-out_err:
- iput(dir);
out:
f2fs_msg(inode->i_sb, KERN_NOTICE,
"%s: ino = %x, name = %s, dir = %lx, err = %d",
@@ -151,7 +178,7 @@ static void recover_inode(struct inode *inode, struct page *page)
char *name;
inode->i_mode = le16_to_cpu(raw->i_mode);
- i_size_write(inode, le64_to_cpu(raw->i_size));
+ f2fs_i_size_write(inode, le64_to_cpu(raw->i_size));
inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime);
inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime);
inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime);
@@ -198,6 +225,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
{
unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
struct curseg_info *curseg;
+ struct inode *inode;
struct page *page = NULL;
block_t blkaddr;
int err = 0;
@@ -206,8 +234,6 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
- ra_meta_pages(sbi, blkaddr, 1, META_POR, true);
-
while (1) {
struct fsync_inode_entry *entry;
@@ -233,35 +259,32 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
break;
}
- /* add this fsync inode to the list */
- entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO);
- if (!entry) {
- err = -ENOMEM;
- break;
- }
/*
* CP | dnode(F) | inode(DF)
* For this case, we should not give up now.
*/
- entry->inode = f2fs_iget(sbi->sb, ino_of_node(page));
- if (IS_ERR(entry->inode)) {
- err = PTR_ERR(entry->inode);
- kmem_cache_free(fsync_entry_slab, entry);
+ inode = f2fs_iget(sbi->sb, ino_of_node(page));
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
if (err == -ENOENT) {
err = 0;
goto next;
}
break;
}
- list_add_tail(&entry->list, head);
+
+ /* add this fsync inode to the list */
+ entry = add_fsync_inode(head, inode);
+ if (!entry) {
+ err = -ENOMEM;
+ iput(inode);
+ break;
+ }
}
entry->blkaddr = blkaddr;
- if (IS_INODE(page)) {
- entry->last_inode = blkaddr;
- if (is_dent_dnode(page))
- entry->last_dentry = blkaddr;
- }
+ if (IS_INODE(page) && is_dent_dnode(page))
+ entry->last_dentry = blkaddr;
next:
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
@@ -277,11 +300,8 @@ static void destroy_fsync_dnodes(struct list_head *head)
{
struct fsync_inode_entry *entry, *tmp;
- list_for_each_entry_safe(entry, tmp, head, list) {
- iput(entry->inode);
- list_del(&entry->list);
- kmem_cache_free(fsync_entry_slab, entry);
- }
+ list_for_each_entry_safe(entry, tmp, head, list)
+ del_fsync_inode(entry);
}
static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
@@ -438,14 +458,16 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
continue;
}
+ if ((start + 1) << PAGE_SHIFT > i_size_read(inode))
+ f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT);
+
/*
* dest is reserved block, invalidate src block
* and then reserve one new block in dnode page.
*/
if (dest == NEW_ADDR) {
truncate_data_blocks_range(&dn, 1);
- err = reserve_new_block(&dn);
- f2fs_bug_on(sbi, err);
+ reserve_new_block(&dn);
continue;
}
@@ -454,8 +476,14 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
if (src == NULL_ADDR) {
err = reserve_new_block(&dn);
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ while (err)
+ err = reserve_new_block(&dn);
+#endif
/* We should not get -ENOSPC */
f2fs_bug_on(sbi, err);
+ if (err)
+ goto err;
}
/* Check the previous node page having this index */
@@ -470,9 +498,6 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
}
}
- if (IS_INODE(dn.node_page))
- sync_inode_page(&dn);
-
copy_node_footer(dn.node_page, page);
fill_node_footer(dn.node_page, dn.nid, ni.ino,
ofs_of_node(page), false);
@@ -486,7 +511,8 @@ out:
return err;
}
-static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head)
+static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
+ struct list_head *dir_list)
{
unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
struct curseg_info *curseg;
@@ -513,7 +539,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head)
break;
}
- entry = get_fsync_inode(head, ino_of_node(page));
+ entry = get_fsync_inode(inode_list, ino_of_node(page));
if (!entry)
goto next;
/*
@@ -521,10 +547,10 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head)
* In this case, we can lose the latest inode(x).
* So, call recover_inode for the inode update.
*/
- if (entry->last_inode == blkaddr)
+ if (IS_INODE(page))
recover_inode(entry->inode, page);
if (entry->last_dentry == blkaddr) {
- err = recover_dentry(entry->inode, page);
+ err = recover_dentry(entry->inode, page, dir_list);
if (err) {
f2fs_put_page(page, 1);
break;
@@ -536,11 +562,8 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head)
break;
}
- if (entry->blkaddr == blkaddr) {
- iput(entry->inode);
- list_del(&entry->list);
- kmem_cache_free(fsync_entry_slab, entry);
- }
+ if (entry->blkaddr == blkaddr)
+ del_fsync_inode(entry);
next:
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
@@ -551,12 +574,14 @@ next:
return err;
}
-int recover_fsync_data(struct f2fs_sb_info *sbi)
+int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
{
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
struct list_head inode_list;
+ struct list_head dir_list;
block_t blkaddr;
int err;
+ int ret = 0;
bool need_writecp = false;
fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
@@ -565,6 +590,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
return -ENOMEM;
INIT_LIST_HEAD(&inode_list);
+ INIT_LIST_HEAD(&dir_list);
/* prevent checkpoint */
mutex_lock(&sbi->cp_mutex);
@@ -573,25 +599,26 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
/* step #1: find fsynced inode numbers */
err = find_fsync_dnodes(sbi, &inode_list);
- if (err)
+ if (err || list_empty(&inode_list))
goto out;
- if (list_empty(&inode_list))
+ if (check_only) {
+ ret = 1;
goto out;
+ }
need_writecp = true;
/* step #2: recover data */
- err = recover_data(sbi, &inode_list);
+ err = recover_data(sbi, &inode_list, &dir_list);
if (!err)
f2fs_bug_on(sbi, !list_empty(&inode_list));
out:
destroy_fsync_dnodes(&inode_list);
- kmem_cache_destroy(fsync_entry_slab);
/* truncate meta pages to be used by the recovery */
truncate_inode_pages_range(META_MAPPING(sbi),
- (loff_t)MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1);
+ (loff_t)MAIN_BLKADDR(sbi) << PAGE_SHIFT, -1);
if (err) {
truncate_inode_pages_final(NODE_MAPPING(sbi));
@@ -602,8 +629,12 @@ out:
if (err) {
bool invalidate = false;
- if (discard_next_dnode(sbi, blkaddr))
+ if (test_opt(sbi, LFS)) {
+ update_meta_page(sbi, NULL, blkaddr);
+ invalidate = true;
+ } else if (discard_next_dnode(sbi, blkaddr)) {
invalidate = true;
+ }
/* Flush all the NAT/SIT pages */
while (get_pages(sbi, F2FS_DIRTY_META))
@@ -625,5 +656,8 @@ out:
} else {
mutex_unlock(&sbi->cp_mutex);
}
- return err;
+
+ destroy_fsync_dnodes(&dir_list);
+ kmem_cache_destroy(fsync_entry_slab);
+ return ret ? ret: err;
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 6f16b39f0b528..a46296f57b026 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -223,9 +223,11 @@ static int __revoke_inmem_pages(struct inode *inode,
f2fs_put_dnode(&dn);
}
next:
- ClearPageUptodate(page);
+ /* we don't need to invalidate this in the sccessful status */
+ if (drop || recover)
+ ClearPageUptodate(page);
set_page_private(page, 0);
- ClearPageUptodate(page);
+ ClearPagePrivate(page);
f2fs_put_page(page, 1);
list_del(&cur->list);
@@ -239,6 +241,8 @@ void drop_inmem_pages(struct inode *inode)
{
struct f2fs_inode_info *fi = F2FS_I(inode);
+ clear_inode_flag(inode, FI_ATOMIC_FILE);
+
mutex_lock(&fi->inmem_lock);
__revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
mutex_unlock(&fi->inmem_lock);
@@ -253,7 +257,8 @@ static int __commit_inmem_pages(struct inode *inode,
struct f2fs_io_info fio = {
.sbi = sbi,
.type = DATA,
- .rw = WRITE_SYNC | REQ_PRIO,
+ .op = REQ_OP_WRITE,
+ .op_flags = WRITE_SYNC | REQ_PRIO,
.encrypted_page = NULL,
};
bool submit_bio = false;
@@ -341,6 +346,11 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
{
if (!need)
return;
+
+ /* balance_fs_bg is able to be pending */
+ if (excess_cached_nats(sbi))
+ f2fs_balance_fs_bg(sbi);
+
/*
* We should do GC or end up with checkpoint, if there are so many dirty
* dir/node pages without enough free segments.
@@ -362,7 +372,9 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK);
if (!available_free_memory(sbi, FREE_NIDS))
- try_to_free_nids(sbi, NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES);
+ try_to_free_nids(sbi, MAX_FREE_NIDS);
+ else
+ build_free_nids(sbi);
/* checkpoint is the only way to shrink partial cached entries */
if (!available_free_memory(sbi, NAT_ENTRIES) ||
@@ -402,7 +414,8 @@ repeat:
fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
bio->bi_bdev = sbi->sb->s_bdev;
- ret = submit_bio_wait(WRITE_FLUSH, bio);
+ bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
+ ret = submit_bio_wait(bio);
llist_for_each_entry_safe(cmd, next,
fcc->dispatch_list, llnode) {
@@ -429,24 +442,29 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
if (test_opt(sbi, NOBARRIER))
return 0;
- if (!test_opt(sbi, FLUSH_MERGE)) {
+ if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) {
struct bio *bio = f2fs_bio_alloc(0);
int ret;
+ atomic_inc(&fcc->submit_flush);
bio->bi_bdev = sbi->sb->s_bdev;
- ret = submit_bio_wait(WRITE_FLUSH, bio);
+ bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
+ ret = submit_bio_wait(bio);
+ atomic_dec(&fcc->submit_flush);
bio_put(bio);
return ret;
}
init_completion(&cmd.wait);
+ atomic_inc(&fcc->submit_flush);
llist_add(&cmd.llnode, &fcc->issue_list);
if (!fcc->dispatch_list)
wake_up(&fcc->flush_wait_queue);
wait_for_completion(&cmd.wait);
+ atomic_dec(&fcc->submit_flush);
return cmd.ret;
}
@@ -460,6 +478,7 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL);
if (!fcc)
return -ENOMEM;
+ atomic_set(&fcc->submit_flush, 0);
init_waitqueue_head(&fcc->flush_wait_queue);
init_llist_head(&fcc->issue_list);
SM_I(sbi)->cmd_control_info = fcc;
@@ -661,6 +680,10 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
break;
end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1);
+ if (force && start && end != max_blocks
+ && (end - start) < cpc->trim_minlen)
+ continue;
+
__add_discard_entry(sbi, cpc, se, start, end);
}
}
@@ -698,6 +721,8 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
unsigned int start = 0, end = -1;
+ unsigned int secno, start_segno;
+ bool force = (cpc->reason == CP_DISCARD);
mutex_lock(&dirty_i->seglist_lock);
@@ -714,17 +739,31 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
dirty_i->nr_dirty[PRE] -= end - start;
- if (!test_opt(sbi, DISCARD))
+ if (force || !test_opt(sbi, DISCARD))
continue;
- f2fs_issue_discard(sbi, START_BLOCK(sbi, start),
+ if (!test_opt(sbi, LFS) || sbi->segs_per_sec == 1) {
+ f2fs_issue_discard(sbi, START_BLOCK(sbi, start),
(end - start) << sbi->log_blocks_per_seg);
+ continue;
+ }
+next:
+ secno = GET_SECNO(sbi, start);
+ start_segno = secno * sbi->segs_per_sec;
+ if (!IS_CURSEC(sbi, secno) &&
+ !get_valid_blocks(sbi, start, sbi->segs_per_sec))
+ f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno),
+ sbi->segs_per_sec << sbi->log_blocks_per_seg);
+
+ start = start_segno + sbi->segs_per_sec;
+ if (start < end)
+ goto next;
}
mutex_unlock(&dirty_i->seglist_lock);
/* send small discards */
list_for_each_entry_safe(entry, this, head, list) {
- if (cpc->reason == CP_DISCARD && entry->len < cpc->trim_minlen)
+ if (force && entry->len < cpc->trim_minlen)
goto skip;
f2fs_issue_discard(sbi, entry->blkaddr, entry->len);
cpc->trimmed += entry->len;
@@ -885,12 +924,12 @@ int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra)
}
}
- sum_in_page = (PAGE_CACHE_SIZE - 2 * SUM_JOURNAL_SIZE -
+ sum_in_page = (PAGE_SIZE - 2 * SUM_JOURNAL_SIZE -
SUM_FOOTER_SIZE) / SUMMARY_SIZE;
if (valid_sum_count <= sum_in_page)
return 1;
else if ((valid_sum_count - sum_in_page) <=
- (PAGE_CACHE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE)
+ (PAGE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE)
return 2;
return 3;
}
@@ -909,9 +948,9 @@ void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr)
void *dst = page_address(page);
if (src)
- memcpy(dst, src, PAGE_CACHE_SIZE);
+ memcpy(dst, src, PAGE_SIZE);
else
- memset(dst, 0, PAGE_CACHE_SIZE);
+ memset(dst, 0, PAGE_SIZE);
set_page_dirty(page);
f2fs_put_page(page, 1);
}
@@ -1212,6 +1251,9 @@ void allocate_new_segments(struct f2fs_sb_info *sbi)
{
int i;
+ if (test_opt(sbi, LFS))
+ return;
+
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++)
__allocate_new_segments(sbi, i);
}
@@ -1385,11 +1427,17 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
{
int type = __get_segment_type(fio->page, fio->type);
+ if (fio->type == NODE || fio->type == DATA)
+ mutex_lock(&fio->sbi->wio_mutex[fio->type]);
+
allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
&fio->new_blkaddr, sum, type);
/* writeout dirty page into bdev */
f2fs_submit_page_mbio(fio);
+
+ if (fio->type == NODE || fio->type == DATA)
+ mutex_unlock(&fio->sbi->wio_mutex[fio->type]);
}
void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
@@ -1397,7 +1445,8 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
struct f2fs_io_info fio = {
.sbi = sbi,
.type = META,
- .rw = WRITE_SYNC | REQ_META | REQ_PRIO,
+ .op = REQ_OP_WRITE,
+ .op_flags = WRITE_SYNC | REQ_META | REQ_PRIO,
.old_blkaddr = page->index,
.new_blkaddr = page->index,
.page = page,
@@ -1405,7 +1454,7 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
};
if (unlikely(page->index >= MAIN_BLKADDR(sbi)))
- fio.rw &= ~REQ_META;
+ fio.op_flags &= ~REQ_META;
set_page_writeback(page);
f2fs_submit_page_mbio(&fio);
@@ -1596,7 +1645,7 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi)
s = (struct f2fs_summary *)(kaddr + offset);
seg_i->sum_blk->entries[j] = *s;
offset += SUMMARY_SIZE;
- if (offset + SUMMARY_SIZE <= PAGE_CACHE_SIZE -
+ if (offset + SUMMARY_SIZE <= PAGE_SIZE -
SUM_FOOTER_SIZE)
continue;
@@ -1757,7 +1806,7 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
*summary = seg_i->sum_blk->entries[j];
written_size += SUMMARY_SIZE;
- if (written_size + SUMMARY_SIZE <= PAGE_CACHE_SIZE -
+ if (written_size + SUMMARY_SIZE <= PAGE_SIZE -
SUM_FOOTER_SIZE)
continue;
@@ -1844,7 +1893,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
src_addr = page_address(src_page);
dst_addr = page_address(dst_page);
- memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE);
+ memcpy(dst_addr, src_addr, PAGE_SIZE);
set_page_dirty(dst_page);
f2fs_put_page(src_page, 1);
@@ -2171,7 +2220,7 @@ static int build_curseg(struct f2fs_sb_info *sbi)
for (i = 0; i < NR_CURSEG_TYPE; i++) {
mutex_init(&array[i].curseg_mutex);
- array[i].sum_blk = kzalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+ array[i].sum_blk = kzalloc(PAGE_SIZE, GFP_KERNEL);
if (!array[i].sum_blk)
return -ENOMEM;
init_rwsem(&array[i].journal_rwsem);
@@ -2369,7 +2418,11 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
sm_info->rec_prefree_segments = sm_info->main_segments *
DEF_RECLAIM_PREFREE_SEGMENTS / 100;
- sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC;
+ if (sm_info->rec_prefree_segments > DEF_MAX_RECLAIM_PREFREE_SEGMENTS)
+ sm_info->rec_prefree_segments = DEF_MAX_RECLAIM_PREFREE_SEGMENTS;
+
+ if (!test_opt(sbi, LFS))
+ sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC;
sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 975c33df65c7c..b33f73ec60a48 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -16,6 +16,7 @@
#define NULL_SECNO ((unsigned int)(~0))
#define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */
+#define DEF_MAX_RECLAIM_PREFREE_SEGMENTS 4096 /* 8GB in maximum */
/* L: Logical segment # in volume, R: Relative segment # in main area */
#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno)
@@ -158,16 +159,17 @@ struct victim_sel_policy {
};
struct seg_entry {
- unsigned short valid_blocks; /* # of valid blocks */
+ unsigned int type:6; /* segment type like CURSEG_XXX_TYPE */
+ unsigned int valid_blocks:10; /* # of valid blocks */
+ unsigned int ckpt_valid_blocks:10; /* # of valid blocks last cp */
+ unsigned int padding:6; /* padding */
unsigned char *cur_valid_map; /* validity bitmap of blocks */
/*
* # of valid blocks and the validity bitmap stored in the the last
* checkpoint pack. This information is used by the SSR mode.
*/
- unsigned short ckpt_valid_blocks;
- unsigned char *ckpt_valid_map;
+ unsigned char *ckpt_valid_map; /* validity bitmap of blocks last cp */
unsigned char *discard_map;
- unsigned char type; /* segment type like CURSEG_XXX_TYPE */
unsigned long long mtime; /* modification time of the segment */
};
@@ -469,6 +471,10 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi)
{
int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
+
+ if (test_opt(sbi, LFS))
+ return false;
+
return free_sections(sbi) <= (node_secs + 2 * dent_secs +
reserved_sections(sbi) + 1);
}
@@ -478,6 +484,8 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
+ node_secs += get_blocktype_secs(sbi, F2FS_DIRTY_IMETA);
+
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
return false;
@@ -530,6 +538,9 @@ static inline bool need_inplace_update(struct inode *inode)
if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode))
return false;
+ if (test_opt(sbi, LFS))
+ return false;
+
if (policy & (0x1 << F2FS_IPU_FORCE))
return true;
if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi))
@@ -543,7 +554,7 @@ static inline bool need_inplace_update(struct inode *inode)
/* this is only set during fdatasync */
if (policy & (0x1 << F2FS_IPU_FSYNC) &&
- is_inode_flag_set(F2FS_I(inode), FI_NEED_IPU))
+ is_inode_flag_set(inode, FI_NEED_IPU))
return true;
return false;
@@ -705,9 +716,9 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
if (type == DATA)
return sbi->blocks_per_seg;
else if (type == NODE)
- return 3 * sbi->blocks_per_seg;
+ return 8 * sbi->blocks_per_seg;
else if (type == META)
- return MAX_BIO_BLOCKS(sbi);
+ return 8 * MAX_BIO_BLOCKS(sbi);
else
return 0;
}
@@ -725,10 +736,8 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type,
nr_to_write = wbc->nr_to_write;
- if (type == DATA)
- desired = 4096;
- else if (type == NODE)
- desired = 3 * max_hw_blocks(sbi);
+ if (type == NODE)
+ desired = 2 * max_hw_blocks(sbi);
else
desired = MAX_BIO_BLOCKS(sbi);
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index 93606f281bf9c..46c9154259239 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -13,6 +13,7 @@
#include <linux/f2fs_fs.h>
#include "f2fs.h"
+#include "node.h"
static LIST_HEAD(f2fs_list);
static DEFINE_SPINLOCK(f2fs_list_lock);
@@ -25,8 +26,8 @@ static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi)
static unsigned long __count_free_nids(struct f2fs_sb_info *sbi)
{
- if (NM_I(sbi)->fcnt > NAT_ENTRY_PER_BLOCK)
- return NM_I(sbi)->fcnt - NAT_ENTRY_PER_BLOCK;
+ if (NM_I(sbi)->fcnt > MAX_FREE_NIDS)
+ return NM_I(sbi)->fcnt - MAX_FREE_NIDS;
return 0;
}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 15bb81f8dac25..1b86d3f638efc 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -39,6 +39,31 @@ static struct proc_dir_entry *f2fs_proc_root;
static struct kmem_cache *f2fs_inode_cachep;
static struct kset *f2fs_kset;
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+struct f2fs_fault_info f2fs_fault;
+
+char *fault_name[FAULT_MAX] = {
+ [FAULT_KMALLOC] = "kmalloc",
+ [FAULT_PAGE_ALLOC] = "page alloc",
+ [FAULT_ALLOC_NID] = "alloc nid",
+ [FAULT_ORPHAN] = "orphan",
+ [FAULT_BLOCK] = "no more block",
+ [FAULT_DIR_DEPTH] = "too big dir depth",
+ [FAULT_EVICT_INODE] = "evict_inode fail",
+};
+
+static void f2fs_build_fault_attr(unsigned int rate)
+{
+ if (rate) {
+ atomic_set(&f2fs_fault.inject_ops, 0);
+ f2fs_fault.inject_rate = rate;
+ f2fs_fault.inject_type = (1 << FAULT_MAX) - 1;
+ } else {
+ memset(&f2fs_fault, 0, sizeof(struct f2fs_fault_info));
+ }
+}
+#endif
+
/* f2fs-wide shrinker description */
static struct shrinker f2fs_shrinker_info = {
.scan_objects = f2fs_shrink_scan,
@@ -51,6 +76,7 @@ enum {
Opt_disable_roll_forward,
Opt_norecovery,
Opt_discard,
+ Opt_nodiscard,
Opt_noheap,
Opt_user_xattr,
Opt_nouser_xattr,
@@ -62,12 +88,17 @@ enum {
Opt_inline_data,
Opt_inline_dentry,
Opt_flush_merge,
+ Opt_noflush_merge,
Opt_nobarrier,
Opt_fastboot,
Opt_extent_cache,
Opt_noextent_cache,
Opt_noinline_data,
Opt_data_flush,
+ Opt_mode,
+ Opt_fault_injection,
+ Opt_lazytime,
+ Opt_nolazytime,
Opt_err,
};
@@ -76,6 +107,7 @@ static match_table_t f2fs_tokens = {
{Opt_disable_roll_forward, "disable_roll_forward"},
{Opt_norecovery, "norecovery"},
{Opt_discard, "discard"},
+ {Opt_nodiscard, "nodiscard"},
{Opt_noheap, "no_heap"},
{Opt_user_xattr, "user_xattr"},
{Opt_nouser_xattr, "nouser_xattr"},
@@ -87,12 +119,17 @@ static match_table_t f2fs_tokens = {
{Opt_inline_data, "inline_data"},
{Opt_inline_dentry, "inline_dentry"},
{Opt_flush_merge, "flush_merge"},
+ {Opt_noflush_merge, "noflush_merge"},
{Opt_nobarrier, "nobarrier"},
{Opt_fastboot, "fastboot"},
{Opt_extent_cache, "extent_cache"},
{Opt_noextent_cache, "noextent_cache"},
{Opt_noinline_data, "noinline_data"},
{Opt_data_flush, "data_flush"},
+ {Opt_mode, "mode=%s"},
+ {Opt_fault_injection, "fault_injection=%u"},
+ {Opt_lazytime, "lazytime"},
+ {Opt_nolazytime, "nolazytime"},
{Opt_err, NULL},
};
@@ -102,6 +139,10 @@ enum {
SM_INFO, /* struct f2fs_sm_info */
NM_INFO, /* struct f2fs_nm_info */
F2FS_SBI, /* struct f2fs_sb_info */
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ FAULT_INFO_RATE, /* struct f2fs_fault_info */
+ FAULT_INFO_TYPE, /* struct f2fs_fault_info */
+#endif
};
struct f2fs_attr {
@@ -123,6 +164,11 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
return (unsigned char *)NM_I(sbi);
else if (struct_type == F2FS_SBI)
return (unsigned char *)sbi;
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ else if (struct_type == FAULT_INFO_RATE ||
+ struct_type == FAULT_INFO_TYPE)
+ return (unsigned char *)&f2fs_fault;
+#endif
return NULL;
}
@@ -172,6 +218,10 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a,
ret = kstrtoul(skip_spaces(buf), 0, &t);
if (ret < 0)
return ret;
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX))
+ return -EINVAL;
+#endif
*ui = t;
return count;
}
@@ -237,6 +287,10 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]);
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate);
+F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type);
+#endif
F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes);
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
@@ -273,6 +327,22 @@ static struct kobj_type f2fs_ktype = {
.release = f2fs_sb_release,
};
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+/* sysfs for f2fs fault injection */
+static struct kobject f2fs_fault_inject;
+
+static struct attribute *f2fs_fault_attrs[] = {
+ ATTR_LIST(inject_rate),
+ ATTR_LIST(inject_type),
+ NULL
+};
+
+static struct kobj_type f2fs_fault_ktype = {
+ .default_attrs = f2fs_fault_attrs,
+ .sysfs_ops = &f2fs_attr_ops,
+};
+#endif
+
void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...)
{
struct va_format vaf;
@@ -300,6 +370,10 @@ static int parse_options(struct super_block *sb, char *options)
char *p, *name;
int arg = 0;
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ f2fs_build_fault_attr(0);
+#endif
+
if (!options)
return 0;
@@ -354,6 +428,8 @@ static int parse_options(struct super_block *sb, char *options)
"the device does not support discard");
}
break;
+ case Opt_nodiscard:
+ clear_opt(sbi, DISCARD);
case Opt_noheap:
set_opt(sbi, NOHEAP);
break;
@@ -415,6 +491,9 @@ static int parse_options(struct super_block *sb, char *options)
case Opt_flush_merge:
set_opt(sbi, FLUSH_MERGE);
break;
+ case Opt_noflush_merge:
+ clear_opt(sbi, FLUSH_MERGE);
+ break;
case Opt_nobarrier:
set_opt(sbi, NOBARRIER);
break;
@@ -433,6 +512,39 @@ static int parse_options(struct super_block *sb, char *options)
case Opt_data_flush:
set_opt(sbi, DATA_FLUSH);
break;
+ case Opt_mode:
+ name = match_strdup(&args[0]);
+
+ if (!name)
+ return -ENOMEM;
+ if (strlen(name) == 8 &&
+ !strncmp(name, "adaptive", 8)) {
+ set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE);
+ } else if (strlen(name) == 3 &&
+ !strncmp(name, "lfs", 3)) {
+ set_opt_mode(sbi, F2FS_MOUNT_LFS);
+ } else {
+ kfree(name);
+ return -EINVAL;
+ }
+ kfree(name);
+ break;
+ case Opt_fault_injection:
+ if (args->from && match_int(args, &arg))
+ return -EINVAL;
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ f2fs_build_fault_attr(arg);
+#else
+ f2fs_msg(sb, KERN_INFO,
+ "FAULT_INJECTION was not selected");
+#endif
+ break;
+ case Opt_lazytime:
+ sb->s_flags |= MS_LAZYTIME;
+ break;
+ case Opt_nolazytime:
+ sb->s_flags &= ~MS_LAZYTIME;
+ break;
default:
f2fs_msg(sb, KERN_ERR,
"Unrecognized mount option \"%s\" or missing value",
@@ -453,20 +565,22 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
init_once((void *) fi);
+ if (percpu_counter_init(&fi->dirty_pages, 0, GFP_NOFS)) {
+ kmem_cache_free(f2fs_inode_cachep, fi);
+ return NULL;
+ }
+
/* Initialize f2fs-specific inode info */
fi->vfs_inode.i_version = 1;
- atomic_set(&fi->dirty_pages, 0);
fi->i_current_depth = 1;
fi->i_advise = 0;
init_rwsem(&fi->i_sem);
INIT_LIST_HEAD(&fi->dirty_list);
+ INIT_LIST_HEAD(&fi->gdirty_list);
INIT_LIST_HEAD(&fi->inmem_pages);
mutex_init(&fi->inmem_lock);
-
- set_inode_flag(fi, FI_NEW_INODE);
-
- if (test_opt(F2FS_SB(sb), INLINE_XATTR))
- set_inode_flag(fi, FI_INLINE_XATTR);
+ init_rwsem(&fi->dio_rwsem[READ]);
+ init_rwsem(&fi->dio_rwsem[WRITE]);
/* Will be used by directory only */
fi->i_dir_level = F2FS_SB(sb)->dir_level;
@@ -482,7 +596,7 @@ static int f2fs_drop_inode(struct inode *inode)
* - f2fs_gc -> iput -> evict
* - inode_wait_for_writeback(inode)
*/
- if (!inode_unhashed(inode) && inode->i_state & I_SYNC) {
+ if ((!inode_unhashed(inode) && inode->i_state & I_SYNC)) {
if (!inode->i_nlink && !is_bad_inode(inode)) {
/* to avoid evict_inode call simultaneously */
atomic_inc(&inode->i_count);
@@ -496,10 +610,10 @@ static int f2fs_drop_inode(struct inode *inode)
f2fs_destroy_extent_node(inode);
sb_start_intwrite(inode->i_sb);
- i_size_write(inode, 0);
+ f2fs_i_size_write(inode, 0);
if (F2FS_HAS_BLOCKS(inode))
- f2fs_truncate(inode, true);
+ f2fs_truncate(inode);
sb_end_intwrite(inode->i_sb);
@@ -509,9 +623,47 @@ static int f2fs_drop_inode(struct inode *inode)
}
return 0;
}
+
return generic_drop_inode(inode);
}
+int f2fs_inode_dirtied(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ spin_lock(&sbi->inode_lock[DIRTY_META]);
+ if (is_inode_flag_set(inode, FI_DIRTY_INODE)) {
+ spin_unlock(&sbi->inode_lock[DIRTY_META]);
+ return 1;
+ }
+
+ set_inode_flag(inode, FI_DIRTY_INODE);
+ list_add_tail(&F2FS_I(inode)->gdirty_list,
+ &sbi->inode_list[DIRTY_META]);
+ inc_page_count(sbi, F2FS_DIRTY_IMETA);
+ stat_inc_dirty_inode(sbi, DIRTY_META);
+ spin_unlock(&sbi->inode_lock[DIRTY_META]);
+
+ return 0;
+}
+
+void f2fs_inode_synced(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ spin_lock(&sbi->inode_lock[DIRTY_META]);
+ if (!is_inode_flag_set(inode, FI_DIRTY_INODE)) {
+ spin_unlock(&sbi->inode_lock[DIRTY_META]);
+ return;
+ }
+ list_del_init(&F2FS_I(inode)->gdirty_list);
+ clear_inode_flag(inode, FI_DIRTY_INODE);
+ clear_inode_flag(inode, FI_AUTO_RECOVER);
+ dec_page_count(sbi, F2FS_DIRTY_IMETA);
+ stat_dec_dirty_inode(F2FS_I_SB(inode), DIRTY_META);
+ spin_unlock(&sbi->inode_lock[DIRTY_META]);
+}
+
/*
* f2fs_dirty_inode() is called from __mark_inode_dirty()
*
@@ -519,7 +671,19 @@ static int f2fs_drop_inode(struct inode *inode)
*/
static void f2fs_dirty_inode(struct inode *inode, int flags)
{
- set_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ if (inode->i_ino == F2FS_NODE_INO(sbi) ||
+ inode->i_ino == F2FS_META_INO(sbi))
+ return;
+
+ if (flags == I_DIRTY_TIME)
+ return;
+
+ if (is_inode_flag_set(inode, FI_AUTO_RECOVER))
+ clear_inode_flag(inode, FI_AUTO_RECOVER);
+
+ f2fs_inode_dirtied(inode);
}
static void f2fs_i_callback(struct rcu_head *head)
@@ -530,15 +694,29 @@ static void f2fs_i_callback(struct rcu_head *head)
static void f2fs_destroy_inode(struct inode *inode)
{
+ percpu_counter_destroy(&F2FS_I(inode)->dirty_pages);
call_rcu(&inode->i_rcu, f2fs_i_callback);
}
+static void destroy_percpu_info(struct f2fs_sb_info *sbi)
+{
+ int i;
+
+ for (i = 0; i < NR_COUNT_TYPE; i++)
+ percpu_counter_destroy(&sbi->nr_pages[i]);
+ percpu_counter_destroy(&sbi->alloc_valid_block_count);
+ percpu_counter_destroy(&sbi->total_valid_inode_count);
+
+ percpu_free_rwsem(&sbi->cp_rwsem);
+}
+
static void f2fs_put_super(struct super_block *sb)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
if (sbi->s_proc) {
remove_proc_entry("segment_info", sbi->s_proc);
+ remove_proc_entry("segment_bits", sbi->s_proc);
remove_proc_entry(sb->s_id, f2fs_proc_root);
}
kobject_del(&sbi->s_kobj);
@@ -568,15 +746,14 @@ static void f2fs_put_super(struct super_block *sb)
* normally superblock is clean, so we need to release this.
* In addition, EIO will skip do checkpoint, we need this as well.
*/
- release_ino_entry(sbi);
+ release_ino_entry(sbi, true);
release_discard_addrs(sbi);
f2fs_leave_shrinker(sbi);
mutex_unlock(&sbi->umount_mutex);
/* our cp_error case, we can wait for any writeback page */
- if (get_pages(sbi, F2FS_WRITEBACK))
- f2fs_flush_merged_bios(sbi);
+ f2fs_flush_merged_bios(sbi);
iput(sbi->node_inode);
iput(sbi->meta_inode);
@@ -593,6 +770,8 @@ static void f2fs_put_super(struct super_block *sb)
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
kfree(sbi->raw_super);
+
+ destroy_percpu_info(sbi);
kfree(sbi);
}
@@ -648,7 +827,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bsize = sbi->blocksize;
buf->f_blocks = total_count - start_count;
- buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count;
+ buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count;
buf->f_bavail = user_block_count - valid_user_blocks(sbi);
buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
@@ -713,6 +892,12 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",noextent_cache");
if (test_opt(sbi, DATA_FLUSH))
seq_puts(seq, ",data_flush");
+
+ seq_puts(seq, ",mode=");
+ if (test_opt(sbi, ADAPTIVE))
+ seq_puts(seq, "adaptive");
+ else if (test_opt(sbi, LFS))
+ seq_puts(seq, "lfs");
seq_printf(seq, ",active_logs=%u", sbi->active_logs);
return 0;
@@ -745,19 +930,46 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset)
return 0;
}
-static int segment_info_open_fs(struct inode *inode, struct file *file)
+static int segment_bits_seq_show(struct seq_file *seq, void *offset)
{
- return single_open(file, segment_info_seq_show, PDE_DATA(inode));
+ struct super_block *sb = seq->private;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ unsigned int total_segs =
+ le32_to_cpu(sbi->raw_super->segment_count_main);
+ int i, j;
+
+ seq_puts(seq, "format: segment_type|valid_blocks|bitmaps\n"
+ "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n");
+
+ for (i = 0; i < total_segs; i++) {
+ struct seg_entry *se = get_seg_entry(sbi, i);
+
+ seq_printf(seq, "%-10d", i);
+ seq_printf(seq, "%d|%-3u|", se->type,
+ get_valid_blocks(sbi, i, 1));
+ for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++)
+ seq_printf(seq, "%x ", se->cur_valid_map[j]);
+ seq_putc(seq, '\n');
+ }
+ return 0;
}
-static const struct file_operations f2fs_seq_segment_info_fops = {
- .owner = THIS_MODULE,
- .open = segment_info_open_fs,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
+#define F2FS_PROC_FILE_DEF(_name) \
+static int _name##_open_fs(struct inode *inode, struct file *file) \
+{ \
+ return single_open(file, _name##_seq_show, PDE_DATA(inode)); \
+} \
+ \
+static const struct file_operations f2fs_seq_##_name##_fops = { \
+ .open = _name##_open_fs, \
+ .read = seq_read, \
+ .llseek = seq_lseek, \
+ .release = single_release, \
};
+F2FS_PROC_FILE_DEF(segment_info);
+F2FS_PROC_FILE_DEF(segment_bits);
+
static void default_options(struct f2fs_sb_info *sbi)
{
/* init some FS parameters */
@@ -766,6 +978,14 @@ static void default_options(struct f2fs_sb_info *sbi)
set_opt(sbi, BG_GC);
set_opt(sbi, INLINE_DATA);
set_opt(sbi, EXTENT_CACHE);
+ sbi->sb->s_flags |= MS_LAZYTIME;
+ set_opt(sbi, FLUSH_MERGE);
+ if (f2fs_sb_mounted_hmsmr(sbi->sb)) {
+ set_opt_mode(sbi, F2FS_MOUNT_LFS);
+ set_opt(sbi, DISCARD);
+ } else {
+ set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE);
+ }
#ifdef CONFIG_F2FS_FS_XATTR
set_opt(sbi, XATTR_USER);
@@ -791,13 +1011,15 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
org_mount_opt = sbi->mount_opt;
active_logs = sbi->active_logs;
- if (*flags & MS_RDONLY) {
- set_opt(sbi, FASTBOOT);
- set_sbi_flag(sbi, SBI_IS_DIRTY);
+ /* recover superblocks we couldn't write due to previous RO mount */
+ if (!(*flags & MS_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) {
+ err = f2fs_commit_super(sbi, false);
+ f2fs_msg(sb, KERN_INFO,
+ "Try to recover all the superblocks, ret: %d", err);
+ if (!err)
+ clear_sbi_flag(sbi, SBI_NEED_SB_WRITE);
}
- sync_filesystem(sb);
-
sbi->mount_opt.opt = 0;
default_options(sbi);
@@ -829,7 +1051,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) {
if (sbi->gc_thread) {
stop_gc_thread(sbi);
- f2fs_sync_fs(sb, 1);
need_restart_gc = true;
}
} else if (!sbi->gc_thread) {
@@ -839,6 +1060,16 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
need_stop_gc = true;
}
+ if (*flags & MS_RDONLY) {
+ writeback_inodes_sb(sb, WB_REASON_SYNC);
+ sync_inodes_sb(sb);
+
+ set_sbi_flag(sbi, SBI_IS_DIRTY);
+ set_sbi_flag(sbi, SBI_IS_CLOSE);
+ f2fs_sync_fs(sb, 1);
+ clear_sbi_flag(sbi, SBI_IS_CLOSE);
+ }
+
/*
* We stop issue flush thread if FS is mounted as RO
* or if flush_merge is not passed in mount option.
@@ -852,8 +1083,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
}
skip:
/* Update the POSIXACL Flag */
- sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
(test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
+
return 0;
restore_gc:
if (need_restart_gc) {
@@ -893,6 +1125,12 @@ static int f2fs_get_context(struct inode *inode, void *ctx, size_t len)
ctx, len, NULL);
}
+static int f2fs_key_prefix(struct inode *inode, u8 **key)
+{
+ *key = F2FS_I_SB(inode)->key_prefix;
+ return F2FS_I_SB(inode)->key_prefix_size;
+}
+
static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len,
void *fs_data)
{
@@ -909,6 +1147,7 @@ static unsigned f2fs_max_namelen(struct inode *inode)
static struct fscrypt_operations f2fs_cryptops = {
.get_context = f2fs_get_context,
+ .key_prefix = f2fs_key_prefix,
.set_context = f2fs_set_context,
.is_encrypted = f2fs_encrypted_inode,
.empty_dir = f2fs_empty_dir,
@@ -984,9 +1223,26 @@ static loff_t max_file_blocks(void)
return result;
}
-static inline bool sanity_check_area_boundary(struct super_block *sb,
- struct f2fs_super_block *raw_super)
+static int __f2fs_commit_super(struct buffer_head *bh,
+ struct f2fs_super_block *super)
{
+ lock_buffer(bh);
+ if (super)
+ memcpy(bh->b_data + F2FS_SUPER_OFFSET, super, sizeof(*super));
+ set_buffer_uptodate(bh);
+ set_buffer_dirty(bh);
+ unlock_buffer(bh);
+
+ /* it's rare case, we can do fua all the time */
+ return __sync_dirty_buffer(bh, WRITE_FLUSH_FUA);
+}
+
+static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi,
+ struct buffer_head *bh)
+{
+ struct f2fs_super_block *raw_super = (struct f2fs_super_block *)
+ (bh->b_data + F2FS_SUPER_OFFSET);
+ struct super_block *sb = sbi->sb;
u32 segment0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr);
u32 cp_blkaddr = le32_to_cpu(raw_super->cp_blkaddr);
u32 sit_blkaddr = le32_to_cpu(raw_super->sit_blkaddr);
@@ -1000,6 +1256,10 @@ static inline bool sanity_check_area_boundary(struct super_block *sb,
u32 segment_count_main = le32_to_cpu(raw_super->segment_count_main);
u32 segment_count = le32_to_cpu(raw_super->segment_count);
u32 log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg);
+ u64 main_end_blkaddr = main_blkaddr +
+ (segment_count_main << log_blocks_per_seg);
+ u64 seg_end_blkaddr = segment0_blkaddr +
+ (segment_count << log_blocks_per_seg);
if (segment0_blkaddr != cp_blkaddr) {
f2fs_msg(sb, KERN_INFO,
@@ -1044,22 +1304,47 @@ static inline bool sanity_check_area_boundary(struct super_block *sb,
return true;
}
- if (main_blkaddr + (segment_count_main << log_blocks_per_seg) !=
- segment0_blkaddr + (segment_count << log_blocks_per_seg)) {
+ if (main_end_blkaddr > seg_end_blkaddr) {
f2fs_msg(sb, KERN_INFO,
- "Wrong MAIN_AREA boundary, start(%u) end(%u) blocks(%u)",
+ "Wrong MAIN_AREA boundary, start(%u) end(%u) block(%u)",
main_blkaddr,
- segment0_blkaddr + (segment_count << log_blocks_per_seg),
+ segment0_blkaddr +
+ (segment_count << log_blocks_per_seg),
segment_count_main << log_blocks_per_seg);
return true;
+ } else if (main_end_blkaddr < seg_end_blkaddr) {
+ int err = 0;
+ char *res;
+
+ /* fix in-memory information all the time */
+ raw_super->segment_count = cpu_to_le32((main_end_blkaddr -
+ segment0_blkaddr) >> log_blocks_per_seg);
+
+ if (f2fs_readonly(sb) || bdev_read_only(sb->s_bdev)) {
+ set_sbi_flag(sbi, SBI_NEED_SB_WRITE);
+ res = "internally";
+ } else {
+ err = __f2fs_commit_super(bh, NULL);
+ res = err ? "failed" : "done";
+ }
+ f2fs_msg(sb, KERN_INFO,
+ "Fix alignment : %s, start(%u) end(%u) block(%u)",
+ res, main_blkaddr,
+ segment0_blkaddr +
+ (segment_count << log_blocks_per_seg),
+ segment_count_main << log_blocks_per_seg);
+ if (err)
+ return true;
}
-
return false;
}
-static int sanity_check_raw_super(struct super_block *sb,
- struct f2fs_super_block *raw_super)
+static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
+ struct buffer_head *bh)
{
+ struct f2fs_super_block *raw_super = (struct f2fs_super_block *)
+ (bh->b_data + F2FS_SUPER_OFFSET);
+ struct super_block *sb = sbi->sb;
unsigned int blocksize;
if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) {
@@ -1070,10 +1355,10 @@ static int sanity_check_raw_super(struct super_block *sb,
}
/* Currently, support only 4KB page cache size */
- if (F2FS_BLKSIZE != PAGE_CACHE_SIZE) {
+ if (F2FS_BLKSIZE != PAGE_SIZE) {
f2fs_msg(sb, KERN_INFO,
"Invalid page_cache_size (%lu), supports only 4KB\n",
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
return 1;
}
@@ -1126,7 +1411,7 @@ static int sanity_check_raw_super(struct super_block *sb,
}
/* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */
- if (sanity_check_area_boundary(sb, raw_super))
+ if (sanity_check_area_boundary(sbi, bh))
return 1;
return 0;
@@ -1158,7 +1443,6 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi)
static void init_sb_info(struct f2fs_sb_info *sbi)
{
struct f2fs_super_block *raw_super = sbi->raw_super;
- int i;
sbi->log_sectors_per_block =
le32_to_cpu(raw_super->log_sectors_per_block);
@@ -1178,9 +1462,6 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
sbi->cur_victim_sec = NULL_SECNO;
sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH;
- for (i = 0; i < NR_COUNT_TYPE; i++)
- atomic_set(&sbi->nr_pages[i], 0);
-
sbi->dir_level = DEF_DIR_LEVEL;
sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL;
sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL;
@@ -1188,6 +1469,35 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
INIT_LIST_HEAD(&sbi->s_list);
mutex_init(&sbi->umount_mutex);
+ mutex_init(&sbi->wio_mutex[NODE]);
+ mutex_init(&sbi->wio_mutex[DATA]);
+
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+ memcpy(sbi->key_prefix, F2FS_KEY_DESC_PREFIX,
+ F2FS_KEY_DESC_PREFIX_SIZE);
+ sbi->key_prefix_size = F2FS_KEY_DESC_PREFIX_SIZE;
+#endif
+}
+
+static int init_percpu_info(struct f2fs_sb_info *sbi)
+{
+ int i, err;
+
+ if (percpu_init_rwsem(&sbi->cp_rwsem))
+ return -ENOMEM;
+
+ for (i = 0; i < NR_COUNT_TYPE; i++) {
+ err = percpu_counter_init(&sbi->nr_pages[i], 0, GFP_KERNEL);
+ if (err)
+ return err;
+ }
+
+ err = percpu_counter_init(&sbi->alloc_valid_block_count, 0, GFP_KERNEL);
+ if (err)
+ return err;
+
+ return percpu_counter_init(&sbi->total_valid_inode_count, 0,
+ GFP_KERNEL);
}
/*
@@ -1196,13 +1506,14 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
* to get the first valid one. If any one of them is broken, we pass
* them recovery flag back to the caller.
*/
-static int read_raw_super_block(struct super_block *sb,
+static int read_raw_super_block(struct f2fs_sb_info *sbi,
struct f2fs_super_block **raw_super,
int *valid_super_block, int *recovery)
{
+ struct super_block *sb = sbi->sb;
int block;
struct buffer_head *bh;
- struct f2fs_super_block *super, *buf;
+ struct f2fs_super_block *super;
int err = 0;
super = kzalloc(sizeof(struct f2fs_super_block), GFP_KERNEL);
@@ -1218,11 +1529,8 @@ static int read_raw_super_block(struct super_block *sb,
continue;
}
- buf = (struct f2fs_super_block *)
- (bh->b_data + F2FS_SUPER_OFFSET);
-
/* sanity checking of raw super */
- if (sanity_check_raw_super(sb, buf)) {
+ if (sanity_check_raw_super(sbi, bh)) {
f2fs_msg(sb, KERN_ERR,
"Can't find valid F2FS filesystem in %dth superblock",
block + 1);
@@ -1232,7 +1540,8 @@ static int read_raw_super_block(struct super_block *sb,
}
if (!*raw_super) {
- memcpy(super, buf, sizeof(*super));
+ memcpy(super, bh->b_data + F2FS_SUPER_OFFSET,
+ sizeof(*super));
*valid_super_block = block;
*raw_super = super;
}
@@ -1252,42 +1561,35 @@ static int read_raw_super_block(struct super_block *sb,
return err;
}
-static int __f2fs_commit_super(struct f2fs_sb_info *sbi, int block)
+int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
{
- struct f2fs_super_block *super = F2FS_RAW_SUPER(sbi);
struct buffer_head *bh;
int err;
- bh = sb_getblk(sbi->sb, block);
+ if ((recover && f2fs_readonly(sbi->sb)) ||
+ bdev_read_only(sbi->sb->s_bdev)) {
+ set_sbi_flag(sbi, SBI_NEED_SB_WRITE);
+ return -EROFS;
+ }
+
+ /* write back-up superblock first */
+ bh = sb_getblk(sbi->sb, sbi->valid_super_block ? 0: 1);
if (!bh)
return -EIO;
-
- lock_buffer(bh);
- memcpy(bh->b_data + F2FS_SUPER_OFFSET, super, sizeof(*super));
- set_buffer_uptodate(bh);
- set_buffer_dirty(bh);
- unlock_buffer(bh);
-
- /* it's rare case, we can do fua all the time */
- err = __sync_dirty_buffer(bh, WRITE_FLUSH_FUA);
+ err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi));
brelse(bh);
- return err;
-}
-
-int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
-{
- int err;
-
- /* write back-up superblock first */
- err = __f2fs_commit_super(sbi, sbi->valid_super_block ? 0 : 1);
-
/* if we are in recovery path, skip writing valid superblock */
if (recover || err)
return err;
/* write current valid superblock */
- return __f2fs_commit_super(sbi, sbi->valid_super_block);
+ bh = sb_getblk(sbi->sb, sbi->valid_super_block);
+ if (!bh)
+ return -EIO;
+ err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi));
+ brelse(bh);
+ return err;
}
static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
@@ -1295,7 +1597,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
struct f2fs_sb_info *sbi;
struct f2fs_super_block *raw_super;
struct inode *root;
- long err;
+ int err;
bool retry = true, need_fsck = false;
char *options = NULL;
int recovery, i, valid_super_block;
@@ -1312,6 +1614,8 @@ try_onemore:
if (!sbi)
return -ENOMEM;
+ sbi->sb = sb;
+
/* Load the checksum driver */
sbi->s_chksum_driver = crypto_alloc_shash("crc32", 0, 0);
if (IS_ERR(sbi->s_chksum_driver)) {
@@ -1327,12 +1631,14 @@ try_onemore:
goto free_sbi;
}
- err = read_raw_super_block(sb, &raw_super, &valid_super_block,
+ err = read_raw_super_block(sbi, &raw_super, &valid_super_block,
&recovery);
if (err)
goto free_sbi;
sb->s_fs_info = sbi;
+ sbi->raw_super = raw_super;
+
default_options(sbi);
/* parse mount options */
options = kstrdup((const char *)data, GFP_KERNEL);
@@ -1362,11 +1668,8 @@ try_onemore:
memcpy(sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid));
/* init f2fs-specific super block info */
- sbi->sb = sb;
- sbi->raw_super = raw_super;
sbi->valid_super_block = valid_super_block;
mutex_init(&sbi->gc_mutex);
- mutex_init(&sbi->writepages);
mutex_init(&sbi->cp_mutex);
init_rwsem(&sbi->node_write);
@@ -1383,10 +1686,13 @@ try_onemore:
sbi->write_io[i].bio = NULL;
}
- init_rwsem(&sbi->cp_rwsem);
init_waitqueue_head(&sbi->cp_wait);
init_sb_info(sbi);
+ err = init_percpu_info(sbi);
+ if (err)
+ goto free_options;
+
/* get an inode for meta space */
sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi));
if (IS_ERR(sbi->meta_inode)) {
@@ -1403,13 +1709,13 @@ try_onemore:
sbi->total_valid_node_count =
le32_to_cpu(sbi->ckpt->valid_node_count);
- sbi->total_valid_inode_count =
- le32_to_cpu(sbi->ckpt->valid_inode_count);
+ percpu_counter_set(&sbi->total_valid_inode_count,
+ le32_to_cpu(sbi->ckpt->valid_inode_count));
sbi->user_block_count = le64_to_cpu(sbi->ckpt->user_block_count);
sbi->total_valid_block_count =
le64_to_cpu(sbi->ckpt->valid_block_count);
sbi->last_valid_block_count = sbi->total_valid_block_count;
- sbi->alloc_valid_block_count = 0;
+
for (i = 0; i < NR_INODE_TYPE; i++) {
INIT_LIST_HEAD(&sbi->inode_list[i]);
spin_lock_init(&sbi->inode_lock[i]);
@@ -1442,7 +1748,7 @@ try_onemore:
seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
if (__exist_node_summaries(sbi))
sbi->kbytes_written =
- le64_to_cpu(seg_i->sum_blk->journal.info.kbytes_written);
+ le64_to_cpu(seg_i->journal->info.kbytes_written);
build_gc_manager(sbi);
@@ -1487,9 +1793,12 @@ try_onemore:
if (f2fs_proc_root)
sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
- if (sbi->s_proc)
+ if (sbi->s_proc) {
proc_create_data("segment_info", S_IRUGO, sbi->s_proc,
&f2fs_seq_segment_info_fops, sb);
+ proc_create_data("segment_bits", S_IRUGO, sbi->s_proc,
+ &f2fs_seq_segment_bits_fops, sb);
+ }
sbi->s_kobj.kset = f2fs_kset;
init_completion(&sbi->s_kobj_unregister);
@@ -1513,14 +1822,24 @@ try_onemore:
if (need_fsck)
set_sbi_flag(sbi, SBI_NEED_FSCK);
- err = recover_fsync_data(sbi);
- if (err) {
+ err = recover_fsync_data(sbi, false);
+ if (err < 0) {
need_fsck = true;
f2fs_msg(sb, KERN_ERR,
- "Cannot recover all fsync data errno=%ld", err);
+ "Cannot recover all fsync data errno=%d", err);
+ goto free_kobj;
+ }
+ } else {
+ err = recover_fsync_data(sbi, true);
+
+ if (!f2fs_readonly(sb) && err > 0) {
+ err = -EINVAL;
+ f2fs_msg(sb, KERN_ERR,
+ "Need to recover fsync data");
goto free_kobj;
}
}
+
/* recover_fsync_data() cleared this already */
clear_sbi_flag(sbi, SBI_POR_DOING);
@@ -1537,10 +1856,10 @@ try_onemore:
kfree(options);
/* recover broken superblock */
- if (recovery && !f2fs_readonly(sb) && !bdev_read_only(sb->s_bdev)) {
+ if (recovery) {
err = f2fs_commit_super(sbi, true);
f2fs_msg(sb, KERN_INFO,
- "Try to recover %dth superblock, ret: %ld",
+ "Try to recover %dth superblock, ret: %d",
sbi->valid_super_block ? 1 : 2, err);
}
@@ -1549,12 +1868,14 @@ try_onemore:
return 0;
free_kobj:
+ f2fs_sync_inode_meta(sbi);
kobject_del(&sbi->s_kobj);
kobject_put(&sbi->s_kobj);
wait_for_completion(&sbi->s_kobj_unregister);
free_proc:
if (sbi->s_proc) {
remove_proc_entry("segment_info", sbi->s_proc);
+ remove_proc_entry("segment_bits", sbi->s_proc);
remove_proc_entry(sb->s_id, f2fs_proc_root);
}
f2fs_destroy_stats(sbi);
@@ -1575,6 +1896,7 @@ free_meta_inode:
make_bad_inode(sbi->meta_inode);
iput(sbi->meta_inode);
free_options:
+ destroy_percpu_info(sbi);
kfree(options);
free_sb_buf:
kfree(raw_super);
@@ -1660,6 +1982,16 @@ static int __init init_f2fs_fs(void)
err = -ENOMEM;
goto free_extent_cache;
}
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ f2fs_fault_inject.kset = f2fs_kset;
+ f2fs_build_fault_attr(0);
+ err = kobject_init_and_add(&f2fs_fault_inject, &f2fs_fault_ktype,
+ NULL, "fault_injection");
+ if (err) {
+ f2fs_fault_inject.kset = NULL;
+ goto free_kset;
+ }
+#endif
err = register_shrinker(&f2fs_shrinker_info);
if (err)
goto free_kset;
@@ -1678,6 +2010,10 @@ free_filesystem:
free_shrinker:
unregister_shrinker(&f2fs_shrinker_info);
free_kset:
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (f2fs_fault_inject.kset)
+ kobject_put(&f2fs_fault_inject);
+#endif
kset_unregister(f2fs_kset);
free_extent_cache:
destroy_extent_cache();
@@ -1697,14 +2033,17 @@ static void __exit exit_f2fs_fs(void)
{
remove_proc_entry("fs/f2fs", NULL);
f2fs_destroy_root_stats();
- unregister_shrinker(&f2fs_shrinker_info);
unregister_filesystem(&f2fs_fs_type);
+ unregister_shrinker(&f2fs_shrinker_info);
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ kobject_put(&f2fs_fault_inject);
+#endif
+ kset_unregister(f2fs_kset);
destroy_extent_cache();
destroy_checkpoint_caches();
destroy_segment_manager_caches();
destroy_node_manager_caches();
destroy_inodecache();
- kset_unregister(f2fs_kset);
f2fs_destroy_trace_ios();
}
diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c
index 562ce0821559f..73b4e1d1912a7 100644
--- a/fs/f2fs/trace.c
+++ b/fs/f2fs/trace.c
@@ -25,11 +25,11 @@ static inline void __print_last_io(void)
if (!last_io.len)
return;
- trace_printk("%3x:%3x %4x %-16s %2x %5x %12x %4x\n",
+ trace_printk("%3x:%3x %4x %-16s %2x %5x %5x %12x %4x\n",
last_io.major, last_io.minor,
last_io.pid, "----------------",
last_io.type,
- last_io.fio.rw,
+ last_io.fio.op, last_io.fio.op_flags,
last_io.fio.new_blkaddr,
last_io.len);
memset(&last_io, 0, sizeof(last_io));
@@ -101,7 +101,8 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush)
if (last_io.major == major && last_io.minor == minor &&
last_io.pid == pid &&
last_io.type == __file_type(inode, pid) &&
- last_io.fio.rw == fio->rw &&
+ last_io.fio.op == fio->op &&
+ last_io.fio.op_flags == fio->op_flags &&
last_io.fio.new_blkaddr + last_io.len ==
fio->new_blkaddr) {
last_io.len++;
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 06a72dc0191a0..c8898b5148eb6 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -26,10 +26,10 @@
#include "xattr.h"
static int f2fs_xattr_generic_get(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name, void *buffer,
- size_t size)
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *buffer, size_t size)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
switch (handler->flags) {
case F2FS_XATTR_INDEX_USER:
@@ -45,15 +45,16 @@ static int f2fs_xattr_generic_get(const struct xattr_handler *handler,
default:
return -EINVAL;
}
- return f2fs_getxattr(d_inode(dentry), handler->flags, name,
+ return f2fs_getxattr(inode, handler->flags, name,
buffer, size, NULL);
}
static int f2fs_xattr_generic_set(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name, const void *value,
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
size_t size, int flags)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
switch (handler->flags) {
case F2FS_XATTR_INDEX_USER:
@@ -69,7 +70,7 @@ static int f2fs_xattr_generic_set(const struct xattr_handler *handler,
default:
return -EINVAL;
}
- return f2fs_setxattr(d_inode(dentry), handler->flags, name,
+ return f2fs_setxattr(inode, handler->flags, name,
value, size, NULL, flags);
}
@@ -86,29 +87,26 @@ static bool f2fs_xattr_trusted_list(struct dentry *dentry)
}
static int f2fs_xattr_advise_get(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name, void *buffer,
- size_t size)
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *buffer, size_t size)
{
- struct inode *inode = d_inode(dentry);
-
if (buffer)
*((char *)buffer) = F2FS_I(inode)->i_advise;
return sizeof(char);
}
static int f2fs_xattr_advise_set(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name, const void *value,
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
size_t size, int flags)
{
- struct inode *inode = d_inode(dentry);
-
if (!inode_owner_or_capable(inode))
return -EPERM;
if (value == NULL)
return -EINVAL;
F2FS_I(inode)->i_advise |= *(char *)value;
- mark_inode_dirty(inode);
+ f2fs_mark_inode_dirty_sync(inode);
return 0;
}
@@ -301,6 +299,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
if (ipage) {
inline_addr = inline_xattr_addr(ipage);
f2fs_wait_on_page_writeback(ipage, NODE, true);
+ set_page_dirty(ipage);
} else {
page = get_node_page(sbi, inode->i_ino);
if (IS_ERR(page)) {
@@ -443,13 +442,12 @@ static int __f2fs_setxattr(struct inode *inode, int index,
const char *name, const void *value, size_t size,
struct page *ipage, int flags)
{
- struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_xattr_entry *here, *last;
void *base_addr;
int found, newsize;
size_t len;
__u32 new_hsize;
- int error = -ENOMEM;
+ int error = 0;
if (name == NULL)
return -EINVAL;
@@ -467,7 +465,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
base_addr = read_all_xattrs(inode, ipage);
if (!base_addr)
- goto exit;
+ return -ENOMEM;
/* find entry with wanted name. */
here = __find_xattr(base_addr, index, len, name);
@@ -500,7 +498,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
free = free + ENTRY_SIZE(here);
if (unlikely(free < newsize)) {
- error = -ENOSPC;
+ error = -E2BIG;
goto exit;
}
}
@@ -528,7 +526,6 @@ static int __f2fs_setxattr(struct inode *inode, int index,
* Before we come here, old entry is removed.
* We just write new entry.
*/
- memset(last, 0, newsize);
last->e_name_index = index;
last->e_name_len = len;
memcpy(last->e_name, name, len);
@@ -542,19 +539,15 @@ static int __f2fs_setxattr(struct inode *inode, int index,
if (error)
goto exit;
- if (is_inode_flag_set(fi, FI_ACL_MODE)) {
- inode->i_mode = fi->i_acl_mode;
+ if (is_inode_flag_set(inode, FI_ACL_MODE)) {
+ inode->i_mode = F2FS_I(inode)->i_acl_mode;
inode->i_ctime = CURRENT_TIME;
- clear_inode_flag(fi, FI_ACL_MODE);
+ clear_inode_flag(inode, FI_ACL_MODE);
}
if (index == F2FS_XATTR_INDEX_ENCRYPTION &&
!strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT))
f2fs_set_encrypted_inode(inode);
-
- if (ipage)
- update_inode(inode, ipage);
- else
- update_inode_page(inode);
+ f2fs_mark_inode_dirty_sync(inode);
exit:
kzfree(base_addr);
return error;
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index d0b95c95079bb..663e428596c6c 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -769,7 +769,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *file,
buf.dirent = dirent;
buf.result = 0;
- inode_lock(inode);
+ inode_lock_shared(inode);
buf.ctx.pos = file->f_pos;
ret = -ENOENT;
if (!IS_DEADDIR(inode)) {
@@ -777,7 +777,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *file,
short_only, both ? &buf : NULL);
file->f_pos = buf.ctx.pos;
}
- inode_unlock(inode);
+ inode_unlock_shared(inode);
if (ret >= 0)
ret = buf.result;
return ret;
@@ -861,7 +861,7 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd,
const struct file_operations fat_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .iterate = fat_readdir,
+ .iterate_shared = fat_readdir,
.unlocked_ioctl = fat_dir_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = fat_compat_dir_ioctl,
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 226281068a461..da04c0298fab4 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -244,13 +244,13 @@ static int fat_write_end(struct file *file, struct address_space *mapping,
return err;
}
-static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- loff_t offset)
+static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
size_t count = iov_iter_count(iter);
+ loff_t offset = iocb->ki_pos;
ssize_t ret;
if (iov_iter_rw(iter) == WRITE) {
@@ -272,7 +272,7 @@ static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
* FAT need to use the DIO_LOCKING for avoiding the race
* condition of fat_get_block() and ->truncate().
*/
- ret = blockdev_direct_IO(iocb, inode, iter, offset, fat_get_block);
+ ret = blockdev_direct_IO(iocb, inode, iter, fat_get_block);
if (ret < 0 && iov_iter_rw(iter) == WRITE)
fat_write_failed(mapping, offset + count);
@@ -1589,7 +1589,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
/*
* GFP_KERNEL is ok here, because while we do hold the
- * supeblock lock, memory pressure can't call back into
+ * superblock lock, memory pressure can't call back into
* the filesystem, since we're only just about to mount
* it and have no inodes etc active!
*/
@@ -1726,7 +1726,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
sbi->dir_entries = bpb.fat_dir_entries;
if (sbi->dir_entries & (sbi->dir_per_block - 1)) {
if (!silent)
- fat_msg(sb, KERN_ERR, "bogus directory-entries per block"
+ fat_msg(sb, KERN_ERR, "bogus number of directory entries"
" (%u)", sbi->dir_entries);
goto out_invalid;
}
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index c4589e9817602..8a8698119ff74 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -267,7 +267,7 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
int i, err = 0;
for (i = 0; i < nr_bhs; i++)
- write_dirty_buffer(bhs[i], WRITE);
+ write_dirty_buffer(bhs[i], 0);
for (i = 0; i < nr_bhs; i++) {
wait_on_buffer(bhs[i]);
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index b7e2b33aa7935..664655b2c55ff 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -154,7 +154,7 @@ static int msdos_hash(const struct dentry *dentry, struct qstr *qstr)
error = msdos_format_name(qstr->name, qstr->len, msdos_name, options);
if (!error)
- qstr->hash = full_name_hash(msdos_name, MSDOS_NAME);
+ qstr->hash = full_name_hash(dentry, msdos_name, MSDOS_NAME);
return 0;
}
@@ -162,10 +162,10 @@ static int msdos_hash(const struct dentry *dentry, struct qstr *qstr)
* Compare two msdos names. If either of the names are invalid,
* we fall back to doing the standard name comparison.
*/
-static int msdos_cmp(const struct dentry *parent, const struct dentry *dentry,
+static int msdos_cmp(const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
- struct fat_mount_options *options = &MSDOS_SB(parent->d_sb)->options;
+ struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
unsigned char a_msdos_name[MSDOS_NAME], b_msdos_name[MSDOS_NAME];
int error;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 7092584f424af..92b7363dafa95 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -107,7 +107,7 @@ static unsigned int vfat_striptail_len(const struct qstr *qstr)
*/
static int vfat_hash(const struct dentry *dentry, struct qstr *qstr)
{
- qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr));
+ qstr->hash = full_name_hash(dentry, qstr->name, vfat_striptail_len(qstr));
return 0;
}
@@ -127,7 +127,7 @@ static int vfat_hashi(const struct dentry *dentry, struct qstr *qstr)
name = qstr->name;
len = vfat_striptail_len(qstr);
- hash = init_name_hash();
+ hash = init_name_hash(dentry);
while (len--)
hash = partial_name_hash(nls_tolower(t, *name++), hash);
qstr->hash = end_name_hash(hash);
@@ -138,10 +138,10 @@ static int vfat_hashi(const struct dentry *dentry, struct qstr *qstr)
/*
* Case insensitive compare of two vfat names.
*/
-static int vfat_cmpi(const struct dentry *parent, const struct dentry *dentry,
+static int vfat_cmpi(const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
- struct nls_table *t = MSDOS_SB(parent->d_sb)->nls_io;
+ struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io;
unsigned int alen, blen;
/* A filename cannot end in '.' or we treat it like it has none */
@@ -157,7 +157,7 @@ static int vfat_cmpi(const struct dentry *parent, const struct dentry *dentry,
/*
* Case sensitive compare of two vfat names.
*/
-static int vfat_cmp(const struct dentry *parent, const struct dentry *dentry,
+static int vfat_cmp(const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
unsigned int alen, blen;
@@ -652,8 +652,8 @@ out_free:
return err;
}
-static int vfat_add_entry(struct inode *dir, struct qstr *qname, int is_dir,
- int cluster, struct timespec *ts,
+static int vfat_add_entry(struct inode *dir, const struct qstr *qname,
+ int is_dir, int cluster, struct timespec *ts,
struct fat_slot_info *sinfo)
{
struct msdos_dir_slot *slots;
@@ -688,7 +688,7 @@ cleanup:
return err;
}
-static int vfat_find(struct inode *dir, struct qstr *qname,
+static int vfat_find(struct inode *dir, const struct qstr *qname,
struct fat_slot_info *sinfo)
{
unsigned int len = vfat_striptail_len(qname);
diff --git a/fs/file.c b/fs/file.c
index 1fbc5c0555a9c..6b1acdfe59dac 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -784,6 +784,11 @@ unsigned long __fdget_pos(unsigned int fd)
return v;
}
+void __f_unlock_pos(struct file *f)
+{
+ mutex_unlock(&f->f_pos_lock);
+}
+
/*
* We only lock f_pos if we have threads or if the file might be
* shared with another process. In both cases we'll have an elevated
diff --git a/fs/freevxfs/Kconfig b/fs/freevxfs/Kconfig
index 8dc1cd5c1efed..ce49df1020dd1 100644
--- a/fs/freevxfs/Kconfig
+++ b/fs/freevxfs/Kconfig
@@ -5,12 +5,21 @@ config VXFS_FS
FreeVxFS is a file system driver that support the VERITAS VxFS(TM)
file system format. VERITAS VxFS(TM) is the standard file system
of SCO UnixWare (and possibly others) and optionally available
- for Sunsoft Solaris, HP-UX and many other operating systems.
- Currently only readonly access is supported.
+ for Sunsoft Solaris, HP-UX and many other operating systems. However
+ these particular OS implementations of vxfs may differ in on-disk
+ data endianess and/or superblock offset. The vxfs module has been
+ tested with SCO UnixWare and HP-UX B.10.20 (pa-risc 1.1 arch.)
+ Currently only readonly access is supported and VxFX versions
+ 2, 3 and 4. Tests were performed with HP-UX VxFS version 3.
NOTE: the file system type as used by mount(1), mount(2) and
fstab(5) is 'vxfs' as it describes the file system format, not
the actual driver.
+ There is a userspace utility for HP-UX logical volumes which makes
+ creating HP-UX logical volumes easy from HP-UX disk block device file
+ or regular file with image of the disk. See:
+ https://sourceforge.net/projects/linux-vxfs/
+
To compile this as a module, choose M here: the module will be
called freevxfs. If unsure, say N.
diff --git a/fs/freevxfs/vxfs.h b/fs/freevxfs/vxfs.h
index c8a92652612aa..a41ea0ba69433 100644
--- a/fs/freevxfs/vxfs.h
+++ b/fs/freevxfs/vxfs.h
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
+ * Copyright (c) 2016 Krzysztof Blaszkowski
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -38,13 +39,6 @@
*/
#include <linux/types.h>
-
-/*
- * Data types for use with the VxFS ondisk format.
- */
-typedef int32_t vx_daddr_t;
-typedef int32_t vx_ino_t;
-
/*
* Superblock magic number (vxfs_super->vs_magic).
*/
@@ -60,6 +54,14 @@ typedef int32_t vx_ino_t;
*/
#define VXFS_NEFREE 32
+enum vxfs_byte_order {
+ VXFS_BO_LE,
+ VXFS_BO_BE,
+};
+
+typedef __u16 __bitwise __fs16;
+typedef __u32 __bitwise __fs32;
+typedef __u64 __bitwise __fs64;
/*
* VxFS superblock (disk).
@@ -71,83 +73,83 @@ struct vxfs_sb {
* Lots of this fields are no more used by version 2
* and never filesystems.
*/
- u_int32_t vs_magic; /* Magic number */
- int32_t vs_version; /* VxFS version */
- u_int32_t vs_ctime; /* create time - secs */
- u_int32_t vs_cutime; /* create time - usecs */
- int32_t __unused1; /* unused */
- int32_t __unused2; /* unused */
- vx_daddr_t vs_old_logstart; /* obsolete */
- vx_daddr_t vs_old_logend; /* obsolete */
- int32_t vs_bsize; /* block size */
- int32_t vs_size; /* number of blocks */
- int32_t vs_dsize; /* number of data blocks */
- u_int32_t vs_old_ninode; /* obsolete */
- int32_t vs_old_nau; /* obsolete */
- int32_t __unused3; /* unused */
- int32_t vs_old_defiextsize; /* obsolete */
- int32_t vs_old_ilbsize; /* obsolete */
- int32_t vs_immedlen; /* size of immediate data area */
- int32_t vs_ndaddr; /* number of direct extentes */
- vx_daddr_t vs_firstau; /* address of first AU */
- vx_daddr_t vs_emap; /* offset of extent map in AU */
- vx_daddr_t vs_imap; /* offset of inode map in AU */
- vx_daddr_t vs_iextop; /* offset of ExtOp. map in AU */
- vx_daddr_t vs_istart; /* offset of inode list in AU */
- vx_daddr_t vs_bstart; /* offset of fdblock in AU */
- vx_daddr_t vs_femap; /* aufirst + emap */
- vx_daddr_t vs_fimap; /* aufirst + imap */
- vx_daddr_t vs_fiextop; /* aufirst + iextop */
- vx_daddr_t vs_fistart; /* aufirst + istart */
- vx_daddr_t vs_fbstart; /* aufirst + bstart */
- int32_t vs_nindir; /* number of entries in indir */
- int32_t vs_aulen; /* length of AU in blocks */
- int32_t vs_auimlen; /* length of imap in blocks */
- int32_t vs_auemlen; /* length of emap in blocks */
- int32_t vs_auilen; /* length of ilist in blocks */
- int32_t vs_aupad; /* length of pad in blocks */
- int32_t vs_aublocks; /* data blocks in AU */
- int32_t vs_maxtier; /* log base 2 of aublocks */
- int32_t vs_inopb; /* number of inodes per blk */
- int32_t vs_old_inopau; /* obsolete */
- int32_t vs_old_inopilb; /* obsolete */
- int32_t vs_old_ndiripau; /* obsolete */
- int32_t vs_iaddrlen; /* size of indirect addr ext. */
- int32_t vs_bshift; /* log base 2 of bsize */
- int32_t vs_inoshift; /* log base 2 of inobp */
- int32_t vs_bmask; /* ~( bsize - 1 ) */
- int32_t vs_boffmask; /* bsize - 1 */
- int32_t vs_old_inomask; /* old_inopilb - 1 */
- int32_t vs_checksum; /* checksum of V1 data */
+ __fs32 vs_magic; /* Magic number */
+ __fs32 vs_version; /* VxFS version */
+ __fs32 vs_ctime; /* create time - secs */
+ __fs32 vs_cutime; /* create time - usecs */
+ __fs32 __unused1; /* unused */
+ __fs32 __unused2; /* unused */
+ __fs32 vs_old_logstart; /* obsolete */
+ __fs32 vs_old_logend; /* obsolete */
+ __fs32 vs_bsize; /* block size */
+ __fs32 vs_size; /* number of blocks */
+ __fs32 vs_dsize; /* number of data blocks */
+ __fs32 vs_old_ninode; /* obsolete */
+ __fs32 vs_old_nau; /* obsolete */
+ __fs32 __unused3; /* unused */
+ __fs32 vs_old_defiextsize; /* obsolete */
+ __fs32 vs_old_ilbsize; /* obsolete */
+ __fs32 vs_immedlen; /* size of immediate data area */
+ __fs32 vs_ndaddr; /* number of direct extentes */
+ __fs32 vs_firstau; /* address of first AU */
+ __fs32 vs_emap; /* offset of extent map in AU */
+ __fs32 vs_imap; /* offset of inode map in AU */
+ __fs32 vs_iextop; /* offset of ExtOp. map in AU */
+ __fs32 vs_istart; /* offset of inode list in AU */
+ __fs32 vs_bstart; /* offset of fdblock in AU */
+ __fs32 vs_femap; /* aufirst + emap */
+ __fs32 vs_fimap; /* aufirst + imap */
+ __fs32 vs_fiextop; /* aufirst + iextop */
+ __fs32 vs_fistart; /* aufirst + istart */
+ __fs32 vs_fbstart; /* aufirst + bstart */
+ __fs32 vs_nindir; /* number of entries in indir */
+ __fs32 vs_aulen; /* length of AU in blocks */
+ __fs32 vs_auimlen; /* length of imap in blocks */
+ __fs32 vs_auemlen; /* length of emap in blocks */
+ __fs32 vs_auilen; /* length of ilist in blocks */
+ __fs32 vs_aupad; /* length of pad in blocks */
+ __fs32 vs_aublocks; /* data blocks in AU */
+ __fs32 vs_maxtier; /* log base 2 of aublocks */
+ __fs32 vs_inopb; /* number of inodes per blk */
+ __fs32 vs_old_inopau; /* obsolete */
+ __fs32 vs_old_inopilb; /* obsolete */
+ __fs32 vs_old_ndiripau; /* obsolete */
+ __fs32 vs_iaddrlen; /* size of indirect addr ext. */
+ __fs32 vs_bshift; /* log base 2 of bsize */
+ __fs32 vs_inoshift; /* log base 2 of inobp */
+ __fs32 vs_bmask; /* ~( bsize - 1 ) */
+ __fs32 vs_boffmask; /* bsize - 1 */
+ __fs32 vs_old_inomask; /* old_inopilb - 1 */
+ __fs32 vs_checksum; /* checksum of V1 data */
/*
* Version 1, writable
*/
- int32_t vs_free; /* number of free blocks */
- int32_t vs_ifree; /* number of free inodes */
- int32_t vs_efree[VXFS_NEFREE]; /* number of free extents by size */
- int32_t vs_flags; /* flags ?!? */
- u_int8_t vs_mod; /* filesystem has been changed */
- u_int8_t vs_clean; /* clean FS */
- u_int16_t __unused4; /* unused */
- u_int32_t vs_firstlogid; /* mount time log ID */
- u_int32_t vs_wtime; /* last time written - sec */
- u_int32_t vs_wutime; /* last time written - usec */
- u_int8_t vs_fname[6]; /* FS name */
- u_int8_t vs_fpack[6]; /* FS pack name */
- int32_t vs_logversion; /* log format version */
- int32_t __unused5; /* unused */
+ __fs32 vs_free; /* number of free blocks */
+ __fs32 vs_ifree; /* number of free inodes */
+ __fs32 vs_efree[VXFS_NEFREE]; /* number of free extents by size */
+ __fs32 vs_flags; /* flags ?!? */
+ __u8 vs_mod; /* filesystem has been changed */
+ __u8 vs_clean; /* clean FS */
+ __fs16 __unused4; /* unused */
+ __fs32 vs_firstlogid; /* mount time log ID */
+ __fs32 vs_wtime; /* last time written - sec */
+ __fs32 vs_wutime; /* last time written - usec */
+ __u8 vs_fname[6]; /* FS name */
+ __u8 vs_fpack[6]; /* FS pack name */
+ __fs32 vs_logversion; /* log format version */
+ __u32 __unused5; /* unused */
/*
* Version 2, Read-only
*/
- vx_daddr_t vs_oltext[2]; /* OLT extent and replica */
- int32_t vs_oltsize; /* OLT extent size */
- int32_t vs_iauimlen; /* size of inode map */
- int32_t vs_iausize; /* size of IAU in blocks */
- int32_t vs_dinosize; /* size of inode in bytes */
- int32_t vs_old_dniaddr; /* indir levels per inode */
- int32_t vs_checksum2; /* checksum of V2 RO */
+ __fs32 vs_oltext[2]; /* OLT extent and replica */
+ __fs32 vs_oltsize; /* OLT extent size */
+ __fs32 vs_iauimlen; /* size of inode map */
+ __fs32 vs_iausize; /* size of IAU in blocks */
+ __fs32 vs_dinosize; /* size of inode in bytes */
+ __fs32 vs_old_dniaddr; /* indir levels per inode */
+ __fs32 vs_checksum2; /* checksum of V2 RO */
/*
* Actually much more...
@@ -168,8 +170,32 @@ struct vxfs_sb_info {
ino_t vsi_fshino; /* fileset header inode */
daddr_t vsi_oltext; /* OLT extent */
daddr_t vsi_oltsize; /* OLT size */
+ enum vxfs_byte_order byte_order;
};
+static inline u16 fs16_to_cpu(struct vxfs_sb_info *sbi, __fs16 a)
+{
+ if (sbi->byte_order == VXFS_BO_BE)
+ return be16_to_cpu((__force __be16)a);
+ else
+ return le16_to_cpu((__force __le16)a);
+}
+
+static inline u32 fs32_to_cpu(struct vxfs_sb_info *sbi, __fs32 a)
+{
+ if (sbi->byte_order == VXFS_BO_BE)
+ return be32_to_cpu((__force __be32)a);
+ else
+ return le32_to_cpu((__force __le32)a);
+}
+
+static inline u64 fs64_to_cpu(struct vxfs_sb_info *sbi, __fs64 a)
+{
+ if (sbi->byte_order == VXFS_BO_BE)
+ return be64_to_cpu((__force __be64)a);
+ else
+ return le64_to_cpu((__force __le64)a);
+}
/*
* File modes. File types above 0xf000 are vxfs internal only, they should
@@ -247,13 +273,6 @@ enum {
#define VXFS_ISIMMED(ip) VXFS_IS_ORG((ip), VXFS_ORG_IMMED)
#define VXFS_ISTYPED(ip) VXFS_IS_ORG((ip), VXFS_ORG_TYPED)
-
-/*
- * Get filesystem private data from VFS inode.
- */
-#define VXFS_INO(ip) \
- ((struct vxfs_inode_info *)(ip)->i_private)
-
/*
* Get filesystem private data from VFS superblock.
*/
diff --git a/fs/freevxfs/vxfs_bmap.c b/fs/freevxfs/vxfs_bmap.c
index f86fd3cacd5ab..1fd41cf98b9fc 100644
--- a/fs/freevxfs/vxfs_bmap.c
+++ b/fs/freevxfs/vxfs_bmap.c
@@ -68,8 +68,9 @@ vxfs_bmap_ext4(struct inode *ip, long bn)
{
struct super_block *sb = ip->i_sb;
struct vxfs_inode_info *vip = VXFS_INO(ip);
+ struct vxfs_sb_info *sbi = VXFS_SBI(sb);
unsigned long bsize = sb->s_blocksize;
- u32 indsize = vip->vii_ext4.ve4_indsize;
+ u32 indsize = fs32_to_cpu(sbi, vip->vii_ext4.ve4_indsize);
int i;
if (indsize > sb->s_blocksize)
@@ -77,22 +78,24 @@ vxfs_bmap_ext4(struct inode *ip, long bn)
for (i = 0; i < VXFS_NDADDR; i++) {
struct direct *d = vip->vii_ext4.ve4_direct + i;
- if (bn >= 0 && bn < d->size)
- return (bn + d->extent);
- bn -= d->size;
+ if (bn >= 0 && bn < fs32_to_cpu(sbi, d->size))
+ return (bn + fs32_to_cpu(sbi, d->extent));
+ bn -= fs32_to_cpu(sbi, d->size);
}
if ((bn / (indsize * indsize * bsize / 4)) == 0) {
struct buffer_head *buf;
daddr_t bno;
- u32 *indir;
+ __fs32 *indir;
- buf = sb_bread(sb, vip->vii_ext4.ve4_indir[0]);
+ buf = sb_bread(sb,
+ fs32_to_cpu(sbi, vip->vii_ext4.ve4_indir[0]));
if (!buf || !buffer_mapped(buf))
goto fail_buf;
- indir = (u32 *)buf->b_data;
- bno = indir[(bn/indsize) % (indsize*bn)] + (bn%indsize);
+ indir = (__fs32 *)buf->b_data;
+ bno = fs32_to_cpu(sbi, indir[(bn / indsize) % (indsize * bn)]) +
+ (bn % indsize);
brelse(buf);
return bno;
@@ -127,6 +130,7 @@ fail_buf:
static daddr_t
vxfs_bmap_indir(struct inode *ip, long indir, int size, long block)
{
+ struct vxfs_sb_info *sbi = VXFS_SBI(ip->i_sb);
struct buffer_head *bp = NULL;
daddr_t pblock = 0;
int i;
@@ -142,24 +146,27 @@ vxfs_bmap_indir(struct inode *ip, long indir, int size, long block)
typ = ((struct vxfs_typed *)bp->b_data) +
(i % VXFS_TYPED_PER_BLOCK(ip->i_sb));
- off = (typ->vt_hdr & VXFS_TYPED_OFFSETMASK);
+ off = fs64_to_cpu(sbi, typ->vt_hdr) & VXFS_TYPED_OFFSETMASK;
if (block < off) {
brelse(bp);
continue;
}
- switch ((u_int32_t)(typ->vt_hdr >> VXFS_TYPED_TYPESHIFT)) {
+ switch ((u_int32_t)(fs64_to_cpu(sbi, typ->vt_hdr) >>
+ VXFS_TYPED_TYPESHIFT)) {
case VXFS_TYPED_INDIRECT:
- pblock = vxfs_bmap_indir(ip, typ->vt_block,
- typ->vt_size, block - off);
+ pblock = vxfs_bmap_indir(ip,
+ fs32_to_cpu(sbi, typ->vt_block),
+ fs32_to_cpu(sbi, typ->vt_size),
+ block - off);
if (pblock == -2)
break;
goto out;
case VXFS_TYPED_DATA:
- if ((block - off) >= typ->vt_size)
+ if ((block - off) >= fs32_to_cpu(sbi, typ->vt_size))
break;
- pblock = (typ->vt_block + block - off);
+ pblock = fs32_to_cpu(sbi, typ->vt_block) + block - off;
goto out;
case VXFS_TYPED_INDIRECT_DEV4:
case VXFS_TYPED_DATA_DEV4: {
@@ -167,13 +174,15 @@ vxfs_bmap_indir(struct inode *ip, long indir, int size, long block)
(struct vxfs_typed_dev4 *)typ;
printk(KERN_INFO "\n\nTYPED_DEV4 detected!\n");
- printk(KERN_INFO "block: %Lu\tsize: %Ld\tdev: %d\n",
- (unsigned long long) typ4->vd4_block,
- (unsigned long long) typ4->vd4_size,
- typ4->vd4_dev);
+ printk(KERN_INFO "block: %llu\tsize: %lld\tdev: %d\n",
+ fs64_to_cpu(sbi, typ4->vd4_block),
+ fs64_to_cpu(sbi, typ4->vd4_size),
+ fs32_to_cpu(sbi, typ4->vd4_dev));
goto fail;
}
default:
+ printk(KERN_ERR "%s:%d vt_hdr %llu\n", __func__,
+ __LINE__, fs64_to_cpu(sbi, typ->vt_hdr));
BUG();
}
brelse(bp);
@@ -201,28 +210,33 @@ static daddr_t
vxfs_bmap_typed(struct inode *ip, long iblock)
{
struct vxfs_inode_info *vip = VXFS_INO(ip);
+ struct vxfs_sb_info *sbi = VXFS_SBI(ip->i_sb);
daddr_t pblock = 0;
int i;
for (i = 0; i < VXFS_NTYPED; i++) {
struct vxfs_typed *typ = vip->vii_org.typed + i;
- int64_t off = (typ->vt_hdr & VXFS_TYPED_OFFSETMASK);
+ u64 hdr = fs64_to_cpu(sbi, typ->vt_hdr);
+ int64_t off = (hdr & VXFS_TYPED_OFFSETMASK);
#ifdef DIAGNOSTIC
vxfs_typdump(typ);
#endif
if (iblock < off)
continue;
- switch ((u_int32_t)(typ->vt_hdr >> VXFS_TYPED_TYPESHIFT)) {
+ switch ((u32)(hdr >> VXFS_TYPED_TYPESHIFT)) {
case VXFS_TYPED_INDIRECT:
- pblock = vxfs_bmap_indir(ip, typ->vt_block,
- typ->vt_size, iblock - off);
+ pblock = vxfs_bmap_indir(ip,
+ fs32_to_cpu(sbi, typ->vt_block),
+ fs32_to_cpu(sbi, typ->vt_size),
+ iblock - off);
if (pblock == -2)
break;
return (pblock);
case VXFS_TYPED_DATA:
- if ((iblock - off) < typ->vt_size)
- return (typ->vt_block + iblock - off);
+ if ((iblock - off) < fs32_to_cpu(sbi, typ->vt_size))
+ return (fs32_to_cpu(sbi, typ->vt_block) +
+ iblock - off);
break;
case VXFS_TYPED_INDIRECT_DEV4:
case VXFS_TYPED_DATA_DEV4: {
@@ -230,10 +244,10 @@ vxfs_bmap_typed(struct inode *ip, long iblock)
(struct vxfs_typed_dev4 *)typ;
printk(KERN_INFO "\n\nTYPED_DEV4 detected!\n");
- printk(KERN_INFO "block: %Lu\tsize: %Ld\tdev: %d\n",
- (unsigned long long) typ4->vd4_block,
- (unsigned long long) typ4->vd4_size,
- typ4->vd4_dev);
+ printk(KERN_INFO "block: %llu\tsize: %lld\tdev: %d\n",
+ fs64_to_cpu(sbi, typ4->vd4_block),
+ fs64_to_cpu(sbi, typ4->vd4_size),
+ fs32_to_cpu(sbi, typ4->vd4_dev));
return 0;
}
default:
diff --git a/fs/freevxfs/vxfs_dir.h b/fs/freevxfs/vxfs_dir.h
index aaf1fb0986395..acc5477b3f232 100644
--- a/fs/freevxfs/vxfs_dir.h
+++ b/fs/freevxfs/vxfs_dir.h
@@ -48,9 +48,9 @@
* Linux driver for now.
*/
struct vxfs_dirblk {
- u_int16_t d_free; /* free space in dirblock */
- u_int16_t d_nhash; /* no of hash chains */
- u_int16_t d_hash[1]; /* hash chain */
+ __fs16 d_free; /* free space in dirblock */
+ __fs16 d_nhash; /* no of hash chains */
+ __fs16 d_hash[1]; /* hash chain */
};
/*
@@ -63,10 +63,10 @@ struct vxfs_dirblk {
* VxFS directory entry.
*/
struct vxfs_direct {
- vx_ino_t d_ino; /* inode number */
- u_int16_t d_reclen; /* record length */
- u_int16_t d_namelen; /* d_name length */
- u_int16_t d_hashnext; /* next hash entry */
+ __fs32 d_ino; /* inode number */
+ __fs16 d_reclen; /* record length */
+ __fs16 d_namelen; /* d_name length */
+ __fs16 d_hashnext; /* next hash entry */
char d_name[VXFS_NAMELEN]; /* name */
};
@@ -87,6 +87,7 @@ struct vxfs_direct {
/*
* VXFS_DIRBLKOV is the overhead of a specific dirblock.
*/
-#define VXFS_DIRBLKOV(dbp) ((sizeof(short) * dbp->d_nhash) + 4)
+#define VXFS_DIRBLKOV(sbi, dbp) \
+ ((sizeof(short) * fs16_to_cpu(sbi, dbp->d_nhash)) + 4)
#endif /* _VXFS_DIR_H_ */
diff --git a/fs/freevxfs/vxfs_extern.h b/fs/freevxfs/vxfs_extern.h
index e3dcb4467d927..f5c428e210245 100644
--- a/fs/freevxfs/vxfs_extern.h
+++ b/fs/freevxfs/vxfs_extern.h
@@ -52,14 +52,10 @@ extern int vxfs_read_fshead(struct super_block *);
/* vxfs_inode.c */
extern const struct address_space_operations vxfs_immed_aops;
-extern struct kmem_cache *vxfs_inode_cachep;
extern void vxfs_dumpi(struct vxfs_inode_info *, ino_t);
-extern struct inode * vxfs_get_fake_inode(struct super_block *,
- struct vxfs_inode_info *);
-extern void vxfs_put_fake_inode(struct inode *);
-extern struct vxfs_inode_info * vxfs_blkiget(struct super_block *, u_long, ino_t);
-extern struct vxfs_inode_info * vxfs_stiget(struct super_block *, ino_t);
-extern struct inode * vxfs_iget(struct super_block *, ino_t);
+extern struct inode *vxfs_blkiget(struct super_block *, u_long, ino_t);
+extern struct inode *vxfs_stiget(struct super_block *, ino_t);
+extern struct inode *vxfs_iget(struct super_block *, ino_t);
extern void vxfs_evict_inode(struct inode *);
/* vxfs_lookup.c */
diff --git a/fs/freevxfs/vxfs_fshead.c b/fs/freevxfs/vxfs_fshead.c
index c9a6a94e58e9c..a4610a77649e5 100644
--- a/fs/freevxfs/vxfs_fshead.c
+++ b/fs/freevxfs/vxfs_fshead.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
+ * Copyright (c) 2016 Krzysztof Blaszkowski
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -108,31 +109,26 @@ vxfs_read_fshead(struct super_block *sbp)
{
struct vxfs_sb_info *infp = VXFS_SBI(sbp);
struct vxfs_fsh *pfp, *sfp;
- struct vxfs_inode_info *vip, *tip;
+ struct vxfs_inode_info *vip;
- vip = vxfs_blkiget(sbp, infp->vsi_iext, infp->vsi_fshino);
- if (!vip) {
+ infp->vsi_fship = vxfs_blkiget(sbp, infp->vsi_iext, infp->vsi_fshino);
+ if (!infp->vsi_fship) {
printk(KERN_ERR "vxfs: unable to read fsh inode\n");
return -EINVAL;
}
+
+ vip = VXFS_INO(infp->vsi_fship);
if (!VXFS_ISFSH(vip)) {
printk(KERN_ERR "vxfs: fsh list inode is of wrong type (%x)\n",
vip->vii_mode & VXFS_TYPE_MASK);
- goto out_free_fship;
+ goto out_iput_fship;
}
-
#ifdef DIAGNOSTIC
printk("vxfs: fsh inode dump:\n");
vxfs_dumpi(vip, infp->vsi_fshino);
#endif
- infp->vsi_fship = vxfs_get_fake_inode(sbp, vip);
- if (!infp->vsi_fship) {
- printk(KERN_ERR "vxfs: unable to get fsh inode\n");
- goto out_free_fship;
- }
-
sfp = vxfs_getfsh(infp->vsi_fship, 0);
if (!sfp) {
printk(KERN_ERR "vxfs: unable to get structural fsh\n");
@@ -153,14 +149,10 @@ vxfs_read_fshead(struct super_block *sbp)
vxfs_dumpfsh(pfp);
#endif
- tip = vxfs_blkiget(sbp, infp->vsi_iext, sfp->fsh_ilistino[0]);
- if (!tip)
- goto out_free_pfp;
-
- infp->vsi_stilist = vxfs_get_fake_inode(sbp, tip);
+ infp->vsi_stilist = vxfs_blkiget(sbp, infp->vsi_iext,
+ fs32_to_cpu(infp, sfp->fsh_ilistino[0]));
if (!infp->vsi_stilist) {
printk(KERN_ERR "vxfs: unable to get structural list inode\n");
- kfree(tip);
goto out_free_pfp;
}
if (!VXFS_ISILT(VXFS_INO(infp->vsi_stilist))) {
@@ -169,13 +161,9 @@ vxfs_read_fshead(struct super_block *sbp)
goto out_iput_stilist;
}
- tip = vxfs_stiget(sbp, pfp->fsh_ilistino[0]);
- if (!tip)
- goto out_iput_stilist;
- infp->vsi_ilist = vxfs_get_fake_inode(sbp, tip);
+ infp->vsi_ilist = vxfs_stiget(sbp, fs32_to_cpu(infp, pfp->fsh_ilistino[0]));
if (!infp->vsi_ilist) {
printk(KERN_ERR "vxfs: unable to get inode list inode\n");
- kfree(tip);
goto out_iput_stilist;
}
if (!VXFS_ISILT(VXFS_INO(infp->vsi_ilist))) {
@@ -184,6 +172,8 @@ vxfs_read_fshead(struct super_block *sbp)
goto out_iput_ilist;
}
+ kfree(pfp);
+ kfree(sfp);
return 0;
out_iput_ilist:
@@ -197,7 +187,4 @@ vxfs_read_fshead(struct super_block *sbp)
out_iput_fship:
iput(infp->vsi_fship);
return -EINVAL;
- out_free_fship:
- kfree(vip);
- return -EINVAL;
}
diff --git a/fs/freevxfs/vxfs_fshead.h b/fs/freevxfs/vxfs_fshead.h
index ead0d640c1814..e026f0c491596 100644
--- a/fs/freevxfs/vxfs_fshead.h
+++ b/fs/freevxfs/vxfs_fshead.h
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
+ * Copyright (c) 2016 Krzysztof Blaszkowski
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -42,20 +43,20 @@
* Fileset header
*/
struct vxfs_fsh {
- u_int32_t fsh_version; /* fileset header version */
- u_int32_t fsh_fsindex; /* fileset index */
- u_int32_t fsh_time; /* modification time - sec */
- u_int32_t fsh_utime; /* modification time - usec */
- u_int32_t fsh_extop; /* extop flags */
- vx_ino_t fsh_ninodes; /* allocated inodes */
- u_int32_t fsh_nau; /* number of IAUs */
- u_int32_t fsh_old_ilesize; /* old size of ilist */
- u_int32_t fsh_dflags; /* flags */
- u_int32_t fsh_quota; /* quota limit */
- vx_ino_t fsh_maxinode; /* maximum inode number */
- vx_ino_t fsh_iauino; /* IAU inode */
- vx_ino_t fsh_ilistino[2]; /* ilist inodes */
- vx_ino_t fsh_lctino; /* link count table inode */
+ __fs32 fsh_version; /* fileset header version */
+ __fs32 fsh_fsindex; /* fileset index */
+ __fs32 fsh_time; /* modification time - sec */
+ __fs32 fsh_utime; /* modification time - usec */
+ __fs32 fsh_extop; /* extop flags */
+ __fs32 fsh_ninodes; /* allocated inodes */
+ __fs32 fsh_nau; /* number of IAUs */
+ __fs32 fsh_old_ilesize; /* old size of ilist */
+ __fs32 fsh_dflags; /* flags */
+ __fs32 fsh_quota; /* quota limit */
+ __fs32 fsh_maxinode; /* maximum inode number */
+ __fs32 fsh_iauino; /* IAU inode */
+ __fs32 fsh_ilistino[2]; /* ilist inodes */
+ __fs32 fsh_lctino; /* link count table inode */
/*
* Slightly more fields follow, but they
diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c
index cb84f0fcc72a4..bfc780c682fb8 100644
--- a/fs/freevxfs/vxfs_immed.c
+++ b/fs/freevxfs/vxfs_immed.c
@@ -66,11 +66,11 @@ static int
vxfs_immed_readpage(struct file *fp, struct page *pp)
{
struct vxfs_inode_info *vip = VXFS_INO(pp->mapping->host);
- u_int64_t offset = (u_int64_t)pp->index << PAGE_CACHE_SHIFT;
+ u_int64_t offset = (u_int64_t)pp->index << PAGE_SHIFT;
caddr_t kaddr;
kaddr = kmap(pp);
- memcpy(kaddr, vip->vii_immed.vi_immed + offset, PAGE_CACHE_SIZE);
+ memcpy(kaddr, vip->vii_immed.vi_immed + offset, PAGE_SIZE);
kunmap(pp);
flush_dcache_page(pp);
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 3e2ccade61edb..1f41b25ef38b2 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
+ * Copyright (c) 2016 Krzysztof Blaszkowski
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -42,9 +43,6 @@
#include "vxfs_extern.h"
-struct kmem_cache *vxfs_inode_cachep;
-
-
#ifdef DIAGNOSTIC
/*
* Dump inode contents (partially).
@@ -68,6 +66,83 @@ vxfs_dumpi(struct vxfs_inode_info *vip, ino_t ino)
}
#endif
+/**
+ * vxfs_transmod - mode for a VxFS inode
+ * @vip: VxFS inode
+ *
+ * Description:
+ * vxfs_transmod returns a Linux mode_t for a given
+ * VxFS inode structure.
+ */
+static __inline__ umode_t
+vxfs_transmod(struct vxfs_inode_info *vip)
+{
+ umode_t ret = vip->vii_mode & ~VXFS_TYPE_MASK;
+
+ if (VXFS_ISFIFO(vip))
+ ret |= S_IFIFO;
+ if (VXFS_ISCHR(vip))
+ ret |= S_IFCHR;
+ if (VXFS_ISDIR(vip))
+ ret |= S_IFDIR;
+ if (VXFS_ISBLK(vip))
+ ret |= S_IFBLK;
+ if (VXFS_ISLNK(vip))
+ ret |= S_IFLNK;
+ if (VXFS_ISREG(vip))
+ ret |= S_IFREG;
+ if (VXFS_ISSOC(vip))
+ ret |= S_IFSOCK;
+
+ return (ret);
+}
+
+static inline void dip2vip_cpy(struct vxfs_sb_info *sbi,
+ struct vxfs_inode_info *vip, struct vxfs_dinode *dip)
+{
+ struct inode *inode = &vip->vfs_inode;
+
+ vip->vii_mode = fs32_to_cpu(sbi, dip->vdi_mode);
+ vip->vii_nlink = fs32_to_cpu(sbi, dip->vdi_nlink);
+ vip->vii_uid = fs32_to_cpu(sbi, dip->vdi_uid);
+ vip->vii_gid = fs32_to_cpu(sbi, dip->vdi_gid);
+ vip->vii_size = fs64_to_cpu(sbi, dip->vdi_size);
+ vip->vii_atime = fs32_to_cpu(sbi, dip->vdi_atime);
+ vip->vii_autime = fs32_to_cpu(sbi, dip->vdi_autime);
+ vip->vii_mtime = fs32_to_cpu(sbi, dip->vdi_mtime);
+ vip->vii_mutime = fs32_to_cpu(sbi, dip->vdi_mutime);
+ vip->vii_ctime = fs32_to_cpu(sbi, dip->vdi_ctime);
+ vip->vii_cutime = fs32_to_cpu(sbi, dip->vdi_cutime);
+ vip->vii_orgtype = dip->vdi_orgtype;
+
+ vip->vii_blocks = fs32_to_cpu(sbi, dip->vdi_blocks);
+ vip->vii_gen = fs32_to_cpu(sbi, dip->vdi_gen);
+
+ if (VXFS_ISDIR(vip))
+ vip->vii_dotdot = fs32_to_cpu(sbi, dip->vdi_dotdot);
+ else if (!VXFS_ISREG(vip) && !VXFS_ISLNK(vip))
+ vip->vii_rdev = fs32_to_cpu(sbi, dip->vdi_rdev);
+
+ /* don't endian swap the fields that differ by orgtype */
+ memcpy(&vip->vii_org, &dip->vdi_org, sizeof(vip->vii_org));
+
+ inode->i_mode = vxfs_transmod(vip);
+ i_uid_write(inode, (uid_t)vip->vii_uid);
+ i_gid_write(inode, (gid_t)vip->vii_gid);
+
+ set_nlink(inode, vip->vii_nlink);
+ inode->i_size = vip->vii_size;
+
+ inode->i_atime.tv_sec = vip->vii_atime;
+ inode->i_ctime.tv_sec = vip->vii_ctime;
+ inode->i_mtime.tv_sec = vip->vii_mtime;
+ inode->i_atime.tv_nsec = 0;
+ inode->i_ctime.tv_nsec = 0;
+ inode->i_mtime.tv_nsec = 0;
+
+ inode->i_blocks = vip->vii_blocks;
+ inode->i_generation = vip->vii_gen;
+}
/**
* vxfs_blkiget - find inode based on extent #
@@ -85,50 +160,55 @@ vxfs_dumpi(struct vxfs_inode_info *vip, ino_t ino)
* buffercache. This function should not be used outside the
* read_super() method, otherwise the data may be incoherent.
*/
-struct vxfs_inode_info *
+struct inode *
vxfs_blkiget(struct super_block *sbp, u_long extent, ino_t ino)
{
struct buffer_head *bp;
+ struct inode *inode;
u_long block, offset;
+ inode = new_inode(sbp);
+ if (!inode)
+ return NULL;
+ inode->i_ino = get_next_ino();
+
block = extent + ((ino * VXFS_ISIZE) / sbp->s_blocksize);
offset = ((ino % (sbp->s_blocksize / VXFS_ISIZE)) * VXFS_ISIZE);
bp = sb_bread(sbp, block);
if (bp && buffer_mapped(bp)) {
- struct vxfs_inode_info *vip;
+ struct vxfs_inode_info *vip = VXFS_INO(inode);
struct vxfs_dinode *dip;
- if (!(vip = kmem_cache_alloc(vxfs_inode_cachep, GFP_KERNEL)))
- goto fail;
dip = (struct vxfs_dinode *)(bp->b_data + offset);
- memcpy(vip, dip, sizeof(*vip));
+ dip2vip_cpy(VXFS_SBI(sbp), vip, dip);
+ vip->vfs_inode.i_mapping->a_ops = &vxfs_aops;
#ifdef DIAGNOSTIC
vxfs_dumpi(vip, ino);
#endif
brelse(bp);
- return (vip);
+ return inode;
}
-fail:
printk(KERN_WARNING "vxfs: unable to read block %ld\n", block);
brelse(bp);
+ iput(inode);
return NULL;
}
/**
* __vxfs_iget - generic find inode facility
- * @sbp: VFS superblock
- * @ino: inode number
* @ilistp: inode list
+ * @vip: VxFS inode to fill in
+ * @ino: inode number
*
* Description:
* Search the for inode number @ino in the filesystem
* described by @sbp. Use the specified inode table (@ilistp).
- * Returns the matching VxFS inode on success, else an error code.
+ * Returns the matching inode on success, else an error code.
*/
-static struct vxfs_inode_info *
-__vxfs_iget(ino_t ino, struct inode *ilistp)
+static int
+__vxfs_iget(struct inode *ilistp, struct vxfs_inode_info *vip, ino_t ino)
{
struct page *pp;
u_long offset;
@@ -137,28 +217,22 @@ __vxfs_iget(ino_t ino, struct inode *ilistp)
pp = vxfs_get_page(ilistp->i_mapping, ino * VXFS_ISIZE / PAGE_SIZE);
if (!IS_ERR(pp)) {
- struct vxfs_inode_info *vip;
struct vxfs_dinode *dip;
caddr_t kaddr = (char *)page_address(pp);
- if (!(vip = kmem_cache_alloc(vxfs_inode_cachep, GFP_KERNEL)))
- goto fail;
dip = (struct vxfs_dinode *)(kaddr + offset);
- memcpy(vip, dip, sizeof(*vip));
+ dip2vip_cpy(VXFS_SBI(ilistp->i_sb), vip, dip);
+ vip->vfs_inode.i_mapping->a_ops = &vxfs_aops;
#ifdef DIAGNOSTIC
vxfs_dumpi(vip, ino);
#endif
vxfs_put_page(pp);
- return (vip);
+ return 0;
}
- printk(KERN_WARNING "vxfs: error on page %p\n", pp);
- return ERR_CAST(pp);
-
-fail:
- printk(KERN_WARNING "vxfs: unable to read inode %ld\n", (unsigned long)ino);
- vxfs_put_page(pp);
- return ERR_PTR(-ENOMEM);
+ printk(KERN_WARNING "vxfs: error on page 0x%p for inode %ld\n",
+ pp, (unsigned long)ino);
+ return PTR_ERR(pp);
}
/**
@@ -169,116 +243,26 @@ fail:
* Description:
* Find inode @ino in the filesystem described by @sbp using
* the structural inode list.
- * Returns the matching VxFS inode on success, else a NULL pointer.
- */
-struct vxfs_inode_info *
-vxfs_stiget(struct super_block *sbp, ino_t ino)
-{
- struct vxfs_inode_info *vip;
-
- vip = __vxfs_iget(ino, VXFS_SBI(sbp)->vsi_stilist);
- return IS_ERR(vip) ? NULL : vip;
-}
-
-/**
- * vxfs_transmod - mode for a VxFS inode
- * @vip: VxFS inode
- *
- * Description:
- * vxfs_transmod returns a Linux mode_t for a given
- * VxFS inode structure.
- */
-static __inline__ umode_t
-vxfs_transmod(struct vxfs_inode_info *vip)
-{
- umode_t ret = vip->vii_mode & ~VXFS_TYPE_MASK;
-
- if (VXFS_ISFIFO(vip))
- ret |= S_IFIFO;
- if (VXFS_ISCHR(vip))
- ret |= S_IFCHR;
- if (VXFS_ISDIR(vip))
- ret |= S_IFDIR;
- if (VXFS_ISBLK(vip))
- ret |= S_IFBLK;
- if (VXFS_ISLNK(vip))
- ret |= S_IFLNK;
- if (VXFS_ISREG(vip))
- ret |= S_IFREG;
- if (VXFS_ISSOC(vip))
- ret |= S_IFSOCK;
-
- return (ret);
-}
-
-/**
- * vxfs_iinit- helper to fill inode fields
- * @ip: VFS inode
- * @vip: VxFS inode
- *
- * Description:
- * vxfs_instino is a helper function to fill in all relevant
- * fields in @ip from @vip.
- */
-static void
-vxfs_iinit(struct inode *ip, struct vxfs_inode_info *vip)
-{
-
- ip->i_mode = vxfs_transmod(vip);
- i_uid_write(ip, (uid_t)vip->vii_uid);
- i_gid_write(ip, (gid_t)vip->vii_gid);
-
- set_nlink(ip, vip->vii_nlink);
- ip->i_size = vip->vii_size;
-
- ip->i_atime.tv_sec = vip->vii_atime;
- ip->i_ctime.tv_sec = vip->vii_ctime;
- ip->i_mtime.tv_sec = vip->vii_mtime;
- ip->i_atime.tv_nsec = 0;
- ip->i_ctime.tv_nsec = 0;
- ip->i_mtime.tv_nsec = 0;
-
- ip->i_blocks = vip->vii_blocks;
- ip->i_generation = vip->vii_gen;
-
- ip->i_private = vip;
-
-}
-
-/**
- * vxfs_get_fake_inode - get fake inode structure
- * @sbp: filesystem superblock
- * @vip: fspriv inode
- *
- * Description:
- * vxfs_fake_inode gets a fake inode (not in the inode hash) for a
- * superblock, vxfs_inode pair.
- * Returns the filled VFS inode.
+ * Returns the matching inode on success, else a NULL pointer.
*/
struct inode *
-vxfs_get_fake_inode(struct super_block *sbp, struct vxfs_inode_info *vip)
+vxfs_stiget(struct super_block *sbp, ino_t ino)
{
- struct inode *ip = NULL;
-
- if ((ip = new_inode(sbp))) {
- ip->i_ino = get_next_ino();
- vxfs_iinit(ip, vip);
- ip->i_mapping->a_ops = &vxfs_aops;
+ struct inode *inode;
+ int error;
+
+ inode = new_inode(sbp);
+ if (!inode)
+ return NULL;
+ inode->i_ino = get_next_ino();
+
+ error = __vxfs_iget(VXFS_SBI(sbp)->vsi_stilist, VXFS_INO(inode), ino);
+ if (error) {
+ iput(inode);
+ return NULL;
}
- return (ip);
-}
-/**
- * vxfs_put_fake_inode - free faked inode
- * *ip: VFS inode
- *
- * Description:
- * vxfs_put_fake_inode frees all data associated with @ip.
- */
-void
-vxfs_put_fake_inode(struct inode *ip)
-{
- iput(ip);
+ return inode;
}
/**
@@ -296,6 +280,7 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
struct vxfs_inode_info *vip;
const struct address_space_operations *aops;
struct inode *ip;
+ int error;
ip = iget_locked(sbp, ino);
if (!ip)
@@ -303,14 +288,13 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
if (!(ip->i_state & I_NEW))
return ip;
- vip = __vxfs_iget(ino, VXFS_SBI(sbp)->vsi_ilist);
- if (IS_ERR(vip)) {
+ vip = VXFS_INO(ip);
+ error = __vxfs_iget(VXFS_SBI(sbp)->vsi_ilist, vip, ino);
+ if (error) {
iget_failed(ip);
- return ERR_CAST(vip);
+ return ERR_PTR(error);
}
- vxfs_iinit(ip, vip);
-
if (VXFS_ISIMMED(vip))
aops = &vxfs_immed_aops;
else
@@ -341,12 +325,6 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
return ip;
}
-static void vxfs_i_callback(struct rcu_head *head)
-{
- struct inode *inode = container_of(head, struct inode, i_rcu);
- kmem_cache_free(vxfs_inode_cachep, inode->i_private);
-}
-
/**
* vxfs_evict_inode - remove inode from main memory
* @ip: inode to discard.
@@ -360,5 +338,4 @@ vxfs_evict_inode(struct inode *ip)
{
truncate_inode_pages_final(&ip->i_data);
clear_inode(ip);
- call_rcu(&ip->i_rcu, vxfs_i_callback);
}
diff --git a/fs/freevxfs/vxfs_inode.h b/fs/freevxfs/vxfs_inode.h
index 240aeb11263fa..f012abed125d6 100644
--- a/fs/freevxfs/vxfs_inode.h
+++ b/fs/freevxfs/vxfs_inode.h
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
+ * Copyright (c) 2016 Krzysztof Blaszkowski
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -66,74 +67,74 @@ enum {
* Data stored immediately in the inode.
*/
struct vxfs_immed {
- u_int8_t vi_immed[VXFS_NIMMED];
+ __u8 vi_immed[VXFS_NIMMED];
};
struct vxfs_ext4 {
- u_int32_t ve4_spare; /* ?? */
- u_int32_t ve4_indsize; /* Indirect extent size */
- vx_daddr_t ve4_indir[VXFS_NIADDR]; /* Indirect extents */
+ __fs32 ve4_spare; /* ?? */
+ __fs32 ve4_indsize; /* Indirect extent size */
+ __fs32 ve4_indir[VXFS_NIADDR]; /* Indirect extents */
struct direct { /* Direct extents */
- vx_daddr_t extent; /* Extent number */
- int32_t size; /* Size of extent */
+ __fs32 extent; /* Extent number */
+ __fs32 size; /* Size of extent */
} ve4_direct[VXFS_NDADDR];
};
struct vxfs_typed {
- u_int64_t vt_hdr; /* Header, 0xTTOOOOOOOOOOOOOO; T=type,O=offs */
- vx_daddr_t vt_block; /* Extent block */
- int32_t vt_size; /* Size in blocks */
+ __fs64 vt_hdr; /* Header, 0xTTOOOOOOOOOOOOOO; T=type,O=offs */
+ __fs32 vt_block; /* Extent block */
+ __fs32 vt_size; /* Size in blocks */
};
struct vxfs_typed_dev4 {
- u_int64_t vd4_hdr; /* Header, 0xTTOOOOOOOOOOOOOO; T=type,O=offs */
- u_int64_t vd4_block; /* Extent block */
- u_int64_t vd4_size; /* Size in blocks */
- int32_t vd4_dev; /* Device ID */
- u_int32_t __pad1;
+ __fs64 vd4_hdr; /* Header, 0xTTOOOOOOOOOOOOOO; T=type,O=offs */
+ __fs64 vd4_block; /* Extent block */
+ __fs64 vd4_size; /* Size in blocks */
+ __fs32 vd4_dev; /* Device ID */
+ __u8 __pad1;
};
/*
* The inode as contained on the physical device.
*/
struct vxfs_dinode {
- int32_t vdi_mode;
- u_int32_t vdi_nlink; /* Link count */
- u_int32_t vdi_uid; /* UID */
- u_int32_t vdi_gid; /* GID */
- u_int64_t vdi_size; /* Inode size in bytes */
- u_int32_t vdi_atime; /* Last time accessed - sec */
- u_int32_t vdi_autime; /* Last time accessed - usec */
- u_int32_t vdi_mtime; /* Last modify time - sec */
- u_int32_t vdi_mutime; /* Last modify time - usec */
- u_int32_t vdi_ctime; /* Create time - sec */
- u_int32_t vdi_cutime; /* Create time - usec */
- u_int8_t vdi_aflags; /* Allocation flags */
- u_int8_t vdi_orgtype; /* Organisation type */
- u_int16_t vdi_eopflags;
- u_int32_t vdi_eopdata;
+ __fs32 vdi_mode;
+ __fs32 vdi_nlink; /* Link count */
+ __fs32 vdi_uid; /* UID */
+ __fs32 vdi_gid; /* GID */
+ __fs64 vdi_size; /* Inode size in bytes */
+ __fs32 vdi_atime; /* Last time accessed - sec */
+ __fs32 vdi_autime; /* Last time accessed - usec */
+ __fs32 vdi_mtime; /* Last modify time - sec */
+ __fs32 vdi_mutime; /* Last modify time - usec */
+ __fs32 vdi_ctime; /* Create time - sec */
+ __fs32 vdi_cutime; /* Create time - usec */
+ __u8 vdi_aflags; /* Allocation flags */
+ __u8 vdi_orgtype; /* Organisation type */
+ __fs16 vdi_eopflags;
+ __fs32 vdi_eopdata;
union {
- u_int32_t rdev;
- u_int32_t dotdot;
+ __fs32 rdev;
+ __fs32 dotdot;
struct {
- u_int32_t reserved;
- u_int32_t fixextsize;
+ __u32 reserved;
+ __fs32 fixextsize;
} i_regular;
struct {
- u_int32_t matchino;
- u_int32_t fsetindex;
+ __fs32 matchino;
+ __fs32 fsetindex;
} i_vxspec;
- u_int64_t align;
+ __u64 align;
} vdi_ftarea;
- u_int32_t vdi_blocks; /* How much blocks does inode occupy */
- u_int32_t vdi_gen; /* Inode generation */
- u_int64_t vdi_version; /* Version */
+ __fs32 vdi_blocks; /* How much blocks does inode occupy */
+ __fs32 vdi_gen; /* Inode generation */
+ __fs64 vdi_version; /* Version */
union {
struct vxfs_immed immed;
struct vxfs_ext4 ext4;
struct vxfs_typed typed[VXFS_NTYPED];
} vdi_org;
- u_int32_t vdi_iattrino;
+ __fs32 vdi_iattrino;
};
#define vdi_rdev vdi_ftarea.rdev
@@ -149,32 +150,45 @@ struct vxfs_dinode {
/*
* The inode as represented in the main memory.
- *
- * TBD: This should become a separate structure...
*/
-#define vxfs_inode_info vxfs_dinode
-
-#define vii_mode vdi_mode
-#define vii_uid vdi_uid
-#define vii_gid vdi_gid
-#define vii_nlink vdi_nlink
-#define vii_size vdi_size
-#define vii_atime vdi_atime
-#define vii_ctime vdi_ctime
-#define vii_mtime vdi_mtime
-#define vii_blocks vdi_blocks
-#define vii_org vdi_org
-#define vii_orgtype vdi_orgtype
-#define vii_gen vdi_gen
-
-#define vii_rdev vdi_ftarea.rdev
-#define vii_dotdot vdi_ftarea.dotdot
-#define vii_fixextsize vdi_ftarea.regular.fixextsize
-#define vii_matchino vdi_ftarea.vxspec.matchino
-#define vii_fsetindex vdi_ftarea.vxspec.fsetindex
-
-#define vii_immed vdi_org.immed
-#define vii_ext4 vdi_org.ext4
-#define vii_typed vdi_org.typed
+struct vxfs_inode_info {
+ struct inode vfs_inode;
+
+ __u32 vii_mode;
+ __u32 vii_nlink; /* Link count */
+ __u32 vii_uid; /* UID */
+ __u32 vii_gid; /* GID */
+ __u64 vii_size; /* Inode size in bytes */
+ __u32 vii_atime; /* Last time accessed - sec */
+ __u32 vii_autime; /* Last time accessed - usec */
+ __u32 vii_mtime; /* Last modify time - sec */
+ __u32 vii_mutime; /* Last modify time - usec */
+ __u32 vii_ctime; /* Create time - sec */
+ __u32 vii_cutime; /* Create time - usec */
+ __u8 vii_orgtype; /* Organisation type */
+ union {
+ __u32 rdev;
+ __u32 dotdot;
+ } vii_ftarea;
+ __u32 vii_blocks; /* How much blocks does inode occupy */
+ __u32 vii_gen; /* Inode generation */
+ union {
+ struct vxfs_immed immed;
+ struct vxfs_ext4 ext4;
+ struct vxfs_typed typed[VXFS_NTYPED];
+ } vii_org;
+};
+
+#define vii_rdev vii_ftarea.rdev
+#define vii_dotdot vii_ftarea.dotdot
+
+#define vii_immed vii_org.immed
+#define vii_ext4 vii_org.ext4
+#define vii_typed vii_org.typed
+
+static inline struct vxfs_inode_info *VXFS_INO(struct inode *inode)
+{
+ return container_of(inode, struct vxfs_inode_info, vfs_inode);
+}
#endif /* _VXFS_INODE_H_ */
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 1cff72df0389a..ce4785fd81c63 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
+ * Copyright (c) 2016 Krzysztof Blaszkowski
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -45,7 +46,7 @@
/*
* Number of VxFS blocks per page.
*/
-#define VXFS_BLOCK_PER_PAGE(sbp) ((PAGE_CACHE_SIZE / (sbp)->s_blocksize))
+#define VXFS_BLOCK_PER_PAGE(sbp) ((PAGE_SIZE / (sbp)->s_blocksize))
static struct dentry * vxfs_lookup(struct inode *, struct dentry *, unsigned int);
@@ -58,36 +59,9 @@ const struct inode_operations vxfs_dir_inode_ops = {
const struct file_operations vxfs_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .iterate = vxfs_readdir,
+ .iterate_shared = vxfs_readdir,
};
-static inline u_long
-dir_blocks(struct inode *ip)
-{
- u_long bsize = ip->i_sb->s_blocksize;
- return (ip->i_size + bsize - 1) & ~(bsize - 1);
-}
-
-/*
- * NOTE! unlike strncmp, vxfs_match returns 1 for success, 0 for failure.
- *
- * len <= VXFS_NAMELEN and de != NULL are guaranteed by caller.
- */
-static inline int
-vxfs_match(int len, const char * const name, struct vxfs_direct *de)
-{
- if (len != de->d_namelen)
- return 0;
- if (!de->d_ino)
- return 0;
- return !memcmp(name, de->d_name, len);
-}
-
-static inline struct vxfs_direct *
-vxfs_next_entry(struct vxfs_direct *de)
-{
- return ((struct vxfs_direct *)((char*)de + de->d_reclen));
-}
/**
* vxfs_find_entry - find a mathing directory entry for a dentry
@@ -106,50 +80,64 @@ vxfs_next_entry(struct vxfs_direct *de)
static struct vxfs_direct *
vxfs_find_entry(struct inode *ip, struct dentry *dp, struct page **ppp)
{
- u_long npages, page, nblocks, pblocks, block;
- u_long bsize = ip->i_sb->s_blocksize;
- const char *name = dp->d_name.name;
- int namelen = dp->d_name.len;
-
- npages = dir_pages(ip);
- nblocks = dir_blocks(ip);
- pblocks = VXFS_BLOCK_PER_PAGE(ip->i_sb);
-
- for (page = 0; page < npages; page++) {
- caddr_t kaddr;
- struct page *pp;
+ u_long bsize = ip->i_sb->s_blocksize;
+ const char *name = dp->d_name.name;
+ int namelen = dp->d_name.len;
+ loff_t limit = VXFS_DIRROUND(ip->i_size);
+ struct vxfs_direct *de_exit = NULL;
+ loff_t pos = 0;
+ struct vxfs_sb_info *sbi = VXFS_SBI(ip->i_sb);
- pp = vxfs_get_page(ip->i_mapping, page);
+ while (pos < limit) {
+ struct page *pp;
+ char *kaddr;
+ int pg_ofs = pos & ~PAGE_MASK;
+
+ pp = vxfs_get_page(ip->i_mapping, pos >> PAGE_SHIFT);
if (IS_ERR(pp))
- continue;
- kaddr = (caddr_t)page_address(pp);
-
- for (block = 0; block <= nblocks && block <= pblocks; block++) {
- caddr_t baddr, limit;
- struct vxfs_dirblk *dbp;
- struct vxfs_direct *de;
-
- baddr = kaddr + (block * bsize);
- limit = baddr + bsize - VXFS_DIRLEN(1);
-
- dbp = (struct vxfs_dirblk *)baddr;
- de = (struct vxfs_direct *)(baddr + VXFS_DIRBLKOV(dbp));
-
- for (; (caddr_t)de <= limit; de = vxfs_next_entry(de)) {
- if (!de->d_reclen)
- break;
- if (!de->d_ino)
- continue;
- if (vxfs_match(namelen, name, de)) {
- *ppp = pp;
- return (de);
- }
+ return NULL;
+ kaddr = (char *)page_address(pp);
+
+ while (pg_ofs < PAGE_SIZE && pos < limit) {
+ struct vxfs_direct *de;
+
+ if ((pos & (bsize - 1)) < 4) {
+ struct vxfs_dirblk *dbp =
+ (struct vxfs_dirblk *)
+ (kaddr + (pos & ~PAGE_MASK));
+ int overhead = VXFS_DIRBLKOV(sbi, dbp);
+
+ pos += overhead;
+ pg_ofs += overhead;
+ }
+ de = (struct vxfs_direct *)(kaddr + pg_ofs);
+
+ if (!de->d_reclen) {
+ pos += bsize - 1;
+ pos &= ~(bsize - 1);
+ break;
+ }
+
+ pg_ofs += fs16_to_cpu(sbi, de->d_reclen);
+ pos += fs16_to_cpu(sbi, de->d_reclen);
+ if (!de->d_ino)
+ continue;
+
+ if (namelen != fs16_to_cpu(sbi, de->d_namelen))
+ continue;
+ if (!memcmp(name, de->d_name, namelen)) {
+ *ppp = pp;
+ de_exit = de;
+ break;
}
}
- vxfs_put_page(pp);
+ if (!de_exit)
+ vxfs_put_page(pp);
+ else
+ break;
}
- return NULL;
+ return de_exit;
}
/**
@@ -173,9 +161,9 @@ vxfs_inode_by_name(struct inode *dip, struct dentry *dp)
de = vxfs_find_entry(dip, dp, &pp);
if (de) {
- ino = de->d_ino;
+ ino = fs32_to_cpu(VXFS_SBI(dip->i_sb), de->d_ino);
kunmap(pp);
- page_cache_release(pp);
+ put_page(pp);
}
return (ino);
@@ -233,74 +221,80 @@ vxfs_readdir(struct file *fp, struct dir_context *ctx)
struct inode *ip = file_inode(fp);
struct super_block *sbp = ip->i_sb;
u_long bsize = sbp->s_blocksize;
- u_long page, npages, block, pblocks, nblocks, offset;
- loff_t pos;
+ loff_t pos, limit;
+ struct vxfs_sb_info *sbi = VXFS_SBI(sbp);
if (ctx->pos == 0) {
if (!dir_emit_dot(fp, ctx))
- return 0;
- ctx->pos = 1;
+ goto out;
+ ctx->pos++;
}
if (ctx->pos == 1) {
if (!dir_emit(ctx, "..", 2, VXFS_INO(ip)->vii_dotdot, DT_DIR))
- return 0;
- ctx->pos = 2;
+ goto out;
+ ctx->pos++;
}
- pos = ctx->pos - 2;
-
- if (pos > VXFS_DIRROUND(ip->i_size))
- return 0;
- npages = dir_pages(ip);
- nblocks = dir_blocks(ip);
- pblocks = VXFS_BLOCK_PER_PAGE(sbp);
+ limit = VXFS_DIRROUND(ip->i_size);
+ if (ctx->pos > limit)
+ goto out;
- page = pos >> PAGE_CACHE_SHIFT;
- offset = pos & ~PAGE_CACHE_MASK;
- block = (u_long)(pos >> sbp->s_blocksize_bits) % pblocks;
+ pos = ctx->pos & ~3L;
- for (; page < npages; page++, block = 0) {
- char *kaddr;
- struct page *pp;
+ while (pos < limit) {
+ struct page *pp;
+ char *kaddr;
+ int pg_ofs = pos & ~PAGE_MASK;
+ int rc = 0;
- pp = vxfs_get_page(ip->i_mapping, page);
+ pp = vxfs_get_page(ip->i_mapping, pos >> PAGE_SHIFT);
if (IS_ERR(pp))
- continue;
+ return -ENOMEM;
+
kaddr = (char *)page_address(pp);
- for (; block <= nblocks && block <= pblocks; block++) {
- char *baddr, *limit;
- struct vxfs_dirblk *dbp;
- struct vxfs_direct *de;
+ while (pg_ofs < PAGE_SIZE && pos < limit) {
+ struct vxfs_direct *de;
- baddr = kaddr + (block * bsize);
- limit = baddr + bsize - VXFS_DIRLEN(1);
-
- dbp = (struct vxfs_dirblk *)baddr;
- de = (struct vxfs_direct *)
- (offset ?
- (kaddr + offset) :
- (baddr + VXFS_DIRBLKOV(dbp)));
-
- for (; (char *)de <= limit; de = vxfs_next_entry(de)) {
- if (!de->d_reclen)
- break;
- if (!de->d_ino)
- continue;
-
- offset = (char *)de - kaddr;
- ctx->pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
- if (!dir_emit(ctx, de->d_name, de->d_namelen,
- de->d_ino, DT_UNKNOWN)) {
- vxfs_put_page(pp);
- return 0;
- }
+ if ((pos & (bsize - 1)) < 4) {
+ struct vxfs_dirblk *dbp =
+ (struct vxfs_dirblk *)
+ (kaddr + (pos & ~PAGE_MASK));
+ int overhead = VXFS_DIRBLKOV(sbi, dbp);
+
+ pos += overhead;
+ pg_ofs += overhead;
+ }
+ de = (struct vxfs_direct *)(kaddr + pg_ofs);
+
+ if (!de->d_reclen) {
+ pos += bsize - 1;
+ pos &= ~(bsize - 1);
+ break;
+ }
+
+ pg_ofs += fs16_to_cpu(sbi, de->d_reclen);
+ pos += fs16_to_cpu(sbi, de->d_reclen);
+ if (!de->d_ino)
+ continue;
+
+ rc = dir_emit(ctx, de->d_name,
+ fs16_to_cpu(sbi, de->d_namelen),
+ fs32_to_cpu(sbi, de->d_ino),
+ DT_UNKNOWN);
+ if (!rc) {
+ /* the dir entry was not read, fix pos. */
+ pos -= fs16_to_cpu(sbi, de->d_reclen);
+ break;
}
- offset = 0;
}
vxfs_put_page(pp);
- offset = 0;
+ if (!rc)
+ break;
}
- ctx->pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
+
+ ctx->pos = pos | 2;
+
+out:
return 0;
}
diff --git a/fs/freevxfs/vxfs_olt.c b/fs/freevxfs/vxfs_olt.c
index 0495008479034..813da66851510 100644
--- a/fs/freevxfs/vxfs_olt.c
+++ b/fs/freevxfs/vxfs_olt.c
@@ -43,14 +43,14 @@ static inline void
vxfs_get_fshead(struct vxfs_oltfshead *fshp, struct vxfs_sb_info *infp)
{
BUG_ON(infp->vsi_fshino);
- infp->vsi_fshino = fshp->olt_fsino[0];
+ infp->vsi_fshino = fs32_to_cpu(infp, fshp->olt_fsino[0]);
}
static inline void
vxfs_get_ilist(struct vxfs_oltilist *ilistp, struct vxfs_sb_info *infp)
{
BUG_ON(infp->vsi_iext);
- infp->vsi_iext = ilistp->olt_iext[0];
+ infp->vsi_iext = fs32_to_cpu(infp, ilistp->olt_iext[0]);
}
static inline u_long
@@ -81,13 +81,12 @@ vxfs_read_olt(struct super_block *sbp, u_long bsize)
struct vxfs_olt *op;
char *oaddr, *eaddr;
-
bp = sb_bread(sbp, vxfs_oblock(sbp, infp->vsi_oltext, bsize));
if (!bp || !bp->b_data)
goto fail;
op = (struct vxfs_olt *)bp->b_data;
- if (op->olt_magic != VXFS_OLT_MAGIC) {
+ if (fs32_to_cpu(infp, op->olt_magic) != VXFS_OLT_MAGIC) {
printk(KERN_NOTICE "vxfs: ivalid olt magic number\n");
goto fail;
}
@@ -102,14 +101,14 @@ vxfs_read_olt(struct super_block *sbp, u_long bsize)
goto fail;
}
- oaddr = bp->b_data + op->olt_size;
+ oaddr = bp->b_data + fs32_to_cpu(infp, op->olt_size);
eaddr = bp->b_data + (infp->vsi_oltsize * sbp->s_blocksize);
while (oaddr < eaddr) {
struct vxfs_oltcommon *ocp =
(struct vxfs_oltcommon *)oaddr;
- switch (ocp->olt_type) {
+ switch (fs32_to_cpu(infp, ocp->olt_type)) {
case VXFS_OLT_FSHEAD:
vxfs_get_fshead((struct vxfs_oltfshead *)oaddr, infp);
break;
@@ -118,11 +117,11 @@ vxfs_read_olt(struct super_block *sbp, u_long bsize)
break;
}
- oaddr += ocp->olt_size;
+ oaddr += fs32_to_cpu(infp, ocp->olt_size);
}
brelse(bp);
- return 0;
+ return (infp->vsi_fshino && infp->vsi_iext) ? 0 : -EINVAL;
fail:
brelse(bp);
diff --git a/fs/freevxfs/vxfs_olt.h b/fs/freevxfs/vxfs_olt.h
index b7b3af5026158..0c0b0c9fa5579 100644
--- a/fs/freevxfs/vxfs_olt.h
+++ b/fs/freevxfs/vxfs_olt.h
@@ -63,83 +63,83 @@ enum {
* the initial inode list, the fileset header or the device configuration.
*/
struct vxfs_olt {
- u_int32_t olt_magic; /* magic number */
- u_int32_t olt_size; /* size of this entry */
- u_int32_t olt_checksum; /* checksum of extent */
- u_int32_t __unused1; /* ??? */
- u_int32_t olt_mtime; /* time of last mod. (sec) */
- u_int32_t olt_mutime; /* time of last mod. (usec) */
- u_int32_t olt_totfree; /* free space in OLT extent */
- vx_daddr_t olt_extents[2]; /* addr of this extent, replica */
- u_int32_t olt_esize; /* size of this extent */
- vx_daddr_t olt_next[2]; /* addr of next extent, replica */
- u_int32_t olt_nsize; /* size of next extent */
- u_int32_t __unused2; /* align to 8 byte boundary */
+ __fs32 olt_magic; /* magic number */
+ __fs32 olt_size; /* size of this entry */
+ __fs32 olt_checksum; /* checksum of extent */
+ __u32 __unused1; /* ??? */
+ __fs32 olt_mtime; /* time of last mod. (sec) */
+ __fs32 olt_mutime; /* time of last mod. (usec) */
+ __fs32 olt_totfree; /* free space in OLT extent */
+ __fs32 olt_extents[2]; /* addr of this extent, replica */
+ __fs32 olt_esize; /* size of this extent */
+ __fs32 olt_next[2]; /* addr of next extent, replica */
+ __fs32 olt_nsize; /* size of next extent */
+ __u32 __unused2; /* align to 8 byte boundary */
};
/*
* VxFS common OLT entry (on disk).
*/
struct vxfs_oltcommon {
- u_int32_t olt_type; /* type of this record */
- u_int32_t olt_size; /* size of this record */
+ __fs32 olt_type; /* type of this record */
+ __fs32 olt_size; /* size of this record */
};
/*
* VxFS free OLT entry (on disk).
*/
struct vxfs_oltfree {
- u_int32_t olt_type; /* type of this record */
- u_int32_t olt_fsize; /* size of this free record */
+ __fs32 olt_type; /* type of this record */
+ __fs32 olt_fsize; /* size of this free record */
};
/*
* VxFS initial-inode list (on disk).
*/
struct vxfs_oltilist {
- u_int32_t olt_type; /* type of this record */
- u_int32_t olt_size; /* size of this record */
- vx_ino_t olt_iext[2]; /* initial inode list, replica */
+ __fs32 olt_type; /* type of this record */
+ __fs32 olt_size; /* size of this record */
+ __fs32 olt_iext[2]; /* initial inode list, replica */
};
/*
* Current Usage Table
*/
struct vxfs_oltcut {
- u_int32_t olt_type; /* type of this record */
- u_int32_t olt_size; /* size of this record */
- vx_ino_t olt_cutino; /* inode of current usage table */
- u_int32_t __pad; /* unused, 8 byte align */
+ __fs32 olt_type; /* type of this record */
+ __fs32 olt_size; /* size of this record */
+ __fs32 olt_cutino; /* inode of current usage table */
+ __u8 __pad; /* unused, 8 byte align */
};
/*
* Inodes containing Superblock, Intent log and OLTs
*/
struct vxfs_oltsb {
- u_int32_t olt_type; /* type of this record */
- u_int32_t olt_size; /* size of this record */
- vx_ino_t olt_sbino; /* inode of superblock file */
- u_int32_t __unused1; /* ??? */
- vx_ino_t olt_logino[2]; /* inode of log file,replica */
- vx_ino_t olt_oltino[2]; /* inode of OLT, replica */
+ __fs32 olt_type; /* type of this record */
+ __fs32 olt_size; /* size of this record */
+ __fs32 olt_sbino; /* inode of superblock file */
+ __u32 __unused1; /* ??? */
+ __fs32 olt_logino[2]; /* inode of log file,replica */
+ __fs32 olt_oltino[2]; /* inode of OLT, replica */
};
/*
* Inode containing device configuration + it's replica
*/
struct vxfs_oltdev {
- u_int32_t olt_type; /* type of this record */
- u_int32_t olt_size; /* size of this record */
- vx_ino_t olt_devino[2]; /* inode of device config files */
+ __fs32 olt_type; /* type of this record */
+ __fs32 olt_size; /* size of this record */
+ __fs32 olt_devino[2]; /* inode of device config files */
};
/*
* Fileset header
*/
struct vxfs_oltfshead {
- u_int32_t olt_type; /* type number */
- u_int32_t olt_size; /* size of this record */
- vx_ino_t olt_fsino[2]; /* inodes of fileset header */
+ __fs32 olt_type; /* type number */
+ __fs32 olt_size; /* size of this record */
+ __fs32 olt_fsino[2]; /* inodes of fileset header */
};
#endif /* _VXFS_OLT_H_ */
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index 5d318c44f8554..e806694d4145e 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -50,7 +50,7 @@ inline void
vxfs_put_page(struct page *pp)
{
kunmap(pp);
- page_cache_release(pp);
+ put_page(pp);
}
/**
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 7ca8c75d50d3f..455ce5b77e9bf 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
+ * Copyright (c) 2016 Krzysztof Blaszkowski
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -48,22 +49,11 @@
#include "vxfs_inode.h"
-MODULE_AUTHOR("Christoph Hellwig");
+MODULE_AUTHOR("Christoph Hellwig, Krzysztof Blaszkowski");
MODULE_DESCRIPTION("Veritas Filesystem (VxFS) driver");
MODULE_LICENSE("Dual BSD/GPL");
-
-
-static void vxfs_put_super(struct super_block *);
-static int vxfs_statfs(struct dentry *, struct kstatfs *);
-static int vxfs_remount(struct super_block *, int *, char *);
-
-static const struct super_operations vxfs_super_ops = {
- .evict_inode = vxfs_evict_inode,
- .put_super = vxfs_put_super,
- .statfs = vxfs_statfs,
- .remount_fs = vxfs_remount,
-};
+static struct kmem_cache *vxfs_inode_cachep;
/**
* vxfs_put_super - free superblock resources
@@ -79,9 +69,9 @@ vxfs_put_super(struct super_block *sbp)
{
struct vxfs_sb_info *infp = VXFS_SBI(sbp);
- vxfs_put_fake_inode(infp->vsi_fship);
- vxfs_put_fake_inode(infp->vsi_ilist);
- vxfs_put_fake_inode(infp->vsi_stilist);
+ iput(infp->vsi_fship);
+ iput(infp->vsi_ilist);
+ iput(infp->vsi_stilist);
brelse(infp->vsi_bp);
kfree(infp);
@@ -109,14 +99,15 @@ static int
vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp)
{
struct vxfs_sb_info *infp = VXFS_SBI(dentry->d_sb);
+ struct vxfs_sb *raw_sb = infp->vsi_raw;
bufp->f_type = VXFS_SUPER_MAGIC;
bufp->f_bsize = dentry->d_sb->s_blocksize;
- bufp->f_blocks = infp->vsi_raw->vs_dsize;
- bufp->f_bfree = infp->vsi_raw->vs_free;
+ bufp->f_blocks = fs32_to_cpu(infp, raw_sb->vs_dsize);
+ bufp->f_bfree = fs32_to_cpu(infp, raw_sb->vs_free);
bufp->f_bavail = 0;
bufp->f_files = 0;
- bufp->f_ffree = infp->vsi_raw->vs_ifree;
+ bufp->f_ffree = fs32_to_cpu(infp, raw_sb->vs_ifree);
bufp->f_namelen = VXFS_NAMELEN;
return 0;
@@ -129,6 +120,81 @@ static int vxfs_remount(struct super_block *sb, int *flags, char *data)
return 0;
}
+static struct inode *vxfs_alloc_inode(struct super_block *sb)
+{
+ struct vxfs_inode_info *vi;
+
+ vi = kmem_cache_alloc(vxfs_inode_cachep, GFP_KERNEL);
+ if (!vi)
+ return NULL;
+ inode_init_once(&vi->vfs_inode);
+ return &vi->vfs_inode;
+}
+
+static void vxfs_i_callback(struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+
+ kmem_cache_free(vxfs_inode_cachep, VXFS_INO(inode));
+}
+
+static void vxfs_destroy_inode(struct inode *inode)
+{
+ call_rcu(&inode->i_rcu, vxfs_i_callback);
+}
+
+static const struct super_operations vxfs_super_ops = {
+ .alloc_inode = vxfs_alloc_inode,
+ .destroy_inode = vxfs_destroy_inode,
+ .evict_inode = vxfs_evict_inode,
+ .put_super = vxfs_put_super,
+ .statfs = vxfs_statfs,
+ .remount_fs = vxfs_remount,
+};
+
+static int vxfs_try_sb_magic(struct super_block *sbp, int silent,
+ unsigned blk, __fs32 magic)
+{
+ struct buffer_head *bp;
+ struct vxfs_sb *rsbp;
+ struct vxfs_sb_info *infp = VXFS_SBI(sbp);
+ int rc = -ENOMEM;
+
+ bp = sb_bread(sbp, blk);
+ do {
+ if (!bp || !buffer_mapped(bp)) {
+ if (!silent) {
+ printk(KERN_WARNING
+ "vxfs: unable to read disk superblock at %u\n",
+ blk);
+ }
+ break;
+ }
+
+ rc = -EINVAL;
+ rsbp = (struct vxfs_sb *)bp->b_data;
+ if (rsbp->vs_magic != magic) {
+ if (!silent)
+ printk(KERN_NOTICE
+ "vxfs: WRONG superblock magic %08x at %u\n",
+ rsbp->vs_magic, blk);
+ break;
+ }
+
+ rc = 0;
+ infp->vsi_raw = rsbp;
+ infp->vsi_bp = bp;
+ } while (0);
+
+ if (rc) {
+ infp->vsi_raw = NULL;
+ infp->vsi_bp = NULL;
+ brelse(bp);
+ }
+
+ return rc;
+}
+
/**
* vxfs_read_super - read superblock into memory and initialize filesystem
* @sbp: VFS superblock (to fill)
@@ -149,10 +215,10 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
{
struct vxfs_sb_info *infp;
struct vxfs_sb *rsbp;
- struct buffer_head *bp = NULL;
u_long bsize;
struct inode *root;
int ret = -EINVAL;
+ u32 j;
sbp->s_flags |= MS_RDONLY;
@@ -168,42 +234,43 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
goto out;
}
- bp = sb_bread(sbp, 1);
- if (!bp || !buffer_mapped(bp)) {
- if (!silent) {
- printk(KERN_WARNING
- "vxfs: unable to read disk superblock\n");
- }
- goto out;
- }
+ sbp->s_op = &vxfs_super_ops;
+ sbp->s_fs_info = infp;
- rsbp = (struct vxfs_sb *)bp->b_data;
- if (rsbp->vs_magic != VXFS_SUPER_MAGIC) {
+ if (!vxfs_try_sb_magic(sbp, silent, 1,
+ (__force __fs32)cpu_to_le32(VXFS_SUPER_MAGIC))) {
+ /* Unixware, x86 */
+ infp->byte_order = VXFS_BO_LE;
+ } else if (!vxfs_try_sb_magic(sbp, silent, 8,
+ (__force __fs32)cpu_to_be32(VXFS_SUPER_MAGIC))) {
+ /* HP-UX, parisc */
+ infp->byte_order = VXFS_BO_BE;
+ } else {
if (!silent)
- printk(KERN_NOTICE "vxfs: WRONG superblock magic\n");
+ printk(KERN_NOTICE "vxfs: can't find superblock.\n");
goto out;
}
- if ((rsbp->vs_version < 2 || rsbp->vs_version > 4) && !silent) {
- printk(KERN_NOTICE "vxfs: unsupported VxFS version (%d)\n",
- rsbp->vs_version);
+ rsbp = infp->vsi_raw;
+ j = fs32_to_cpu(infp, rsbp->vs_version);
+ if ((j < 2 || j > 4) && !silent) {
+ printk(KERN_NOTICE "vxfs: unsupported VxFS version (%d)\n", j);
goto out;
}
#ifdef DIAGNOSTIC
- printk(KERN_DEBUG "vxfs: supported VxFS version (%d)\n", rsbp->vs_version);
- printk(KERN_DEBUG "vxfs: blocksize: %d\n", rsbp->vs_bsize);
+ printk(KERN_DEBUG "vxfs: supported VxFS version (%d)\n", j);
+ printk(KERN_DEBUG "vxfs: blocksize: %d\n",
+ fs32_to_cpu(infp, rsbp->vs_bsize));
#endif
- sbp->s_magic = rsbp->vs_magic;
- sbp->s_fs_info = infp;
+ sbp->s_magic = fs32_to_cpu(infp, rsbp->vs_magic);
- infp->vsi_raw = rsbp;
- infp->vsi_bp = bp;
- infp->vsi_oltext = rsbp->vs_oltext[0];
- infp->vsi_oltsize = rsbp->vs_oltsize;
+ infp->vsi_oltext = fs32_to_cpu(infp, rsbp->vs_oltext[0]);
+ infp->vsi_oltsize = fs32_to_cpu(infp, rsbp->vs_oltsize);
- if (!sb_set_blocksize(sbp, rsbp->vs_bsize)) {
+ j = fs32_to_cpu(infp, rsbp->vs_bsize);
+ if (!sb_set_blocksize(sbp, j)) {
printk(KERN_WARNING "vxfs: unable to set final block size\n");
goto out;
}
@@ -218,7 +285,6 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
goto out;
}
- sbp->s_op = &vxfs_super_ops;
root = vxfs_iget(sbp, VXFS_ROOT_INO);
if (IS_ERR(root)) {
ret = PTR_ERR(root);
@@ -233,11 +299,11 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
return 0;
out_free_ilist:
- vxfs_put_fake_inode(infp->vsi_fship);
- vxfs_put_fake_inode(infp->vsi_ilist);
- vxfs_put_fake_inode(infp->vsi_stilist);
+ iput(infp->vsi_fship);
+ iput(infp->vsi_ilist);
+ iput(infp->vsi_stilist);
out:
- brelse(bp);
+ brelse(infp->vsi_bp);
kfree(infp);
return ret;
}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index fee81e8768c95..4d09d4441e3ee 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -33,7 +33,7 @@
/*
* 4MB minimal write chunk size
*/
-#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10))
+#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10))
struct wb_completion {
atomic_t cnt;
@@ -483,9 +483,9 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
goto out_free;
}
inode->i_state |= I_WB_SWITCH;
+ __iget(inode);
spin_unlock(&inode->i_lock);
- ihold(inode);
isw->inode = inode;
atomic_inc(&isw_nr_in_flight);
@@ -931,7 +931,8 @@ void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
* This is WB_SYNC_NONE writeback, so if allocation fails just
* wakeup the thread for old dirty data writeback
*/
- work = kzalloc(sizeof(*work), GFP_ATOMIC);
+ work = kzalloc(sizeof(*work),
+ GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
if (!work) {
trace_writeback_nowork(wb);
wb_wakeup(wb);
@@ -980,6 +981,42 @@ void inode_io_list_del(struct inode *inode)
}
/*
+ * mark an inode as under writeback on the sb
+ */
+void sb_mark_inode_writeback(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+ unsigned long flags;
+
+ if (list_empty(&inode->i_wb_list)) {
+ spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
+ if (list_empty(&inode->i_wb_list)) {
+ list_add_tail(&inode->i_wb_list, &sb->s_inodes_wb);
+ trace_sb_mark_inode_writeback(inode);
+ }
+ spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
+ }
+}
+
+/*
+ * clear an inode as under writeback on the sb
+ */
+void sb_clear_inode_writeback(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+ unsigned long flags;
+
+ if (!list_empty(&inode->i_wb_list)) {
+ spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
+ if (!list_empty(&inode->i_wb_list)) {
+ list_del_init(&inode->i_wb_list);
+ trace_sb_clear_inode_writeback(inode);
+ }
+ spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
+ }
+}
+
+/*
* Redirty an inode: set its when-it-was dirtied timestamp and move it to the
* furthest end of its superblock's dirty-inode list.
*
@@ -1290,6 +1327,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
dirty = inode->i_state & I_DIRTY;
if (inode->i_state & I_DIRTY_TIME) {
if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
+ wbc->sync_mode == WB_SYNC_ALL ||
unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) ||
unlikely(time_after(jiffies,
(inode->dirtied_time_when +
@@ -1770,8 +1808,8 @@ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
*/
static unsigned long get_nr_dirty_pages(void)
{
- return global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS) +
+ return global_node_page_state(NR_FILE_DIRTY) +
+ global_node_page_state(NR_UNSTABLE_NFS) +
get_nr_dirty_inodes();
}
@@ -2153,7 +2191,7 @@ EXPORT_SYMBOL(__mark_inode_dirty);
*/
static void wait_sb_inodes(struct super_block *sb)
{
- struct inode *inode, *old_inode = NULL;
+ LIST_HEAD(sync_list);
/*
* We need to be protected against the filesystem going from
@@ -2162,38 +2200,60 @@ static void wait_sb_inodes(struct super_block *sb)
WARN_ON(!rwsem_is_locked(&sb->s_umount));
mutex_lock(&sb->s_sync_lock);
- spin_lock(&sb->s_inode_list_lock);
/*
- * Data integrity sync. Must wait for all pages under writeback,
- * because there may have been pages dirtied before our sync
- * call, but which had writeout started before we write it out.
- * In which case, the inode may not be on the dirty list, but
- * we still have to wait for that writeout.
+ * Splice the writeback list onto a temporary list to avoid waiting on
+ * inodes that have started writeback after this point.
+ *
+ * Use rcu_read_lock() to keep the inodes around until we have a
+ * reference. s_inode_wblist_lock protects sb->s_inodes_wb as well as
+ * the local list because inodes can be dropped from either by writeback
+ * completion.
+ */
+ rcu_read_lock();
+ spin_lock_irq(&sb->s_inode_wblist_lock);
+ list_splice_init(&sb->s_inodes_wb, &sync_list);
+
+ /*
+ * Data integrity sync. Must wait for all pages under writeback, because
+ * there may have been pages dirtied before our sync call, but which had
+ * writeout started before we write it out. In which case, the inode
+ * may not be on the dirty list, but we still have to wait for that
+ * writeout.
*/
- list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+ while (!list_empty(&sync_list)) {
+ struct inode *inode = list_first_entry(&sync_list, struct inode,
+ i_wb_list);
struct address_space *mapping = inode->i_mapping;
+ /*
+ * Move each inode back to the wb list before we drop the lock
+ * to preserve consistency between i_wb_list and the mapping
+ * writeback tag. Writeback completion is responsible to remove
+ * the inode from either list once the writeback tag is cleared.
+ */
+ list_move_tail(&inode->i_wb_list, &sb->s_inodes_wb);
+
+ /*
+ * The mapping can appear untagged while still on-list since we
+ * do not have the mapping lock. Skip it here, wb completion
+ * will remove it.
+ */
+ if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
+ continue;
+
+ spin_unlock_irq(&sb->s_inode_wblist_lock);
+
spin_lock(&inode->i_lock);
- if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
- (mapping->nrpages == 0)) {
+ if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
spin_unlock(&inode->i_lock);
+
+ spin_lock_irq(&sb->s_inode_wblist_lock);
continue;
}
__iget(inode);
spin_unlock(&inode->i_lock);
- spin_unlock(&sb->s_inode_list_lock);
-
- /*
- * We hold a reference to 'inode' so it couldn't have been
- * removed from s_inodes list while we dropped the
- * s_inode_list_lock. We cannot iput the inode now as we can
- * be holding the last reference and we cannot iput it under
- * s_inode_list_lock. So we keep the reference and iput it
- * later.
- */
- iput(old_inode);
- old_inode = inode;
+ rcu_read_unlock();
/*
* We keep the error status of individual mapping so that
@@ -2204,10 +2264,13 @@ static void wait_sb_inodes(struct super_block *sb)
cond_resched();
- spin_lock(&sb->s_inode_list_lock);
+ iput(inode);
+
+ rcu_read_lock();
+ spin_lock_irq(&sb->s_inode_wblist_lock);
}
- spin_unlock(&sb->s_inode_list_lock);
- iput(old_inode);
+ spin_unlock_irq(&sb->s_inode_wblist_lock);
+ rcu_read_unlock();
mutex_unlock(&sb->s_sync_lock);
}
diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c
index 7d637e2335fda..15a3d042247e9 100644
--- a/fs/fscache/histogram.c
+++ b/fs/fscache/histogram.c
@@ -99,7 +99,6 @@ static int fscache_histogram_open(struct inode *inode, struct file *file)
}
const struct file_operations fscache_histogram_fops = {
- .owner = THIS_MODULE,
.open = fscache_histogram_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 6b028b7c42509..5d5ddaa84b215 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -404,7 +404,6 @@ static int fscache_objlist_release(struct inode *inode, struct file *file)
}
const struct file_operations fscache_objlist_fops = {
- .owner = THIS_MODULE,
.open = fscache_objlist_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 6b35fc4860a03..c8c4f79c7ce16 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -113,7 +113,7 @@ try_again:
wake_up_bit(&cookie->flags, 0);
if (xpage)
- page_cache_release(xpage);
+ put_page(xpage);
__fscache_uncache_page(cookie, page);
return true;
@@ -164,7 +164,7 @@ static void fscache_end_page_write(struct fscache_object *object,
}
spin_unlock(&object->lock);
if (xpage)
- page_cache_release(xpage);
+ put_page(xpage);
}
/*
@@ -884,9 +884,11 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
spin_unlock(&cookie->stores_lock);
for (i = n - 1; i >= 0; i--)
- page_cache_release(results[i]);
+ put_page(results[i]);
}
+ wake_up_bit(&cookie->flags, 0);
+
_leave("");
}
@@ -982,7 +984,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
radix_tree_tag_set(&cookie->stores, page->index,
FSCACHE_COOKIE_PENDING_TAG);
- page_cache_get(page);
+ get_page(page);
/* we only want one writer at a time, but we do need to queue new
* writers after exclusive ops */
@@ -1026,7 +1028,7 @@ submit_failed:
radix_tree_delete(&cookie->stores, page->index);
spin_unlock(&cookie->stores_lock);
wake_cookie = __fscache_unuse_cookie(cookie);
- page_cache_release(page);
+ put_page(page);
ret = -ENOBUFS;
goto nobufs;
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index 7cfa0aacdf6d5..7ac6e839b065b 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -295,7 +295,6 @@ static int fscache_stats_open(struct inode *inode, struct file *file)
}
const struct file_operations fscache_stats_fops = {
- .owner = THIS_MODULE,
.open = fscache_stats_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ebb5e37455a07..a94d2ed81ab4a 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -99,19 +99,6 @@ void fuse_request_free(struct fuse_req *req)
kmem_cache_free(fuse_req_cachep, req);
}
-static void block_sigs(sigset_t *oldset)
-{
- sigset_t mask;
-
- siginitsetinv(&mask, sigmask(SIGKILL));
- sigprocmask(SIG_BLOCK, &mask, oldset);
-}
-
-static void restore_sigs(sigset_t *oldset)
-{
- sigprocmask(SIG_SETMASK, oldset, NULL);
-}
-
void __fuse_get_request(struct fuse_req *req)
{
atomic_inc(&req->count);
@@ -151,15 +138,9 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
atomic_inc(&fc->num_waiting);
if (fuse_block_alloc(fc, for_background)) {
- sigset_t oldset;
- int intr;
-
- block_sigs(&oldset);
- intr = wait_event_interruptible_exclusive(fc->blocked_waitq,
- !fuse_block_alloc(fc, for_background));
- restore_sigs(&oldset);
err = -EINTR;
- if (intr)
+ if (wait_event_killable_exclusive(fc->blocked_waitq,
+ !fuse_block_alloc(fc, for_background)))
goto out;
}
/* Matches smp_wmb() in fuse_set_initialized() */
@@ -446,14 +427,9 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
}
if (!test_bit(FR_FORCE, &req->flags)) {
- sigset_t oldset;
-
/* Only fatal signals may interrupt this */
- block_sigs(&oldset);
- err = wait_event_interruptible(req->waitq,
+ err = wait_event_killable(req->waitq,
test_bit(FR_FINISHED, &req->flags));
- restore_sigs(&oldset);
-
if (!err)
return;
@@ -897,7 +873,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
return err;
}
- page_cache_get(newpage);
+ get_page(newpage);
if (!(buf->flags & PIPE_BUF_FLAG_LRU))
lru_cache_add_file(newpage);
@@ -912,12 +888,12 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
if (err) {
unlock_page(newpage);
- page_cache_release(newpage);
+ put_page(newpage);
return err;
}
unlock_page(oldpage);
- page_cache_release(oldpage);
+ put_page(oldpage);
cs->len = 0;
return 0;
@@ -951,7 +927,7 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
fuse_copy_finish(cs);
buf = cs->pipebufs;
- page_cache_get(page);
+ get_page(page);
buf->page = page;
buf->offset = offset;
buf->len = count;
@@ -1435,7 +1411,7 @@ out_unlock:
out:
for (; page_nr < cs.nr_segs; page_nr++)
- page_cache_release(bufs[page_nr].page);
+ put_page(bufs[page_nr].page);
kfree(bufs);
return ret;
@@ -1525,7 +1501,6 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
goto err;
fuse_copy_finish(cs);
buf[outarg.namelen] = 0;
- name.hash = full_name_hash(name.name, name.len);
down_read(&fc->killsb);
err = -ENOENT;
@@ -1576,7 +1551,6 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
goto err;
fuse_copy_finish(cs);
buf[outarg.namelen] = 0;
- name.hash = full_name_hash(name.name, name.len);
down_read(&fc->killsb);
err = -ENOENT;
@@ -1632,8 +1606,8 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
goto out_up_killsb;
mapping = inode->i_mapping;
- index = outarg.offset >> PAGE_CACHE_SHIFT;
- offset = outarg.offset & ~PAGE_CACHE_MASK;
+ index = outarg.offset >> PAGE_SHIFT;
+ offset = outarg.offset & ~PAGE_MASK;
file_size = i_size_read(inode);
end = outarg.offset + outarg.size;
if (end > file_size) {
@@ -1652,13 +1626,13 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
if (!page)
goto out_iput;
- this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
+ this_num = min_t(unsigned, num, PAGE_SIZE - offset);
err = fuse_copy_page(cs, &page, offset, this_num, 0);
if (!err && offset == 0 &&
- (this_num == PAGE_CACHE_SIZE || file_size == end))
+ (this_num == PAGE_SIZE || file_size == end))
SetPageUptodate(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if (err)
goto out_iput;
@@ -1697,7 +1671,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
size_t total_len = 0;
int num_pages;
- offset = outarg->offset & ~PAGE_CACHE_MASK;
+ offset = outarg->offset & ~PAGE_MASK;
file_size = i_size_read(inode);
num = outarg->size;
@@ -1720,7 +1694,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
req->page_descs[0].offset = offset;
req->end = fuse_retrieve_end;
- index = outarg->offset >> PAGE_CACHE_SHIFT;
+ index = outarg->offset >> PAGE_SHIFT;
while (num && req->num_pages < num_pages) {
struct page *page;
@@ -1730,7 +1704,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
if (!page)
break;
- this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
+ this_num = min_t(unsigned, num, PAGE_SIZE - offset);
req->pages[req->num_pages] = page;
req->page_descs[req->num_pages].length = this_num;
req->num_pages++;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 4b855b65d4577..c47b7780ce37b 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -146,7 +146,7 @@ static void fuse_invalidate_entry(struct dentry *entry)
}
static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args,
- u64 nodeid, struct qstr *name,
+ u64 nodeid, const struct qstr *name,
struct fuse_entry_out *outarg)
{
memset(outarg, 0, sizeof(struct fuse_entry_out));
@@ -282,7 +282,7 @@ int fuse_valid_type(int m)
S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m);
}
-int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
+int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name,
struct fuse_entry_out *outarg, struct inode **inode)
{
struct fuse_conn *fc = get_fuse_conn_super(sb);
@@ -341,8 +341,10 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
struct dentry *newent;
bool outarg_valid = true;
+ fuse_lock_inode(dir);
err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name,
&outarg, &inode);
+ fuse_unlock_inode(dir);
if (err == -ENOENT) {
outarg_valid = false;
err = 0;
@@ -478,7 +480,7 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
struct fuse_conn *fc = get_fuse_conn(dir);
struct dentry *res = NULL;
- if (d_unhashed(entry)) {
+ if (d_in_lookup(entry)) {
res = fuse_lookup(dir, entry, 0);
if (IS_ERR(res))
return PTR_ERR(res);
@@ -953,6 +955,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
if (!dir)
goto unlock;
+ name->hash = full_name_hash(dir, name->name, name->len);
entry = d_lookup(dir, name);
dput(dir);
if (!entry)
@@ -1162,7 +1165,6 @@ static int fuse_direntplus_link(struct file *file,
struct fuse_direntplus *direntplus,
u64 attr_version)
{
- int err;
struct fuse_entry_out *o = &direntplus->entry_out;
struct fuse_dirent *dirent = &direntplus->dirent;
struct dentry *parent = file->f_path.dentry;
@@ -1172,6 +1174,7 @@ static int fuse_direntplus_link(struct file *file,
struct inode *dir = d_inode(parent);
struct fuse_conn *fc;
struct inode *inode;
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
if (!o->nodeid) {
/*
@@ -1202,67 +1205,63 @@ static int fuse_direntplus_link(struct file *file,
fc = get_fuse_conn(dir);
- name.hash = full_name_hash(name.name, name.len);
+ name.hash = full_name_hash(parent, name.name, name.len);
dentry = d_lookup(parent, &name);
- if (dentry) {
+ if (!dentry) {
+retry:
+ dentry = d_alloc_parallel(parent, &name, &wq);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+ }
+ if (!d_in_lookup(dentry)) {
+ struct fuse_inode *fi;
inode = d_inode(dentry);
- if (!inode) {
- d_drop(dentry);
- } else if (get_node_id(inode) != o->nodeid ||
- ((o->attr.mode ^ inode->i_mode) & S_IFMT)) {
+ if (!inode ||
+ get_node_id(inode) != o->nodeid ||
+ ((o->attr.mode ^ inode->i_mode) & S_IFMT)) {
d_invalidate(dentry);
- } else if (is_bad_inode(inode)) {
- err = -EIO;
- goto out;
- } else {
- struct fuse_inode *fi;
- fi = get_fuse_inode(inode);
- spin_lock(&fc->lock);
- fi->nlookup++;
- spin_unlock(&fc->lock);
-
- fuse_change_attributes(inode, &o->attr,
- entry_attr_timeout(o),
- attr_version);
-
- /*
- * The other branch to 'found' comes via fuse_iget()
- * which bumps nlookup inside
- */
- goto found;
+ dput(dentry);
+ goto retry;
+ }
+ if (is_bad_inode(inode)) {
+ dput(dentry);
+ return -EIO;
}
- dput(dentry);
- }
-
- dentry = d_alloc(parent, &name);
- err = -ENOMEM;
- if (!dentry)
- goto out;
- inode = fuse_iget(dir->i_sb, o->nodeid, o->generation,
- &o->attr, entry_attr_timeout(o), attr_version);
- if (!inode)
- goto out;
+ fi = get_fuse_inode(inode);
+ spin_lock(&fc->lock);
+ fi->nlookup++;
+ spin_unlock(&fc->lock);
- alias = d_splice_alias(inode, dentry);
- err = PTR_ERR(alias);
- if (IS_ERR(alias))
- goto out;
+ fuse_change_attributes(inode, &o->attr,
+ entry_attr_timeout(o),
+ attr_version);
+ /*
+ * The other branch comes via fuse_iget()
+ * which bumps nlookup inside
+ */
+ } else {
+ inode = fuse_iget(dir->i_sb, o->nodeid, o->generation,
+ &o->attr, entry_attr_timeout(o),
+ attr_version);
+ if (!inode)
+ inode = ERR_PTR(-ENOMEM);
- if (alias) {
- dput(dentry);
- dentry = alias;
+ alias = d_splice_alias(inode, dentry);
+ d_lookup_done(dentry);
+ if (alias) {
+ dput(dentry);
+ dentry = alias;
+ }
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
}
-
-found:
if (fc->readdirplus_auto)
set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state);
fuse_change_entry_timeout(dentry, o);
- err = 0;
-out:
dput(dentry);
- return err;
+ return 0;
}
static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
@@ -1345,7 +1344,9 @@ static int fuse_readdir(struct file *file, struct dir_context *ctx)
fuse_read_fill(req, file, ctx->pos, PAGE_SIZE,
FUSE_READDIR);
}
+ fuse_lock_inode(inode);
fuse_request_send(fc, req);
+ fuse_unlock_inode(inode);
nbytes = req->out.args[0].size;
err = req->out.h.error;
fuse_put_request(fc, req);
@@ -1723,10 +1724,10 @@ static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
return fuse_update_attributes(inode, stat, NULL, NULL);
}
-static int fuse_setxattr(struct dentry *entry, const char *name,
- const void *value, size_t size, int flags)
+static int fuse_setxattr(struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- struct inode *inode = d_inode(entry);
struct fuse_conn *fc = get_fuse_conn(inode);
FUSE_ARGS(args);
struct fuse_setxattr_in inarg;
@@ -1759,10 +1760,9 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
return err;
}
-static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
- void *value, size_t size)
+static ssize_t fuse_getxattr(struct dentry *entry, struct inode *inode,
+ const char *name, void *value, size_t size)
{
- struct inode *inode = d_inode(entry);
struct fuse_conn *fc = get_fuse_conn(inode);
FUSE_ARGS(args);
struct fuse_getxattr_in inarg;
@@ -1893,7 +1893,7 @@ static const struct inode_operations fuse_dir_inode_operations = {
static const struct file_operations fuse_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .iterate = fuse_readdir,
+ .iterate_shared = fuse_readdir,
.open = fuse_dir_open,
.release = fuse_dir_release,
.fsync = fuse_dir_fsync,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 9dde38f12c07b..f394aff59c363 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -348,7 +348,7 @@ static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
pgoff_t curr_index;
BUG_ON(req->inode != inode);
- curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
+ curr_index = req->misc.write.in.offset >> PAGE_SHIFT;
if (idx_from < curr_index + req->num_pages &&
curr_index <= idx_to) {
found = true;
@@ -417,6 +417,10 @@ static int fuse_flush(struct file *file, fl_owner_t id)
fuse_sync_writes(inode);
inode_unlock(inode);
+ err = filemap_check_errors(file->f_mapping);
+ if (err)
+ return err;
+
req = fuse_get_req_nofail_nopages(fc, file);
memset(&inarg, 0, sizeof(inarg));
inarg.fh = ff->fh;
@@ -462,6 +466,16 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
goto out;
fuse_sync_writes(inode);
+
+ /*
+ * Due to implementation of fuse writeback
+ * filemap_write_and_wait_range() does not catch errors.
+ * We have to do this directly after fuse_sync_writes()
+ */
+ err = filemap_check_errors(file->f_mapping);
+ if (err)
+ goto out;
+
err = sync_inode_metadata(inode, 1);
if (err)
goto out;
@@ -562,7 +576,6 @@ static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io)
*/
static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
{
- bool is_sync = is_sync_kiocb(io->iocb);
int left;
spin_lock(&io->lock);
@@ -572,11 +585,11 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
io->bytes = pos;
left = --io->reqs;
- if (!left && is_sync)
+ if (!left && io->blocking)
complete(io->done);
spin_unlock(&io->lock);
- if (!left && !is_sync) {
+ if (!left && !io->blocking) {
ssize_t res = fuse_get_res_by_io(io);
if (res >= 0) {
@@ -683,11 +696,11 @@ static void fuse_short_read(struct fuse_req *req, struct inode *inode,
* present there.
*/
int i;
- int start_idx = num_read >> PAGE_CACHE_SHIFT;
- size_t off = num_read & (PAGE_CACHE_SIZE - 1);
+ int start_idx = num_read >> PAGE_SHIFT;
+ size_t off = num_read & (PAGE_SIZE - 1);
for (i = start_idx; i < req->num_pages; i++) {
- zero_user_segment(req->pages[i], off, PAGE_CACHE_SIZE);
+ zero_user_segment(req->pages[i], off, PAGE_SIZE);
off = 0;
}
} else {
@@ -704,7 +717,7 @@ static int fuse_do_readpage(struct file *file, struct page *page)
struct fuse_req *req;
size_t num_read;
loff_t pos = page_offset(page);
- size_t count = PAGE_CACHE_SIZE;
+ size_t count = PAGE_SIZE;
u64 attr_ver;
int err;
@@ -789,7 +802,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
else
SetPageError(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
if (req->ff)
fuse_file_put(req->ff, false);
@@ -800,7 +813,7 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file)
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = ff->fc;
loff_t pos = page_offset(req->pages[0]);
- size_t count = req->num_pages << PAGE_CACHE_SHIFT;
+ size_t count = req->num_pages << PAGE_SHIFT;
req->out.argpages = 1;
req->out.page_zeroing = 1;
@@ -836,7 +849,7 @@ static int fuse_readpages_fill(void *_data, struct page *page)
if (req->num_pages &&
(req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
- (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||
+ (req->num_pages + 1) * PAGE_SIZE > fc->max_read ||
req->pages[req->num_pages - 1]->index + 1 != page->index)) {
int nr_alloc = min_t(unsigned, data->nr_pages,
FUSE_MAX_PAGES_PER_REQ);
@@ -858,7 +871,7 @@ static int fuse_readpages_fill(void *_data, struct page *page)
return -EIO;
}
- page_cache_get(page);
+ get_page(page);
req->pages[req->num_pages] = page;
req->page_descs[req->num_pages].length = PAGE_SIZE;
req->num_pages++;
@@ -1003,17 +1016,17 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
for (i = 0; i < req->num_pages; i++) {
struct page *page = req->pages[i];
- if (!req->out.h.error && !offset && count >= PAGE_CACHE_SIZE)
+ if (!req->out.h.error && !offset && count >= PAGE_SIZE)
SetPageUptodate(page);
- if (count > PAGE_CACHE_SIZE - offset)
- count -= PAGE_CACHE_SIZE - offset;
+ if (count > PAGE_SIZE - offset)
+ count -= PAGE_SIZE - offset;
else
count = 0;
offset = 0;
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
return res;
@@ -1024,7 +1037,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
struct iov_iter *ii, loff_t pos)
{
struct fuse_conn *fc = get_fuse_conn(mapping->host);
- unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned offset = pos & (PAGE_SIZE - 1);
size_t count = 0;
int err;
@@ -1034,8 +1047,8 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
do {
size_t tmp;
struct page *page;
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- size_t bytes = min_t(size_t, PAGE_CACHE_SIZE - offset,
+ pgoff_t index = pos >> PAGE_SHIFT;
+ size_t bytes = min_t(size_t, PAGE_SIZE - offset,
iov_iter_count(ii));
bytes = min_t(size_t, bytes, fc->max_write - count);
@@ -1059,7 +1072,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
iov_iter_advance(ii, tmp);
if (!tmp) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
bytes = min(bytes, iov_iter_single_seg_count(ii));
goto again;
}
@@ -1072,7 +1085,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
count += tmp;
pos += tmp;
offset += tmp;
- if (offset == PAGE_CACHE_SIZE)
+ if (offset == PAGE_SIZE)
offset = 0;
if (!fc->big_writes)
@@ -1086,8 +1099,8 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
static inline unsigned fuse_wr_pages(loff_t pos, size_t len)
{
return min_t(unsigned,
- ((pos + len - 1) >> PAGE_CACHE_SHIFT) -
- (pos >> PAGE_CACHE_SHIFT) + 1,
+ ((pos + len - 1) >> PAGE_SHIFT) -
+ (pos >> PAGE_SHIFT) + 1,
FUSE_MAX_PAGES_PER_REQ);
}
@@ -1186,7 +1199,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (iocb->ki_flags & IOCB_DIRECT) {
loff_t pos = iocb->ki_pos;
- written = generic_file_direct_write(iocb, from, pos);
+ written = generic_file_direct_write(iocb, from);
if (written < 0 || !iov_iter_count(from))
goto out;
@@ -1205,8 +1218,8 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
goto out;
invalidate_mapping_pages(file->f_mapping,
- pos >> PAGE_CACHE_SHIFT,
- endbyte >> PAGE_CACHE_SHIFT);
+ pos >> PAGE_SHIFT,
+ endbyte >> PAGE_SHIFT);
written += written_buffered;
iocb->ki_pos = pos + written_buffered;
@@ -1295,7 +1308,7 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
*nbytesp = nbytes;
- return ret;
+ return ret < 0 ? ret : 0;
}
static inline int fuse_iter_npages(const struct iov_iter *ii_p)
@@ -1315,8 +1328,8 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
size_t nmax = write ? fc->max_write : fc->max_read;
loff_t pos = *ppos;
size_t count = iov_iter_count(iter);
- pgoff_t idx_from = pos >> PAGE_CACHE_SHIFT;
- pgoff_t idx_to = (pos + count - 1) >> PAGE_CACHE_SHIFT;
+ pgoff_t idx_from = pos >> PAGE_SHIFT;
+ pgoff_t idx_to = (pos + count - 1) >> PAGE_SHIFT;
ssize_t res = 0;
struct fuse_req *req;
int err = 0;
@@ -1452,7 +1465,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
list_del(&req->writepages_entry);
for (i = 0; i < req->num_pages; i++) {
dec_wb_stat(&bdi->wb, WB_WRITEBACK);
- dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP);
+ dec_node_page_state(req->pages[i], NR_WRITEBACK_TEMP);
wb_writeout_inc(&bdi->wb);
}
wake_up(&fi->page_waitq);
@@ -1466,7 +1479,7 @@ __acquires(fc->lock)
{
struct fuse_inode *fi = get_fuse_inode(req->inode);
struct fuse_write_in *inarg = &req->misc.write.in;
- __u64 data_size = req->num_pages * PAGE_CACHE_SIZE;
+ __u64 data_size = req->num_pages * PAGE_SIZE;
if (!fc->connected)
goto out_free;
@@ -1642,7 +1655,7 @@ static int fuse_writepage_locked(struct page *page)
req->inode = inode;
inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
- inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
+ inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
spin_lock(&fc->lock);
list_add(&req->writepages_entry, &fi->writepages);
@@ -1727,7 +1740,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
list_del(&new_req->writepages_entry);
list_for_each_entry(old_req, &fi->writepages, writepages_entry) {
BUG_ON(old_req->inode != new_req->inode);
- curr_index = old_req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
+ curr_index = old_req->misc.write.in.offset >> PAGE_SHIFT;
if (curr_index <= page->index &&
page->index < curr_index + old_req->num_pages) {
found = true;
@@ -1742,7 +1755,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
new_req->num_pages = 1;
for (tmp = old_req; tmp != NULL; tmp = tmp->misc.write.next) {
BUG_ON(tmp->inode != new_req->inode);
- curr_index = tmp->misc.write.in.offset >> PAGE_CACHE_SHIFT;
+ curr_index = tmp->misc.write.in.offset >> PAGE_SHIFT;
if (tmp->num_pages == 1 &&
curr_index == page->index) {
old_req = tmp;
@@ -1756,7 +1769,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
spin_unlock(&fc->lock);
dec_wb_stat(&bdi->wb, WB_WRITEBACK);
- dec_zone_page_state(page, NR_WRITEBACK_TEMP);
+ dec_node_page_state(page, NR_WRITEBACK_TEMP);
wb_writeout_inc(&bdi->wb);
fuse_writepage_free(fc, new_req);
fuse_request_free(new_req);
@@ -1799,7 +1812,7 @@ static int fuse_writepages_fill(struct page *page,
if (req && req->num_pages &&
(is_writeback || req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
- (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_write ||
+ (req->num_pages + 1) * PAGE_SIZE > fc->max_write ||
data->orig_pages[req->num_pages - 1]->index + 1 != page->index)) {
fuse_writepages_send(data);
data->req = NULL;
@@ -1855,7 +1868,7 @@ static int fuse_writepages_fill(struct page *page,
req->page_descs[req->num_pages].length = PAGE_SIZE;
inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
- inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
+ inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
err = 0;
if (is_writeback && fuse_writepage_in_flight(req, page)) {
@@ -1924,7 +1937,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t index = pos >> PAGE_SHIFT;
struct fuse_conn *fc = get_fuse_conn(file_inode(file));
struct page *page;
loff_t fsize;
@@ -1938,15 +1951,15 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping,
fuse_wait_on_page_writeback(mapping->host, page->index);
- if (PageUptodate(page) || len == PAGE_CACHE_SIZE)
+ if (PageUptodate(page) || len == PAGE_SIZE)
goto success;
/*
* Check if the start this page comes after the end of file, in which
* case the readpage can be optimized away.
*/
fsize = i_size_read(mapping->host);
- if (fsize <= (pos & PAGE_CACHE_MASK)) {
- size_t off = pos & ~PAGE_CACHE_MASK;
+ if (fsize <= (pos & PAGE_MASK)) {
+ size_t off = pos & ~PAGE_MASK;
if (off)
zero_user_segment(page, 0, off);
goto success;
@@ -1960,7 +1973,7 @@ success:
cleanup:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
error:
return err;
}
@@ -1973,16 +1986,16 @@ static int fuse_write_end(struct file *file, struct address_space *mapping,
if (!PageUptodate(page)) {
/* Zero any unwritten bytes at the end of the page */
- size_t endoff = (pos + copied) & ~PAGE_CACHE_MASK;
+ size_t endoff = (pos + copied) & ~PAGE_MASK;
if (endoff)
- zero_user_segment(page, endoff, PAGE_CACHE_SIZE);
+ zero_user_segment(page, endoff, PAGE_SIZE);
SetPageUptodate(page);
}
fuse_write_update_size(inode, pos + copied);
set_page_dirty(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return copied;
}
@@ -2837,7 +2850,7 @@ static inline loff_t fuse_round_up(loff_t off)
}
static ssize_t
-fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
+fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
DECLARE_COMPLETION_ONSTACK(wait);
ssize_t ret = 0;
@@ -2848,8 +2861,8 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
struct inode *inode;
loff_t i_size;
size_t count = iov_iter_count(iter);
+ loff_t offset = iocb->ki_pos;
struct fuse_io_priv *io;
- bool is_sync = is_sync_kiocb(iocb);
pos = offset;
inode = file->f_mapping->host;
@@ -2884,17 +2897,16 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
*/
io->async = async_dio;
io->iocb = iocb;
+ io->blocking = is_sync_kiocb(iocb);
/*
- * We cannot asynchronously extend the size of a file. We have no method
- * to wait on real async I/O requests, so we must submit this request
- * synchronously.
+ * We cannot asynchronously extend the size of a file.
+ * In such case the aio will behave exactly like sync io.
*/
- if (!is_sync && (offset + count > i_size) &&
- iov_iter_rw(iter) == WRITE)
- io->async = false;
+ if ((offset + count > i_size) && iov_iter_rw(iter) == WRITE)
+ io->blocking = true;
- if (io->async && is_sync) {
+ if (io->async && io->blocking) {
/*
* Additional reference to keep io around after
* calling fuse_aio_complete()
@@ -2914,7 +2926,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
/* we have a non-extending, async request, so return */
- if (!is_sync)
+ if (!io->blocking)
return -EIOCBQUEUED;
wait_for_completion(&wait);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index eddbe02c40289..d98d8cc84defb 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -110,6 +110,9 @@ struct fuse_inode {
/** Miscellaneous bits describing inode state */
unsigned long state;
+
+ /** Lock for serializing lookup and readdir for back compatibility*/
+ struct mutex mutex;
};
/** FUSE inode state bits */
@@ -256,6 +259,7 @@ struct fuse_io_priv {
struct kiocb *iocb;
struct file *file;
struct completion *done;
+ bool blocking;
};
#define FUSE_IO_PRIV_SYNC(f) \
@@ -540,6 +544,9 @@ struct fuse_conn {
/** write-back cache policy (default is write-through) */
unsigned writeback_cache:1;
+ /** allow parallel lookups and readdir (default is serialized) */
+ unsigned parallel_dirops:1;
+
/*
* The following bitfields are only for optimization purposes
* and hence races in setting them will not cause malfunction
@@ -697,7 +704,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
int generation, struct fuse_attr *attr,
u64 attr_valid, u64 attr_version);
-int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
+int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name,
struct fuse_entry_out *outarg, struct inode **inode);
/**
@@ -956,4 +963,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
void fuse_set_initialized(struct fuse_conn *fc);
+void fuse_unlock_inode(struct inode *inode);
+void fuse_lock_inode(struct inode *inode);
+
#endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 4d69d5c0bedcd..4e05b51120f40 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -97,6 +97,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
INIT_LIST_HEAD(&fi->queued_writes);
INIT_LIST_HEAD(&fi->writepages);
init_waitqueue_head(&fi->page_waitq);
+ mutex_init(&fi->mutex);
fi->forget = fuse_alloc_forget();
if (!fi->forget) {
kmem_cache_free(fuse_inode_cachep, inode);
@@ -117,6 +118,7 @@ static void fuse_destroy_inode(struct inode *inode)
struct fuse_inode *fi = get_fuse_inode(inode);
BUG_ON(!list_empty(&fi->write_files));
BUG_ON(!list_empty(&fi->queued_writes));
+ mutex_destroy(&fi->mutex);
kfree(fi->forget);
call_rcu(&inode->i_rcu, fuse_i_callback);
}
@@ -339,11 +341,11 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
fuse_invalidate_attr(inode);
if (offset >= 0) {
- pg_start = offset >> PAGE_CACHE_SHIFT;
+ pg_start = offset >> PAGE_SHIFT;
if (len <= 0)
pg_end = -1;
else
- pg_end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+ pg_end = (offset + len - 1) >> PAGE_SHIFT;
invalidate_inode_pages2_range(inode->i_mapping,
pg_start, pg_end);
}
@@ -351,6 +353,18 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
return 0;
}
+void fuse_lock_inode(struct inode *inode)
+{
+ if (!get_fuse_conn(inode)->parallel_dirops)
+ mutex_lock(&get_fuse_inode(inode)->mutex);
+}
+
+void fuse_unlock_inode(struct inode *inode)
+{
+ if (!get_fuse_conn(inode)->parallel_dirops)
+ mutex_unlock(&get_fuse_inode(inode)->mutex);
+}
+
static void fuse_umount_begin(struct super_block *sb)
{
fuse_abort_conn(get_fuse_conn_super(sb));
@@ -659,13 +673,11 @@ static struct dentry *fuse_get_dentry(struct super_block *sb,
inode = ilookup5(sb, handle->nodeid, fuse_inode_eq, &handle->nodeid);
if (!inode) {
struct fuse_entry_out outarg;
- struct qstr name;
+ const struct qstr name = QSTR_INIT(".", 1);
if (!fc->export_support)
goto out_err;
- name.len = 1;
- name.name = ".";
err = fuse_lookup_name(sb, handle->nodeid, &name, &outarg,
&inode);
if (err && err != -ENOENT)
@@ -761,14 +773,12 @@ static struct dentry *fuse_get_parent(struct dentry *child)
struct inode *inode;
struct dentry *parent;
struct fuse_entry_out outarg;
- struct qstr name;
+ const struct qstr name = QSTR_INIT("..", 2);
int err;
if (!fc->export_support)
return ERR_PTR(-ESTALE);
- name.len = 2;
- name.name = "..";
err = fuse_lookup_name(child_inode->i_sb, get_node_id(child_inode),
&name, &outarg, &inode);
if (err) {
@@ -864,7 +874,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
process_init_limits(fc, arg);
if (arg->minor >= 6) {
- ra_pages = arg->max_readahead / PAGE_CACHE_SIZE;
+ ra_pages = arg->max_readahead / PAGE_SIZE;
if (arg->flags & FUSE_ASYNC_READ)
fc->async_read = 1;
if (!(arg->flags & FUSE_POSIX_LOCKS))
@@ -898,10 +908,12 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
fc->async_dio = 1;
if (arg->flags & FUSE_WRITEBACK_CACHE)
fc->writeback_cache = 1;
+ if (arg->flags & FUSE_PARALLEL_DIROPS)
+ fc->parallel_dirops = 1;
if (arg->time_gran && arg->time_gran <= 1000000000)
fc->sb->s_time_gran = arg->time_gran;
} else {
- ra_pages = fc->max_read / PAGE_CACHE_SIZE;
+ ra_pages = fc->max_read / PAGE_SIZE;
fc->no_lock = 1;
fc->no_flock = 1;
}
@@ -922,13 +934,14 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
arg->major = FUSE_KERNEL_VERSION;
arg->minor = FUSE_KERNEL_MINOR_VERSION;
- arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
+ arg->max_readahead = fc->bdi.ra_pages * PAGE_SIZE;
arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ |
- FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA |
+ FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA |
FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO |
- FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT;
+ FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT |
+ FUSE_PARALLEL_DIROPS;
req->in.h.opcode = FUSE_INIT;
req->in.numargs = 1;
req->in.args[0].size = sizeof(*arg);
@@ -955,7 +968,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
int err;
fc->bdi.name = "fuse";
- fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+ fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
/* fuse does it's own writeback accounting */
fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;
@@ -1053,8 +1066,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
goto err;
#endif
} else {
- sb->s_blocksize = PAGE_CACHE_SIZE;
- sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ sb->s_blocksize = PAGE_SIZE;
+ sb->s_blocksize_bits = PAGE_SHIFT;
}
sb->s_magic = FUSE_SUPER_MAGIC;
sb->s_op = &fuse_super_operations;
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 791932617d1a3..363ba9e9d8d0a 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -24,6 +24,7 @@
#include "glock.h"
#include "inode.h"
#include "meta_io.h"
+#include "rgrp.h"
#include "trans.h"
#include "util.h"
@@ -38,7 +39,7 @@ static const char *gfs2_acl_name(int type)
return NULL;
}
-struct posix_acl *gfs2_get_acl(struct inode *inode, int type)
+static struct posix_acl *__gfs2_get_acl(struct inode *inode, int type)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct posix_acl *acl;
@@ -50,29 +51,41 @@ struct posix_acl *gfs2_get_acl(struct inode *inode, int type)
return NULL;
name = gfs2_acl_name(type);
- if (name == NULL)
- return ERR_PTR(-EINVAL);
-
len = gfs2_xattr_acl_get(ip, name, &data);
- if (len < 0)
+ if (len <= 0)
return ERR_PTR(len);
- if (len == 0)
- return NULL;
-
acl = posix_acl_from_xattr(&init_user_ns, data, len);
kfree(data);
return acl;
}
-int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+struct posix_acl *gfs2_get_acl(struct inode *inode, int type)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder gh;
+ bool need_unlock = false;
+ struct posix_acl *acl;
+
+ if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
+ int ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
+ LM_FLAG_ANY, &gh);
+ if (ret)
+ return ERR_PTR(ret);
+ need_unlock = true;
+ }
+ acl = __gfs2_get_acl(inode, type);
+ if (need_unlock)
+ gfs2_glock_dq_uninit(&gh);
+ return acl;
+}
+
+int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
int error;
int len;
char *data;
const char *name = gfs2_acl_name(type);
- BUG_ON(name == NULL);
-
if (acl && acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode)))
return -E2BIG;
@@ -115,3 +128,26 @@ out:
kfree(data);
return error;
}
+
+int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder gh;
+ bool need_unlock = false;
+ int ret;
+
+ ret = gfs2_rsqa_alloc(ip);
+ if (ret)
+ return ret;
+
+ if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
+ ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+ if (ret)
+ return ret;
+ need_unlock = true;
+ }
+ ret = __gfs2_set_acl(inode, acl, type);
+ if (need_unlock)
+ gfs2_glock_dq_uninit(&gh);
+ return ret;
+}
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 3af4f407a483e..f674fdd223374 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -15,6 +15,7 @@
#define GFS2_ACL_MAX_ENTRIES(sdp) ((300 << (sdp)->sd_sb.sb_bsize_shift) >> 12)
extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type);
+extern int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
extern int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
#endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index aa016e4b8bec9..82df368869388 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -101,7 +101,7 @@ static int gfs2_writepage_common(struct page *page,
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
loff_t i_size = i_size_read(inode);
- pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ pgoff_t end_index = i_size >> PAGE_SHIFT;
unsigned offset;
if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl)))
@@ -109,9 +109,9 @@ static int gfs2_writepage_common(struct page *page,
if (current->journal_info)
goto redirty;
/* Is the page fully outside i_size? (truncate in progress) */
- offset = i_size & (PAGE_CACHE_SIZE-1);
+ offset = i_size & (PAGE_SIZE-1);
if (page->index > end_index || (page->index == end_index && !offset)) {
- page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
+ page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
goto out;
}
return 1;
@@ -140,6 +140,32 @@ static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
return nobh_writepage(page, gfs2_get_block_noalloc, wbc);
}
+/* This is the same as calling block_write_full_page, but it also
+ * writes pages outside of i_size
+ */
+int gfs2_write_full_page(struct page *page, get_block_t *get_block,
+ struct writeback_control *wbc)
+{
+ struct inode * const inode = page->mapping->host;
+ loff_t i_size = i_size_read(inode);
+ const pgoff_t end_index = i_size >> PAGE_SHIFT;
+ unsigned offset;
+
+ /*
+ * The page straddles i_size. It must be zeroed out on each and every
+ * writepage invocation because it may be mmapped. "A file is mapped
+ * in multiples of the page size. For a file that is not a multiple of
+ * the page size, the remaining memory is zeroed when mapped, and
+ * writes to that region are not written out to the file."
+ */
+ offset = i_size & (PAGE_SIZE-1);
+ if (page->index == end_index && offset)
+ zero_user_segment(page, offset, PAGE_SIZE);
+
+ return __block_write_full_page(inode, page, get_block, wbc,
+ end_buffer_async_write);
+}
+
/**
* __gfs2_jdata_writepage - The core of jdata writepage
* @page: The page to write
@@ -165,7 +191,7 @@ static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *w
}
gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1);
}
- return block_write_full_page(page, gfs2_get_block_noalloc, wbc);
+ return gfs2_write_full_page(page, gfs2_get_block_noalloc, wbc);
}
/**
@@ -180,27 +206,20 @@ static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *w
static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
+ struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
int ret;
- int done_trans = 0;
- if (PageChecked(page)) {
- if (wbc->sync_mode != WB_SYNC_ALL)
- goto out_ignore;
- ret = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
- if (ret)
- goto out_ignore;
- done_trans = 1;
- }
- ret = gfs2_writepage_common(page, wbc);
- if (ret > 0)
- ret = __gfs2_jdata_writepage(page, wbc);
- if (done_trans)
- gfs2_trans_end(sdp);
+ if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl)))
+ goto out;
+ if (PageChecked(page) || current->journal_info)
+ goto out_ignore;
+ ret = __gfs2_jdata_writepage(page, wbc);
return ret;
out_ignore:
redirty_page_for_writepage(wbc, page);
+out:
unlock_page(page);
return 0;
}
@@ -238,7 +257,7 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
{
struct inode *inode = mapping->host;
struct gfs2_sbd *sdp = GFS2_SB(inode);
- unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize);
+ unsigned nrblocks = nr_pages * (PAGE_SIZE/inode->i_sb->s_blocksize);
int i;
int ret;
@@ -366,8 +385,8 @@ static int gfs2_write_cache_jdata(struct address_space *mapping,
cycled = 0;
end = -1;
} else {
- index = wbc->range_start >> PAGE_CACHE_SHIFT;
- end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ index = wbc->range_start >> PAGE_SHIFT;
+ end = wbc->range_end >> PAGE_SHIFT;
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
cycled = 1; /* ignore range_cyclic tests */
@@ -458,7 +477,7 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
* so we need to supply one here. It doesn't happen often.
*/
if (unlikely(page->index)) {
- zero_user(page, 0, PAGE_CACHE_SIZE);
+ zero_user(page, 0, PAGE_SIZE);
SetPageUptodate(page);
return 0;
}
@@ -471,7 +490,7 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
dsize = (dibh->b_size - sizeof(struct gfs2_dinode));
memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
- memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize);
+ memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
kunmap_atomic(kaddr);
flush_dcache_page(page);
brelse(dibh);
@@ -560,8 +579,8 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos,
unsigned size)
{
struct address_space *mapping = ip->i_inode.i_mapping;
- unsigned long index = *pos / PAGE_CACHE_SIZE;
- unsigned offset = *pos & (PAGE_CACHE_SIZE - 1);
+ unsigned long index = *pos / PAGE_SIZE;
+ unsigned offset = *pos & (PAGE_SIZE - 1);
unsigned copied = 0;
unsigned amt;
struct page *page;
@@ -569,15 +588,15 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos,
do {
amt = size - copied;
- if (offset + size > PAGE_CACHE_SIZE)
- amt = PAGE_CACHE_SIZE - offset;
+ if (offset + size > PAGE_SIZE)
+ amt = PAGE_SIZE - offset;
page = read_cache_page(mapping, index, __gfs2_readpage, NULL);
if (IS_ERR(page))
return PTR_ERR(page);
p = kmap_atomic(page);
memcpy(buf + copied, p + offset, amt);
kunmap_atomic(p);
- page_cache_release(page);
+ put_page(page);
copied += amt;
index++;
offset = 0;
@@ -651,8 +670,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
unsigned requested = 0;
int alloc_required;
int error = 0;
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ pgoff_t index = pos >> PAGE_SHIFT;
+ unsigned from = pos & (PAGE_SIZE - 1);
struct page *page;
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
@@ -697,7 +716,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
rblocks += gfs2_rg_blocks(ip, requested);
error = gfs2_trans_begin(sdp, rblocks,
- PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
+ PAGE_SIZE/sdp->sd_sb.sb_bsize);
if (error)
goto out_trans_fail;
@@ -727,7 +746,7 @@ out:
return 0;
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
gfs2_trans_end(sdp);
if (pos + len > ip->i_inode.i_size)
@@ -827,7 +846,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
if (!PageUptodate(page))
SetPageUptodate(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if (copied) {
if (inode->i_size < to)
@@ -877,7 +896,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
struct buffer_head *dibh;
- unsigned int from = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned int from = pos & (PAGE_SIZE - 1);
unsigned int to = from + len;
int ret;
struct gfs2_trans *tr = current->journal_info;
@@ -888,7 +907,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
ret = gfs2_meta_inode_buffer(ip, &dibh);
if (unlikely(ret)) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
goto failed;
}
@@ -977,7 +996,7 @@ static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh)
if (!list_empty(&bd->bd_list) && !buffer_pinned(bh))
list_del_init(&bd->bd_list);
else
- gfs2_remove_from_journal(bh, current->journal_info, 0);
+ gfs2_remove_from_journal(bh, REMOVE_JDATA);
}
bh->b_bdev = NULL;
clear_buffer_mapped(bh);
@@ -992,7 +1011,7 @@ static void gfs2_invalidatepage(struct page *page, unsigned int offset,
{
struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
unsigned int stop = offset + length;
- int partial_page = (offset || length < PAGE_CACHE_SIZE);
+ int partial_page = (offset || length < PAGE_SIZE);
struct buffer_head *bh, *head;
unsigned long pos = 0;
@@ -1042,13 +1061,13 @@ static int gfs2_ok_for_dio(struct gfs2_inode *ip, loff_t offset)
-static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- loff_t offset)
+static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct address_space *mapping = inode->i_mapping;
struct gfs2_inode *ip = GFS2_I(inode);
+ loff_t offset = iocb->ki_pos;
struct gfs2_holder gh;
int rv;
@@ -1063,7 +1082,7 @@ static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh);
rv = gfs2_glock_nq(&gh);
if (rv)
- return rv;
+ goto out_uninit;
rv = gfs2_ok_for_dio(ip, offset);
if (rv != 1)
goto out; /* dio not valid, fall back to buffered i/o */
@@ -1082,7 +1101,7 @@ static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
* the first place, mapping->nr_pages will always be zero.
*/
if (mapping->nrpages) {
- loff_t lstart = offset & ~(PAGE_CACHE_SIZE - 1);
+ loff_t lstart = offset & ~(PAGE_SIZE - 1);
loff_t len = iov_iter_count(iter);
loff_t end = PAGE_ALIGN(offset + len) - 1;
@@ -1099,9 +1118,10 @@ static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
}
rv = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
- offset, gfs2_get_block_direct, NULL, NULL, 0);
+ gfs2_get_block_direct, NULL, NULL, 0);
out:
gfs2_glock_dq(&gh);
+out_uninit:
gfs2_holder_uninit(&gh);
return rv;
}
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 0860f0b5b3f19..6e2bec1cd2894 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -75,7 +75,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
dsize = dibh->b_size - sizeof(struct gfs2_dinode);
memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
- memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize);
+ memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
kunmap(page);
SetPageUptodate(page);
@@ -98,7 +98,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
if (release) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
return 0;
@@ -285,7 +285,8 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl,
if (trylock_buffer(rabh)) {
if (!buffer_uptodate(rabh)) {
rabh->b_end_io = end_buffer_read_sync;
- submit_bh(READA | REQ_META, rabh);
+ submit_bh(REQ_OP_READ, REQ_RAHEAD | REQ_META,
+ rabh);
continue;
}
unlock_buffer(rabh);
@@ -932,8 +933,8 @@ static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
{
struct inode *inode = mapping->host;
struct gfs2_inode *ip = GFS2_I(inode);
- unsigned long index = from >> PAGE_CACHE_SHIFT;
- unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned long index = from >> PAGE_SHIFT;
+ unsigned offset = from & (PAGE_SIZE-1);
unsigned blocksize, iblock, length, pos;
struct buffer_head *bh;
struct page *page;
@@ -945,7 +946,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
blocksize = inode->i_sb->s_blocksize;
length = blocksize - (offset & (blocksize - 1));
- iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+ iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
if (!page_has_buffers(page))
create_empty_buffers(page, blocksize, 0);
@@ -974,7 +975,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
if (!buffer_uptodate(bh)) {
err = -EIO;
- ll_rw_block(READ, 1, &bh);
+ ll_rw_block(REQ_OP_READ, 0, 1, &bh);
wait_on_buffer(bh);
/* Uhhuh. Read error. Complain and punt. */
if (!buffer_uptodate(bh))
@@ -989,7 +990,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
mark_buffer_dirty(bh);
unlock:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return err;
}
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 30822b148f3e6..5173b98ca0368 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -117,7 +117,7 @@ static int gfs2_dentry_delete(const struct dentry *dentry)
return 0;
ginode = GFS2_I(d_inode(dentry));
- if (!ginode->i_iopen_gh.gh_gl)
+ if (!gfs2_holder_initialized(&ginode->i_iopen_gh))
return 0;
if (test_bit(GLF_DEMOTE, &ginode->i_iopen_gh.gh_gl->gl_flags))
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 4a01f30e99954..fcb59b23f1e38 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -783,12 +783,15 @@ static int get_leaf_nr(struct gfs2_inode *dip, u32 index,
u64 *leaf_out)
{
__be64 *hash;
+ int error;
hash = gfs2_dir_get_hash_table(dip);
- if (IS_ERR(hash))
- return PTR_ERR(hash);
- *leaf_out = be64_to_cpu(*(hash + index));
- return 0;
+ error = PTR_ERR_OR_ZERO(hash);
+
+ if (!error)
+ *leaf_out = be64_to_cpu(*(hash + index));
+
+ return error;
}
static int get_first_leaf(struct gfs2_inode *dip, u32 index,
@@ -798,7 +801,7 @@ static int get_first_leaf(struct gfs2_inode *dip, u32 index,
int error;
error = get_leaf_nr(dip, index, &leaf_no);
- if (!IS_ERR_VALUE(error))
+ if (!error)
error = get_leaf(dip, leaf_no, bh_out);
return error;
@@ -1014,7 +1017,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
index = name->hash >> (32 - dip->i_depth);
error = get_leaf_nr(dip, index, &leaf_no);
- if (IS_ERR_VALUE(error))
+ if (error)
return error;
/* Get the old leaf block */
@@ -1510,7 +1513,7 @@ static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index,
continue;
}
bh->b_end_io = end_buffer_read_sync;
- submit_bh(READA | REQ_META, bh);
+ submit_bh(REQ_OP_READ, REQ_RAHEAD | REQ_META, bh);
continue;
}
brelse(bh);
@@ -1660,7 +1663,8 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name,
brelse(bh);
if (fail_on_exist)
return ERR_PTR(-EEXIST);
- inode = gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino);
+ inode = gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino,
+ GFS2_BLKST_FREE /* ignore */);
if (!IS_ERR(inode))
GFS2_I(inode)->i_rahead = rahead;
return inode;
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index d5bda85134574..a332f3cd925ef 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -137,21 +137,10 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
struct gfs2_sbd *sdp = sb->s_fs_info;
struct inode *inode;
- inode = gfs2_ilookup(sb, inum->no_addr);
- if (inode) {
- if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
- iput(inode);
- return ERR_PTR(-ESTALE);
- }
- goto out_inode;
- }
-
inode = gfs2_lookup_by_inum(sdp, inum->no_addr, &inum->no_formal_ino,
GFS2_BLKST_DINODE);
if (IS_ERR(inode))
return ERR_CAST(inode);
-
-out_inode:
return d_obtain_alias(inode);
}
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index c9384f932975e..320e65e61938a 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -160,7 +160,7 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
error = gfs2_glock_nq(&gh);
if (error)
- return error;
+ goto out_uninit;
fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags);
if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA)
@@ -169,6 +169,7 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
error = -EFAULT;
gfs2_glock_dq(&gh);
+out_uninit:
gfs2_holder_uninit(&gh);
return error;
}
@@ -354,8 +355,8 @@ static int gfs2_allocate_page_backing(struct page *page)
{
struct inode *inode = page->mapping->host;
struct buffer_head bh;
- unsigned long size = PAGE_CACHE_SIZE;
- u64 lblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ unsigned long size = PAGE_SIZE;
+ u64 lblock = page->index << (PAGE_SHIFT - inode->i_blkbits);
do {
bh.b_state = 0;
@@ -386,7 +387,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct gfs2_alloc_parms ap = { .aflags = 0, };
unsigned long last_index;
- u64 pos = page->index << PAGE_CACHE_SHIFT;
+ u64 pos = page->index << PAGE_SHIFT;
unsigned int data_blocks, ind_blocks, rblocks;
struct gfs2_holder gh;
loff_t size;
@@ -401,7 +402,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
if (ret)
goto out;
- gfs2_size_hint(vma->vm_file, pos, PAGE_CACHE_SIZE);
+ gfs2_size_hint(vma->vm_file, pos, PAGE_SIZE);
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
ret = gfs2_glock_nq(&gh);
@@ -411,7 +412,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
set_bit(GIF_SW_PAGED, &ip->i_flags);
- if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE)) {
+ if (!gfs2_write_alloc_required(ip, pos, PAGE_SIZE)) {
lock_page(page);
if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
ret = -EAGAIN;
@@ -424,7 +425,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
if (ret)
goto out_unlock;
- gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
+ gfs2_write_calc_reserv(ip, PAGE_SIZE, &data_blocks, &ind_blocks);
ap.target = data_blocks + ind_blocks;
ret = gfs2_quota_lock_check(ip, &ap);
if (ret)
@@ -447,7 +448,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
lock_page(page);
ret = -EINVAL;
size = i_size_read(inode);
- last_index = (size - 1) >> PAGE_CACHE_SHIFT;
+ last_index = (size - 1) >> PAGE_SHIFT;
/* Check page index against inode size */
if (size == 0 || (page->index > last_index))
goto out_trans_end;
@@ -873,7 +874,7 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t
rblocks += data_blocks ? data_blocks : 1;
error = gfs2_trans_begin(sdp, rblocks,
- PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
+ PAGE_SIZE/sdp->sd_sb.sb_bsize);
if (error)
goto out_trans_fail;
@@ -895,7 +896,10 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t
mark_inode_dirty(inode);
}
- return generic_write_sync(file, pos, count);
+ if ((file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host))
+ return vfs_fsync_range(file, pos, pos + count - 1,
+ (file->f_flags & __O_SYNC) ? 0 : 1);
+ return 0;
out_trans_fail:
gfs2_inplace_release(ip);
@@ -950,6 +954,30 @@ out_uninit:
return ret;
}
+static ssize_t gfs2_file_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
+{
+ struct inode *inode = in->f_mapping->host;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder gh;
+ int ret;
+
+ inode_lock(inode);
+
+ ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+ if (ret) {
+ inode_unlock(inode);
+ return ret;
+ }
+
+ gfs2_glock_dq_uninit(&gh);
+ inode_unlock(inode);
+
+ return generic_file_splice_read(in, ppos, pipe, len, flags);
+}
+
+
static ssize_t gfs2_file_splice_write(struct pipe_inode_info *pipe,
struct file *out, loff_t *ppos,
size_t len, unsigned int flags)
@@ -1070,7 +1098,7 @@ static void do_unflock(struct file *file, struct file_lock *fl)
mutex_lock(&fp->f_fl_mutex);
locks_lock_file_wait(file, fl);
- if (fl_gh->gh_gl) {
+ if (gfs2_holder_initialized(fl_gh)) {
gfs2_glock_dq(fl_gh);
gfs2_holder_uninit(fl_gh);
}
@@ -1112,14 +1140,14 @@ const struct file_operations gfs2_file_fops = {
.fsync = gfs2_fsync,
.lock = gfs2_lock,
.flock = gfs2_flock,
- .splice_read = generic_file_splice_read,
+ .splice_read = gfs2_file_splice_read,
.splice_write = gfs2_file_splice_write,
.setlease = simple_nosetlease,
.fallocate = gfs2_fallocate,
};
const struct file_operations gfs2_dir_fops = {
- .iterate = gfs2_readdir,
+ .iterate_shared = gfs2_readdir,
.unlocked_ioctl = gfs2_ioctl,
.open = gfs2_open,
.release = gfs2_release,
@@ -1140,14 +1168,14 @@ const struct file_operations gfs2_file_fops_nolock = {
.open = gfs2_open,
.release = gfs2_release,
.fsync = gfs2_fsync,
- .splice_read = generic_file_splice_read,
+ .splice_read = gfs2_file_splice_read,
.splice_write = gfs2_file_splice_write,
.setlease = generic_setlease,
.fallocate = gfs2_fallocate,
};
const struct file_operations gfs2_dir_fops_nolock = {
- .iterate = gfs2_readdir,
+ .iterate_shared = gfs2_readdir,
.unlocked_ioctl = gfs2_ioctl,
.open = gfs2_open,
.release = gfs2_release,
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 6539131c52a24..3a90b2b5b9bb3 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -218,7 +218,7 @@ static void gfs2_holder_wake(struct gfs2_holder *gh)
*
*/
-static inline void do_error(struct gfs2_glock *gl, const int ret)
+static void do_error(struct gfs2_glock *gl, const int ret)
{
struct gfs2_holder *gh, *tmp;
@@ -475,7 +475,14 @@ __acquires(&gl->gl_lockref.lock)
if (sdp->sd_lockstruct.ls_ops->lm_lock) {
/* lock_dlm */
ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
- if (ret) {
+ if (ret == -EINVAL && gl->gl_target == LM_ST_UNLOCKED &&
+ target == LM_ST_UNLOCKED &&
+ test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags)) {
+ finish_xmote(gl, target);
+ if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+ gfs2_glock_put(gl);
+ }
+ else if (ret) {
pr_err("lm_lock ret %d\n", ret);
GLOCK_BUG_ON(gl, 1);
}
@@ -568,7 +575,6 @@ static void delete_work_func(struct work_struct *work)
{
struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete);
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
- struct gfs2_inode *ip;
struct inode *inode;
u64 no_addr = gl->gl_name.ln_number;
@@ -578,13 +584,7 @@ static void delete_work_func(struct work_struct *work)
if (test_bit(GLF_INODE_CREATING, &gl->gl_flags))
goto out;
- ip = gl->gl_object;
- /* Note: Unsafe to dereference ip as we don't hold right refs/locks */
-
- if (ip)
- inode = gfs2_ilookup(sdp->sd_vfs, no_addr);
- else
- inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED);
+ inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED);
if (inode && !IS_ERR(inode)) {
d_prune_aliases(inode);
iput(inode);
@@ -801,7 +801,7 @@ void gfs2_holder_uninit(struct gfs2_holder *gh)
{
put_pid(gh->gh_owner_pid);
gfs2_glock_put(gh->gh_gl);
- gh->gh_gl = NULL;
+ gfs2_holder_mark_uninitialized(gh);
gh->gh_ip = 0;
}
@@ -1913,7 +1913,7 @@ static int gfs2_glocks_open(struct inode *inode, struct file *file)
if (seq->buf)
seq->size = GFS2_SEQ_GOODSIZE;
gi->gl = NULL;
- ret = rhashtable_walk_init(&gl_hash_table, &gi->hti);
+ ret = rhashtable_walk_init(&gl_hash_table, &gi->hti, GFP_KERNEL);
}
return ret;
}
@@ -1941,7 +1941,7 @@ static int gfs2_glstats_open(struct inode *inode, struct file *file)
if (seq->buf)
seq->size = GFS2_SEQ_GOODSIZE;
gi->gl = NULL;
- ret = rhashtable_walk_init(&gl_hash_table, &gi->hti);
+ ret = rhashtable_walk_init(&gl_hash_table, &gi->hti, GFP_KERNEL);
}
return ret;
}
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 46ab67fc16daa..ab1ef322f7a53 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -247,4 +247,14 @@ extern void gfs2_unregister_debugfs(void);
extern const struct lm_lockops gfs2_dlm_ops;
+static inline void gfs2_holder_mark_uninitialized(struct gfs2_holder *gh)
+{
+ gh->gh_gl = NULL;
+}
+
+static inline bool gfs2_holder_initialized(struct gfs2_holder *gh)
+{
+ return gh->gh_gl;
+}
+
#endif /* __GLOCK_DOT_H__ */
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 437fd73e381e2..5db59d4448380 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -286,17 +286,10 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
static int inode_go_demote_ok(const struct gfs2_glock *gl)
{
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
- struct gfs2_holder *gh;
if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object)
return 0;
- if (!list_empty(&gl->gl_holders)) {
- gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
- if (gh->gh_list.next != &gl->gl_holders)
- return 0;
- }
-
return 1;
}
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index bb30f9a72c659..e4da0ecd32852 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -37,9 +37,35 @@
#include "super.h"
#include "glops.h"
-struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr)
+static int iget_test(struct inode *inode, void *opaque)
{
- return ilookup(sb, (unsigned long)no_addr);
+ u64 no_addr = *(u64 *)opaque;
+
+ return GFS2_I(inode)->i_no_addr == no_addr;
+}
+
+static int iget_set(struct inode *inode, void *opaque)
+{
+ u64 no_addr = *(u64 *)opaque;
+
+ GFS2_I(inode)->i_no_addr = no_addr;
+ inode->i_ino = no_addr;
+ return 0;
+}
+
+static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr)
+{
+ struct inode *inode;
+
+repeat:
+ inode = iget5_locked(sb, no_addr, iget_test, iget_set, &no_addr);
+ if (!inode)
+ return inode;
+ if (is_bad_inode(inode)) {
+ iput(inode);
+ goto repeat;
+ }
+ return inode;
}
/**
@@ -78,27 +104,38 @@ static void gfs2_set_iop(struct inode *inode)
/**
* gfs2_inode_lookup - Lookup an inode
* @sb: The super block
- * @no_addr: The inode number
* @type: The type of the inode
+ * @no_addr: The inode number
+ * @no_formal_ino: The inode generation number
+ * @blktype: Requested block type (GFS2_BLKST_DINODE or GFS2_BLKST_UNLINKED;
+ * GFS2_BLKST_FREE do indicate not to verify)
+ *
+ * If @type is DT_UNKNOWN, the inode type is fetched from disk.
+ *
+ * If @blktype is anything other than GFS2_BLKST_FREE (which is used as a
+ * placeholder because it doesn't otherwise make sense), the on-disk block type
+ * is verified to be @blktype.
*
* Returns: A VFS inode, or an error
*/
struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
- u64 no_addr, u64 no_formal_ino)
+ u64 no_addr, u64 no_formal_ino,
+ unsigned int blktype)
{
struct inode *inode;
struct gfs2_inode *ip;
struct gfs2_glock *io_gl = NULL;
+ struct gfs2_holder i_gh;
int error;
- inode = iget_locked(sb, (unsigned long)no_addr);
- ip = GFS2_I(inode);
- ip->i_no_addr = no_addr;
-
+ gfs2_holder_mark_uninitialized(&i_gh);
+ inode = gfs2_iget(sb, no_addr);
if (!inode)
return ERR_PTR(-ENOMEM);
+ ip = GFS2_I(inode);
+
if (inode->i_state & I_NEW) {
struct gfs2_sbd *sdp = GFS2_SB(inode);
ip->i_no_formal_ino = no_formal_ino;
@@ -112,10 +149,29 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
if (unlikely(error))
goto fail_put;
+ if (type == DT_UNKNOWN || blktype != GFS2_BLKST_FREE) {
+ /*
+ * The GL_SKIP flag indicates to skip reading the inode
+ * block. We read the inode with gfs2_inode_refresh
+ * after possibly checking the block type.
+ */
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE,
+ GL_SKIP, &i_gh);
+ if (error)
+ goto fail_put;
+
+ if (blktype != GFS2_BLKST_FREE) {
+ error = gfs2_check_blk_type(sdp, no_addr,
+ blktype);
+ if (error)
+ goto fail_put;
+ }
+ }
+
set_bit(GIF_INVALID, &ip->i_flags);
error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
if (unlikely(error))
- goto fail_iopen;
+ goto fail_put;
ip->i_iopen_gh.gh_gl->gl_object = ip;
gfs2_glock_put(io_gl);
@@ -134,6 +190,8 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
unlock_new_inode(inode);
}
+ if (gfs2_holder_initialized(&i_gh))
+ gfs2_glock_dq_uninit(&i_gh);
return inode;
fail_refresh:
@@ -141,10 +199,11 @@ fail_refresh:
ip->i_iopen_gh.gh_gl->gl_object = NULL;
gfs2_glock_dq_wait(&ip->i_iopen_gh);
gfs2_holder_uninit(&ip->i_iopen_gh);
-fail_iopen:
+fail_put:
if (io_gl)
gfs2_glock_put(io_gl);
-fail_put:
+ if (gfs2_holder_initialized(&i_gh))
+ gfs2_glock_dq_uninit(&i_gh);
ip->i_gl->gl_object = NULL;
fail:
iget_failed(inode);
@@ -155,23 +214,12 @@ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
u64 *no_formal_ino, unsigned int blktype)
{
struct super_block *sb = sdp->sd_vfs;
- struct gfs2_holder i_gh;
- struct inode *inode = NULL;
+ struct inode *inode;
int error;
- /* Must not read in block until block type is verified */
- error = gfs2_glock_nq_num(sdp, no_addr, &gfs2_inode_glops,
- LM_ST_EXCLUSIVE, GL_SKIP, &i_gh);
- if (error)
- return ERR_PTR(error);
-
- error = gfs2_check_blk_type(sdp, no_addr, blktype);
- if (error)
- goto fail;
-
- inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0);
+ inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0, blktype);
if (IS_ERR(inode))
- goto fail;
+ return inode;
/* Two extra checks for NFS only */
if (no_formal_ino) {
@@ -182,16 +230,12 @@ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
error = -EIO;
if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM)
goto fail_iput;
-
- error = 0;
}
+ return inode;
-fail:
- gfs2_glock_dq_uninit(&i_gh);
- return error ? ERR_PTR(error) : inode;
fail_iput:
iput(inode);
- goto fail;
+ return ERR_PTR(error);
}
@@ -236,8 +280,8 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
struct gfs2_holder d_gh;
int error = 0;
struct inode *inode = NULL;
- int unlock = 0;
+ gfs2_holder_mark_uninitialized(&d_gh);
if (!name->len || name->len > GFS2_FNAMESIZE)
return ERR_PTR(-ENAMETOOLONG);
@@ -252,7 +296,6 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
if (error)
return ERR_PTR(error);
- unlock = 1;
}
if (!is_root) {
@@ -265,7 +308,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
if (IS_ERR(inode))
error = PTR_ERR(inode);
out:
- if (unlock)
+ if (gfs2_holder_initialized(&d_gh))
gfs2_glock_dq_uninit(&d_gh);
if (error == -ENOENT)
return NULL;
@@ -692,12 +735,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
considered free. Any failures need to undo
the gfs2 structures. */
if (default_acl) {
- error = gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+ error = __gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
posix_acl_release(default_acl);
}
if (acl) {
if (!error)
- error = gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS);
+ error = __gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS);
posix_acl_release(acl);
}
@@ -1189,7 +1232,7 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry,
struct dentry *d;
bool excl = !!(flags & O_EXCL);
- if (!d_unhashed(dentry))
+ if (!d_in_lookup(dentry))
goto skip_lookup;
d = __gfs2_lookup(dir, dentry, file, opened);
@@ -1309,7 +1352,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
struct gfs2_inode *ip = GFS2_I(d_inode(odentry));
struct gfs2_inode *nip = NULL;
struct gfs2_sbd *sdp = GFS2_SB(odir);
- struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, };
+ struct gfs2_holder ghs[5], r_gh;
struct gfs2_rgrpd *nrgd;
unsigned int num_gh;
int dir_rename = 0;
@@ -1317,6 +1360,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
unsigned int x;
int error;
+ gfs2_holder_mark_uninitialized(&r_gh);
if (d_really_is_positive(ndentry)) {
nip = GFS2_I(d_inode(ndentry));
if (ip == nip)
@@ -1506,7 +1550,7 @@ out_gunlock:
gfs2_holder_uninit(ghs + x);
}
out_gunlock_r:
- if (r_gh.gh_gl)
+ if (gfs2_holder_initialized(&r_gh))
gfs2_glock_dq_uninit(&r_gh);
out:
return error;
@@ -1532,13 +1576,14 @@ static int gfs2_exchange(struct inode *odir, struct dentry *odentry,
struct gfs2_inode *oip = GFS2_I(odentry->d_inode);
struct gfs2_inode *nip = GFS2_I(ndentry->d_inode);
struct gfs2_sbd *sdp = GFS2_SB(odir);
- struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, };
+ struct gfs2_holder ghs[5], r_gh;
unsigned int num_gh;
unsigned int x;
umode_t old_mode = oip->i_inode.i_mode;
umode_t new_mode = nip->i_inode.i_mode;
int error;
+ gfs2_holder_mark_uninitialized(&r_gh);
error = gfs2_rindex_update(sdp);
if (error)
return error;
@@ -1646,7 +1691,7 @@ out_gunlock:
gfs2_holder_uninit(ghs + x);
}
out_gunlock_r:
- if (r_gh.gh_gl)
+ if (gfs2_holder_initialized(&r_gh))
gfs2_glock_dq_uninit(&r_gh);
out:
return error;
@@ -1743,9 +1788,8 @@ int gfs2_permission(struct inode *inode, int mask)
struct gfs2_inode *ip;
struct gfs2_holder i_gh;
int error;
- int unlock = 0;
-
+ gfs2_holder_mark_uninitialized(&i_gh);
ip = GFS2_I(inode);
if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
if (mask & MAY_NOT_BLOCK)
@@ -1753,14 +1797,13 @@ int gfs2_permission(struct inode *inode, int mask)
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
if (error)
return error;
- unlock = 1;
}
if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
- error = -EACCES;
+ error = -EPERM;
else
error = generic_permission(inode, mask);
- if (unlock)
+ if (gfs2_holder_initialized(&i_gh))
gfs2_glock_dq_uninit(&i_gh);
return error;
@@ -1932,83 +1975,21 @@ static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder gh;
int error;
- int unlock = 0;
+ gfs2_holder_mark_uninitialized(&gh);
if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
if (error)
return error;
- unlock = 1;
}
generic_fillattr(inode, stat);
- if (unlock)
+ if (gfs2_holder_initialized(&gh))
gfs2_glock_dq_uninit(&gh);
return 0;
}
-static int gfs2_setxattr(struct dentry *dentry, const char *name,
- const void *data, size_t size, int flags)
-{
- struct inode *inode = d_inode(dentry);
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_holder gh;
- int ret;
-
- gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
- ret = gfs2_glock_nq(&gh);
- if (ret == 0) {
- ret = gfs2_rsqa_alloc(ip);
- if (ret == 0)
- ret = generic_setxattr(dentry, name, data, size, flags);
- gfs2_glock_dq(&gh);
- }
- gfs2_holder_uninit(&gh);
- return ret;
-}
-
-static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
- void *data, size_t size)
-{
- struct inode *inode = d_inode(dentry);
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_holder gh;
- int ret;
-
- /* For selinux during lookup */
- if (gfs2_glock_is_locked_by_me(ip->i_gl))
- return generic_getxattr(dentry, name, data, size);
-
- gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
- ret = gfs2_glock_nq(&gh);
- if (ret == 0) {
- ret = generic_getxattr(dentry, name, data, size);
- gfs2_glock_dq(&gh);
- }
- gfs2_holder_uninit(&gh);
- return ret;
-}
-
-static int gfs2_removexattr(struct dentry *dentry, const char *name)
-{
- struct inode *inode = d_inode(dentry);
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_holder gh;
- int ret;
-
- gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
- ret = gfs2_glock_nq(&gh);
- if (ret == 0) {
- ret = gfs2_rsqa_alloc(ip);
- if (ret == 0)
- ret = generic_removexattr(dentry, name);
- gfs2_glock_dq(&gh);
- }
- gfs2_holder_uninit(&gh);
- return ret;
-}
-
static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
@@ -2055,10 +2036,10 @@ const struct inode_operations gfs2_file_iops = {
.permission = gfs2_permission,
.setattr = gfs2_setattr,
.getattr = gfs2_getattr,
- .setxattr = gfs2_setxattr,
- .getxattr = gfs2_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
.listxattr = gfs2_listxattr,
- .removexattr = gfs2_removexattr,
+ .removexattr = generic_removexattr,
.fiemap = gfs2_fiemap,
.get_acl = gfs2_get_acl,
.set_acl = gfs2_set_acl,
@@ -2077,10 +2058,10 @@ const struct inode_operations gfs2_dir_iops = {
.permission = gfs2_permission,
.setattr = gfs2_setattr,
.getattr = gfs2_getattr,
- .setxattr = gfs2_setxattr,
- .getxattr = gfs2_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
.listxattr = gfs2_listxattr,
- .removexattr = gfs2_removexattr,
+ .removexattr = generic_removexattr,
.fiemap = gfs2_fiemap,
.get_acl = gfs2_get_acl,
.set_acl = gfs2_set_acl,
@@ -2093,10 +2074,10 @@ const struct inode_operations gfs2_symlink_iops = {
.permission = gfs2_permission,
.setattr = gfs2_setattr,
.getattr = gfs2_getattr,
- .setxattr = gfs2_setxattr,
- .getxattr = gfs2_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
.listxattr = gfs2_listxattr,
- .removexattr = gfs2_removexattr,
+ .removexattr = generic_removexattr,
.fiemap = gfs2_fiemap,
};
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index e1af0d4aa308e..7710dfd3af350 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -94,11 +94,11 @@ err:
}
extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
- u64 no_addr, u64 no_formal_ino);
+ u64 no_addr, u64 no_formal_ino,
+ unsigned int blktype);
extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
u64 *no_formal_ino,
unsigned int blktype);
-extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
extern int gfs2_inode_refresh(struct gfs2_inode *ip);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 0ff028c15199a..e58ccef09c917 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -657,7 +657,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
struct gfs2_log_header *lh;
unsigned int tail;
u32 hash;
- int rw = WRITE_FLUSH_FUA | REQ_META;
+ int op_flags = WRITE_FLUSH_FUA | REQ_META;
struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO);
enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state);
lh = page_address(page);
@@ -682,12 +682,12 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) {
gfs2_ordered_wait(sdp);
log_flush_wait(sdp);
- rw = WRITE_SYNC | REQ_META | REQ_PRIO;
+ op_flags = WRITE_SYNC | REQ_META | REQ_PRIO;
}
sdp->sd_log_idle = (tail == sdp->sd_log_flush_head);
gfs2_log_write_page(sdp, page);
- gfs2_log_flush_bio(sdp, rw);
+ gfs2_log_flush_bio(sdp, REQ_OP_WRITE, op_flags);
log_flush_wait(sdp);
if (sdp->sd_log_tail != tail)
@@ -738,7 +738,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
gfs2_ordered_write(sdp);
lops_before_commit(sdp, tr);
- gfs2_log_flush_bio(sdp, WRITE);
+ gfs2_log_flush_bio(sdp, REQ_OP_WRITE, 0);
if (sdp->sd_log_head != sdp->sd_log_flush_head) {
log_flush_wait(sdp);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index d5369a109781d..49d5a1b61b069 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -230,17 +230,19 @@ static void gfs2_end_log_write(struct bio *bio)
/**
* gfs2_log_flush_bio - Submit any pending log bio
* @sdp: The superblock
- * @rw: The rw flags
+ * @op: REQ_OP
+ * @op_flags: rq_flag_bits
*
* Submit any pending part-built or full bio to the block device. If
* there is no pending bio, then this is a no-op.
*/
-void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int rw)
+void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int op, int op_flags)
{
if (sdp->sd_log_bio) {
atomic_inc(&sdp->sd_log_in_flight);
- submit_bio(rw, sdp->sd_log_bio);
+ bio_set_op_attrs(sdp->sd_log_bio, op, op_flags);
+ submit_bio(sdp->sd_log_bio);
sdp->sd_log_bio = NULL;
}
}
@@ -299,7 +301,7 @@ static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno)
nblk >>= sdp->sd_fsb2bb_shift;
if (blkno == nblk)
return bio;
- gfs2_log_flush_bio(sdp, WRITE);
+ gfs2_log_flush_bio(sdp, REQ_OP_WRITE, 0);
}
return gfs2_log_alloc_bio(sdp, blkno);
@@ -328,7 +330,7 @@ static void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page,
bio = gfs2_log_get_bio(sdp, blkno);
ret = bio_add_page(bio, page, size, offset);
if (ret == 0) {
- gfs2_log_flush_bio(sdp, WRITE);
+ gfs2_log_flush_bio(sdp, REQ_OP_WRITE, 0);
bio = gfs2_log_alloc_bio(sdp, blkno);
ret = bio_add_page(bio, page, size, offset);
WARN_ON(ret == 0);
@@ -535,9 +537,9 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_METADATA)
return 0;
- gfs2_replay_incr_blk(sdp, &start);
+ gfs2_replay_incr_blk(jd, &start);
- for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
+ for (; blks; gfs2_replay_incr_blk(jd, &start), blks--) {
blkno = be64_to_cpu(*ptr++);
jd->jd_found_blocks++;
@@ -693,7 +695,7 @@ static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
offset = sizeof(struct gfs2_log_descriptor);
- for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
+ for (; blks; gfs2_replay_incr_blk(jd, &start), blks--) {
error = gfs2_replay_read_block(jd, start, &bh);
if (error)
return error;
@@ -762,7 +764,6 @@ static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
__be64 *ptr, int pass)
{
struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
- struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
struct gfs2_glock *gl = ip->i_gl;
unsigned int blks = be32_to_cpu(ld->ld_data1);
struct buffer_head *bh_log, *bh_ip;
@@ -773,8 +774,8 @@ static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_JDATA)
return 0;
- gfs2_replay_incr_blk(sdp, &start);
- for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
+ gfs2_replay_incr_blk(jd, &start);
+ for (; blks; gfs2_replay_incr_blk(jd, &start), blks--) {
blkno = be64_to_cpu(*ptr++);
esc = be64_to_cpu(*ptr++);
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index a65a7ba32ffdf..e529f536c1179 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -27,7 +27,7 @@ extern const struct gfs2_log_operations gfs2_databuf_lops;
extern const struct gfs2_log_operations *gfs2_log_ops[];
extern void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page);
-extern void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int rw);
+extern void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int op, int op_flags);
extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
static inline unsigned int buf_limit(struct gfs2_sbd *sdp)
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index f99f8e94de3f3..74fd0139e6c2e 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -45,6 +45,7 @@ static void gfs2_init_inode_once(void *foo)
memset(&ip->i_res, 0, sizeof(ip->i_res));
RB_CLEAR_NODE(&ip->i_res.rs_node);
ip->i_hash_cache = NULL;
+ gfs2_holder_mark_uninitialized(&ip->i_iopen_gh);
}
static void gfs2_init_glock_once(void *foo)
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index e137d96f1b17b..950b8be68e415 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -37,8 +37,8 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
{
struct buffer_head *bh, *head;
int nr_underway = 0;
- int write_op = REQ_META | REQ_PRIO |
- (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
+ int write_flags = REQ_META | REQ_PRIO |
+ (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0);
BUG_ON(!PageLocked(page));
BUG_ON(!page_has_buffers(page));
@@ -79,7 +79,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
- submit_bh(write_op, bh);
+ submit_bh(REQ_OP_WRITE, write_flags, bh);
nr_underway++;
}
bh = next;
@@ -124,7 +124,7 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
if (mapping == NULL)
mapping = &sdp->sd_aspace;
- shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift;
+ shift = PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift;
index = blkno >> shift; /* convert block to page */
bufnum = blkno - (index << shift); /* block buf index within page */
@@ -154,7 +154,7 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
map_bh(bh, sdp->sd_vfs, blkno);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return bh;
}
@@ -213,7 +213,8 @@ static void gfs2_meta_read_endio(struct bio *bio)
* Submit several consecutive buffer head I/O requests as a single bio I/O
* request. (See submit_bh_wbc.)
*/
-static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num)
+static void gfs2_submit_bhs(int op, int op_flags, struct buffer_head *bhs[],
+ int num)
{
struct buffer_head *bh = bhs[0];
struct bio *bio;
@@ -230,7 +231,8 @@ static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num)
bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
}
bio->bi_end_io = gfs2_meta_read_endio;
- submit_bio(rw, bio);
+ bio_set_op_attrs(bio, op, op_flags);
+ submit_bio(bio);
}
/**
@@ -280,7 +282,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
}
}
- gfs2_submit_bhs(READ_SYNC | REQ_META | REQ_PRIO, bhs, num);
+ gfs2_submit_bhs(REQ_OP_READ, READ_SYNC | REQ_META | REQ_PRIO, bhs, num);
if (!(flags & DIO_WAIT))
return 0;
@@ -325,18 +327,19 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh)
return 0;
}
-void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta)
+void gfs2_remove_from_journal(struct buffer_head *bh, int meta)
{
struct address_space *mapping = bh->b_page->mapping;
struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
struct gfs2_bufdata *bd = bh->b_private;
+ struct gfs2_trans *tr = current->journal_info;
int was_pinned = 0;
if (test_clear_buffer_pinned(bh)) {
trace_gfs2_pin(bd, 0);
atomic_dec(&sdp->sd_log_pinned);
list_del_init(&bd->bd_list);
- if (meta)
+ if (meta == REMOVE_META)
tr->tr_num_buf_rm++;
else
tr->tr_num_databuf_rm++;
@@ -376,7 +379,7 @@ void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
if (bh) {
lock_buffer(bh);
gfs2_log_lock(sdp);
- gfs2_remove_from_journal(bh, current->journal_info, 1);
+ gfs2_remove_from_journal(bh, REMOVE_META);
gfs2_log_unlock(sdp);
unlock_buffer(bh);
brelse(bh);
@@ -447,7 +450,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
if (buffer_uptodate(first_bh))
goto out;
if (!buffer_locked(first_bh))
- ll_rw_block(READ_SYNC | REQ_META, 1, &first_bh);
+ ll_rw_block(REQ_OP_READ, READ_SYNC | REQ_META, 1, &first_bh);
dblock++;
extlen--;
@@ -456,7 +459,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
bh = gfs2_getbuf(gl, dblock, CREATE);
if (!buffer_uptodate(bh) && !buffer_locked(bh))
- ll_rw_block(READA | REQ_META, 1, &bh);
+ ll_rw_block(REQ_OP_READ, REQ_RAHEAD | REQ_META, 1, &bh);
brelse(bh);
dblock++;
extlen--;
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index c5086c8af5ed4..ffdf6aa3509d5 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -57,8 +57,12 @@ extern int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
extern int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
extern struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno,
int create);
-extern void gfs2_remove_from_journal(struct buffer_head *bh,
- struct gfs2_trans *tr, int meta);
+enum {
+ REMOVE_JDATA = 0,
+ REMOVE_META = 1,
+};
+
+extern void gfs2_remove_from_journal(struct buffer_head *bh, int meta);
extern void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
extern int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
struct buffer_head **bhp);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 49b0bff18fe3a..ef1e1822977f1 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -246,7 +246,8 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
bio->bi_end_io = end_bio_io_page;
bio->bi_private = page;
- submit_bio(READ_SYNC | REQ_META, bio);
+ bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC | REQ_META);
+ submit_bio(bio);
wait_on_page_locked(page);
bio_put(bio);
if (!PageUptodate(page)) {
@@ -454,7 +455,8 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
struct dentry *dentry;
struct inode *inode;
- inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0);
+ inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0,
+ GFS2_BLKST_FREE /* ignore */);
if (IS_ERR(inode)) {
fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode));
return PTR_ERR(inode);
@@ -824,7 +826,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
* i_mutex on quota files is special. Since this inode is hidden system
* file, we are safe to define locking ourselves.
*/
- lockdep_set_class(&sdp->sd_quota_inode->i_mutex,
+ lockdep_set_class(&sdp->sd_quota_inode->i_rwsem,
&gfs2_quota_imutex_key);
error = gfs2_rindex_update(sdp);
@@ -1360,7 +1362,7 @@ static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type,
return ERR_PTR(error);
}
s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, flags,
- d_inode(path.dentry)->i_sb->s_bdev);
+ path.dentry->d_sb->s_bdev);
path_put(&path);
if (IS_ERR(s)) {
pr_warn("gfs2 mount does not exist\n");
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index a398913442591..77930ca25303d 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -701,7 +701,7 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index,
unsigned to_write = bytes, pg_off = off;
int done = 0;
- blk = index << (PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift);
+ blk = index << (PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift);
boff = off % bsize;
page = find_or_create_page(mapping, index, GFP_NOFS);
@@ -730,7 +730,7 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index,
if (PageUptodate(page))
set_buffer_uptodate(bh);
if (!buffer_uptodate(bh)) {
- ll_rw_block(READ | REQ_META, 1, &bh);
+ ll_rw_block(REQ_OP_READ, REQ_META, 1, &bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh))
goto unlock_out;
@@ -753,13 +753,13 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index,
flush_dcache_page(page);
kunmap_atomic(kaddr);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return 0;
unlock_out:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return -EIO;
}
@@ -773,13 +773,13 @@ static int gfs2_write_disk_quota(struct gfs2_inode *ip, struct gfs2_quota *qp,
nbytes = sizeof(struct gfs2_quota);
- pg_beg = loc >> PAGE_CACHE_SHIFT;
- pg_off = loc % PAGE_CACHE_SIZE;
+ pg_beg = loc >> PAGE_SHIFT;
+ pg_off = loc % PAGE_SIZE;
/* If the quota straddles a page boundary, split the write in two */
- if ((pg_off + nbytes) > PAGE_CACHE_SIZE) {
+ if ((pg_off + nbytes) > PAGE_SIZE) {
pg_oflow = 1;
- overflow = (pg_off + nbytes) - PAGE_CACHE_SIZE;
+ overflow = (pg_off + nbytes) - PAGE_SIZE;
}
ptr = qp;
@@ -883,7 +883,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
&data_blocks, &ind_blocks);
- ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_NOFS);
+ ghs = kmalloc(num_qd * sizeof(struct gfs2_holder), GFP_NOFS);
if (!ghs)
return -ENOMEM;
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 1b645773c98e2..113b6095a58dd 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -338,7 +338,7 @@ static int foreach_descriptor(struct gfs2_jdesc *jd, unsigned int start,
struct gfs2_log_header_host lh;
error = get_log_header(jd, start, &lh);
if (!error) {
- gfs2_replay_incr_blk(sdp, &start);
+ gfs2_replay_incr_blk(jd, &start);
brelse(bh);
continue;
}
@@ -360,7 +360,7 @@ static int foreach_descriptor(struct gfs2_jdesc *jd, unsigned int start,
}
while (length--)
- gfs2_replay_incr_blk(sdp, &start);
+ gfs2_replay_incr_blk(jd, &start);
brelse(bh);
}
@@ -390,7 +390,7 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
lblock = head->lh_blkno;
- gfs2_replay_incr_blk(sdp, &lblock);
+ gfs2_replay_incr_blk(jd, &lblock);
bh_map.b_size = 1 << ip->i_inode.i_blkbits;
error = gfs2_block_map(&ip->i_inode, lblock, &bh_map, 0);
if (error)
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index 6142836cce961..11fdfab4bf99d 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -14,9 +14,9 @@
extern struct workqueue_struct *gfs_recovery_wq;
-static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
+static inline void gfs2_replay_incr_blk(struct gfs2_jdesc *jd, unsigned int *blk)
{
- if (++*blk == sdp->sd_jdesc->jd_blocks)
+ if (++*blk == jd->jd_blocks)
*blk = 0;
}
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 07c0265aa1953..86ccc01593937 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -73,8 +73,7 @@ static const char valid_change[16] = {
};
static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
- const struct gfs2_inode *ip, bool nowrap,
- const struct gfs2_alloc_parms *ap);
+ const struct gfs2_inode *ip, bool nowrap);
/**
@@ -659,6 +658,7 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
if (rgd) {
spin_lock(&rgd->rd_rsspin);
__rs_deltree(rs);
+ BUG_ON(rs->rs_free);
spin_unlock(&rgd->rd_rsspin);
}
}
@@ -672,10 +672,8 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
void gfs2_rsqa_delete(struct gfs2_inode *ip, atomic_t *wcount)
{
down_write(&ip->i_rw_mutex);
- if ((wcount == NULL) || (atomic_read(wcount) <= 1)) {
+ if ((wcount == NULL) || (atomic_read(wcount) <= 1))
gfs2_rs_deltree(&ip->i_res);
- BUG_ON(ip->i_res.rs_free);
- }
up_write(&ip->i_rw_mutex);
gfs2_qa_delete(ip, wcount);
}
@@ -723,6 +721,7 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
gfs2_free_clones(rgd);
kfree(rgd->rd_bits);
+ rgd->rd_bits = NULL;
return_all_reservations(rgd);
kmem_cache_free(gfs2_rgrpd_cachep, rgd);
}
@@ -917,10 +916,6 @@ static int read_rindex_entry(struct gfs2_inode *ip)
if (error)
goto fail;
- rgd->rd_gl->gl_object = rgd;
- rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_CACHE_MASK;
- rgd->rd_gl->gl_vm.end = PAGE_CACHE_ALIGN((rgd->rd_addr +
- rgd->rd_length) * bsize) - 1;
rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
if (rgd->rd_data > sdp->sd_max_rg_data)
@@ -928,14 +923,20 @@ static int read_rindex_entry(struct gfs2_inode *ip)
spin_lock(&sdp->sd_rindex_spin);
error = rgd_insert(rgd);
spin_unlock(&sdp->sd_rindex_spin);
- if (!error)
+ if (!error) {
+ rgd->rd_gl->gl_object = rgd;
+ rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_MASK;
+ rgd->rd_gl->gl_vm.end = PAGE_ALIGN((rgd->rd_addr +
+ rgd->rd_length) * bsize) - 1;
return 0;
+ }
error = 0; /* someone else read in the rgrp; free it and ignore it */
gfs2_glock_put(rgd->rd_gl);
fail:
kfree(rgd->rd_bits);
+ rgd->rd_bits = NULL;
kmem_cache_free(gfs2_rgrpd_cachep, rgd);
return error;
}
@@ -1512,7 +1513,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
if (WARN_ON(gfs2_rbm_from_block(&rbm, goal)))
return;
- ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true, ap);
+ ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true);
if (ret == 0) {
rs->rs_rbm = rbm;
rs->rs_free = extlen;
@@ -1639,7 +1640,6 @@ fail:
* @ip: If set, check for reservations
* @nowrap: Stop looking at the end of the rgrp, rather than wrapping
* around until we've reached the starting point.
- * @ap: the allocation parameters
*
* Side effects:
* - If looking for free blocks, we set GBF_FULL on each bitmap which
@@ -1651,8 +1651,7 @@ fail:
*/
static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
- const struct gfs2_inode *ip, bool nowrap,
- const struct gfs2_alloc_parms *ap)
+ const struct gfs2_inode *ip, bool nowrap)
{
struct buffer_head *bh;
int initial_bii;
@@ -1773,7 +1772,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
while (1) {
down_write(&sdp->sd_log_flush_lock);
error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
- true, NULL);
+ true);
up_write(&sdp->sd_log_flush_lock);
if (error == -ENOSPC)
break;
@@ -2100,7 +2099,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
{
struct gfs2_blkreserv *rs = &ip->i_res;
- if (rs->rs_rgd_gh.gh_gl)
+ if (gfs2_holder_initialized(&rs->rs_rgd_gh))
gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
}
@@ -2330,12 +2329,11 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
int error;
gfs2_set_alloc_start(&rbm, ip, dinode);
- error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false, NULL);
+ error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false);
if (error == -ENOSPC) {
gfs2_set_alloc_start(&rbm, ip, dinode);
- error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false,
- NULL);
+ error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false);
}
/* Since all blocks are reserved in advance, this shouldn't happen */
@@ -2601,7 +2599,7 @@ void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state)
{
unsigned int x;
- rlist->rl_ghs = kcalloc(rlist->rl_rgrps, sizeof(struct gfs2_holder),
+ rlist->rl_ghs = kmalloc(rlist->rl_rgrps * sizeof(struct gfs2_holder),
GFP_NOFS | __GFP_NOFAIL);
for (x = 0; x < rlist->rl_rgrps; x++)
gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index f8a0cd8212909..3a7e60bb39f8f 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -855,7 +855,7 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
wait_event(sdp->sd_reserving_log_wait, atomic_read(&sdp->sd_reserving_log) == 0);
gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks);
- if (freeze_gh.gh_gl)
+ if (gfs2_holder_initialized(&freeze_gh))
gfs2_glock_dq_uninit(&freeze_gh);
gfs2_quota_cleanup(sdp);
@@ -1033,7 +1033,7 @@ static int gfs2_unfreeze(struct super_block *sb)
mutex_lock(&sdp->sd_freeze_mutex);
if (atomic_read(&sdp->sd_freeze_state) != SFS_FROZEN ||
- sdp->sd_freeze_gh.gh_gl == NULL) {
+ !gfs2_holder_initialized(&sdp->sd_freeze_gh)) {
mutex_unlock(&sdp->sd_freeze_mutex);
return 0;
}
@@ -1084,9 +1084,11 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host
int error = 0, err;
memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
- gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
+ gha = kmalloc(slots * sizeof(struct gfs2_holder), GFP_KERNEL);
if (!gha)
return -ENOMEM;
+ for (x = 0; x < slots; x++)
+ gfs2_holder_mark_uninitialized(gha + x);
rgd_next = gfs2_rgrpd_get_first(sdp);
@@ -1096,7 +1098,7 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host
for (x = 0; x < slots; x++) {
gh = gha + x;
- if (gh->gh_gl && gfs2_glock_poll(gh)) {
+ if (gfs2_holder_initialized(gh) && gfs2_glock_poll(gh)) {
err = gfs2_glock_wait(gh);
if (err) {
gfs2_holder_uninit(gh);
@@ -1109,7 +1111,7 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host
}
}
- if (gh->gh_gl)
+ if (gfs2_holder_initialized(gh))
done = 0;
else if (rgd_next && !error) {
error = gfs2_glock_nq_init(rgd_next->rd_gl,
@@ -1176,7 +1178,7 @@ static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *s
static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
{
- struct super_block *sb = d_inode(dentry)->i_sb;
+ struct super_block *sb = dentry->d_sb;
struct gfs2_sbd *sdp = sb->s_fs_info;
struct gfs2_statfs_change_host sc;
int error;
@@ -1304,9 +1306,11 @@ static int gfs2_drop_inode(struct inode *inode)
{
struct gfs2_inode *ip = GFS2_I(inode);
- if (!test_bit(GIF_FREE_VFS_INODE, &ip->i_flags) && inode->i_nlink) {
+ if (!test_bit(GIF_FREE_VFS_INODE, &ip->i_flags) &&
+ inode->i_nlink &&
+ gfs2_holder_initialized(&ip->i_iopen_gh)) {
struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
- if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
+ if (test_bit(GLF_DEMOTE, &gl->gl_flags))
clear_nlink(inode);
}
return generic_drop_inode(inode);
@@ -1551,7 +1555,7 @@ static void gfs2_evict_inode(struct inode *inode)
goto out_truncate;
}
- if (ip->i_iopen_gh.gh_gl &&
+ if (gfs2_holder_initialized(&ip->i_iopen_gh) &&
test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
gfs2_glock_dq_wait(&ip->i_iopen_gh);
@@ -1610,7 +1614,7 @@ out_unlock:
if (gfs2_rs_active(&ip->i_res))
gfs2_rs_deltree(&ip->i_res);
- if (ip->i_iopen_gh.gh_gl) {
+ if (gfs2_holder_initialized(&ip->i_iopen_gh)) {
if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
gfs2_glock_dq_wait(&ip->i_iopen_gh);
@@ -1632,7 +1636,7 @@ out:
gfs2_glock_add_to_lru(ip->i_gl);
gfs2_glock_put(ip->i_gl);
ip->i_gl = NULL;
- if (ip->i_iopen_gh.gh_gl) {
+ if (gfs2_holder_initialized(&ip->i_iopen_gh)) {
ip->i_iopen_gh.gh_gl->gl_object = NULL;
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
gfs2_glock_dq_wait(&ip->i_iopen_gh);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index cf645835710f8..aee4485ad8a9b 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -68,6 +68,7 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...)
fs_err(sdp, "telling LM to unmount\n");
lm->lm_unmount(sdp);
}
+ set_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags);
fs_err(sdp, "withdrawn\n");
dump_stack();
}
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index e8dfb4740c049..3a28535040841 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -583,13 +583,11 @@ out:
*
* Returns: actual size of data on success, -errno on error
*/
-static int gfs2_xattr_get(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- void *buffer, size_t size)
+static int __gfs2_xattr_get(struct inode *inode, const char *name,
+ void *buffer, size_t size, int type)
{
- struct gfs2_inode *ip = GFS2_I(d_inode(dentry));
+ struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_ea_location el;
- int type = handler->flags;
int error;
if (!ip->i_eattr)
@@ -611,6 +609,29 @@ static int gfs2_xattr_get(const struct xattr_handler *handler,
return error;
}
+static int gfs2_xattr_get(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *buffer, size_t size)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder gh;
+ bool need_unlock = false;
+ int ret;
+
+ /* During lookup, SELinux calls this function with the glock locked. */
+
+ if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
+ ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+ if (ret)
+ return ret;
+ need_unlock = true;
+ }
+ ret = __gfs2_xattr_get(inode, name, buffer, size, handler->flags);
+ if (need_unlock)
+ gfs2_glock_dq_uninit(&gh);
+ return ret;
+}
+
/**
* ea_alloc_blk - allocates a new block for extended attributes.
* @ip: A pointer to the inode that's getting extended attributes
@@ -1230,11 +1251,24 @@ int __gfs2_xattr_set(struct inode *inode, const char *name,
}
static int gfs2_xattr_set(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- return __gfs2_xattr_set(d_inode(dentry), name, value,
- size, flags, handler->flags);
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder gh;
+ int ret;
+
+ ret = gfs2_rsqa_alloc(ip);
+ if (ret)
+ return ret;
+
+ ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+ if (ret)
+ return ret;
+ ret = __gfs2_xattr_set(inode, name, value, size, flags, handler->flags);
+ gfs2_glock_dq_uninit(&gh);
+ return ret;
}
static int ea_dealloc_indirect(struct gfs2_inode *ip)
diff --git a/fs/hfs/attr.c b/fs/hfs/attr.c
index 8d931b157bbef..d9a86919fdf6e 100644
--- a/fs/hfs/attr.c
+++ b/fs/hfs/attr.c
@@ -13,10 +13,10 @@
#include "hfs_fs.h"
#include "btree.h"
-int hfs_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+int hfs_setxattr(struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- struct inode *inode = d_inode(dentry);
struct hfs_find_data fd;
hfs_cat_rec rec;
struct hfs_cat_file *file;
@@ -56,10 +56,9 @@ out:
return res;
}
-ssize_t hfs_getxattr(struct dentry *dentry, const char *name,
- void *value, size_t size)
+ssize_t hfs_getxattr(struct dentry *unused, struct inode *inode,
+ const char *name, void *value, size_t size)
{
- struct inode *inode = d_inode(dentry);
struct hfs_find_data fd;
hfs_cat_rec rec;
struct hfs_cat_file *file;
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index 221719eac5de6..d77d844b668b1 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -278,14 +278,14 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
mapping = tree->inode->i_mapping;
off = (loff_t)cnid * tree->node_size;
- block = off >> PAGE_CACHE_SHIFT;
- node->page_offset = off & ~PAGE_CACHE_MASK;
+ block = off >> PAGE_SHIFT;
+ node->page_offset = off & ~PAGE_MASK;
for (i = 0; i < tree->pages_per_bnode; i++) {
page = read_mapping_page(mapping, block++, NULL);
if (IS_ERR(page))
goto fail;
if (PageError(page)) {
- page_cache_release(page);
+ put_page(page);
goto fail;
}
node->page[i] = page;
@@ -401,7 +401,7 @@ void hfs_bnode_free(struct hfs_bnode *node)
for (i = 0; i < node->tree->pages_per_bnode; i++)
if (node->page[i])
- page_cache_release(node->page[i]);
+ put_page(node->page[i]);
kfree(node);
}
@@ -429,11 +429,11 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num)
pagep = node->page;
memset(kmap(*pagep) + node->page_offset, 0,
- min((int)PAGE_CACHE_SIZE, (int)tree->node_size));
+ min((int)PAGE_SIZE, (int)tree->node_size));
set_page_dirty(*pagep);
kunmap(*pagep);
for (i = 1; i < tree->pages_per_bnode; i++) {
- memset(kmap(*++pagep), 0, PAGE_CACHE_SIZE);
+ memset(kmap(*++pagep), 0, PAGE_SIZE);
set_page_dirty(*pagep);
kunmap(*pagep);
}
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 1ab19e660e690..37cdd955eceb2 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -116,14 +116,14 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
}
tree->node_size_shift = ffs(size) - 1;
- tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ tree->pages_per_bnode = (tree->node_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
kunmap(page);
- page_cache_release(page);
+ put_page(page);
return tree;
fail_page:
- page_cache_release(page);
+ put_page(page);
free_inode:
tree->inode->i_mapping->a_ops = &hfs_aops;
iput(tree->inode);
@@ -257,9 +257,9 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
off = off16;
off += node->page_offset;
- pagep = node->page + (off >> PAGE_CACHE_SHIFT);
+ pagep = node->page + (off >> PAGE_SHIFT);
data = kmap(*pagep);
- off &= ~PAGE_CACHE_MASK;
+ off &= ~PAGE_MASK;
idx = 0;
for (;;) {
@@ -279,7 +279,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
}
}
}
- if (++off >= PAGE_CACHE_SIZE) {
+ if (++off >= PAGE_SIZE) {
kunmap(*pagep);
data = kmap(*++pagep);
off = 0;
@@ -302,9 +302,9 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
len = hfs_brec_lenoff(node, 0, &off16);
off = off16;
off += node->page_offset;
- pagep = node->page + (off >> PAGE_CACHE_SHIFT);
+ pagep = node->page + (off >> PAGE_SHIFT);
data = kmap(*pagep);
- off &= ~PAGE_CACHE_MASK;
+ off &= ~PAGE_MASK;
}
}
@@ -348,9 +348,9 @@ void hfs_bmap_free(struct hfs_bnode *node)
len = hfs_brec_lenoff(node, 0, &off);
}
off += node->page_offset + nidx / 8;
- page = node->page[off >> PAGE_CACHE_SHIFT];
+ page = node->page[off >> PAGE_SHIFT];
data = kmap(page);
- off &= ~PAGE_CACHE_MASK;
+ off &= ~PAGE_MASK;
m = 1 << (~nidx & 7);
byte = data[off];
if (!(byte & m)) {
diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c
index 1eb5d415d4346..8f4afd3f5108d 100644
--- a/fs/hfs/catalog.c
+++ b/fs/hfs/catalog.c
@@ -20,7 +20,7 @@
*
* Given the ID of the parent and the name build a search key.
*/
-void hfs_cat_build_key(struct super_block *sb, btree_key *key, u32 parent, struct qstr *name)
+void hfs_cat_build_key(struct super_block *sb, btree_key *key, u32 parent, const struct qstr *name)
{
key->cat.reserved = 0;
key->cat.ParID = cpu_to_be32(parent);
@@ -64,7 +64,7 @@ static int hfs_cat_build_record(hfs_cat_rec *rec, u32 cnid, struct inode *inode)
static int hfs_cat_build_thread(struct super_block *sb,
hfs_cat_rec *rec, int type,
- u32 parentid, struct qstr *name)
+ u32 parentid, const struct qstr *name)
{
rec->type = type;
memset(rec->thread.reserved, 0, sizeof(rec->thread.reserved));
@@ -79,7 +79,7 @@ static int hfs_cat_build_thread(struct super_block *sb,
* Add a new file or directory to the catalog B-tree and
* return a (struct hfs_cat_entry) for it in '*result'.
*/
-int hfs_cat_create(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode)
+int hfs_cat_create(u32 cnid, struct inode *dir, const struct qstr *str, struct inode *inode)
{
struct hfs_find_data fd;
struct super_block *sb;
@@ -210,7 +210,7 @@ int hfs_cat_find_brec(struct super_block *sb, u32 cnid,
* Delete the indicated file or directory.
* The associated thread is also removed unless ('with_thread'==0).
*/
-int hfs_cat_delete(u32 cnid, struct inode *dir, struct qstr *str)
+int hfs_cat_delete(u32 cnid, struct inode *dir, const struct qstr *str)
{
struct super_block *sb;
struct hfs_find_data fd;
@@ -240,10 +240,13 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, struct qstr *str)
}
}
+ /* we only need to take spinlock for exclusion with ->release() */
+ spin_lock(&HFS_I(dir)->open_dir_lock);
list_for_each_entry(rd, &HFS_I(dir)->open_dir_list, list) {
if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0)
rd->file->f_pos--;
}
+ spin_unlock(&HFS_I(dir)->open_dir_lock);
res = hfs_brec_remove(&fd);
if (res)
@@ -274,8 +277,8 @@ out:
* If the destination exists it is removed and a
* (struct hfs_cat_entry) for it is returned in '*result'.
*/
-int hfs_cat_move(u32 cnid, struct inode *src_dir, struct qstr *src_name,
- struct inode *dst_dir, struct qstr *dst_name)
+int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name,
+ struct inode *dst_dir, const struct qstr *dst_name)
{
struct super_block *sb;
struct hfs_find_data src_fd, dst_fd;
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index e9f2b855f8316..163190ecc0d2e 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -161,8 +161,14 @@ static int hfs_readdir(struct file *file, struct dir_context *ctx)
}
file->private_data = rd;
rd->file = file;
+ spin_lock(&HFS_I(inode)->open_dir_lock);
list_add(&rd->list, &HFS_I(inode)->open_dir_list);
+ spin_unlock(&HFS_I(inode)->open_dir_lock);
}
+ /*
+ * Can be done after the list insertion; exclusion with
+ * hfs_delete_cat() is provided by directory lock.
+ */
memcpy(&rd->key, &fd.key, sizeof(struct hfs_cat_key));
out:
hfs_find_exit(&fd);
@@ -173,9 +179,9 @@ static int hfs_dir_release(struct inode *inode, struct file *file)
{
struct hfs_readdir_data *rd = file->private_data;
if (rd) {
- inode_lock(inode);
+ spin_lock(&HFS_I(inode)->open_dir_lock);
list_del(&rd->list);
- inode_unlock(inode);
+ spin_unlock(&HFS_I(inode)->open_dir_lock);
kfree(rd);
}
return 0;
@@ -303,7 +309,7 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
const struct file_operations hfs_dir_operations = {
.read = generic_read_dir,
- .iterate = hfs_readdir,
+ .iterate_shared = hfs_readdir,
.llseek = generic_file_llseek,
.release = hfs_dir_release,
};
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 1f1c7dcbcc2ff..16f5172ee40db 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -69,6 +69,7 @@ struct hfs_inode_info {
struct hfs_cat_key cat_key;
struct list_head open_dir_list;
+ spinlock_t open_dir_lock;
struct inode *rsrc_inode;
struct mutex extents_lock;
@@ -177,11 +178,11 @@ extern int hfs_clear_vbm_bits(struct super_block *, u16, u16);
extern int hfs_cat_keycmp(const btree_key *, const btree_key *);
struct hfs_find_data;
extern int hfs_cat_find_brec(struct super_block *, u32, struct hfs_find_data *);
-extern int hfs_cat_create(u32, struct inode *, struct qstr *, struct inode *);
-extern int hfs_cat_delete(u32, struct inode *, struct qstr *);
-extern int hfs_cat_move(u32, struct inode *, struct qstr *,
- struct inode *, struct qstr *);
-extern void hfs_cat_build_key(struct super_block *, btree_key *, u32, struct qstr *);
+extern int hfs_cat_create(u32, struct inode *, const struct qstr *, struct inode *);
+extern int hfs_cat_delete(u32, struct inode *, const struct qstr *);
+extern int hfs_cat_move(u32, struct inode *, const struct qstr *,
+ struct inode *, const struct qstr *);
+extern void hfs_cat_build_key(struct super_block *, btree_key *, u32, const struct qstr *);
/* dir.c */
extern const struct file_operations hfs_dir_operations;
@@ -200,7 +201,7 @@ extern int hfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
extern const struct address_space_operations hfs_aops;
extern const struct address_space_operations hfs_btree_aops;
-extern struct inode *hfs_new_inode(struct inode *, struct qstr *, umode_t);
+extern struct inode *hfs_new_inode(struct inode *, const struct qstr *, umode_t);
extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *);
extern int hfs_write_inode(struct inode *, struct writeback_control *);
extern int hfs_inode_setattr(struct dentry *, struct iattr *);
@@ -211,10 +212,10 @@ extern void hfs_evict_inode(struct inode *);
extern void hfs_delete_inode(struct inode *);
/* attr.c */
-extern int hfs_setxattr(struct dentry *dentry, const char *name,
+extern int hfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
const void *value, size_t size, int flags);
-extern ssize_t hfs_getxattr(struct dentry *dentry, const char *name,
- void *value, size_t size);
+extern ssize_t hfs_getxattr(struct dentry *dentry, struct inode *inode,
+ const char *name, void *value, size_t size);
extern ssize_t hfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
/* mdb.c */
@@ -232,11 +233,11 @@ extern const struct dentry_operations hfs_dentry_operations;
extern int hfs_hash_dentry(const struct dentry *, struct qstr *);
extern int hfs_strcmp(const unsigned char *, unsigned int,
const unsigned char *, unsigned int);
-extern int hfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
+extern int hfs_compare_dentry(const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name);
/* trans.c */
-extern void hfs_asc2mac(struct super_block *, struct hfs_name *, struct qstr *);
+extern void hfs_asc2mac(struct super_block *, struct hfs_name *, const struct qstr *);
extern int hfs_mac2asc(struct super_block *, char *, const struct hfs_name *);
/* super.c */
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 6686bf39a5b5a..c6a32415735bc 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -91,8 +91,8 @@ static int hfs_releasepage(struct page *page, gfp_t mask)
if (!tree)
return 0;
- if (tree->node_size >= PAGE_CACHE_SIZE) {
- nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT);
+ if (tree->node_size >= PAGE_SIZE) {
+ nidx = page->index >> (tree->node_size_shift - PAGE_SHIFT);
spin_lock(&tree->hash_lock);
node = hfs_bnode_findhash(tree, nidx);
if (!node)
@@ -105,8 +105,8 @@ static int hfs_releasepage(struct page *page, gfp_t mask)
}
spin_unlock(&tree->hash_lock);
} else {
- nidx = page->index << (PAGE_CACHE_SHIFT - tree->node_size_shift);
- i = 1 << (PAGE_CACHE_SHIFT - tree->node_size_shift);
+ nidx = page->index << (PAGE_SHIFT - tree->node_size_shift);
+ i = 1 << (PAGE_SHIFT - tree->node_size_shift);
spin_lock(&tree->hash_lock);
do {
node = hfs_bnode_findhash(tree, nidx++);
@@ -124,16 +124,15 @@ static int hfs_releasepage(struct page *page, gfp_t mask)
return res ? try_to_free_buffers(page) : 0;
}
-static ssize_t hfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- loff_t offset)
+static ssize_t hfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
- struct inode *inode = file_inode(file)->i_mapping->host;
+ struct inode *inode = mapping->host;
size_t count = iov_iter_count(iter);
ssize_t ret;
- ret = blockdev_direct_IO(iocb, inode, iter, offset, hfs_get_block);
+ ret = blockdev_direct_IO(iocb, inode, iter, hfs_get_block);
/*
* In case of error extending write may have instantiated a few
@@ -141,7 +140,7 @@ static ssize_t hfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
*/
if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
loff_t isize = i_size_read(inode);
- loff_t end = offset + count;
+ loff_t end = iocb->ki_pos + count;
if (end > isize)
hfs_write_failed(mapping, end);
@@ -178,7 +177,7 @@ const struct address_space_operations hfs_aops = {
/*
* hfs_new_inode
*/
-struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, umode_t mode)
+struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t mode)
{
struct super_block *sb = dir->i_sb;
struct inode *inode = new_inode(sb);
@@ -187,6 +186,7 @@ struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, umode_t mode)
mutex_init(&HFS_I(inode)->extents_lock);
INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list);
+ spin_lock_init(&HFS_I(inode)->open_dir_lock);
hfs_cat_build_key(sb, (btree_key *)&HFS_I(inode)->cat_key, dir->i_ino, name);
inode->i_ino = HFS_SB(sb)->next_id++;
inode->i_mode = mode;
@@ -318,6 +318,7 @@ static int hfs_read_inode(struct inode *inode, void *data)
HFS_I(inode)->rsrc_inode = NULL;
mutex_init(&HFS_I(inode)->extents_lock);
INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list);
+ spin_lock_init(&HFS_I(inode)->open_dir_lock);
/* Initialize the inode */
inode->i_uid = hsb->s_uid;
diff --git a/fs/hfs/string.c b/fs/hfs/string.c
index 85b610c3909fb..3912209153a84 100644
--- a/fs/hfs/string.c
+++ b/fs/hfs/string.c
@@ -59,7 +59,7 @@ int hfs_hash_dentry(const struct dentry *dentry, struct qstr *this)
if (len > HFS_NAMELEN)
len = HFS_NAMELEN;
- hash = init_name_hash();
+ hash = init_name_hash(dentry);
for (; len; len--)
hash = partial_name_hash(caseorder[*name++], hash);
this->hash = end_name_hash(hash);
@@ -92,7 +92,7 @@ int hfs_strcmp(const unsigned char *s1, unsigned int len1,
* Test for equality of two strings in the HFS filename character ordering.
* return 1 on failure and 0 on success
*/
-int hfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
+int hfs_compare_dentry(const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
const unsigned char *n1, *n2;
diff --git a/fs/hfs/trans.c b/fs/hfs/trans.c
index b1ce4c7ad3fb4..39f5e343bf4d4 100644
--- a/fs/hfs/trans.c
+++ b/fs/hfs/trans.c
@@ -94,7 +94,7 @@ out:
* This routine is a inverse to hfs_mac2triv().
* A ':' is replaced by a '/'.
*/
-void hfs_asc2mac(struct super_block *sb, struct hfs_name *out, struct qstr *in)
+void hfs_asc2mac(struct super_block *sb, struct hfs_name *out, const struct qstr *in)
{
struct nls_table *nls_disk = HFS_SB(sb)->nls_disk;
struct nls_table *nls_io = HFS_SB(sb)->nls_io;
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index d2954451519ed..c0ae274c0a225 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -13,7 +13,7 @@
#include "hfsplus_fs.h"
#include "hfsplus_raw.h"
-#define PAGE_CACHE_BITS (PAGE_CACHE_SIZE * 8)
+#define PAGE_CACHE_BITS (PAGE_SIZE * 8)
int hfsplus_block_allocate(struct super_block *sb, u32 size,
u32 offset, u32 *max)
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 63924662aaf3e..ce014ceb89efc 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -24,16 +24,16 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len)
int l;
off += node->page_offset;
- pagep = node->page + (off >> PAGE_CACHE_SHIFT);
- off &= ~PAGE_CACHE_MASK;
+ pagep = node->page + (off >> PAGE_SHIFT);
+ off &= ~PAGE_MASK;
- l = min_t(int, len, PAGE_CACHE_SIZE - off);
+ l = min_t(int, len, PAGE_SIZE - off);
memcpy(buf, kmap(*pagep) + off, l);
kunmap(*pagep);
while ((len -= l) != 0) {
buf += l;
- l = min_t(int, len, PAGE_CACHE_SIZE);
+ l = min_t(int, len, PAGE_SIZE);
memcpy(buf, kmap(*++pagep), l);
kunmap(*pagep);
}
@@ -77,17 +77,17 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len)
int l;
off += node->page_offset;
- pagep = node->page + (off >> PAGE_CACHE_SHIFT);
- off &= ~PAGE_CACHE_MASK;
+ pagep = node->page + (off >> PAGE_SHIFT);
+ off &= ~PAGE_MASK;
- l = min_t(int, len, PAGE_CACHE_SIZE - off);
+ l = min_t(int, len, PAGE_SIZE - off);
memcpy(kmap(*pagep) + off, buf, l);
set_page_dirty(*pagep);
kunmap(*pagep);
while ((len -= l) != 0) {
buf += l;
- l = min_t(int, len, PAGE_CACHE_SIZE);
+ l = min_t(int, len, PAGE_SIZE);
memcpy(kmap(*++pagep), buf, l);
set_page_dirty(*pagep);
kunmap(*pagep);
@@ -107,16 +107,16 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len)
int l;
off += node->page_offset;
- pagep = node->page + (off >> PAGE_CACHE_SHIFT);
- off &= ~PAGE_CACHE_MASK;
+ pagep = node->page + (off >> PAGE_SHIFT);
+ off &= ~PAGE_MASK;
- l = min_t(int, len, PAGE_CACHE_SIZE - off);
+ l = min_t(int, len, PAGE_SIZE - off);
memset(kmap(*pagep) + off, 0, l);
set_page_dirty(*pagep);
kunmap(*pagep);
while ((len -= l) != 0) {
- l = min_t(int, len, PAGE_CACHE_SIZE);
+ l = min_t(int, len, PAGE_SIZE);
memset(kmap(*++pagep), 0, l);
set_page_dirty(*pagep);
kunmap(*pagep);
@@ -136,20 +136,20 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
tree = src_node->tree;
src += src_node->page_offset;
dst += dst_node->page_offset;
- src_page = src_node->page + (src >> PAGE_CACHE_SHIFT);
- src &= ~PAGE_CACHE_MASK;
- dst_page = dst_node->page + (dst >> PAGE_CACHE_SHIFT);
- dst &= ~PAGE_CACHE_MASK;
+ src_page = src_node->page + (src >> PAGE_SHIFT);
+ src &= ~PAGE_MASK;
+ dst_page = dst_node->page + (dst >> PAGE_SHIFT);
+ dst &= ~PAGE_MASK;
if (src == dst) {
- l = min_t(int, len, PAGE_CACHE_SIZE - src);
+ l = min_t(int, len, PAGE_SIZE - src);
memcpy(kmap(*dst_page) + src, kmap(*src_page) + src, l);
kunmap(*src_page);
set_page_dirty(*dst_page);
kunmap(*dst_page);
while ((len -= l) != 0) {
- l = min_t(int, len, PAGE_CACHE_SIZE);
+ l = min_t(int, len, PAGE_SIZE);
memcpy(kmap(*++dst_page), kmap(*++src_page), l);
kunmap(*src_page);
set_page_dirty(*dst_page);
@@ -161,12 +161,12 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
do {
src_ptr = kmap(*src_page) + src;
dst_ptr = kmap(*dst_page) + dst;
- if (PAGE_CACHE_SIZE - src < PAGE_CACHE_SIZE - dst) {
- l = PAGE_CACHE_SIZE - src;
+ if (PAGE_SIZE - src < PAGE_SIZE - dst) {
+ l = PAGE_SIZE - src;
src = 0;
dst += l;
} else {
- l = PAGE_CACHE_SIZE - dst;
+ l = PAGE_SIZE - dst;
src += l;
dst = 0;
}
@@ -195,11 +195,11 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
dst += node->page_offset;
if (dst > src) {
src += len - 1;
- src_page = node->page + (src >> PAGE_CACHE_SHIFT);
- src = (src & ~PAGE_CACHE_MASK) + 1;
+ src_page = node->page + (src >> PAGE_SHIFT);
+ src = (src & ~PAGE_MASK) + 1;
dst += len - 1;
- dst_page = node->page + (dst >> PAGE_CACHE_SHIFT);
- dst = (dst & ~PAGE_CACHE_MASK) + 1;
+ dst_page = node->page + (dst >> PAGE_SHIFT);
+ dst = (dst & ~PAGE_MASK) + 1;
if (src == dst) {
while (src < len) {
@@ -208,7 +208,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
set_page_dirty(*dst_page);
kunmap(*dst_page);
len -= src;
- src = PAGE_CACHE_SIZE;
+ src = PAGE_SIZE;
src_page--;
dst_page--;
}
@@ -226,32 +226,32 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
dst_ptr = kmap(*dst_page) + dst;
if (src < dst) {
l = src;
- src = PAGE_CACHE_SIZE;
+ src = PAGE_SIZE;
dst -= l;
} else {
l = dst;
src -= l;
- dst = PAGE_CACHE_SIZE;
+ dst = PAGE_SIZE;
}
l = min(len, l);
memmove(dst_ptr - l, src_ptr - l, l);
kunmap(*src_page);
set_page_dirty(*dst_page);
kunmap(*dst_page);
- if (dst == PAGE_CACHE_SIZE)
+ if (dst == PAGE_SIZE)
dst_page--;
else
src_page--;
} while ((len -= l));
}
} else {
- src_page = node->page + (src >> PAGE_CACHE_SHIFT);
- src &= ~PAGE_CACHE_MASK;
- dst_page = node->page + (dst >> PAGE_CACHE_SHIFT);
- dst &= ~PAGE_CACHE_MASK;
+ src_page = node->page + (src >> PAGE_SHIFT);
+ src &= ~PAGE_MASK;
+ dst_page = node->page + (dst >> PAGE_SHIFT);
+ dst &= ~PAGE_MASK;
if (src == dst) {
- l = min_t(int, len, PAGE_CACHE_SIZE - src);
+ l = min_t(int, len, PAGE_SIZE - src);
memmove(kmap(*dst_page) + src,
kmap(*src_page) + src, l);
kunmap(*src_page);
@@ -259,7 +259,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
kunmap(*dst_page);
while ((len -= l) != 0) {
- l = min_t(int, len, PAGE_CACHE_SIZE);
+ l = min_t(int, len, PAGE_SIZE);
memmove(kmap(*++dst_page),
kmap(*++src_page), l);
kunmap(*src_page);
@@ -272,13 +272,13 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
do {
src_ptr = kmap(*src_page) + src;
dst_ptr = kmap(*dst_page) + dst;
- if (PAGE_CACHE_SIZE - src <
- PAGE_CACHE_SIZE - dst) {
- l = PAGE_CACHE_SIZE - src;
+ if (PAGE_SIZE - src <
+ PAGE_SIZE - dst) {
+ l = PAGE_SIZE - src;
src = 0;
dst += l;
} else {
- l = PAGE_CACHE_SIZE - dst;
+ l = PAGE_SIZE - dst;
src += l;
dst = 0;
}
@@ -444,14 +444,14 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
mapping = tree->inode->i_mapping;
off = (loff_t)cnid << tree->node_size_shift;
- block = off >> PAGE_CACHE_SHIFT;
- node->page_offset = off & ~PAGE_CACHE_MASK;
+ block = off >> PAGE_SHIFT;
+ node->page_offset = off & ~PAGE_MASK;
for (i = 0; i < tree->pages_per_bnode; block++, i++) {
page = read_mapping_page(mapping, block, NULL);
if (IS_ERR(page))
goto fail;
if (PageError(page)) {
- page_cache_release(page);
+ put_page(page);
goto fail;
}
node->page[i] = page;
@@ -569,7 +569,7 @@ void hfs_bnode_free(struct hfs_bnode *node)
for (i = 0; i < node->tree->pages_per_bnode; i++)
if (node->page[i])
- page_cache_release(node->page[i]);
+ put_page(node->page[i]);
kfree(node);
}
@@ -597,11 +597,11 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num)
pagep = node->page;
memset(kmap(*pagep) + node->page_offset, 0,
- min_t(int, PAGE_CACHE_SIZE, tree->node_size));
+ min_t(int, PAGE_SIZE, tree->node_size));
set_page_dirty(*pagep);
kunmap(*pagep);
for (i = 1; i < tree->pages_per_bnode; i++) {
- memset(kmap(*++pagep), 0, PAGE_CACHE_SIZE);
+ memset(kmap(*++pagep), 0, PAGE_SIZE);
set_page_dirty(*pagep);
kunmap(*pagep);
}
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index 3345c7553edc1..d9d1a36ba8266 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -236,15 +236,15 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
tree->node_size_shift = ffs(size) - 1;
tree->pages_per_bnode =
- (tree->node_size + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
+ (tree->node_size + PAGE_SIZE - 1) >>
+ PAGE_SHIFT;
kunmap(page);
- page_cache_release(page);
+ put_page(page);
return tree;
fail_page:
- page_cache_release(page);
+ put_page(page);
free_inode:
tree->inode->i_mapping->a_ops = &hfsplus_aops;
iput(tree->inode);
@@ -380,9 +380,9 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
off = off16;
off += node->page_offset;
- pagep = node->page + (off >> PAGE_CACHE_SHIFT);
+ pagep = node->page + (off >> PAGE_SHIFT);
data = kmap(*pagep);
- off &= ~PAGE_CACHE_MASK;
+ off &= ~PAGE_MASK;
idx = 0;
for (;;) {
@@ -403,7 +403,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
}
}
}
- if (++off >= PAGE_CACHE_SIZE) {
+ if (++off >= PAGE_SIZE) {
kunmap(*pagep);
data = kmap(*++pagep);
off = 0;
@@ -426,9 +426,9 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
len = hfs_brec_lenoff(node, 0, &off16);
off = off16;
off += node->page_offset;
- pagep = node->page + (off >> PAGE_CACHE_SHIFT);
+ pagep = node->page + (off >> PAGE_SHIFT);
data = kmap(*pagep);
- off &= ~PAGE_CACHE_MASK;
+ off &= ~PAGE_MASK;
}
}
@@ -475,9 +475,9 @@ void hfs_bmap_free(struct hfs_bnode *node)
len = hfs_brec_lenoff(node, 0, &off);
}
off += node->page_offset + nidx / 8;
- page = node->page[off >> PAGE_CACHE_SHIFT];
+ page = node->page[off >> PAGE_SHIFT];
data = kmap(page);
- off &= ~PAGE_CACHE_MASK;
+ off &= ~PAGE_MASK;
m = 1 << (~nidx & 7);
byte = data[off];
if (!(byte & m)) {
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index 022974ab6e3cc..142534d3c2d50 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -40,7 +40,7 @@ int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *k1,
/* Generates key for catalog file/folders record. */
int hfsplus_cat_build_key(struct super_block *sb,
- hfsplus_btree_key *key, u32 parent, struct qstr *str)
+ hfsplus_btree_key *key, u32 parent, const struct qstr *str)
{
int len, err;
@@ -174,7 +174,7 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry,
static int hfsplus_fill_cat_thread(struct super_block *sb,
hfsplus_cat_entry *entry, int type,
- u32 parentid, struct qstr *str)
+ u32 parentid, const struct qstr *str)
{
int err;
@@ -250,7 +250,7 @@ static void hfsplus_subfolders_dec(struct inode *dir)
}
int hfsplus_create_cat(u32 cnid, struct inode *dir,
- struct qstr *str, struct inode *inode)
+ const struct qstr *str, struct inode *inode)
{
struct super_block *sb = dir->i_sb;
struct hfs_find_data fd;
@@ -318,7 +318,7 @@ err2:
return err;
}
-int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
+int hfsplus_delete_cat(u32 cnid, struct inode *dir, const struct qstr *str)
{
struct super_block *sb = dir->i_sb;
struct hfs_find_data fd;
@@ -374,12 +374,15 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC);
}
+ /* we only need to take spinlock for exclusion with ->release() */
+ spin_lock(&HFSPLUS_I(dir)->open_dir_lock);
list_for_each(pos, &HFSPLUS_I(dir)->open_dir_list) {
struct hfsplus_readdir_data *rd =
list_entry(pos, struct hfsplus_readdir_data, list);
if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0)
rd->file->f_pos--;
}
+ spin_unlock(&HFSPLUS_I(dir)->open_dir_lock);
err = hfs_brec_remove(&fd);
if (err)
@@ -412,8 +415,8 @@ out:
}
int hfsplus_rename_cat(u32 cnid,
- struct inode *src_dir, struct qstr *src_name,
- struct inode *dst_dir, struct qstr *dst_name)
+ struct inode *src_dir, const struct qstr *src_name,
+ struct inode *dst_dir, const struct qstr *dst_name)
{
struct super_block *sb = src_dir->i_sb;
struct hfs_find_data src_fd, dst_fd;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index a4e867e089478..42e128661dc15 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -271,8 +271,14 @@ next:
}
file->private_data = rd;
rd->file = file;
+ spin_lock(&HFSPLUS_I(inode)->open_dir_lock);
list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list);
+ spin_unlock(&HFSPLUS_I(inode)->open_dir_lock);
}
+ /*
+ * Can be done after the list insertion; exclusion with
+ * hfsplus_delete_cat() is provided by directory lock.
+ */
memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key));
out:
kfree(strbuf);
@@ -284,9 +290,9 @@ static int hfsplus_dir_release(struct inode *inode, struct file *file)
{
struct hfsplus_readdir_data *rd = file->private_data;
if (rd) {
- inode_lock(inode);
+ spin_lock(&HFSPLUS_I(inode)->open_dir_lock);
list_del(&rd->list);
- inode_unlock(inode);
+ spin_unlock(&HFSPLUS_I(inode)->open_dir_lock);
kfree(rd);
}
return 0;
@@ -569,7 +575,7 @@ const struct inode_operations hfsplus_dir_inode_operations = {
const struct file_operations hfsplus_dir_operations = {
.fsync = hfsplus_file_fsync,
.read = generic_read_dir,
- .iterate = hfsplus_readdir,
+ .iterate_shared = hfsplus_readdir,
.unlocked_ioctl = hfsplus_ioctl,
.llseek = generic_file_llseek,
.release = hfsplus_dir_release,
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index f91a1faf819e9..a3f03b2474637 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -244,6 +244,7 @@ struct hfsplus_inode_info {
u8 userflags; /* BSD user file flags */
u32 subfolders; /* Subfolder count (HFSX only) */
struct list_head open_dir_list;
+ spinlock_t open_dir_lock;
loff_t phys_size;
struct inode vfs_inode;
@@ -444,17 +445,17 @@ int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *k1,
int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *k1,
const hfsplus_btree_key *k2);
int hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *key,
- u32 parent, struct qstr *str);
+ u32 parent, const struct qstr *str);
void hfsplus_cat_build_key_with_cnid(struct super_block *sb,
hfsplus_btree_key *key, u32 parent);
void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms);
int hfsplus_find_cat(struct super_block *sb, u32 cnid,
struct hfs_find_data *fd);
-int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str,
+int hfsplus_create_cat(u32 cnid, struct inode *dir, const struct qstr *str,
struct inode *inode);
-int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str);
-int hfsplus_rename_cat(u32 cnid, struct inode *src_dir, struct qstr *src_name,
- struct inode *dst_dir, struct qstr *dst_name);
+int hfsplus_delete_cat(u32 cnid, struct inode *dir, const struct qstr *str);
+int hfsplus_rename_cat(u32 cnid, struct inode *src_dir, const struct qstr *src_name,
+ struct inode *dst_dir, const struct qstr *dst_name);
/* dir.c */
extern const struct inode_operations hfsplus_dir_inode_operations;
@@ -519,13 +520,12 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr,
int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
int max_unistr_len, const char *astr, int len);
int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str);
-int hfsplus_compare_dentry(const struct dentry *parent,
- const struct dentry *dentry, unsigned int len,
+int hfsplus_compare_dentry(const struct dentry *dentry, unsigned int len,
const char *str, const struct qstr *name);
/* wrapper.c */
int hfsplus_submit_bio(struct super_block *sb, sector_t sector, void *buf,
- void **data, int rw);
+ void **data, int op, int op_flags);
int hfsplus_read_wrapper(struct super_block *sb);
/* time macros */
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 1a6394cdb54ef..19462d773fe24 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -87,9 +87,9 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
}
if (!tree)
return 0;
- if (tree->node_size >= PAGE_CACHE_SIZE) {
+ if (tree->node_size >= PAGE_SIZE) {
nidx = page->index >>
- (tree->node_size_shift - PAGE_CACHE_SHIFT);
+ (tree->node_size_shift - PAGE_SHIFT);
spin_lock(&tree->hash_lock);
node = hfs_bnode_findhash(tree, nidx);
if (!node)
@@ -103,8 +103,8 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
spin_unlock(&tree->hash_lock);
} else {
nidx = page->index <<
- (PAGE_CACHE_SHIFT - tree->node_size_shift);
- i = 1 << (PAGE_CACHE_SHIFT - tree->node_size_shift);
+ (PAGE_SHIFT - tree->node_size_shift);
+ i = 1 << (PAGE_SHIFT - tree->node_size_shift);
spin_lock(&tree->hash_lock);
do {
node = hfs_bnode_findhash(tree, nidx++);
@@ -122,16 +122,15 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
return res ? try_to_free_buffers(page) : 0;
}
-static ssize_t hfsplus_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- loff_t offset)
+static ssize_t hfsplus_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
- struct inode *inode = file_inode(file)->i_mapping->host;
+ struct inode *inode = mapping->host;
size_t count = iov_iter_count(iter);
ssize_t ret;
- ret = blockdev_direct_IO(iocb, inode, iter, offset, hfsplus_get_block);
+ ret = blockdev_direct_IO(iocb, inode, iter, hfsplus_get_block);
/*
* In case of error extending write may have instantiated a few
@@ -139,7 +138,7 @@ static ssize_t hfsplus_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
*/
if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
loff_t isize = i_size_read(inode);
- loff_t end = offset + count;
+ loff_t end = iocb->ki_pos + count;
if (end > isize)
hfsplus_write_failed(mapping, end);
@@ -374,6 +373,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode)
hip = HFSPLUS_I(inode);
INIT_LIST_HEAD(&hip->open_dir_list);
+ spin_lock_init(&hip->open_dir_lock);
mutex_init(&hip->extents_lock);
atomic_set(&hip->opencnt, 0);
hip->extent_state = 0;
diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c
index eb355d81e2798..63164ebc52fa1 100644
--- a/fs/hfsplus/part_tbl.c
+++ b/fs/hfsplus/part_tbl.c
@@ -112,7 +112,8 @@ static int hfs_parse_new_pmap(struct super_block *sb, void *buf,
if ((u8 *)pm - (u8 *)buf >= buf_size) {
res = hfsplus_submit_bio(sb,
*part_start + HFS_PMAP_BLK + i,
- buf, (void **)&pm, READ);
+ buf, (void **)&pm, REQ_OP_READ,
+ 0);
if (res)
return res;
}
@@ -136,7 +137,7 @@ int hfs_part_find(struct super_block *sb,
return -ENOMEM;
res = hfsplus_submit_bio(sb, *part_start + HFS_PMAP_BLK,
- buf, &data, READ);
+ buf, &data, REQ_OP_READ, 0);
if (res)
goto out;
diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c
index afb33eda6d7db..ab7ea2506b4de 100644
--- a/fs/hfsplus/posix_acl.c
+++ b/fs/hfsplus/posix_acl.c
@@ -48,9 +48,6 @@ struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type)
hfsplus_destroy_attr_entry((hfsplus_attr_entry *)value);
- if (!IS_ERR(acl))
- set_cached_acl(inode, type, acl);
-
return acl;
}
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 5d54490a136d8..11854dd845726 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -67,6 +67,7 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
return inode;
INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list);
+ spin_lock_init(&HFSPLUS_I(inode)->open_dir_lock);
mutex_init(&HFSPLUS_I(inode)->extents_lock);
HFSPLUS_I(inode)->flags = 0;
HFSPLUS_I(inode)->extent_state = 0;
@@ -219,7 +220,8 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait)
error2 = hfsplus_submit_bio(sb,
sbi->part_start + HFSPLUS_VOLHEAD_SECTOR,
- sbi->s_vhdr_buf, NULL, WRITE_SYNC);
+ sbi->s_vhdr_buf, NULL, REQ_OP_WRITE,
+ WRITE_SYNC);
if (!error)
error = error2;
if (!write_backup)
@@ -227,7 +229,8 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait)
error2 = hfsplus_submit_bio(sb,
sbi->part_start + sbi->sect_count - 2,
- sbi->s_backup_vhdr_buf, NULL, WRITE_SYNC);
+ sbi->s_backup_vhdr_buf, NULL, REQ_OP_WRITE,
+ WRITE_SYNC);
if (!error)
error2 = error;
out:
@@ -438,7 +441,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
err = -EFBIG;
last_fs_block = sbi->total_blocks - 1;
last_fs_page = (last_fs_block << sbi->alloc_blksz_shift) >>
- PAGE_CACHE_SHIFT;
+ PAGE_SHIFT;
if ((last_fs_block > (sector_t)(~0ULL) >> (sbi->alloc_blksz_shift - 9)) ||
(last_fs_page > (pgoff_t)(~0ULL))) {
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index e8ef121a4d8b5..e563939882f35 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -346,7 +346,7 @@ int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str)
casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
- hash = init_name_hash();
+ hash = init_name_hash(dentry);
astr = str->name;
len = str->len;
while (len > 0) {
@@ -385,10 +385,10 @@ int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str)
* Composed unicode characters are decomposed and case-folding is performed
* if the appropriate bits are (un)set on the superblock.
*/
-int hfsplus_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
+int hfsplus_compare_dentry(const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
- struct super_block *sb = parent->d_sb;
+ struct super_block *sb = dentry->d_sb;
int casefold, decompose, size;
int dsize1, dsize2, len1, len2;
const u16 *dstr1, *dstr2;
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index cc62356714376..ebb85e5f65499 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -30,7 +30,8 @@ struct hfsplus_wd {
* @sector: block to read or write, for blocks of HFSPLUS_SECTOR_SIZE bytes
* @buf: buffer for I/O
* @data: output pointer for location of requested data
- * @rw: direction of I/O
+ * @op: direction of I/O
+ * @op_flags: request op flags
*
* The unit of I/O is hfsplus_min_io_size(sb), which may be bigger than
* HFSPLUS_SECTOR_SIZE, and @buf must be sized accordingly. On reads
@@ -44,7 +45,7 @@ struct hfsplus_wd {
* will work correctly.
*/
int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
- void *buf, void **data, int rw)
+ void *buf, void **data, int op, int op_flags)
{
struct bio *bio;
int ret = 0;
@@ -65,8 +66,9 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
bio = bio_alloc(GFP_NOIO, 1);
bio->bi_iter.bi_sector = sector;
bio->bi_bdev = sb->s_bdev;
+ bio_set_op_attrs(bio, op, op_flags);
- if (!(rw & WRITE) && data)
+ if (op != WRITE && data)
*data = (u8 *)buf + offset;
while (io_size > 0) {
@@ -83,7 +85,7 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
buf = (u8 *)buf + len;
}
- ret = submit_bio_wait(rw, bio);
+ ret = submit_bio_wait(bio);
out:
bio_put(bio);
return ret < 0 ? ret : 0;
@@ -181,7 +183,7 @@ int hfsplus_read_wrapper(struct super_block *sb)
reread:
error = hfsplus_submit_bio(sb, part_start + HFSPLUS_VOLHEAD_SECTOR,
sbi->s_vhdr_buf, (void **)&sbi->s_vhdr,
- READ);
+ REQ_OP_READ, 0);
if (error)
goto out_free_backup_vhdr;
@@ -213,7 +215,8 @@ reread:
error = hfsplus_submit_bio(sb, part_start + part_size - 2,
sbi->s_backup_vhdr_buf,
- (void **)&sbi->s_backup_vhdr, READ);
+ (void **)&sbi->s_backup_vhdr, REQ_OP_READ,
+ 0);
if (error)
goto out_free_backup_vhdr;
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index ab01530b4930f..d37bb88dc746e 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -220,7 +220,7 @@ check_attr_tree_state_again:
index = 0;
written = 0;
- for (; written < node_size; index++, written += PAGE_CACHE_SIZE) {
+ for (; written < node_size; index++, written += PAGE_SIZE) {
void *kaddr;
page = read_mapping_page(mapping, index, NULL);
@@ -231,11 +231,11 @@ check_attr_tree_state_again:
kaddr = kmap_atomic(page);
memcpy(kaddr, buf + written,
- min_t(size_t, PAGE_CACHE_SIZE, node_size - written));
+ min_t(size_t, PAGE_SIZE, node_size - written));
kunmap_atomic(kaddr);
set_page_dirty(page);
- page_cache_release(page);
+ put_page(page);
}
hfsplus_mark_inode_dirty(attr_file, HFSPLUS_I_ATTR_DIRTY);
@@ -424,7 +424,7 @@ static int copy_name(char *buffer, const char *xattr_name, int name_len)
return len;
}
-int hfsplus_setxattr(struct dentry *dentry, const char *name,
+int hfsplus_setxattr(struct inode *inode, const char *name,
const void *value, size_t size, int flags,
const char *prefix, size_t prefixlen)
{
@@ -437,8 +437,7 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name,
return -ENOMEM;
strcpy(xattr_name, prefix);
strcpy(xattr_name + prefixlen, name);
- res = __hfsplus_setxattr(d_inode(dentry), xattr_name, value, size,
- flags);
+ res = __hfsplus_setxattr(inode, xattr_name, value, size, flags);
kfree(xattr_name);
return res;
}
@@ -579,7 +578,7 @@ failed_getxattr_init:
return res;
}
-ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
+ssize_t hfsplus_getxattr(struct inode *inode, const char *name,
void *value, size_t size,
const char *prefix, size_t prefixlen)
{
@@ -594,7 +593,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
strcpy(xattr_name, prefix);
strcpy(xattr_name + prefixlen, name);
- res = __hfsplus_getxattr(d_inode(dentry), xattr_name, value, size);
+ res = __hfsplus_getxattr(inode, xattr_name, value, size);
kfree(xattr_name);
return res;
@@ -844,8 +843,8 @@ end_removexattr:
}
static int hfsplus_osx_getxattr(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- void *buffer, size_t size)
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *buffer, size_t size)
{
/*
* Don't allow retrieving properly prefixed attributes
@@ -860,12 +859,13 @@ static int hfsplus_osx_getxattr(const struct xattr_handler *handler,
* creates), so we pass the name through unmodified (after
* ensuring it doesn't conflict with another namespace).
*/
- return __hfsplus_getxattr(d_inode(dentry), name, buffer, size);
+ return __hfsplus_getxattr(inode, name, buffer, size);
}
static int hfsplus_osx_setxattr(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *buffer,
+ size_t size, int flags)
{
/*
* Don't allow setting properly prefixed attributes
@@ -880,7 +880,7 @@ static int hfsplus_osx_setxattr(const struct xattr_handler *handler,
* creates), so we pass the name through unmodified (after
* ensuring it doesn't conflict with another namespace).
*/
- return __hfsplus_setxattr(d_inode(dentry), name, buffer, size, flags);
+ return __hfsplus_setxattr(inode, name, buffer, size, flags);
}
const struct xattr_handler hfsplus_xattr_osx_handler = {
diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h
index f9b0955b3d281..68f6b539371f5 100644
--- a/fs/hfsplus/xattr.h
+++ b/fs/hfsplus/xattr.h
@@ -21,14 +21,14 @@ extern const struct xattr_handler *hfsplus_xattr_handlers[];
int __hfsplus_setxattr(struct inode *inode, const char *name,
const void *value, size_t size, int flags);
-int hfsplus_setxattr(struct dentry *dentry, const char *name,
+int hfsplus_setxattr(struct inode *inode, const char *name,
const void *value, size_t size, int flags,
const char *prefix, size_t prefixlen);
ssize_t __hfsplus_getxattr(struct inode *inode, const char *name,
void *value, size_t size);
-ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
+ssize_t hfsplus_getxattr(struct inode *inode, const char *name,
void *value, size_t size,
const char *prefix, size_t prefixlen);
diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c
index 72a68a3a0c996..37b3efa733ef2 100644
--- a/fs/hfsplus/xattr_security.c
+++ b/fs/hfsplus/xattr_security.c
@@ -14,19 +14,20 @@
#include "acl.h"
static int hfsplus_security_getxattr(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- void *buffer, size_t size)
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *buffer, size_t size)
{
- return hfsplus_getxattr(dentry, name, buffer, size,
+ return hfsplus_getxattr(inode, name, buffer, size,
XATTR_SECURITY_PREFIX,
XATTR_SECURITY_PREFIX_LEN);
}
static int hfsplus_security_setxattr(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *buffer,
+ size_t size, int flags)
{
- return hfsplus_setxattr(dentry, name, buffer, size, flags,
+ return hfsplus_setxattr(inode, name, buffer, size, flags,
XATTR_SECURITY_PREFIX,
XATTR_SECURITY_PREFIX_LEN);
}
diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c
index 95a7704c7abb7..94519d6c627df 100644
--- a/fs/hfsplus/xattr_trusted.c
+++ b/fs/hfsplus/xattr_trusted.c
@@ -12,19 +12,20 @@
#include "xattr.h"
static int hfsplus_trusted_getxattr(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- void *buffer, size_t size)
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *buffer, size_t size)
{
- return hfsplus_getxattr(dentry, name, buffer, size,
+ return hfsplus_getxattr(inode, name, buffer, size,
XATTR_TRUSTED_PREFIX,
XATTR_TRUSTED_PREFIX_LEN);
}
static int hfsplus_trusted_setxattr(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *buffer,
+ size_t size, int flags)
{
- return hfsplus_setxattr(dentry, name, buffer, size, flags,
+ return hfsplus_setxattr(inode, name, buffer, size, flags,
XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
}
diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c
index 6fc269baf959a..fae6c0ea00305 100644
--- a/fs/hfsplus/xattr_user.c
+++ b/fs/hfsplus/xattr_user.c
@@ -12,19 +12,20 @@
#include "xattr.h"
static int hfsplus_user_getxattr(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- void *buffer, size_t size)
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *buffer, size_t size)
{
- return hfsplus_getxattr(dentry, name, buffer, size,
+ return hfsplus_getxattr(inode, name, buffer, size,
XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
}
static int hfsplus_user_setxattr(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *buffer,
+ size_t size, int flags)
{
- return hfsplus_setxattr(dentry, name, buffer, size, flags,
+ return hfsplus_setxattr(inode, name, buffer, size, flags,
XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
}
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index d1abbee281d19..90e46cd752fe7 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -398,7 +398,7 @@ static const struct file_operations hostfs_file_fops = {
static const struct file_operations hostfs_dir_fops = {
.llseek = generic_file_llseek,
- .iterate = hostfs_readdir,
+ .iterate_shared = hostfs_readdir,
.read = generic_read_dir,
.open = hostfs_open,
.fsync = hostfs_fsync,
@@ -410,12 +410,12 @@ static int hostfs_writepage(struct page *page, struct writeback_control *wbc)
struct inode *inode = mapping->host;
char *buffer;
loff_t base = page_offset(page);
- int count = PAGE_CACHE_SIZE;
- int end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+ int count = PAGE_SIZE;
+ int end_index = inode->i_size >> PAGE_SHIFT;
int err;
if (page->index >= end_index)
- count = inode->i_size & (PAGE_CACHE_SIZE-1);
+ count = inode->i_size & (PAGE_SIZE-1);
buffer = kmap(page);
@@ -447,7 +447,7 @@ static int hostfs_readpage(struct file *file, struct page *page)
buffer = kmap(page);
bytes_read = read_file(FILE_HOSTFS_I(file)->fd, &start, buffer,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
if (bytes_read < 0) {
ClearPageUptodate(page);
SetPageError(page);
@@ -455,7 +455,7 @@ static int hostfs_readpage(struct file *file, struct page *page)
goto out;
}
- memset(buffer + bytes_read, 0, PAGE_CACHE_SIZE - bytes_read);
+ memset(buffer + bytes_read, 0, PAGE_SIZE - bytes_read);
ClearPageError(page);
SetPageUptodate(page);
@@ -471,7 +471,7 @@ static int hostfs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t index = pos >> PAGE_SHIFT;
*pagep = grab_cache_page_write_begin(mapping, index, flags);
if (!*pagep)
@@ -485,14 +485,14 @@ static int hostfs_write_end(struct file *file, struct address_space *mapping,
{
struct inode *inode = mapping->host;
void *buffer;
- unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned from = pos & (PAGE_SIZE - 1);
int err;
buffer = kmap(page);
err = write_file(FILE_HOSTFS_I(file)->fd, &pos, buffer + from, copied);
kunmap(page);
- if (!PageUptodate(page) && err == PAGE_CACHE_SIZE)
+ if (!PageUptodate(page) && err == PAGE_SIZE)
SetPageUptodate(page);
/*
@@ -502,7 +502,7 @@ static int hostfs_write_end(struct file *file, struct address_space *mapping,
if (err > 0 && (pos > inode->i_size))
inode->i_size = pos;
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return err;
}
@@ -959,10 +959,11 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
if (S_ISLNK(root_inode->i_mode)) {
char *name = follow_link(host_root_path);
- if (IS_ERR(name))
+ if (IS_ERR(name)) {
err = PTR_ERR(name);
- else
- err = read_name(root_inode, name);
+ goto out_put;
+ }
+ err = read_name(root_inode, name);
kfree(name);
if (err)
goto out_put;
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index fa27980f22292..bb87d65f0d971 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -26,7 +26,7 @@ static int hpfs_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
/*return -ENOENT;*/
x:
- hash = init_name_hash();
+ hash = init_name_hash(dentry);
for (i = 0; i < l; i++)
hash = partial_name_hash(hpfs_upcase(hpfs_sb(dentry->d_sb)->sb_cp_table,qstr->name[i]), hash);
qstr->hash = end_name_hash(hash);
@@ -34,7 +34,7 @@ static int hpfs_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
return 0;
}
-static int hpfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
+static int hpfs_compare_dentry(const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
unsigned al = len;
@@ -50,7 +50,7 @@ static int hpfs_compare_dentry(const struct dentry *parent, const struct dentry
if (hpfs_chk_name(name->name, &bl))
return 1;
- if (hpfs_compare_names(parent->d_sb, str, al, name->name, bl, 0))
+ if (hpfs_compare_names(dentry->d_sb, str, al, name->name, bl, 0))
return 1;
return 0;
}
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index e57a53c13d864..7b9150c2e75c4 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -44,7 +44,11 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
else goto fail;
if (pos == 12) goto fail;
}
- hpfs_add_pos(i, &filp->f_pos);
+ if (unlikely(hpfs_add_pos(i, &filp->f_pos) < 0)) {
+ hpfs_unlock(s);
+ inode_unlock(i);
+ return -ENOMEM;
+ }
ok:
filp->f_pos = new_off;
hpfs_unlock(s);
@@ -141,8 +145,10 @@ static int hpfs_readdir(struct file *file, struct dir_context *ctx)
ctx->pos = 1;
}
if (ctx->pos == 1) {
+ ret = hpfs_add_pos(inode, &file->f_pos);
+ if (unlikely(ret < 0))
+ goto out;
ctx->pos = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, hpfs_inode->i_dno) << 4) + 1;
- hpfs_add_pos(inode, &file->f_pos);
file->f_version = inode->i_version;
}
next_pos = ctx->pos;
@@ -324,7 +330,7 @@ const struct file_operations hpfs_dir_ops =
{
.llseek = hpfs_dir_lseek,
.read = generic_read_dir,
- .iterate = hpfs_readdir,
+ .iterate_shared = hpfs_readdir,
.release = hpfs_dir_release,
.fsync = hpfs_file_fsync,
.unlocked_ioctl = hpfs_ioctl,
diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c
index 2923a7bd82acc..86ab7e790b4e5 100644
--- a/fs/hpfs/dnode.c
+++ b/fs/hpfs/dnode.c
@@ -21,7 +21,7 @@ static loff_t get_pos(struct dnode *d, struct hpfs_dirent *fde)
return ((loff_t)le32_to_cpu(d->self) << 4) | (loff_t)1;
}
-void hpfs_add_pos(struct inode *inode, loff_t *pos)
+int hpfs_add_pos(struct inode *inode, loff_t *pos)
{
struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
int i = 0;
@@ -29,11 +29,12 @@ void hpfs_add_pos(struct inode *inode, loff_t *pos)
if (hpfs_inode->i_rddir_off)
for (; hpfs_inode->i_rddir_off[i]; i++)
- if (hpfs_inode->i_rddir_off[i] == pos) return;
+ if (hpfs_inode->i_rddir_off[i] == pos)
+ return 0;
if (!(i&0x0f)) {
if (!(ppos = kmalloc((i+0x11) * sizeof(loff_t*), GFP_NOFS))) {
pr_err("out of memory for position list\n");
- return;
+ return -ENOMEM;
}
if (hpfs_inode->i_rddir_off) {
memcpy(ppos, hpfs_inode->i_rddir_off, i * sizeof(loff_t));
@@ -43,6 +44,7 @@ void hpfs_add_pos(struct inode *inode, loff_t *pos)
}
hpfs_inode->i_rddir_off[i] = pos;
hpfs_inode->i_rddir_off[i + 1] = NULL;
+ return 0;
}
void hpfs_del_pos(struct inode *inode, loff_t *pos)
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 975654a63c13d..aebb78f9e47f2 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -242,7 +242,7 @@ extern const struct file_operations hpfs_dir_ops;
/* dnode.c */
-void hpfs_add_pos(struct inode *, loff_t *);
+int hpfs_add_pos(struct inode *, loff_t *);
void hpfs_del_pos(struct inode *, loff_t *);
struct hpfs_dirent *hpfs_add_de(struct super_block *, struct dnode *,
const unsigned char *, unsigned, secno);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 458cf463047b6..82067ca22f2b9 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -15,6 +15,7 @@
#include <linux/sched.h>
#include <linux/bitmap.h>
#include <linux/slab.h>
+#include <linux/seq_file.h>
/* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */
@@ -453,10 +454,6 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
int lowercase, eas, chk, errs, chkdsk, timeshift;
int o;
struct hpfs_sb_info *sbi = hpfs_sb(s);
- char *new_opts = kstrdup(data, GFP_KERNEL);
-
- if (!new_opts)
- return -ENOMEM;
sync_filesystem(s);
@@ -493,17 +490,44 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
if (!(*flags & MS_RDONLY)) mark_dirty(s, 1);
- replace_mount_options(s, new_opts);
-
hpfs_unlock(s);
return 0;
out_err:
hpfs_unlock(s);
- kfree(new_opts);
return -EINVAL;
}
+static int hpfs_show_options(struct seq_file *seq, struct dentry *root)
+{
+ struct hpfs_sb_info *sbi = hpfs_sb(root->d_sb);
+
+ seq_printf(seq, ",uid=%u", from_kuid_munged(&init_user_ns, sbi->sb_uid));
+ seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, sbi->sb_gid));
+ seq_printf(seq, ",umask=%03o", (~sbi->sb_mode & 0777));
+ if (sbi->sb_lowercase)
+ seq_printf(seq, ",case=lower");
+ if (!sbi->sb_chk)
+ seq_printf(seq, ",check=none");
+ if (sbi->sb_chk == 2)
+ seq_printf(seq, ",check=strict");
+ if (!sbi->sb_err)
+ seq_printf(seq, ",errors=continue");
+ if (sbi->sb_err == 2)
+ seq_printf(seq, ",errors=panic");
+ if (!sbi->sb_chkdsk)
+ seq_printf(seq, ",chkdsk=no");
+ if (sbi->sb_chkdsk == 2)
+ seq_printf(seq, ",chkdsk=always");
+ if (!sbi->sb_eas)
+ seq_printf(seq, ",eas=no");
+ if (sbi->sb_eas == 1)
+ seq_printf(seq, ",eas=ro");
+ if (sbi->sb_timeshift)
+ seq_printf(seq, ",timeshift=%d", sbi->sb_timeshift);
+ return 0;
+}
+
/* Super operations */
static const struct super_operations hpfs_sops =
@@ -514,7 +538,7 @@ static const struct super_operations hpfs_sops =
.put_super = hpfs_put_super,
.statfs = hpfs_statfs,
.remount_fs = hpfs_remount_fs,
- .show_options = generic_show_options,
+ .show_options = hpfs_show_options,
};
static int hpfs_fill_super(struct super_block *s, void *options, int silent)
@@ -537,8 +561,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
int o;
- save_mount_options(s, options);
-
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi) {
return -ENOMEM;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index e1f465a389d5b..4ea71eba40a57 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -213,12 +213,12 @@ hugetlbfs_read_actor(struct page *page, unsigned long offset,
int i, chunksize;
/* Find which 4k chunk and offset with in that chunk */
- i = offset >> PAGE_CACHE_SHIFT;
- offset = offset & ~PAGE_CACHE_MASK;
+ i = offset >> PAGE_SHIFT;
+ offset = offset & ~PAGE_MASK;
while (size) {
size_t n;
- chunksize = PAGE_CACHE_SIZE;
+ chunksize = PAGE_SIZE;
if (offset)
chunksize -= offset;
if (chunksize > size)
@@ -237,7 +237,7 @@ hugetlbfs_read_actor(struct page *page, unsigned long offset,
/*
* Support for read() - Find the page attached to f_mapping and copy out the
* data. Its *very* similar to do_generic_mapping_read(), we can't use that
- * since it has PAGE_CACHE_SIZE assumptions.
+ * since it has PAGE_SIZE assumptions.
*/
static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
@@ -285,7 +285,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
* We have the page, copy it to user space buffer.
*/
copied = hugetlbfs_read_actor(page, offset, to, nr);
- page_cache_release(page);
+ put_page(page);
}
offset += copied;
retval += copied;
diff --git a/fs/inode.c b/fs/inode.c
index 69b8b526c1946..7e3ef3af3db9e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -151,6 +151,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_bdev = NULL;
inode->i_cdev = NULL;
inode->i_link = NULL;
+ inode->i_dir_seq = 0;
inode->i_rdev = 0;
inode->dirtied_when = 0;
@@ -165,8 +166,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
spin_lock_init(&inode->i_lock);
lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
- mutex_init(&inode->i_mutex);
- lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
+ init_rwsem(&inode->i_rwsem);
+ lockdep_set_class(&inode->i_rwsem, &sb->s_type->i_mutex_key);
atomic_set(&inode->i_dio_count, 0);
@@ -238,9 +239,9 @@ void __destroy_inode(struct inode *inode)
}
#ifdef CONFIG_FS_POSIX_ACL
- if (inode->i_acl && inode->i_acl != ACL_NOT_CACHED)
+ if (inode->i_acl && !is_uncached_acl(inode->i_acl))
posix_acl_release(inode->i_acl);
- if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
+ if (inode->i_default_acl && !is_uncached_acl(inode->i_default_acl))
posix_acl_release(inode->i_default_acl);
#endif
this_cpu_dec(nr_inodes);
@@ -344,7 +345,7 @@ EXPORT_SYMBOL(inc_nlink);
void address_space_init_once(struct address_space *mapping)
{
memset(mapping, 0, sizeof(*mapping));
- INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
+ INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC | __GFP_ACCOUNT);
spin_lock_init(&mapping->tree_lock);
init_rwsem(&mapping->i_mmap_rwsem);
INIT_LIST_HEAD(&mapping->private_list);
@@ -364,6 +365,7 @@ void inode_init_once(struct inode *inode)
INIT_HLIST_NODE(&inode->i_hash);
INIT_LIST_HEAD(&inode->i_devices);
INIT_LIST_HEAD(&inode->i_io_list);
+ INIT_LIST_HEAD(&inode->i_wb_list);
INIT_LIST_HEAD(&inode->i_lru);
address_space_init_once(&inode->i_data);
i_size_ordered_init(inode);
@@ -506,6 +508,7 @@ void clear_inode(struct inode *inode)
BUG_ON(!list_empty(&inode->i_data.private_list));
BUG_ON(!(inode->i_state & I_FREEING));
BUG_ON(inode->i_state & I_CLEAR);
+ BUG_ON(!list_empty(&inode->i_wb_list));
/* don't need i_lock here, no concurrent mods to i_state */
inode->i_state = I_FREEING | I_CLEAR;
}
@@ -924,13 +927,13 @@ void lockdep_annotate_inode_mutex_key(struct inode *inode)
struct file_system_type *type = inode->i_sb->s_type;
/* Set new key only if filesystem hasn't already changed it */
- if (lockdep_match_class(&inode->i_mutex, &type->i_mutex_key)) {
+ if (lockdep_match_class(&inode->i_rwsem, &type->i_mutex_key)) {
/*
* ensure nobody is actually holding i_mutex
*/
- mutex_destroy(&inode->i_mutex);
- mutex_init(&inode->i_mutex);
- lockdep_set_class(&inode->i_mutex,
+ // mutex_destroy(&inode->i_mutex);
+ init_rwsem(&inode->i_rwsem);
+ lockdep_set_class(&inode->i_rwsem,
&type->i_mutex_dir_key);
}
}
@@ -1616,6 +1619,13 @@ bool atime_needs_update(const struct path *path, struct inode *inode)
if (inode->i_flags & S_NOATIME)
return false;
+
+ /* Atime updates will likely cause i_uid and i_gid to be written
+ * back improprely if their true value is unknown to the vfs.
+ */
+ if (HAS_UNMAPPED_ID(inode))
+ return false;
+
if (IS_NOATIME(inode))
return false;
if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
@@ -1719,7 +1729,6 @@ int dentry_needs_remove_privs(struct dentry *dentry)
mask |= ATTR_KILL_PRIV;
return mask;
}
-EXPORT_SYMBOL(dentry_needs_remove_privs);
static int __remove_privs(struct dentry *dentry, int kill)
{
@@ -1739,8 +1748,8 @@ static int __remove_privs(struct dentry *dentry, int kill)
*/
int file_remove_privs(struct file *file)
{
- struct dentry *dentry = file->f_path.dentry;
- struct inode *inode = d_inode(dentry);
+ struct dentry *dentry = file_dentry(file);
+ struct inode *inode = file_inode(file);
int kill;
int error = 0;
@@ -1748,7 +1757,7 @@ int file_remove_privs(struct file *file)
if (IS_NOSEC(inode))
return 0;
- kill = file_needs_remove_privs(file);
+ kill = dentry_needs_remove_privs(dentry);
if (kill < 0)
return kill;
if (kill)
diff --git a/fs/internal.h b/fs/internal.h
index c8ca0c957743b..ba0737649d4a2 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -11,6 +11,7 @@
struct super_block;
struct file_system_type;
+struct iomap;
struct linux_binprm;
struct path;
struct mount;
@@ -39,6 +40,8 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
* buffer.c
*/
extern void guard_bio_eod(int rw, struct bio *bio);
+extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
+ get_block_t *get_block, struct iomap *iomap);
/*
* char_dev.c
@@ -115,6 +118,7 @@ extern struct file *filp_clone_open(struct file *);
*/
extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
extern void inode_add_lru(struct inode *inode);
+extern int dentry_needs_remove_privs(struct dentry *dentry);
/*
* fs-writeback.c
@@ -131,6 +135,7 @@ extern int invalidate_inodes(struct super_block *, bool);
extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
extern int d_set_mounted(struct dentry *dentry);
extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc);
+extern struct dentry *d_alloc_cursor(struct dentry *);
/*
* read_write.c
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 116a333e9c773..0f56deb24ce65 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -590,6 +590,7 @@ static long ioctl_file_dedupe_range(struct file *file, void __user *arg)
goto out;
}
+ same->dest_count = count;
ret = vfs_dedupe_file_range(file, same);
if (ret)
goto out;
diff --git a/fs/iomap.c b/fs/iomap.c
new file mode 100644
index 0000000000000..48141b8eff5f4
--- /dev/null
+++ b/fs/iomap.c
@@ -0,0 +1,497 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * Copyright (c) 2016 Christoph Hellwig.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/iomap.h>
+#include <linux/uaccess.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/uio.h>
+#include <linux/backing-dev.h>
+#include <linux/buffer_head.h>
+#include <linux/dax.h>
+#include "internal.h"
+
+typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
+ void *data, struct iomap *iomap);
+
+/*
+ * Execute a iomap write on a segment of the mapping that spans a
+ * contiguous range of pages that have identical block mapping state.
+ *
+ * This avoids the need to map pages individually, do individual allocations
+ * for each page and most importantly avoid the need for filesystem specific
+ * locking per page. Instead, all the operations are amortised over the entire
+ * range of pages. It is assumed that the filesystems will lock whatever
+ * resources they require in the iomap_begin call, and release them in the
+ * iomap_end call.
+ */
+static loff_t
+iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
+ struct iomap_ops *ops, void *data, iomap_actor_t actor)
+{
+ struct iomap iomap = { 0 };
+ loff_t written = 0, ret;
+
+ /*
+ * Need to map a range from start position for length bytes. This can
+ * span multiple pages - it is only guaranteed to return a range of a
+ * single type of pages (e.g. all into a hole, all mapped or all
+ * unwritten). Failure at this point has nothing to undo.
+ *
+ * If allocation is required for this range, reserve the space now so
+ * that the allocation is guaranteed to succeed later on. Once we copy
+ * the data into the page cache pages, then we cannot fail otherwise we
+ * expose transient stale data. If the reserve fails, we can safely
+ * back out at this point as there is nothing to undo.
+ */
+ ret = ops->iomap_begin(inode, pos, length, flags, &iomap);
+ if (ret)
+ return ret;
+ if (WARN_ON(iomap.offset > pos))
+ return -EIO;
+
+ /*
+ * Cut down the length to the one actually provided by the filesystem,
+ * as it might not be able to give us the whole size that we requested.
+ */
+ if (iomap.offset + iomap.length < pos + length)
+ length = iomap.offset + iomap.length - pos;
+
+ /*
+ * Now that we have guaranteed that the space allocation will succeed.
+ * we can do the copy-in page by page without having to worry about
+ * failures exposing transient data.
+ */
+ written = actor(inode, pos, length, data, &iomap);
+
+ /*
+ * Now the data has been copied, commit the range we've copied. This
+ * should not fail unless the filesystem has had a fatal error.
+ */
+ ret = ops->iomap_end(inode, pos, length, written > 0 ? written : 0,
+ flags, &iomap);
+
+ return written ? written : ret;
+}
+
+static void
+iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
+{
+ loff_t i_size = i_size_read(inode);
+
+ /*
+ * Only truncate newly allocated pages beyoned EOF, even if the
+ * write started inside the existing inode size.
+ */
+ if (pos + len > i_size)
+ truncate_pagecache_range(inode, max(pos, i_size), pos + len);
+}
+
+static int
+iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, struct iomap *iomap)
+{
+ pgoff_t index = pos >> PAGE_SHIFT;
+ struct page *page;
+ int status = 0;
+
+ BUG_ON(pos + len > iomap->offset + iomap->length);
+
+ page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
+ if (!page)
+ return -ENOMEM;
+
+ status = __block_write_begin_int(page, pos, len, NULL, iomap);
+ if (unlikely(status)) {
+ unlock_page(page);
+ put_page(page);
+ page = NULL;
+
+ iomap_write_failed(inode, pos, len);
+ }
+
+ *pagep = page;
+ return status;
+}
+
+static int
+iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
+ unsigned copied, struct page *page)
+{
+ int ret;
+
+ ret = generic_write_end(NULL, inode->i_mapping, pos, len,
+ copied, page, NULL);
+ if (ret < len)
+ iomap_write_failed(inode, pos, len);
+ return ret;
+}
+
+static loff_t
+iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+ struct iomap *iomap)
+{
+ struct iov_iter *i = data;
+ long status = 0;
+ ssize_t written = 0;
+ unsigned int flags = AOP_FLAG_NOFS;
+
+ /*
+ * Copies from kernel address space cannot fail (NFSD is a big user).
+ */
+ if (!iter_is_iovec(i))
+ flags |= AOP_FLAG_UNINTERRUPTIBLE;
+
+ do {
+ struct page *page;
+ unsigned long offset; /* Offset into pagecache page */
+ unsigned long bytes; /* Bytes to write to page */
+ size_t copied; /* Bytes copied from user */
+
+ offset = (pos & (PAGE_SIZE - 1));
+ bytes = min_t(unsigned long, PAGE_SIZE - offset,
+ iov_iter_count(i));
+again:
+ if (bytes > length)
+ bytes = length;
+
+ /*
+ * Bring in the user page that we will copy from _first_.
+ * Otherwise there's a nasty deadlock on copying from the
+ * same page as we're writing to, without it being marked
+ * up-to-date.
+ *
+ * Not only is this an optimisation, but it is also required
+ * to check that the address is actually valid, when atomic
+ * usercopies are used, below.
+ */
+ if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+ status = -EFAULT;
+ break;
+ }
+
+ status = iomap_write_begin(inode, pos, bytes, flags, &page,
+ iomap);
+ if (unlikely(status))
+ break;
+
+ if (mapping_writably_mapped(inode->i_mapping))
+ flush_dcache_page(page);
+
+ pagefault_disable();
+ copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+ pagefault_enable();
+
+ flush_dcache_page(page);
+ mark_page_accessed(page);
+
+ status = iomap_write_end(inode, pos, bytes, copied, page);
+ if (unlikely(status < 0))
+ break;
+ copied = status;
+
+ cond_resched();
+
+ iov_iter_advance(i, copied);
+ if (unlikely(copied == 0)) {
+ /*
+ * If we were unable to copy any data at all, we must
+ * fall back to a single segment length write.
+ *
+ * If we didn't fallback here, we could livelock
+ * because not all segments in the iov can be copied at
+ * once without a pagefault.
+ */
+ bytes = min_t(unsigned long, PAGE_SIZE - offset,
+ iov_iter_single_seg_count(i));
+ goto again;
+ }
+ pos += copied;
+ written += copied;
+ length -= copied;
+
+ balance_dirty_pages_ratelimited(inode->i_mapping);
+ } while (iov_iter_count(i) && length);
+
+ return written ? written : status;
+}
+
+ssize_t
+iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
+ struct iomap_ops *ops)
+{
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
+ loff_t pos = iocb->ki_pos, ret = 0, written = 0;
+
+ while (iov_iter_count(iter)) {
+ ret = iomap_apply(inode, pos, iov_iter_count(iter),
+ IOMAP_WRITE, ops, iter, iomap_write_actor);
+ if (ret <= 0)
+ break;
+ pos += ret;
+ written += ret;
+ }
+
+ return written ? written : ret;
+}
+EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
+
+static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
+ unsigned bytes, struct iomap *iomap)
+{
+ struct page *page;
+ int status;
+
+ status = iomap_write_begin(inode, pos, bytes,
+ AOP_FLAG_UNINTERRUPTIBLE | AOP_FLAG_NOFS, &page, iomap);
+ if (status)
+ return status;
+
+ zero_user(page, offset, bytes);
+ mark_page_accessed(page);
+
+ return iomap_write_end(inode, pos, bytes, bytes, page);
+}
+
+static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
+ struct iomap *iomap)
+{
+ sector_t sector = iomap->blkno +
+ (((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9);
+
+ return __dax_zero_page_range(iomap->bdev, sector, offset, bytes);
+}
+
+static loff_t
+iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
+ void *data, struct iomap *iomap)
+{
+ bool *did_zero = data;
+ loff_t written = 0;
+ int status;
+
+ /* already zeroed? we're done. */
+ if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
+ return count;
+
+ do {
+ unsigned offset, bytes;
+
+ offset = pos & (PAGE_SIZE - 1); /* Within page */
+ bytes = min_t(unsigned, PAGE_SIZE - offset, count);
+
+ if (IS_DAX(inode))
+ status = iomap_dax_zero(pos, offset, bytes, iomap);
+ else
+ status = iomap_zero(inode, pos, offset, bytes, iomap);
+ if (status < 0)
+ return status;
+
+ pos += bytes;
+ count -= bytes;
+ written += bytes;
+ if (did_zero)
+ *did_zero = true;
+ } while (count > 0);
+
+ return written;
+}
+
+int
+iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
+ struct iomap_ops *ops)
+{
+ loff_t ret;
+
+ while (len > 0) {
+ ret = iomap_apply(inode, pos, len, IOMAP_ZERO,
+ ops, did_zero, iomap_zero_range_actor);
+ if (ret <= 0)
+ return ret;
+
+ pos += ret;
+ len -= ret;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_zero_range);
+
+int
+iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
+ struct iomap_ops *ops)
+{
+ unsigned blocksize = (1 << inode->i_blkbits);
+ unsigned off = pos & (blocksize - 1);
+
+ /* Block boundary? Nothing to do */
+ if (!off)
+ return 0;
+ return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
+}
+EXPORT_SYMBOL_GPL(iomap_truncate_page);
+
+static loff_t
+iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
+ void *data, struct iomap *iomap)
+{
+ struct page *page = data;
+ int ret;
+
+ ret = __block_write_begin_int(page, pos & ~PAGE_MASK, length,
+ NULL, iomap);
+ if (ret)
+ return ret;
+
+ block_commit_write(page, 0, length);
+ return length;
+}
+
+int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+ struct iomap_ops *ops)
+{
+ struct page *page = vmf->page;
+ struct inode *inode = file_inode(vma->vm_file);
+ unsigned long length;
+ loff_t offset, size;
+ ssize_t ret;
+
+ lock_page(page);
+ size = i_size_read(inode);
+ if ((page->mapping != inode->i_mapping) ||
+ (page_offset(page) > size)) {
+ /* We overload EFAULT to mean page got truncated */
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ /* page is wholly or partially inside EOF */
+ if (((page->index + 1) << PAGE_SHIFT) > size)
+ length = size & ~PAGE_MASK;
+ else
+ length = PAGE_SIZE;
+
+ offset = page_offset(page);
+ while (length > 0) {
+ ret = iomap_apply(inode, offset, length, IOMAP_WRITE,
+ ops, page, iomap_page_mkwrite_actor);
+ if (unlikely(ret <= 0))
+ goto out_unlock;
+ offset += ret;
+ length -= ret;
+ }
+
+ set_page_dirty(page);
+ wait_for_stable_page(page);
+ return 0;
+out_unlock:
+ unlock_page(page);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
+
+struct fiemap_ctx {
+ struct fiemap_extent_info *fi;
+ struct iomap prev;
+};
+
+static int iomap_to_fiemap(struct fiemap_extent_info *fi,
+ struct iomap *iomap, u32 flags)
+{
+ switch (iomap->type) {
+ case IOMAP_HOLE:
+ /* skip holes */
+ return 0;
+ case IOMAP_DELALLOC:
+ flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN;
+ break;
+ case IOMAP_UNWRITTEN:
+ flags |= FIEMAP_EXTENT_UNWRITTEN;
+ break;
+ case IOMAP_MAPPED:
+ break;
+ }
+
+ return fiemap_fill_next_extent(fi, iomap->offset,
+ iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0,
+ iomap->length, flags | FIEMAP_EXTENT_MERGED);
+
+}
+
+static loff_t
+iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+ struct iomap *iomap)
+{
+ struct fiemap_ctx *ctx = data;
+ loff_t ret = length;
+
+ if (iomap->type == IOMAP_HOLE)
+ return length;
+
+ ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0);
+ ctx->prev = *iomap;
+ switch (ret) {
+ case 0: /* success */
+ return length;
+ case 1: /* extent array full */
+ return 0;
+ default:
+ return ret;
+ }
+}
+
+int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
+ loff_t start, loff_t len, struct iomap_ops *ops)
+{
+ struct fiemap_ctx ctx;
+ loff_t ret;
+
+ memset(&ctx, 0, sizeof(ctx));
+ ctx.fi = fi;
+ ctx.prev.type = IOMAP_HOLE;
+
+ ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC);
+ if (ret)
+ return ret;
+
+ ret = filemap_write_and_wait(inode->i_mapping);
+ if (ret)
+ return ret;
+
+ while (len > 0) {
+ ret = iomap_apply(inode, start, len, 0, ops, &ctx,
+ iomap_fiemap_actor);
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ break;
+
+ start += ret;
+ len -= ret;
+ }
+
+ if (ctx.prev.type != IOMAP_HOLE) {
+ ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_fiemap);
diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index f311bf084015f..44af14b2e9166 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -26,7 +26,7 @@
#include "zisofs.h"
/* This should probably be global. */
-static char zisofs_sink_page[PAGE_CACHE_SIZE];
+static char zisofs_sink_page[PAGE_SIZE];
/*
* This contains the zlib memory allocation and the mutex for the
@@ -70,18 +70,18 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start,
for ( i = 0 ; i < pcount ; i++ ) {
if (!pages[i])
continue;
- memset(page_address(pages[i]), 0, PAGE_CACHE_SIZE);
+ memset(page_address(pages[i]), 0, PAGE_SIZE);
flush_dcache_page(pages[i]);
SetPageUptodate(pages[i]);
}
- return ((loff_t)pcount) << PAGE_CACHE_SHIFT;
+ return ((loff_t)pcount) << PAGE_SHIFT;
}
/* Because zlib is not thread-safe, do all the I/O at the top. */
blocknum = block_start >> bufshift;
memset(bhs, 0, (needblocks + 1) * sizeof(struct buffer_head *));
haveblocks = isofs_get_blocks(inode, blocknum, bhs, needblocks);
- ll_rw_block(READ, haveblocks, bhs);
+ ll_rw_block(REQ_OP_READ, 0, haveblocks, bhs);
curbh = 0;
curpage = 0;
@@ -121,11 +121,11 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start,
if (pages[curpage]) {
stream.next_out = page_address(pages[curpage])
+ poffset;
- stream.avail_out = PAGE_CACHE_SIZE - poffset;
+ stream.avail_out = PAGE_SIZE - poffset;
poffset = 0;
} else {
stream.next_out = (void *)&zisofs_sink_page;
- stream.avail_out = PAGE_CACHE_SIZE;
+ stream.avail_out = PAGE_SIZE;
}
}
if (!stream.avail_in) {
@@ -220,14 +220,14 @@ static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount,
* pages with the data we have anyway...
*/
start_off = page_offset(pages[full_page]);
- end_off = min_t(loff_t, start_off + PAGE_CACHE_SIZE, inode->i_size);
+ end_off = min_t(loff_t, start_off + PAGE_SIZE, inode->i_size);
cstart_block = start_off >> zisofs_block_shift;
cend_block = (end_off + (1 << zisofs_block_shift) - 1)
>> zisofs_block_shift;
- WARN_ON(start_off - (full_page << PAGE_CACHE_SHIFT) !=
- ((cstart_block << zisofs_block_shift) & PAGE_CACHE_MASK));
+ WARN_ON(start_off - (full_page << PAGE_SHIFT) !=
+ ((cstart_block << zisofs_block_shift) & PAGE_MASK));
/* Find the pointer to this specific chunk */
/* Note: we're not using isonum_731() here because the data is known aligned */
@@ -260,10 +260,10 @@ static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount,
ret = zisofs_uncompress_block(inode, block_start, block_end,
pcount, pages, poffset, &err);
poffset += ret;
- pages += poffset >> PAGE_CACHE_SHIFT;
- pcount -= poffset >> PAGE_CACHE_SHIFT;
- full_page -= poffset >> PAGE_CACHE_SHIFT;
- poffset &= ~PAGE_CACHE_MASK;
+ pages += poffset >> PAGE_SHIFT;
+ pcount -= poffset >> PAGE_SHIFT;
+ full_page -= poffset >> PAGE_SHIFT;
+ poffset &= ~PAGE_MASK;
if (err) {
brelse(bh);
@@ -282,7 +282,7 @@ static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount,
if (poffset && *pages) {
memset(page_address(*pages) + poffset, 0,
- PAGE_CACHE_SIZE - poffset);
+ PAGE_SIZE - poffset);
flush_dcache_page(*pages);
SetPageUptodate(*pages);
}
@@ -302,12 +302,12 @@ static int zisofs_readpage(struct file *file, struct page *page)
int i, pcount, full_page;
unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1];
unsigned int zisofs_pages_per_cblock =
- PAGE_CACHE_SHIFT <= zisofs_block_shift ?
- (1 << (zisofs_block_shift - PAGE_CACHE_SHIFT)) : 0;
+ PAGE_SHIFT <= zisofs_block_shift ?
+ (1 << (zisofs_block_shift - PAGE_SHIFT)) : 0;
struct page *pages[max_t(unsigned, zisofs_pages_per_cblock, 1)];
pgoff_t index = page->index, end_index;
- end_index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ end_index = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
/*
* If this page is wholly outside i_size we just return zero;
* do_generic_file_read() will handle this for us
@@ -318,7 +318,7 @@ static int zisofs_readpage(struct file *file, struct page *page)
return 0;
}
- if (PAGE_CACHE_SHIFT <= zisofs_block_shift) {
+ if (PAGE_SHIFT <= zisofs_block_shift) {
/* We have already been given one page, this is the one
we must do. */
full_page = index & (zisofs_pages_per_cblock - 1);
@@ -351,7 +351,7 @@ static int zisofs_readpage(struct file *file, struct page *page)
kunmap(pages[i]);
unlock_page(pages[i]);
if (i != full_page)
- page_cache_release(pages[i]);
+ put_page(pages[i]);
}
}
@@ -361,7 +361,6 @@ static int zisofs_readpage(struct file *file, struct page *page)
const struct address_space_operations zisofs_aops = {
.readpage = zisofs_readpage,
- /* No sync_page operation supported? */
/* No bmap operation supported */
};
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index b943cbd963bb9..e7599615e4e04 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -58,7 +58,7 @@ int get_acorn_filename(struct iso_directory_record *de,
std = sizeof(struct iso_directory_record) + de->name_len[0];
if (std & 1)
std++;
- if ((*((unsigned char *) de) - std) != 32)
+ if (de->length[0] - std != 32)
return retnamlen;
chr = ((unsigned char *) de) + std;
if (strncmp(chr, "ARCHIMEDES", 10))
@@ -269,7 +269,7 @@ const struct file_operations isofs_dir_operations =
{
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .iterate = isofs_readdir,
+ .iterate_shared = isofs_readdir,
};
/*
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index bcd2d41b318a4..ad0c745ebad72 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -29,18 +29,15 @@
#define BEQUIET
static int isofs_hashi(const struct dentry *parent, struct qstr *qstr);
-static int isofs_dentry_cmpi(const struct dentry *parent,
- const struct dentry *dentry,
+static int isofs_dentry_cmpi(const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name);
#ifdef CONFIG_JOLIET
static int isofs_hashi_ms(const struct dentry *parent, struct qstr *qstr);
static int isofs_hash_ms(const struct dentry *parent, struct qstr *qstr);
-static int isofs_dentry_cmpi_ms(const struct dentry *parent,
- const struct dentry *dentry,
+static int isofs_dentry_cmpi_ms(const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name);
-static int isofs_dentry_cmp_ms(const struct dentry *parent,
- const struct dentry *dentry,
+static int isofs_dentry_cmp_ms(const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name);
#endif
@@ -174,7 +171,7 @@ struct iso9660_options{
* Compute the hash for the isofs name corresponding to the dentry.
*/
static int
-isofs_hashi_common(struct qstr *qstr, int ms)
+isofs_hashi_common(const struct dentry *dentry, struct qstr *qstr, int ms)
{
const char *name;
int len;
@@ -188,7 +185,7 @@ isofs_hashi_common(struct qstr *qstr, int ms)
len--;
}
- hash = init_name_hash();
+ hash = init_name_hash(dentry);
while (len--) {
c = tolower(*name++);
hash = partial_name_hash(c, hash);
@@ -231,11 +228,11 @@ static int isofs_dentry_cmp_common(
static int
isofs_hashi(const struct dentry *dentry, struct qstr *qstr)
{
- return isofs_hashi_common(qstr, 0);
+ return isofs_hashi_common(dentry, qstr, 0);
}
static int
-isofs_dentry_cmpi(const struct dentry *parent, const struct dentry *dentry,
+isofs_dentry_cmpi(const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
return isofs_dentry_cmp_common(len, str, name, 0, 1);
@@ -246,7 +243,7 @@ isofs_dentry_cmpi(const struct dentry *parent, const struct dentry *dentry,
* Compute the hash for the isofs name corresponding to the dentry.
*/
static int
-isofs_hash_common(struct qstr *qstr, int ms)
+isofs_hash_common(const struct dentry *dentry, struct qstr *qstr, int ms)
{
const char *name;
int len;
@@ -258,7 +255,7 @@ isofs_hash_common(struct qstr *qstr, int ms)
len--;
}
- qstr->hash = full_name_hash(name, len);
+ qstr->hash = full_name_hash(dentry, name, len);
return 0;
}
@@ -266,24 +263,24 @@ isofs_hash_common(struct qstr *qstr, int ms)
static int
isofs_hash_ms(const struct dentry *dentry, struct qstr *qstr)
{
- return isofs_hash_common(qstr, 1);
+ return isofs_hash_common(dentry, qstr, 1);
}
static int
isofs_hashi_ms(const struct dentry *dentry, struct qstr *qstr)
{
- return isofs_hashi_common(qstr, 1);
+ return isofs_hashi_common(dentry, qstr, 1);
}
static int
-isofs_dentry_cmp_ms(const struct dentry *parent, const struct dentry *dentry,
+isofs_dentry_cmp_ms(const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
return isofs_dentry_cmp_common(len, str, name, 1, 0);
}
static int
-isofs_dentry_cmpi_ms(const struct dentry *parent, const struct dentry *dentry,
+isofs_dentry_cmpi_ms(const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
return isofs_dentry_cmp_common(len, str, name, 1, 1);
@@ -1021,7 +1018,7 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock,
* the page with useless information without generating any
* I/O errors.
*/
- if (b_off > ((inode->i_size + PAGE_CACHE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) {
+ if (b_off > ((inode->i_size + PAGE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) {
printk(KERN_DEBUG "%s: block >= EOF (%lu, %llu)\n",
__func__, b_off,
(unsigned long long)inode->i_size);
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index 7b543e6b6526d..aee592767f1d0 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -22,7 +22,7 @@ isofs_cmp(struct dentry *dentry, const char *compare, int dlen)
qstr.len = dlen;
if (likely(!dentry->d_op))
return dentry->d_name.len != dlen || memcmp(dentry->d_name.name, compare, dlen);
- return dentry->d_op->d_compare(NULL, NULL, dentry->d_name.len, dentry->d_name.name, &qstr);
+ return dentry->d_op->d_compare(NULL, dentry->d_name.len, dentry->d_name.name, &qstr);
}
/*
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 5384ceb35b1cc..98b3eb7d8eaf6 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -203,6 +203,8 @@ int get_rock_ridge_filename(struct iso_directory_record *de,
int retnamlen = 0;
int truncate = 0;
int ret = 0;
+ char *p;
+ int len;
if (!ISOFS_SB(inode->i_sb)->s_rock)
return 0;
@@ -267,12 +269,17 @@ repeat:
rr->u.NM.flags);
break;
}
- if ((strlen(retname) + rr->len - 5) >= 254) {
+ len = rr->len - 5;
+ if (retnamlen + len >= 254) {
truncate = 1;
break;
}
- strncat(retname, rr->u.NM.name, rr->len - 5);
- retnamlen += rr->len - 5;
+ p = memchr(rr->u.NM.name, '\0', len);
+ if (unlikely(p))
+ len = p - rr->u.NM.name;
+ memcpy(retname + retnamlen, rr->u.NM.name, len);
+ retnamlen += len;
+ retname[retnamlen] = '\0';
break;
case SIG('R', 'E'):
kfree(rs.buffer);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 517f2de784cfc..5bb565f9989cc 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -81,11 +81,11 @@ static void release_buffer_page(struct buffer_head *bh)
if (!trylock_page(page))
goto nope;
- page_cache_get(page);
+ get_page(page);
__brelse(bh);
try_to_free_buffers(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return;
nope:
@@ -124,7 +124,7 @@ static int journal_submit_commit_record(journal_t *journal,
struct commit_header *tmp;
struct buffer_head *bh;
int ret;
- struct timespec now = current_kernel_time();
+ struct timespec64 now = current_kernel_time64();
*cbh = NULL;
@@ -155,9 +155,9 @@ static int journal_submit_commit_record(journal_t *journal,
if (journal->j_flags & JBD2_BARRIER &&
!jbd2_has_feature_async_commit(journal))
- ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
+ ret = submit_bh(REQ_OP_WRITE, WRITE_SYNC | WRITE_FLUSH_FUA, bh);
else
- ret = submit_bh(WRITE_SYNC, bh);
+ ret = submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh);
*cbh = bh;
return ret;
@@ -219,6 +219,8 @@ static int journal_submit_data_buffers(journal_t *journal,
spin_lock(&journal->j_list_lock);
list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+ if (!(jinode->i_flags & JI_WRITE_DATA))
+ continue;
mapping = jinode->i_vfs_inode->i_mapping;
jinode->i_flags |= JI_COMMIT_RUNNING;
spin_unlock(&journal->j_list_lock);
@@ -256,6 +258,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
/* For locking, see the comment in journal_submit_data_buffers() */
spin_lock(&journal->j_list_lock);
list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+ if (!(jinode->i_flags & JI_WAIT_DATA))
+ continue;
jinode->i_flags |= JI_COMMIT_RUNNING;
spin_unlock(&journal->j_list_lock);
err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
@@ -714,7 +718,7 @@ start_journal_io:
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
bh->b_end_io = journal_end_buffer_io_sync;
- submit_bh(WRITE_SYNC, bh);
+ submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh);
}
cond_resched();
stats.run.rs_blocks_logged += bufs;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index de73a9516a542..46261a6f902dd 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -94,7 +94,8 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
EXPORT_SYMBOL(jbd2_journal_invalidatepage);
EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
EXPORT_SYMBOL(jbd2_journal_force_commit);
-EXPORT_SYMBOL(jbd2_journal_file_inode);
+EXPORT_SYMBOL(jbd2_journal_inode_add_write);
+EXPORT_SYMBOL(jbd2_journal_inode_add_wait);
EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
@@ -690,6 +691,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
{
int err = 0;
+ jbd2_might_wait_for_commit(journal);
read_lock(&journal->j_state_lock);
#ifdef CONFIG_JBD2_DEBUG
if (!tid_geq(journal->j_commit_request, tid)) {
@@ -1090,6 +1092,7 @@ static void jbd2_stats_proc_exit(journal_t *journal)
static journal_t * journal_init_common (void)
{
+ static struct lock_class_key jbd2_trans_commit_key;
journal_t *journal;
int err;
@@ -1125,6 +1128,9 @@ static journal_t * journal_init_common (void)
spin_lock_init(&journal->j_history_lock);
+ lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle",
+ &jbd2_trans_commit_key, 0);
+
return journal;
}
@@ -1345,15 +1351,15 @@ static int journal_reset(journal_t *journal)
return jbd2_journal_start_thread(journal);
}
-static int jbd2_write_superblock(journal_t *journal, int write_op)
+static int jbd2_write_superblock(journal_t *journal, int write_flags)
{
struct buffer_head *bh = journal->j_sb_buffer;
journal_superblock_t *sb = journal->j_superblock;
int ret;
- trace_jbd2_write_superblock(journal, write_op);
+ trace_jbd2_write_superblock(journal, write_flags);
if (!(journal->j_flags & JBD2_BARRIER))
- write_op &= ~(REQ_FUA | REQ_FLUSH);
+ write_flags &= ~(REQ_FUA | REQ_PREFLUSH);
lock_buffer(bh);
if (buffer_write_io_error(bh)) {
/*
@@ -1373,7 +1379,7 @@ static int jbd2_write_superblock(journal_t *journal, int write_op)
jbd2_superblock_csum_set(journal, sb);
get_bh(bh);
bh->b_end_io = end_buffer_write_sync;
- ret = submit_bh(write_op, bh);
+ ret = submit_bh(REQ_OP_WRITE, write_flags, bh);
wait_on_buffer(bh);
if (buffer_write_io_error(bh)) {
clear_buffer_write_io_error(bh);
@@ -1497,7 +1503,7 @@ static int journal_get_superblock(journal_t *journal)
J_ASSERT(bh != NULL);
if (!buffer_uptodate(bh)) {
- ll_rw_block(READ, 1, &bh);
+ ll_rw_block(REQ_OP_READ, 0, 1, &bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
printk(KERN_ERR
@@ -2221,7 +2227,7 @@ void jbd2_journal_ack_err(journal_t *journal)
int jbd2_journal_blocks_per_page(struct inode *inode)
{
- return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+ return 1 << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
}
/*
@@ -2328,18 +2334,10 @@ void *jbd2_alloc(size_t size, gfp_t flags)
BUG_ON(size & (size-1)); /* Must be a power of 2 */
- flags |= __GFP_REPEAT;
- if (size == PAGE_SIZE)
- ptr = (void *)__get_free_pages(flags, 0);
- else if (size > PAGE_SIZE) {
- int order = get_order(size);
-
- if (order < 3)
- ptr = (void *)__get_free_pages(flags, order);
- else
- ptr = vmalloc(size);
- } else
+ if (size < PAGE_SIZE)
ptr = kmem_cache_alloc(get_slab(size), flags);
+ else
+ ptr = (void *)__get_free_pages(flags, get_order(size));
/* Check alignment; SLUB has gotten this wrong in the past,
* and this can lead to user data corruption! */
@@ -2350,20 +2348,10 @@ void *jbd2_alloc(size_t size, gfp_t flags)
void jbd2_free(void *ptr, size_t size)
{
- if (size == PAGE_SIZE) {
- free_pages((unsigned long)ptr, 0);
- return;
- }
- if (size > PAGE_SIZE) {
- int order = get_order(size);
-
- if (order < 3)
- free_pages((unsigned long)ptr, order);
- else
- vfree(ptr);
- return;
- }
- kmem_cache_free(get_slab(size), ptr);
+ if (size < PAGE_SIZE)
+ kmem_cache_free(get_slab(size), ptr);
+ else
+ free_pages((unsigned long)ptr, get_order(size));
};
/*
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 08a456b96e4ef..02dd3360cb20c 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -104,7 +104,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
bufs[nbufs++] = bh;
if (nbufs == MAXBUF) {
- ll_rw_block(READ, nbufs, bufs);
+ ll_rw_block(REQ_OP_READ, 0, nbufs, bufs);
journal_brelse_array(bufs, nbufs);
nbufs = 0;
}
@@ -113,7 +113,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
}
if (nbufs)
- ll_rw_block(READ, nbufs, bufs);
+ ll_rw_block(REQ_OP_READ, 0, nbufs, bufs);
err = 0;
failed:
@@ -303,7 +303,7 @@ int jbd2_journal_recover(journal_t *journal)
* Locate any valid recovery information from the journal and set up the
* journal structures in memory to ignore it (presumably because the
* caller has evidence that it is out of date).
- * This function does'nt appear to be exorted..
+ * This function doesn't appear to be exported..
*
* We perform one pass over the journal to allow us to tell the user how
* much recovery information is being erased, and to let us initialise
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 01e4652d88f69..b5bc3e2491632 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -182,6 +182,8 @@ static int add_transaction_credits(journal_t *journal, int blocks,
int needed;
int total = blocks + rsv_blocks;
+ jbd2_might_wait_for_commit(journal);
+
/*
* If the current transaction is locked down for commit, wait
* for the lock to be released.
@@ -382,13 +384,11 @@ repeat:
read_unlock(&journal->j_state_lock);
current->journal_info = handle;
- lock_map_acquire(&handle->h_lockdep_map);
+ rwsem_acquire_read(&journal->j_trans_commit_map, 0, 0, _THIS_IP_);
jbd2_journal_free_transaction(new_transaction);
return 0;
}
-static struct lock_class_key jbd2_handle_key;
-
/* Allocate a new handle. This should probably be in a slab... */
static handle_t *new_handle(int nblocks)
{
@@ -398,9 +398,6 @@ static handle_t *new_handle(int nblocks)
handle->h_buffer_credits = nblocks;
handle->h_ref = 1;
- lockdep_init_map(&handle->h_lockdep_map, "jbd2_handle",
- &jbd2_handle_key, 0);
-
return handle;
}
@@ -543,7 +540,7 @@ EXPORT_SYMBOL(jbd2_journal_start_reserved);
*
* Some transactions, such as large extends and truncates, can be done
* atomically all at once or in several stages. The operation requests
- * a credit for a number of buffer modications in advance, but can
+ * a credit for a number of buffer modifications in advance, but can
* extend its credit if it needs more.
*
* jbd2_journal_extend tries to give the running handle more buffer credits.
@@ -627,7 +624,7 @@ error_out:
* If the jbd2_journal_extend() call above fails to grant new buffer credits
* to a running handle, a call to jbd2_journal_restart will commit the
* handle's transaction so far and reattach the handle to a new
- * transaction capabable of guaranteeing the requested number of
+ * transaction capable of guaranteeing the requested number of
* credits. We preserve reserved handle if there's any attached to the
* passed in handle.
*/
@@ -672,7 +669,7 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
if (need_to_start)
jbd2_log_start_commit(journal, tid);
- lock_map_release(&handle->h_lockdep_map);
+ rwsem_release(&journal->j_trans_commit_map, 1, _THIS_IP_);
handle->h_buffer_credits = nblocks;
ret = start_this_handle(journal, handle, gfp_mask);
return ret;
@@ -700,6 +697,8 @@ void jbd2_journal_lock_updates(journal_t *journal)
{
DEFINE_WAIT(wait);
+ jbd2_might_wait_for_commit(journal);
+
write_lock(&journal->j_state_lock);
++journal->j_barrier_count;
@@ -1586,7 +1585,7 @@ drop:
/**
* int jbd2_journal_stop() - complete a transaction
- * @handle: tranaction to complete.
+ * @handle: transaction to complete.
*
* All done for a particular handle.
*
@@ -1750,11 +1749,11 @@ int jbd2_journal_stop(handle_t *handle)
wake_up(&journal->j_wait_transaction_locked);
}
+ rwsem_release(&journal->j_trans_commit_map, 1, _THIS_IP_);
+
if (wait_for_commit)
err = jbd2_log_wait_commit(journal, tid);
- lock_map_release(&handle->h_lockdep_map);
-
if (handle->h_rsv_handle)
jbd2_journal_free_reserved(handle->h_rsv_handle);
free_and_exit:
@@ -2263,7 +2262,7 @@ int jbd2_journal_invalidatepage(journal_t *journal,
struct buffer_head *head, *bh, *next;
unsigned int stop = offset + length;
unsigned int curr_off = 0;
- int partial_page = (offset || length < PAGE_CACHE_SIZE);
+ int partial_page = (offset || length < PAGE_SIZE);
int may_free = 1;
int ret = 0;
@@ -2272,7 +2271,7 @@ int jbd2_journal_invalidatepage(journal_t *journal,
if (!page_has_buffers(page))
return 0;
- BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+ BUG_ON(stop > PAGE_SIZE || stop < length);
/* We will potentially be playing with lists other than just the
* data lists (especially for journaled data mode), so be
@@ -2462,7 +2461,8 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
/*
* File inode in the inode list of the handle's transaction
*/
-int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
+static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode,
+ unsigned long flags)
{
transaction_t *transaction = handle->h_transaction;
journal_t *journal;
@@ -2487,12 +2487,14 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
* and if jinode->i_next_transaction == transaction, commit code
* will only file the inode where we want it.
*/
- if (jinode->i_transaction == transaction ||
- jinode->i_next_transaction == transaction)
+ if ((jinode->i_transaction == transaction ||
+ jinode->i_next_transaction == transaction) &&
+ (jinode->i_flags & flags) == flags)
return 0;
spin_lock(&journal->j_list_lock);
-
+ jinode->i_flags |= flags;
+ /* Is inode already attached where we need it? */
if (jinode->i_transaction == transaction ||
jinode->i_next_transaction == transaction)
goto done;
@@ -2523,6 +2525,17 @@ done:
return 0;
}
+int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *jinode)
+{
+ return jbd2_journal_file_inode(handle, jinode,
+ JI_WRITE_DATA | JI_WAIT_DATA);
+}
+
+int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *jinode)
+{
+ return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA);
+}
+
/*
* File truncate and transaction commit interact with each other in a
* non-trivial way. If a transaction writing data block A is
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 2f7a3c0904899..bc2693d562987 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -203,8 +203,6 @@ struct posix_acl *jffs2_get_acl(struct inode *inode, int type)
acl = ERR_PTR(rc);
}
kfree(value);
- if (!IS_ERR(acl))
- set_cached_acl(inode, type, acl);
return acl;
}
diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c
index 1090eb64b90d6..9d26b1b9fc014 100644
--- a/fs/jffs2/debug.c
+++ b/fs/jffs2/debug.c
@@ -95,15 +95,15 @@ __jffs2_dbg_fragtree_paranoia_check_nolock(struct jffs2_inode_info *f)
rather than mucking around with actually reading the node
and checking the compression type, which is the real way
to tell a hole node. */
- if (frag->ofs & (PAGE_CACHE_SIZE-1) && frag_prev(frag)
- && frag_prev(frag)->size < PAGE_CACHE_SIZE && frag_prev(frag)->node) {
+ if (frag->ofs & (PAGE_SIZE-1) && frag_prev(frag)
+ && frag_prev(frag)->size < PAGE_SIZE && frag_prev(frag)->node) {
JFFS2_ERROR("REF_PRISTINE node at 0x%08x had a previous non-hole frag in the same page. Tell dwmw2.\n",
ref_offset(fn->raw));
bitched = 1;
}
- if ((frag->ofs+frag->size) & (PAGE_CACHE_SIZE-1) && frag_next(frag)
- && frag_next(frag)->size < PAGE_CACHE_SIZE && frag_next(frag)->node) {
+ if ((frag->ofs+frag->size) & (PAGE_SIZE-1) && frag_next(frag)
+ && frag_next(frag)->size < PAGE_SIZE && frag_next(frag)->node) {
JFFS2_ERROR("REF_PRISTINE node at 0x%08x (%08x-%08x) had a following non-hole frag in the same page. Tell dwmw2.\n",
ref_offset(fn->raw), frag->ofs, frag->ofs+frag->size);
bitched = 1;
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 30c4c9ebb693f..30eb33ff81892 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -40,7 +40,7 @@ static int jffs2_rename (struct inode *, struct dentry *,
const struct file_operations jffs2_dir_operations =
{
.read = generic_read_dir,
- .iterate = jffs2_readdir,
+ .iterate_shared=jffs2_readdir,
.unlocked_ioctl=jffs2_ioctl,
.fsync = jffs2_fsync,
.llseek = generic_file_llseek,
@@ -81,6 +81,7 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target,
struct jffs2_full_dirent *fd = NULL, *fd_list;
uint32_t ino = 0;
struct inode *inode = NULL;
+ unsigned int nhash;
jffs2_dbg(1, "jffs2_lookup()\n");
@@ -89,11 +90,14 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target,
dir_f = JFFS2_INODE_INFO(dir_i);
+ /* The 'nhash' on the fd_list is not the same as the dentry hash */
+ nhash = full_name_hash(NULL, target->d_name.name, target->d_name.len);
+
mutex_lock(&dir_f->sem);
/* NB: The 2.2 backport will need to explicitly check for '.' and '..' here */
- for (fd_list = dir_f->dents; fd_list && fd_list->nhash <= target->d_name.hash; fd_list = fd_list->next) {
- if (fd_list->nhash == target->d_name.hash &&
+ for (fd_list = dir_f->dents; fd_list && fd_list->nhash <= nhash; fd_list = fd_list->next) {
+ if (fd_list->nhash == nhash &&
(!fd || fd_list->version > fd->version) &&
strlen(fd_list->name) == target->d_name.len &&
!strncmp(fd_list->name, target->d_name.name, target->d_name.len)) {
@@ -241,7 +245,7 @@ static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry)
static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct dentry *dentry)
{
- struct jffs2_sb_info *c = JFFS2_SB_INFO(d_inode(old_dentry)->i_sb);
+ struct jffs2_sb_info *c = JFFS2_SB_INFO(old_dentry->d_sb);
struct jffs2_inode_info *f = JFFS2_INODE_INFO(d_inode(old_dentry));
struct jffs2_inode_info *dir_f = JFFS2_INODE_INFO(dir_i);
int ret;
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index cad86bac34530..0e62dec3effce 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -87,14 +87,15 @@ static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg)
int ret;
jffs2_dbg(2, "%s(): ino #%lu, page at offset 0x%lx\n",
- __func__, inode->i_ino, pg->index << PAGE_CACHE_SHIFT);
+ __func__, inode->i_ino, pg->index << PAGE_SHIFT);
BUG_ON(!PageLocked(pg));
pg_buf = kmap(pg);
/* FIXME: Can kmap fail? */
- ret = jffs2_read_inode_range(c, f, pg_buf, pg->index << PAGE_CACHE_SHIFT, PAGE_CACHE_SIZE);
+ ret = jffs2_read_inode_range(c, f, pg_buf, pg->index << PAGE_SHIFT,
+ PAGE_SIZE);
if (ret) {
ClearPageUptodate(pg);
@@ -137,8 +138,8 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
struct page *pg;
struct inode *inode = mapping->host;
struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- uint32_t pageofs = index << PAGE_CACHE_SHIFT;
+ pgoff_t index = pos >> PAGE_SHIFT;
+ uint32_t pageofs = index << PAGE_SHIFT;
int ret = 0;
pg = grab_cache_page_write_begin(mapping, index, flags);
@@ -230,7 +231,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
out_page:
unlock_page(pg);
- page_cache_release(pg);
+ put_page(pg);
return ret;
}
@@ -245,14 +246,14 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
struct jffs2_raw_inode *ri;
- unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned start = pos & (PAGE_SIZE - 1);
unsigned end = start + copied;
unsigned aligned_start = start & ~3;
int ret = 0;
uint32_t writtenlen = 0;
jffs2_dbg(1, "%s(): ino #%lu, page at 0x%lx, range %d-%d, flags %lx\n",
- __func__, inode->i_ino, pg->index << PAGE_CACHE_SHIFT,
+ __func__, inode->i_ino, pg->index << PAGE_SHIFT,
start, end, pg->flags);
/* We need to avoid deadlock with page_cache_read() in
@@ -261,7 +262,7 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
to re-lock it. */
BUG_ON(!PageUptodate(pg));
- if (end == PAGE_CACHE_SIZE) {
+ if (end == PAGE_SIZE) {
/* When writing out the end of a page, write out the
_whole_ page. This helps to reduce the number of
nodes in files which have many short writes, like
@@ -275,7 +276,7 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
jffs2_dbg(1, "%s(): Allocation of raw inode failed\n",
__func__);
unlock_page(pg);
- page_cache_release(pg);
+ put_page(pg);
return -ENOMEM;
}
@@ -292,7 +293,7 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
kmap(pg);
ret = jffs2_write_inode_range(c, f, ri, page_address(pg) + aligned_start,
- (pg->index << PAGE_CACHE_SHIFT) + aligned_start,
+ (pg->index << PAGE_SHIFT) + aligned_start,
end - aligned_start, &writtenlen);
kunmap(pg);
@@ -329,6 +330,6 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
jffs2_dbg(1, "%s() returning %d\n",
__func__, writtenlen > 0 ? writtenlen : ret);
unlock_page(pg);
- page_cache_release(pg);
+ put_page(pg);
return writtenlen > 0 ? writtenlen : ret;
}
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index bead25ae8fe4a..ae2ebb26b4468 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -586,8 +586,8 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
goto out_root;
sb->s_maxbytes = 0xFFFFFFFF;
- sb->s_blocksize = PAGE_CACHE_SIZE;
- sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ sb->s_blocksize = PAGE_SIZE;
+ sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = JFFS2_SUPER_MAGIC;
if (!(sb->s_flags & MS_RDONLY))
jffs2_start_garbage_collect_thread(c);
@@ -685,7 +685,7 @@ unsigned char *jffs2_gc_fetch_page(struct jffs2_sb_info *c,
struct inode *inode = OFNI_EDONI_2SFFJ(f);
struct page *pg;
- pg = read_cache_page(inode->i_mapping, offset >> PAGE_CACHE_SHIFT,
+ pg = read_cache_page(inode->i_mapping, offset >> PAGE_SHIFT,
(void *)jffs2_do_readpage_unlock, inode);
if (IS_ERR(pg))
return (void *)pg;
@@ -701,7 +701,7 @@ void jffs2_gc_release_page(struct jffs2_sb_info *c,
struct page *pg = (void *)*priv;
kunmap(pg);
- page_cache_release(pg);
+ put_page(pg);
}
static int jffs2_flash_setup(struct jffs2_sb_info *c) {
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 7e553f286775f..9ed0f26cf0238 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -552,7 +552,7 @@ static int jffs2_garbage_collect_live(struct jffs2_sb_info *c, struct jffs2_era
goto upnout;
}
/* We found a datanode. Do the GC */
- if((start >> PAGE_CACHE_SHIFT) < ((end-1) >> PAGE_CACHE_SHIFT)) {
+ if((start >> PAGE_SHIFT) < ((end-1) >> PAGE_SHIFT)) {
/* It crosses a page boundary. Therefore, it must be a hole. */
ret = jffs2_garbage_collect_hole(c, jeb, f, fn, start, end);
} else {
@@ -1192,8 +1192,8 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
struct jffs2_node_frag *frag;
uint32_t min, max;
- min = start & ~(PAGE_CACHE_SIZE-1);
- max = min + PAGE_CACHE_SIZE;
+ min = start & ~(PAGE_SIZE-1);
+ max = min + PAGE_SIZE;
frag = jffs2_lookup_node_frag(&f->fragtree, start);
@@ -1351,7 +1351,7 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
cdatalen = min_t(uint32_t, alloclen - sizeof(ri), end - offset);
datalen = end - offset;
- writebuf = pg_ptr + (offset & (PAGE_CACHE_SIZE -1));
+ writebuf = pg_ptr + (offset & (PAGE_SIZE -1));
comprtype = jffs2_compress(c, f, writebuf, &comprbuf, &datalen, &cdatalen);
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 9a5449bc3afb0..b86c78d178c60 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -90,7 +90,7 @@ uint32_t jffs2_truncate_fragtree(struct jffs2_sb_info *c, struct rb_root *list,
/* If the last fragment starts at the RAM page boundary, it is
* REF_PRISTINE irrespective of its size. */
- if (frag->node && (frag->ofs & (PAGE_CACHE_SIZE - 1)) == 0) {
+ if (frag->node && (frag->ofs & (PAGE_SIZE - 1)) == 0) {
dbg_fragtree2("marking the last fragment 0x%08x-0x%08x REF_PRISTINE.\n",
frag->ofs, frag->ofs + frag->size);
frag->node->raw->flash_offset = ref_offset(frag->node->raw) | REF_PRISTINE;
@@ -237,7 +237,7 @@ static int jffs2_add_frag_to_fragtree(struct jffs2_sb_info *c, struct rb_root *r
If so, both 'this' and the new node get marked REF_NORMAL so
the GC can take a look.
*/
- if (lastend && (lastend-1) >> PAGE_CACHE_SHIFT == newfrag->ofs >> PAGE_CACHE_SHIFT) {
+ if (lastend && (lastend-1) >> PAGE_SHIFT == newfrag->ofs >> PAGE_SHIFT) {
if (this->node)
mark_ref_normal(this->node->raw);
mark_ref_normal(newfrag->node->raw);
@@ -382,7 +382,7 @@ int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_in
/* If we now share a page with other nodes, mark either previous
or next node REF_NORMAL, as appropriate. */
- if (newfrag->ofs & (PAGE_CACHE_SIZE-1)) {
+ if (newfrag->ofs & (PAGE_SIZE-1)) {
struct jffs2_node_frag *prev = frag_prev(newfrag);
mark_ref_normal(fn->raw);
@@ -391,7 +391,7 @@ int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_in
mark_ref_normal(prev->node->raw);
}
- if ((newfrag->ofs+newfrag->size) & (PAGE_CACHE_SIZE-1)) {
+ if ((newfrag->ofs+newfrag->size) & (PAGE_SIZE-1)) {
struct jffs2_node_frag *next = frag_next(newfrag);
if (next) {
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index bfebbf13698c0..06a71dbd4833e 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -674,7 +674,7 @@ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_r
}
}
- fd->nhash = full_name_hash(fd->name, rd->nsize);
+ fd->nhash = full_name_hash(NULL, fd->name, rd->nsize);
fd->next = NULL;
fd->name[rd->nsize] = '\0';
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 9ad5ba4b299be..90431dd613b8d 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -1100,7 +1100,7 @@ static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblo
fd->next = NULL;
fd->version = je32_to_cpu(rd->version);
fd->ino = je32_to_cpu(rd->ino);
- fd->nhash = full_name_hash(fd->name, checkedlen);
+ fd->nhash = full_name_hash(NULL, fd->name, checkedlen);
fd->type = rd->type;
jffs2_add_fd_to_list(c, fd, &ic->scan_dents);
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index 7a28facd71750..c2332e30f218a 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -49,18 +49,19 @@ int jffs2_init_security(struct inode *inode, struct inode *dir,
/* ---- XATTR Handler for "security.*" ----------------- */
static int jffs2_security_getxattr(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- void *buffer, size_t size)
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *buffer, size_t size)
{
- return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY,
+ return do_jffs2_getxattr(inode, JFFS2_XPREFIX_SECURITY,
name, buffer, size);
}
static int jffs2_security_setxattr(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *buffer,
+ size_t size, int flags)
{
- return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY,
+ return do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY,
name, buffer, size, flags);
}
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index bc5385471a6e3..be7c8a6a57480 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -476,7 +476,7 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
fd->next = NULL;
fd->version = je32_to_cpu(spd->version);
fd->ino = je32_to_cpu(spd->ino);
- fd->nhash = full_name_hash(fd->name, checkedlen);
+ fd->nhash = full_name_hash(NULL, fd->name, checkedlen);
fd->type = spd->type;
jffs2_add_fd_to_list(c, fd, &ic->scan_dents);
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 0a9a114bb9d11..5ef21f4c4c77d 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -147,7 +147,7 @@ static struct dentry *jffs2_get_parent(struct dentry *child)
JFFS2_DEBUG("Parent of directory ino #%u is #%u\n",
f->inocache->ino, pino);
- return d_obtain_alias(jffs2_iget(d_inode(child)->i_sb, pino));
+ return d_obtain_alias(jffs2_iget(child->d_sb, pino));
}
static const struct export_operations jffs2_export_ops = {
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index b634de4c81013..cda9a361368e8 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -172,8 +172,8 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
beginning of a page and runs to the end of the file, or if
it's a hole node, mark it REF_PRISTINE, else REF_NORMAL.
*/
- if ((je32_to_cpu(ri->dsize) >= PAGE_CACHE_SIZE) ||
- ( ((je32_to_cpu(ri->offset)&(PAGE_CACHE_SIZE-1))==0) &&
+ if ((je32_to_cpu(ri->dsize) >= PAGE_SIZE) ||
+ ( ((je32_to_cpu(ri->offset)&(PAGE_SIZE-1))==0) &&
(je32_to_cpu(ri->dsize)+je32_to_cpu(ri->offset) == je32_to_cpu(ri->isize)))) {
flash_ofs |= REF_PRISTINE;
} else {
@@ -245,7 +245,7 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff
fd->version = je32_to_cpu(rd->version);
fd->ino = je32_to_cpu(rd->ino);
- fd->nhash = full_name_hash(name, namelen);
+ fd->nhash = full_name_hash(NULL, name, namelen);
fd->type = rd->type;
memcpy(fd->name, name, namelen);
fd->name[namelen]=0;
@@ -366,7 +366,8 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
break;
}
mutex_lock(&f->sem);
- datalen = min_t(uint32_t, writelen, PAGE_CACHE_SIZE - (offset & (PAGE_CACHE_SIZE-1)));
+ datalen = min_t(uint32_t, writelen,
+ PAGE_SIZE - (offset & (PAGE_SIZE-1)));
cdatalen = min_t(uint32_t, alloclen - sizeof(*ri), datalen);
comprtype = jffs2_compress(c, f, buf, &comprbuf, &datalen, &cdatalen);
@@ -597,7 +598,7 @@ int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f,
jffs2_add_fd_to_list(c, fd, &dir_f->dents);
mutex_unlock(&dir_f->sem);
} else {
- uint32_t nhash = full_name_hash(name, namelen);
+ uint32_t nhash = full_name_hash(NULL, name, namelen);
fd = dir_f->dents;
/* We don't actually want to reserve any space, but we do
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index b2555ef07a12b..5d6030826c520 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -17,18 +17,19 @@
#include "nodelist.h"
static int jffs2_trusted_getxattr(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- void *buffer, size_t size)
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *buffer, size_t size)
{
- return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED,
+ return do_jffs2_getxattr(inode, JFFS2_XPREFIX_TRUSTED,
name, buffer, size);
}
static int jffs2_trusted_setxattr(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *buffer,
+ size_t size, int flags)
{
- return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED,
+ return do_jffs2_setxattr(inode, JFFS2_XPREFIX_TRUSTED,
name, buffer, size, flags);
}
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index 539bd630b5e42..9d027b4abcf99 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -17,18 +17,19 @@
#include "nodelist.h"
static int jffs2_user_getxattr(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- void *buffer, size_t size)
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *buffer, size_t size)
{
- return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_USER,
+ return do_jffs2_getxattr(inode, JFFS2_XPREFIX_USER,
name, buffer, size);
}
static int jffs2_user_setxattr(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *buffer,
+ size_t size, int flags)
{
- return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_USER,
+ return do_jffs2_setxattr(inode, JFFS2_XPREFIX_USER,
name, buffer, size, flags);
}
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 49456853e9de2..21fa92ba2c191 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -34,10 +34,6 @@ struct posix_acl *jfs_get_acl(struct inode *inode, int type)
int size;
char *value = NULL;
- acl = get_cached_acl(inode, type);
- if (acl != ACL_NOT_CACHED)
- return acl;
-
switch(type) {
case ACL_TYPE_ACCESS:
ea_name = XATTR_NAME_POSIX_ACL_ACCESS;
@@ -67,8 +63,6 @@ struct posix_acl *jfs_get_acl(struct inode *inode, int type)
acl = posix_acl_from_xattr(&init_user_ns, value, size);
}
kfree(value);
- if (!IS_ERR(acl))
- set_cached_acl(inode, type, acl);
return acl;
}
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 4ce7735dd0422..7f1a585a0a947 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -140,10 +140,10 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
}
const struct inode_operations jfs_file_inode_operations = {
- .setxattr = jfs_setxattr,
- .getxattr = jfs_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
.listxattr = jfs_listxattr,
- .removexattr = jfs_removexattr,
+ .removexattr = generic_removexattr,
.setattr = jfs_setattr,
#ifdef CONFIG_JFS_POSIX_ACL
.get_acl = jfs_get_acl,
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 9d9bae63ae2a2..ad3e7b1effc4b 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -102,8 +102,8 @@ int jfs_commit_inode(struct inode *inode, int wait)
* partitions and may think inode is dirty
*/
if (!special_file(inode->i_mode) && noisy) {
- jfs_err("jfs_commit_inode(0x%p) called on "
- "read-only volume", inode);
+ jfs_err("jfs_commit_inode(0x%p) called on read-only volume",
+ inode);
jfs_err("Is remount racy?");
noisy--;
}
@@ -332,8 +332,7 @@ static sector_t jfs_bmap(struct address_space *mapping, sector_t block)
return generic_block_bmap(mapping, block, jfs_get_block);
}
-static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- loff_t offset)
+static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -341,7 +340,7 @@ static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
size_t count = iov_iter_count(iter);
ssize_t ret;
- ret = blockdev_direct_IO(iocb, inode, iter, offset, jfs_get_block);
+ ret = blockdev_direct_IO(iocb, inode, iter, jfs_get_block);
/*
* In case of error extending write may have instantiated a few
@@ -349,7 +348,7 @@ static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
*/
if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
loff_t isize = i_size_read(inode);
- loff_t end = offset + count;
+ loff_t end = iocb->ki_pos + count;
if (end > isize)
jfs_write_failed(mapping, end);
diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c
index dd824d9b0b1a1..a37eb5f8cbc07 100644
--- a/fs/jfs/jfs_debug.c
+++ b/fs/jfs/jfs_debug.c
@@ -58,7 +58,6 @@ static ssize_t jfs_loglevel_proc_write(struct file *file,
}
static const struct file_operations jfs_loglevel_proc_fops = {
- .owner = THIS_MODULE,
.open = jfs_loglevel_proc_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/fs/jfs/jfs_discard.c b/fs/jfs/jfs_discard.c
index dfcd503045593..f76ff0a464442 100644
--- a/fs/jfs/jfs_discard.c
+++ b/fs/jfs/jfs_discard.c
@@ -49,14 +49,12 @@ void jfs_issue_discard(struct inode *ip, u64 blkno, u64 nblocks)
r = sb_issue_discard(sb, blkno, nblocks, GFP_NOFS, 0);
if (unlikely(r != 0)) {
- jfs_err("JFS: sb_issue_discard" \
- "(%p, %llu, %llu, GFP_NOFS, 0) = %d => failed!\n",
+ jfs_err("JFS: sb_issue_discard(%p, %llu, %llu, GFP_NOFS, 0) = %d => failed!",
sb, (unsigned long long)blkno,
(unsigned long long)nblocks, r);
}
- jfs_info("JFS: sb_issue_discard" \
- "(%p, %llu, %llu, GFP_NOFS, 0) = %d\n",
+ jfs_info("JFS: sb_issue_discard(%p, %llu, %llu, GFP_NOFS, 0) = %d",
sb, (unsigned long long)blkno,
(unsigned long long)nblocks, r);
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index d88576e23fe4c..de2bcb36e0793 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -3072,8 +3072,7 @@ int jfs_readdir(struct file *file, struct dir_context *ctx)
}
if (dirtab_slot.flag == DIR_INDEX_FREE) {
if (loop_count++ > JFS_IP(ip)->next_index) {
- jfs_err("jfs_readdir detected "
- "infinite loop!");
+ jfs_err("jfs_readdir detected infinite loop!");
ctx->pos = DIREND;
return 0;
}
@@ -3151,8 +3150,7 @@ int jfs_readdir(struct file *file, struct dir_context *ctx)
if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR))
return 0;
} else {
- jfs_err("jfs_readdir called with "
- "invalid offset!");
+ jfs_err("jfs_readdir called with invalid offset!");
}
dtoffset->pn = 1;
dtoffset->index = 0;
@@ -3165,8 +3163,8 @@ int jfs_readdir(struct file *file, struct dir_context *ctx)
}
if ((rc = dtReadNext(ip, &ctx->pos, &btstack))) {
- jfs_err("jfs_readdir: unexpected rc = %d "
- "from dtReadNext", rc);
+ jfs_err("jfs_readdir: unexpected rc = %d from dtReadNext",
+ rc);
ctx->pos = DIREND;
return 0;
}
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index f321986e73d28..6aca224a5d684 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -534,8 +534,7 @@ void diWriteSpecial(struct inode *ip, int secondary)
/* read the page of fixed disk inode (AIT) in raw mode */
mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1);
if (mp == NULL) {
- jfs_err("diWriteSpecial: failed to read aggregate inode "
- "extent!");
+ jfs_err("diWriteSpecial: failed to read aggregate inode extent!");
return;
}
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index cf7936fe2e682..5e33cb9a190d8 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -151,7 +151,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
jfs_inode->xtlid = 0;
jfs_set_inode_flags(inode);
- jfs_info("ialloc returns inode = 0x%p\n", inode);
+ jfs_info("ialloc returns inode = 0x%p", inode);
return inode;
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index a270cb7ff4e03..a21ea8b3e5fa6 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1094,7 +1094,7 @@ int lmLogOpen(struct super_block *sb)
if (log->bdev->bd_dev == sbi->logdev) {
if (memcmp(log->uuid, sbi->loguuid,
sizeof(log->uuid))) {
- jfs_warn("wrong uuid on JFS journal\n");
+ jfs_warn("wrong uuid on JFS journal");
mutex_unlock(&jfs_log_mutex);
return -EINVAL;
}
@@ -1333,9 +1333,8 @@ int lmLogInit(struct jfs_log * log)
rc = -EINVAL;
goto errout20;
}
- jfs_info("lmLogInit: inline log:0x%p base:0x%Lx "
- "size:0x%x", log,
- (unsigned long long) log->base, log->size);
+ jfs_info("lmLogInit: inline log:0x%p base:0x%Lx size:0x%x",
+ log, (unsigned long long)log->base, log->size);
} else {
if (memcmp(logsuper->uuid, log->uuid, 16)) {
jfs_warn("wrong uuid on JFS log device");
@@ -1343,9 +1342,8 @@ int lmLogInit(struct jfs_log * log)
}
log->size = le32_to_cpu(logsuper->size);
log->l2bsize = le32_to_cpu(logsuper->l2bsize);
- jfs_info("lmLogInit: external log:0x%p base:0x%Lx "
- "size:0x%x", log,
- (unsigned long long) log->base, log->size);
+ jfs_info("lmLogInit: external log:0x%p base:0x%Lx size:0x%x",
+ log, (unsigned long long)log->base, log->size);
}
log->page = le32_to_cpu(logsuper->end) / LOGPSIZE;
@@ -2004,12 +2002,13 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
bio->bi_end_io = lbmIODone;
bio->bi_private = bp;
+ bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC);
/*check if journaling to disk has been disabled*/
if (log->no_integrity) {
bio->bi_iter.bi_size = 0;
lbmIODone(bio);
} else {
- submit_bio(READ_SYNC, bio);
+ submit_bio(bio);
}
wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD));
@@ -2136,7 +2135,7 @@ static void lbmStartIO(struct lbuf * bp)
struct bio *bio;
struct jfs_log *log = bp->l_log;
- jfs_info("lbmStartIO\n");
+ jfs_info("lbmStartIO");
bio = bio_alloc(GFP_NOFS, 1);
bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
@@ -2147,13 +2146,14 @@ static void lbmStartIO(struct lbuf * bp)
bio->bi_end_io = lbmIODone;
bio->bi_private = bp;
+ bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC);
/* check if journaling to disk has been disabled */
if (log->no_integrity) {
bio->bi_iter.bi_size = 0;
lbmIODone(bio);
} else {
- submit_bio(WRITE_SYNC, bio);
+ submit_bio(bio);
INCREMENT(lmStat.submitted);
}
}
@@ -2517,7 +2517,6 @@ static int jfs_lmstats_proc_open(struct inode *inode, struct file *file)
}
const struct file_operations jfs_lmstats_proc_fops = {
- .owner = THIS_MODULE,
.open = jfs_lmstats_proc_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index a3eb316b1ac38..489aaa1403e57 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -80,7 +80,7 @@ static inline void lock_metapage(struct metapage *mp)
static struct kmem_cache *metapage_cache;
static mempool_t *metapage_mempool;
-#define MPS_PER_PAGE (PAGE_CACHE_SIZE >> L2PSIZE)
+#define MPS_PER_PAGE (PAGE_SIZE >> L2PSIZE)
#if MPS_PER_PAGE > 1
@@ -316,7 +316,7 @@ static void last_write_complete(struct page *page)
struct metapage *mp;
unsigned int offset;
- for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) {
+ for (offset = 0; offset < PAGE_SIZE; offset += PSIZE) {
mp = page_to_mp(page, offset);
if (mp && test_bit(META_io, &mp->flag)) {
if (mp->lsn)
@@ -366,12 +366,12 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
int bad_blocks = 0;
page_start = (sector_t)page->index <<
- (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ (PAGE_SHIFT - inode->i_blkbits);
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
set_page_writeback(page);
- for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) {
+ for (offset = 0; offset < PAGE_SIZE; offset += PSIZE) {
mp = page_to_mp(page, offset);
if (!mp || !test_bit(META_dirty, &mp->flag))
@@ -411,12 +411,12 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
inc_io(page);
if (!bio->bi_iter.bi_size)
goto dump_bio;
- submit_bio(WRITE, bio);
+ submit_bio(bio);
nr_underway++;
bio = NULL;
} else
inc_io(page);
- xlen = (PAGE_CACHE_SIZE - offset) >> inode->i_blkbits;
+ xlen = (PAGE_SIZE - offset) >> inode->i_blkbits;
pblock = metapage_get_blocks(inode, lblock, &xlen);
if (!pblock) {
printk(KERN_ERR "JFS: metapage_get_blocks failed\n");
@@ -434,6 +434,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
bio->bi_iter.bi_sector = pblock << (inode->i_blkbits - 9);
bio->bi_end_io = metapage_write_end_io;
bio->bi_private = page;
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
/* Don't call bio_add_page yet, we may add to this vec */
bio_offset = offset;
@@ -448,7 +449,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
if (!bio->bi_iter.bi_size)
goto dump_bio;
- submit_bio(WRITE, bio);
+ submit_bio(bio);
nr_underway++;
}
if (redirty)
@@ -485,7 +486,7 @@ static int metapage_readpage(struct file *fp, struct page *page)
struct inode *inode = page->mapping->host;
struct bio *bio = NULL;
int block_offset;
- int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;
+ int blocks_per_page = PAGE_SIZE >> inode->i_blkbits;
sector_t page_start; /* address of page in fs blocks */
sector_t pblock;
int xlen;
@@ -494,7 +495,7 @@ static int metapage_readpage(struct file *fp, struct page *page)
BUG_ON(!PageLocked(page));
page_start = (sector_t)page->index <<
- (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ (PAGE_SHIFT - inode->i_blkbits);
block_offset = 0;
while (block_offset < blocks_per_page) {
@@ -506,7 +507,7 @@ static int metapage_readpage(struct file *fp, struct page *page)
insert_metapage(page, NULL);
inc_io(page);
if (bio)
- submit_bio(READ, bio);
+ submit_bio(bio);
bio = bio_alloc(GFP_NOFS, 1);
bio->bi_bdev = inode->i_sb->s_bdev;
@@ -514,6 +515,7 @@ static int metapage_readpage(struct file *fp, struct page *page)
pblock << (inode->i_blkbits - 9);
bio->bi_end_io = metapage_read_end_io;
bio->bi_private = page;
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
len = xlen << inode->i_blkbits;
offset = block_offset << inode->i_blkbits;
if (bio_add_page(bio, page, len, offset) < len)
@@ -523,7 +525,7 @@ static int metapage_readpage(struct file *fp, struct page *page)
block_offset++;
}
if (bio)
- submit_bio(READ, bio);
+ submit_bio(bio);
else
unlock_page(page);
@@ -542,7 +544,7 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
int ret = 1;
int offset;
- for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) {
+ for (offset = 0; offset < PAGE_SIZE; offset += PSIZE) {
mp = page_to_mp(page, offset);
if (!mp)
@@ -568,7 +570,7 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
static void metapage_invalidatepage(struct page *page, unsigned int offset,
unsigned int length)
{
- BUG_ON(offset || length < PAGE_CACHE_SIZE);
+ BUG_ON(offset || length < PAGE_SIZE);
BUG_ON(PageWriteback(page));
@@ -599,10 +601,10 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
inode->i_ino, lblock, absolute);
l2bsize = inode->i_blkbits;
- l2BlocksPerPage = PAGE_CACHE_SHIFT - l2bsize;
+ l2BlocksPerPage = PAGE_SHIFT - l2bsize;
page_index = lblock >> l2BlocksPerPage;
page_offset = (lblock - (page_index << l2BlocksPerPage)) << l2bsize;
- if ((page_offset + size) > PAGE_CACHE_SIZE) {
+ if ((page_offset + size) > PAGE_SIZE) {
jfs_err("MetaData crosses page boundary!!");
jfs_err("lblock = %lx, size = %d", lblock, size);
dump_stack();
@@ -621,7 +623,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
mapping = inode->i_mapping;
}
- if (new && (PSIZE == PAGE_CACHE_SIZE)) {
+ if (new && (PSIZE == PAGE_SIZE)) {
page = grab_cache_page(mapping, page_index);
if (!page) {
jfs_err("grab_cache_page failed!");
@@ -693,7 +695,7 @@ unlock:
void grab_metapage(struct metapage * mp)
{
jfs_info("grab_metapage: mp = 0x%p", mp);
- page_cache_get(mp->page);
+ get_page(mp->page);
lock_page(mp->page);
mp->count++;
lock_metapage(mp);
@@ -706,12 +708,12 @@ void force_metapage(struct metapage *mp)
jfs_info("force_metapage: mp = 0x%p", mp);
set_bit(META_forcewrite, &mp->flag);
clear_bit(META_sync, &mp->flag);
- page_cache_get(page);
+ get_page(page);
lock_page(page);
set_page_dirty(page);
write_one_page(page, 1);
clear_bit(META_forcewrite, &mp->flag);
- page_cache_release(page);
+ put_page(page);
}
void hold_metapage(struct metapage *mp)
@@ -726,7 +728,7 @@ void put_metapage(struct metapage *mp)
unlock_page(mp->page);
return;
}
- page_cache_get(mp->page);
+ get_page(mp->page);
mp->count++;
lock_metapage(mp);
unlock_page(mp->page);
@@ -746,7 +748,7 @@ void release_metapage(struct metapage * mp)
assert(mp->count);
if (--mp->count || mp->nohomeok) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return;
}
@@ -764,13 +766,13 @@ void release_metapage(struct metapage * mp)
drop_metapage(page, mp);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
void __invalidate_metapages(struct inode *ip, s64 addr, int len)
{
sector_t lblock;
- int l2BlocksPerPage = PAGE_CACHE_SHIFT - ip->i_blkbits;
+ int l2BlocksPerPage = PAGE_SHIFT - ip->i_blkbits;
int BlocksPerPage = 1 << l2BlocksPerPage;
/* All callers are interested in block device's mapping */
struct address_space *mapping =
@@ -788,7 +790,7 @@ void __invalidate_metapages(struct inode *ip, s64 addr, int len)
page = find_lock_page(mapping, lblock >> l2BlocksPerPage);
if (!page)
continue;
- for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) {
+ for (offset = 0; offset < PAGE_SIZE; offset += PSIZE) {
mp = page_to_mp(page, offset);
if (!mp)
continue;
@@ -803,7 +805,7 @@ void __invalidate_metapages(struct inode *ip, s64 addr, int len)
remove_from_logsync(mp);
}
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
}
@@ -828,7 +830,6 @@ static int jfs_mpstat_proc_open(struct inode *inode, struct file *file)
}
const struct file_operations jfs_mpstat_proc_fops = {
- .owner = THIS_MODULE,
.open = jfs_mpstat_proc_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/fs/jfs/jfs_metapage.h b/fs/jfs/jfs_metapage.h
index 337e9e51ac066..a869fb4a20d66 100644
--- a/fs/jfs/jfs_metapage.h
+++ b/fs/jfs/jfs_metapage.h
@@ -106,7 +106,7 @@ static inline void metapage_nohomeok(struct metapage *mp)
lock_page(page);
if (!mp->nohomeok++) {
mark_metapage_dirty(mp);
- page_cache_get(page);
+ get_page(page);
wait_on_page_writeback(page);
}
unlock_page(page);
@@ -128,7 +128,7 @@ static inline void metapage_wait_for_io(struct metapage *mp)
static inline void _metapage_homeok(struct metapage *mp)
{
if (!--mp->nohomeok)
- page_cache_release(mp->page);
+ put_page(mp->page);
}
static inline void metapage_homeok(struct metapage *mp)
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index d595856453b24..2e58978d6f45a 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -1764,7 +1764,7 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
if (lwm == next)
goto out;
if (lwm > next) {
- jfs_err("xtLog: lwm > next\n");
+ jfs_err("xtLog: lwm > next");
goto out;
}
tlck->flag |= tlckUPDATEMAP;
@@ -1798,8 +1798,8 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
xadlock->xdlist = &p->xad[lwm];
tblk->xflag &= ~COMMIT_LAZY;
}
- jfs_info("xtLog: alloc ip:0x%p mp:0x%p tlck:0x%p lwm:%d "
- "count:%d", tlck->ip, mp, tlck, lwm, xadlock->count);
+ jfs_info("xtLog: alloc ip:0x%p mp:0x%p tlck:0x%p lwm:%d count:%d",
+ tlck->ip, mp, tlck, lwm, xadlock->count);
maplock->index = 1;
@@ -2025,8 +2025,7 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
xadlock->count = next - lwm;
xadlock->xdlist = &p->xad[lwm];
- jfs_info("xtLog: alloc ip:0x%p mp:0x%p count:%d "
- "lwm:%d next:%d",
+ jfs_info("xtLog: alloc ip:0x%p mp:0x%p count:%d lwm:%d next:%d",
tlck->ip, mp, xadlock->count, lwm, next);
maplock->index++;
xadlock++;
@@ -2047,8 +2046,8 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
pxdlock->count = 1;
pxdlock->pxd = pxd;
- jfs_info("xtLog: truncate ip:0x%p mp:0x%p count:%d "
- "hwm:%d", ip, mp, pxdlock->count, hwm);
+ jfs_info("xtLog: truncate ip:0x%p mp:0x%p count:%d hwm:%d",
+ ip, mp, pxdlock->count, hwm);
maplock->index++;
xadlock++;
}
@@ -2066,8 +2065,7 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
xadlock->count = hwm - next + 1;
xadlock->xdlist = &p->xad[next];
- jfs_info("xtLog: free ip:0x%p mp:0x%p count:%d "
- "next:%d hwm:%d",
+ jfs_info("xtLog: free ip:0x%p mp:0x%p count:%d next:%d hwm:%d",
tlck->ip, mp, xadlock->count, next, hwm);
maplock->index++;
}
@@ -2523,8 +2521,7 @@ void txFreeMap(struct inode *ip,
xlen = lengthXAD(xad);
dbUpdatePMap(ipbmap, true, xaddr,
(s64) xlen, tblk);
- jfs_info("freePMap: xaddr:0x%lx "
- "xlen:%d",
+ jfs_info("freePMap: xaddr:0x%lx xlen:%d",
(ulong) xaddr, xlen);
}
}
@@ -2814,7 +2811,7 @@ int jfs_lazycommit(void *arg)
if (!list_empty(&TxAnchor.unlock_queue))
jfs_err("jfs_lazycommit being killed w/pending transactions!");
else
- jfs_info("jfs_lazycommit being killed\n");
+ jfs_info("jfs_lazycommit being killed");
return 0;
}
@@ -3043,7 +3040,6 @@ static int jfs_txanchor_proc_open(struct inode *inode, struct file *file)
}
const struct file_operations jfs_txanchor_proc_fops = {
- .owner = THIS_MODULE,
.open = jfs_txanchor_proc_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -3084,7 +3080,6 @@ static int jfs_txstats_proc_open(struct inode *inode, struct file *file)
}
const struct file_operations jfs_txstats_proc_fops = {
- .owner = THIS_MODULE,
.open = jfs_txstats_proc_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h
index e8d717dabca3e..561f6af46288f 100644
--- a/fs/jfs/jfs_xattr.h
+++ b/fs/jfs/jfs_xattr.h
@@ -19,6 +19,8 @@
#ifndef H_JFS_XATTR
#define H_JFS_XATTR
+#include <linux/xattr.h>
+
/*
* jfs_ea_list describe the on-disk format of the extended attributes.
* I know the null-terminator is redundant since namelen is stored, but
@@ -54,12 +56,8 @@ struct jfs_ea_list {
extern int __jfs_setxattr(tid_t, struct inode *, const char *, const void *,
size_t, int);
-extern int jfs_setxattr(struct dentry *, const char *, const void *, size_t,
- int);
extern ssize_t __jfs_getxattr(struct inode *, const char *, void *, size_t);
-extern ssize_t jfs_getxattr(struct dentry *, const char *, void *, size_t);
extern ssize_t jfs_listxattr(struct dentry *, char *, size_t);
-extern int jfs_removexattr(struct dentry *, const char *);
extern const struct xattr_handler *jfs_xattr_handlers[];
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index 5ad7748860ce6..5cde6d2fcfca6 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -3894,7 +3894,6 @@ static int jfs_xtstat_proc_open(struct inode *inode, struct file *file)
}
const struct file_operations jfs_xtstat_proc_fops = {
- .owner = THIS_MODULE,
.open = jfs_xtstat_proc_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 701f89370de7a..814b0c58016cc 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1225,8 +1225,8 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
rc = dtSearch(new_dir, &new_dname, &ino, &btstack,
JFS_CREATE);
if (rc) {
- jfs_err("jfs_rename didn't expect dtSearch to fail "
- "w/rc = %d", rc);
+ jfs_err("jfs_rename didn't expect dtSearch to fail w/rc = %d",
+ rc);
goto out_tx;
}
@@ -1524,7 +1524,7 @@ struct dentry *jfs_get_parent(struct dentry *dentry)
parent_ino =
le32_to_cpu(JFS_IP(d_inode(dentry))->i_dtroot.header.idotdot);
- return d_obtain_alias(jfs_iget(d_inode(dentry)->i_sb, parent_ino));
+ return d_obtain_alias(jfs_iget(dentry->d_sb, parent_ino));
}
const struct inode_operations jfs_dir_inode_operations = {
@@ -1537,10 +1537,10 @@ const struct inode_operations jfs_dir_inode_operations = {
.rmdir = jfs_rmdir,
.mknod = jfs_mknod,
.rename = jfs_rename,
- .setxattr = jfs_setxattr,
- .getxattr = jfs_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
.listxattr = jfs_listxattr,
- .removexattr = jfs_removexattr,
+ .removexattr = generic_removexattr,
.setattr = jfs_setattr,
#ifdef CONFIG_JFS_POSIX_ACL
.get_acl = jfs_get_acl,
@@ -1564,7 +1564,7 @@ static int jfs_ci_hash(const struct dentry *dir, struct qstr *this)
unsigned long hash;
int i;
- hash = init_name_hash();
+ hash = init_name_hash(dir);
for (i=0; i < this->len; i++)
hash = partial_name_hash(tolower(this->name[i]), hash);
this->hash = end_name_hash(hash);
@@ -1572,7 +1572,7 @@ static int jfs_ci_hash(const struct dentry *dir, struct qstr *this)
return 0;
}
-static int jfs_ci_compare(const struct dentry *parent, const struct dentry *dentry,
+static int jfs_ci_compare(const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
int i, result = 1;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 4f5d85ba8e237..cec8814a3b8be 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -84,7 +84,7 @@ static void jfs_handle_error(struct super_block *sb)
panic("JFS (device %s): panic forced after error\n",
sb->s_id);
else if (sbi->flag & JFS_ERR_REMOUNT_RO) {
- jfs_err("ERROR: (device %s): remounting filesystem as read-only\n",
+ jfs_err("ERROR: (device %s): remounting filesystem as read-only",
sb->s_id);
sb->s_flags |= MS_RDONLY;
}
@@ -596,7 +596,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
* Page cache is indexed by long.
* I would use MAX_LFS_FILESIZE, but it's only half as big
*/
- sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1,
+ sb->s_maxbytes = min(((u64) PAGE_SIZE << 32) - 1,
(u64)sb->s_maxbytes);
#endif
sb->s_time_gran = 1;
@@ -641,7 +641,7 @@ static int jfs_freeze(struct super_block *sb)
}
rc = updateSuper(sb, FM_CLEAN);
if (rc) {
- jfs_err("jfs_freeze: updateSuper failed\n");
+ jfs_err("jfs_freeze: updateSuper failed");
/*
* Don't fail here. Everything succeeded except
* marking the superblock clean, so there's really
diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c
index f8db4fde0b0b6..c94c7e4a13233 100644
--- a/fs/jfs/symlink.c
+++ b/fs/jfs/symlink.c
@@ -25,19 +25,19 @@ const struct inode_operations jfs_fast_symlink_inode_operations = {
.readlink = generic_readlink,
.get_link = simple_get_link,
.setattr = jfs_setattr,
- .setxattr = jfs_setxattr,
- .getxattr = jfs_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
.listxattr = jfs_listxattr,
- .removexattr = jfs_removexattr,
+ .removexattr = generic_removexattr,
};
const struct inode_operations jfs_symlink_inode_operations = {
.readlink = generic_readlink,
.get_link = page_get_link,
.setattr = jfs_setattr,
- .setxattr = jfs_setxattr,
- .getxattr = jfs_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
.listxattr = jfs_listxattr,
- .removexattr = jfs_removexattr,
+ .removexattr = generic_removexattr,
};
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 48b15a6e55586..0bf3c33aedff9 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -86,6 +86,14 @@ struct ea_buffer {
#define EA_MALLOC 0x0008
+/*
+ * Mapping of on-disk attribute names: for on-disk attribute names with an
+ * unknown prefix (not "system.", "user.", "security.", or "trusted."), the
+ * prefix "os2." is prepended. On the way back to disk, "os2." prefixes are
+ * stripped and we make sure that the remaining name does not start with one
+ * of the know prefixes.
+ */
+
static int is_known_namespace(const char *name)
{
if (strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) &&
@@ -97,29 +105,19 @@ static int is_known_namespace(const char *name)
return true;
}
-/*
- * These three routines are used to recognize on-disk extended attributes
- * that are in a recognized namespace. If the attribute is not recognized,
- * "os2." is prepended to the name
- */
-static int is_os2_xattr(struct jfs_ea *ea)
-{
- return !is_known_namespace(ea->name);
-}
-
static inline int name_size(struct jfs_ea *ea)
{
- if (is_os2_xattr(ea))
- return ea->namelen + XATTR_OS2_PREFIX_LEN;
- else
+ if (is_known_namespace(ea->name))
return ea->namelen;
+ else
+ return ea->namelen + XATTR_OS2_PREFIX_LEN;
}
static inline int copy_name(char *buffer, struct jfs_ea *ea)
{
int len = ea->namelen;
- if (is_os2_xattr(ea)) {
+ if (!is_known_namespace(ea->name)) {
memcpy(buffer, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN);
buffer += XATTR_OS2_PREFIX_LEN;
len += XATTR_OS2_PREFIX_LEN;
@@ -665,35 +663,6 @@ static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf,
return 0;
}
-/*
- * Most of the permission checking is done by xattr_permission in the vfs.
- * We also need to verify that this is a namespace that we recognize.
- */
-static int can_set_xattr(struct inode *inode, const char *name,
- const void *value, size_t value_len)
-{
- if (!strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)) {
- /*
- * This makes sure that we aren't trying to set an
- * attribute in a different namespace by prefixing it
- * with "os2."
- */
- if (is_known_namespace(name + XATTR_OS2_PREFIX_LEN))
- return -EOPNOTSUPP;
- return 0;
- }
-
- /*
- * Don't allow setting an attribute in an unknown namespace.
- */
- if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) &&
- strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
- strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
- return -EOPNOTSUPP;
-
- return 0;
-}
-
int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name,
const void *value, size_t value_len, int flags)
{
@@ -704,21 +673,10 @@ int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name,
int xattr_size;
int new_size;
int namelen = strlen(name);
- char *os2name = NULL;
int found = 0;
int rc;
int length;
- if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
- os2name = kmalloc(namelen - XATTR_OS2_PREFIX_LEN + 1,
- GFP_KERNEL);
- if (!os2name)
- return -ENOMEM;
- strcpy(os2name, name + XATTR_OS2_PREFIX_LEN);
- name = os2name;
- namelen -= XATTR_OS2_PREFIX_LEN;
- }
-
down_write(&JFS_IP(inode)->xattr_sem);
xattr_size = ea_get(inode, &ea_buf, 0);
@@ -841,44 +799,6 @@ int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name,
out:
up_write(&JFS_IP(inode)->xattr_sem);
- kfree(os2name);
-
- return rc;
-}
-
-int jfs_setxattr(struct dentry *dentry, const char *name, const void *value,
- size_t value_len, int flags)
-{
- struct inode *inode = d_inode(dentry);
- struct jfs_inode_info *ji = JFS_IP(inode);
- int rc;
- tid_t tid;
-
- /*
- * If this is a request for a synthetic attribute in the system.*
- * namespace use the generic infrastructure to resolve a handler
- * for it via sb->s_xattr.
- */
- if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- return generic_setxattr(dentry, name, value, value_len, flags);
-
- if ((rc = can_set_xattr(inode, name, value, value_len)))
- return rc;
-
- if (value == NULL) { /* empty EA, do not remove */
- value = "";
- value_len = 0;
- }
-
- tid = txBegin(inode->i_sb, 0);
- mutex_lock(&ji->commit_mutex);
- rc = __jfs_setxattr(tid, d_inode(dentry), name, value, value_len,
- flags);
- if (!rc)
- rc = txCommit(tid, 1, &inode, 0);
- txEnd(tid);
- mutex_unlock(&ji->commit_mutex);
-
return rc;
}
@@ -933,37 +853,6 @@ ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data,
return size;
}
-ssize_t jfs_getxattr(struct dentry *dentry, const char *name, void *data,
- size_t buf_size)
-{
- int err;
-
- /*
- * If this is a request for a synthetic attribute in the system.*
- * namespace use the generic infrastructure to resolve a handler
- * for it via sb->s_xattr.
- */
- if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- return generic_getxattr(dentry, name, data, buf_size);
-
- if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
- /*
- * skip past "os2." prefix
- */
- name += XATTR_OS2_PREFIX_LEN;
- /*
- * Don't allow retrieving properly prefixed attributes
- * by prepending them with "os2."
- */
- if (is_known_namespace(name))
- return -EOPNOTSUPP;
- }
-
- err = __jfs_getxattr(d_inode(dentry), name, data, buf_size);
-
- return err;
-}
-
/*
* No special permissions are needed to list attributes except for trusted.*
*/
@@ -1027,27 +916,16 @@ ssize_t jfs_listxattr(struct dentry * dentry, char *data, size_t buf_size)
return size;
}
-int jfs_removexattr(struct dentry *dentry, const char *name)
+static int __jfs_xattr_set(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
{
- struct inode *inode = d_inode(dentry);
struct jfs_inode_info *ji = JFS_IP(inode);
- int rc;
tid_t tid;
-
- /*
- * If this is a request for a synthetic attribute in the system.*
- * namespace use the generic infrastructure to resolve a handler
- * for it via sb->s_xattr.
- */
- if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- return generic_removexattr(dentry, name);
-
- if ((rc = can_set_xattr(inode, name, NULL, 0)))
- return rc;
+ int rc;
tid = txBegin(inode->i_sb, 0);
mutex_lock(&ji->commit_mutex);
- rc = __jfs_setxattr(tid, d_inode(dentry), name, NULL, 0, XATTR_REPLACE);
+ rc = __jfs_setxattr(tid, inode, name, value, size, flags);
if (!rc)
rc = txCommit(tid, 1, &inode, 0);
txEnd(tid);
@@ -1056,15 +934,75 @@ int jfs_removexattr(struct dentry *dentry, const char *name)
return rc;
}
-/*
- * List of handlers for synthetic system.* attributes. All real ondisk
- * attributes are handled directly.
- */
+static int jfs_xattr_get(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *value, size_t size)
+{
+ name = xattr_full_name(handler, name);
+ return __jfs_getxattr(inode, name, value, size);
+}
+
+static int jfs_xattr_set(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ name = xattr_full_name(handler, name);
+ return __jfs_xattr_set(inode, name, value, size, flags);
+}
+
+static int jfs_xattr_get_os2(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *value, size_t size)
+{
+ if (is_known_namespace(name))
+ return -EOPNOTSUPP;
+ return __jfs_getxattr(inode, name, value, size);
+}
+
+static int jfs_xattr_set_os2(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ if (is_known_namespace(name))
+ return -EOPNOTSUPP;
+ return __jfs_xattr_set(inode, name, value, size, flags);
+}
+
+static const struct xattr_handler jfs_user_xattr_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .get = jfs_xattr_get,
+ .set = jfs_xattr_set,
+};
+
+static const struct xattr_handler jfs_os2_xattr_handler = {
+ .prefix = XATTR_OS2_PREFIX,
+ .get = jfs_xattr_get_os2,
+ .set = jfs_xattr_set_os2,
+};
+
+static const struct xattr_handler jfs_security_xattr_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .get = jfs_xattr_get,
+ .set = jfs_xattr_set,
+};
+
+static const struct xattr_handler jfs_trusted_xattr_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .get = jfs_xattr_get,
+ .set = jfs_xattr_set,
+};
+
const struct xattr_handler *jfs_xattr_handlers[] = {
#ifdef CONFIG_JFS_POSIX_ACL
&posix_acl_access_xattr_handler,
&posix_acl_default_xattr_handler,
#endif
+ &jfs_os2_xattr_handler,
+ &jfs_user_xattr_handler,
+ &jfs_security_xattr_handler,
+ &jfs_trusted_xattr_handler,
NULL,
};
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 03b688d19f696..e57174d436830 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -153,9 +153,9 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
p = buf + len + nlen;
*p = '\0';
for (kn = kn_to; kn != common; kn = kn->parent) {
- nlen = strlen(kn->name);
- p -= nlen;
- memcpy(p, kn->name, nlen);
+ size_t tmp = strlen(kn->name);
+ p -= tmp;
+ memcpy(p, kn->name, tmp);
*(--p) = '/';
}
@@ -336,11 +336,11 @@ struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
*/
static unsigned int kernfs_name_hash(const char *name, const void *ns)
{
- unsigned long hash = init_name_hash();
+ unsigned long hash = init_name_hash(ns);
unsigned int len = strlen(name);
while (len--)
hash = partial_name_hash(*name++, hash);
- hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
+ hash = end_name_hash(hash);
hash &= 0x7fffffffU;
/* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
if (hash < 2)
@@ -753,7 +753,8 @@ int kernfs_add_one(struct kernfs_node *kn)
ps_iattr = parent->iattr;
if (ps_iattr) {
struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
- ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
+ ktime_get_real_ts(&ps_iattrs->ia_ctime);
+ ps_iattrs->ia_mtime = ps_iattrs->ia_ctime;
}
mutex_unlock(&kernfs_mutex);
@@ -1279,8 +1280,9 @@ static void __kernfs_remove(struct kernfs_node *kn)
/* update timestamps on the parent */
if (ps_iattr) {
- ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME;
- ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
+ ktime_get_real_ts(&ps_iattr->ia_iattr.ia_ctime);
+ ps_iattr->ia_iattr.ia_mtime =
+ ps_iattr->ia_iattr.ia_ctime;
}
kernfs_put(pos);
@@ -1643,22 +1645,9 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
return 0;
}
-static loff_t kernfs_dir_fop_llseek(struct file *file, loff_t offset,
- int whence)
-{
- struct inode *inode = file_inode(file);
- loff_t ret;
-
- inode_lock(inode);
- ret = generic_file_llseek(file, offset, whence);
- inode_unlock(inode);
-
- return ret;
-}
-
const struct file_operations kernfs_dir_fops = {
.read = generic_read_dir,
- .iterate = kernfs_fop_readdir,
+ .iterate_shared = kernfs_fop_readdir,
.release = kernfs_dir_fop_release,
- .llseek = kernfs_dir_fop_llseek,
+ .llseek = generic_file_llseek,
};
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 7247252ee9b1b..e1574008adc9e 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -190,15 +190,16 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of,
char *buf;
buf = of->prealloc_buf;
- if (!buf)
+ if (buf)
+ mutex_lock(&of->prealloc_mutex);
+ else
buf = kmalloc(len, GFP_KERNEL);
if (!buf)
return -ENOMEM;
/*
* @of->mutex nests outside active ref and is used both to ensure that
- * the ops aren't called concurrently for the same open file, and
- * to provide exclusive access to ->prealloc_buf (when that exists).
+ * the ops aren't called concurrently for the same open file.
*/
mutex_lock(&of->mutex);
if (!kernfs_get_active(of->kn)) {
@@ -214,21 +215,23 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of,
else
len = -EINVAL;
+ kernfs_put_active(of->kn);
+ mutex_unlock(&of->mutex);
+
if (len < 0)
- goto out_unlock;
+ goto out_free;
if (copy_to_user(user_buf, buf, len)) {
len = -EFAULT;
- goto out_unlock;
+ goto out_free;
}
*ppos += len;
- out_unlock:
- kernfs_put_active(of->kn);
- mutex_unlock(&of->mutex);
out_free:
- if (buf != of->prealloc_buf)
+ if (buf == of->prealloc_buf)
+ mutex_unlock(&of->prealloc_mutex);
+ else
kfree(buf);
return len;
}
@@ -284,15 +287,22 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf,
}
buf = of->prealloc_buf;
- if (!buf)
+ if (buf)
+ mutex_lock(&of->prealloc_mutex);
+ else
buf = kmalloc(len + 1, GFP_KERNEL);
if (!buf)
return -ENOMEM;
+ if (copy_from_user(buf, user_buf, len)) {
+ len = -EFAULT;
+ goto out_free;
+ }
+ buf[len] = '\0'; /* guarantee string termination */
+
/*
* @of->mutex nests outside active ref and is used both to ensure that
- * the ops aren't called concurrently for the same open file, and
- * to provide exclusive access to ->prealloc_buf (when that exists).
+ * the ops aren't called concurrently for the same open file.
*/
mutex_lock(&of->mutex);
if (!kernfs_get_active(of->kn)) {
@@ -301,26 +311,22 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf,
goto out_free;
}
- if (copy_from_user(buf, user_buf, len)) {
- len = -EFAULT;
- goto out_unlock;
- }
- buf[len] = '\0'; /* guarantee string termination */
-
ops = kernfs_ops(of->kn);
if (ops->write)
len = ops->write(of, buf, len, *ppos);
else
len = -EINVAL;
+ kernfs_put_active(of->kn);
+ mutex_unlock(&of->mutex);
+
if (len > 0)
*ppos += len;
-out_unlock:
- kernfs_put_active(of->kn);
- mutex_unlock(&of->mutex);
out_free:
- if (buf != of->prealloc_buf)
+ if (buf == of->prealloc_buf)
+ mutex_unlock(&of->prealloc_mutex);
+ else
kfree(buf);
return len;
}
@@ -687,6 +693,7 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
error = -ENOMEM;
if (!of->prealloc_buf)
goto err_free;
+ mutex_init(&of->prealloc_mutex);
}
/*
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 16405ae88d2d6..63b925d5ba1e4 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -54,7 +54,10 @@ static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
iattrs->ia_mode = kn->mode;
iattrs->ia_uid = GLOBAL_ROOT_UID;
iattrs->ia_gid = GLOBAL_ROOT_GID;
- iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
+
+ ktime_get_real_ts(&iattrs->ia_atime);
+ iattrs->ia_mtime = iattrs->ia_atime;
+ iattrs->ia_ctime = iattrs->ia_atime;
simple_xattrs_init(&kn->iattr->xattrs);
out_unlock:
@@ -157,10 +160,11 @@ static int kernfs_node_setsecdata(struct kernfs_node *kn, void **secdata,
return 0;
}
-int kernfs_iop_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+int kernfs_iop_setxattr(struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- struct kernfs_node *kn = dentry->d_fsdata;
+ struct kernfs_node *kn = inode->i_private;
struct kernfs_iattrs *attrs;
void *secdata;
int error;
@@ -172,11 +176,11 @@ int kernfs_iop_setxattr(struct dentry *dentry, const char *name,
if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
- error = security_inode_setsecurity(d_inode(dentry), suffix,
+ error = security_inode_setsecurity(inode, suffix,
value, size, flags);
if (error)
return error;
- error = security_inode_getsecctx(d_inode(dentry),
+ error = security_inode_getsecctx(inode,
&secdata, &secdata_len);
if (error)
return error;
@@ -208,10 +212,10 @@ int kernfs_iop_removexattr(struct dentry *dentry, const char *name)
return simple_xattr_set(&attrs->xattrs, name, NULL, 0, XATTR_REPLACE);
}
-ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
- size_t size)
+ssize_t kernfs_iop_getxattr(struct dentry *unused, struct inode *inode,
+ const char *name, void *buf, size_t size)
{
- struct kernfs_node *kn = dentry->d_fsdata;
+ struct kernfs_node *kn = inode->i_private;
struct kernfs_iattrs *attrs;
attrs = kernfs_iattrs(kn);
@@ -236,16 +240,18 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
{
inode->i_mode = mode;
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime =
+ inode->i_ctime = current_fs_time(inode->i_sb);
}
static inline void set_inode_attr(struct inode *inode, struct iattr *iattr)
{
+ struct super_block *sb = inode->i_sb;
inode->i_uid = iattr->ia_uid;
inode->i_gid = iattr->ia_gid;
- inode->i_atime = iattr->ia_atime;
- inode->i_mtime = iattr->ia_mtime;
- inode->i_ctime = iattr->ia_ctime;
+ inode->i_atime = timespec_trunc(iattr->ia_atime, sb->s_time_gran);
+ inode->i_mtime = timespec_trunc(iattr->ia_mtime, sb->s_time_gran);
+ inode->i_ctime = timespec_trunc(iattr->ia_ctime, sb->s_time_gran);
}
static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 6762bfbd82071..37159235ac109 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -81,11 +81,12 @@ int kernfs_iop_permission(struct inode *inode, int mask);
int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr);
int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
-int kernfs_iop_setxattr(struct dentry *dentry, const char *name, const void *value,
+int kernfs_iop_setxattr(struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
size_t size, int flags);
int kernfs_iop_removexattr(struct dentry *dentry, const char *name);
-ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
- size_t size);
+ssize_t kernfs_iop_getxattr(struct dentry *dentry, struct inode *inode,
+ const char *name, void *buf, size_t size);
ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size);
/*
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index b67dbccdaf883..b3d73ad52b22a 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -15,6 +15,7 @@
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/namei.h>
+#include <linux/seq_file.h>
#include "kernfs-internal.h"
@@ -40,6 +41,19 @@ static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
return 0;
}
+static int kernfs_sop_show_path(struct seq_file *sf, struct dentry *dentry)
+{
+ struct kernfs_node *node = dentry->d_fsdata;
+ struct kernfs_root *root = kernfs_root(node);
+ struct kernfs_syscall_ops *scops = root->syscall_ops;
+
+ if (scops && scops->show_path)
+ return scops->show_path(sf, node, root);
+
+ seq_dentry(sf, dentry, " \t\n\\");
+ return 0;
+}
+
const struct super_operations kernfs_sops = {
.statfs = simple_statfs,
.drop_inode = generic_delete_inode,
@@ -47,6 +61,7 @@ const struct super_operations kernfs_sops = {
.remount_fs = kernfs_sop_remount_fs,
.show_options = kernfs_sop_show_options,
+ .show_path = kernfs_sop_show_path,
};
/**
@@ -120,9 +135,8 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
kntmp = find_next_ancestor(kn, knparent);
if (WARN_ON(!kntmp))
return ERR_PTR(-EINVAL);
- mutex_lock(&d_inode(dentry)->i_mutex);
- dtmp = lookup_one_len(kntmp->name, dentry, strlen(kntmp->name));
- mutex_unlock(&d_inode(dentry)->i_mutex);
+ dtmp = lookup_one_len_unlocked(kntmp->name, dentry,
+ strlen(kntmp->name));
dput(dentry);
if (IS_ERR(dtmp))
return dtmp;
@@ -138,8 +152,10 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
struct dentry *root;
info->sb = sb;
- sb->s_blocksize = PAGE_CACHE_SIZE;
- sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ /* Userspace would break if executables or devices appear on sysfs */
+ sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;
+ sb->s_blocksize = PAGE_SIZE;
+ sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = magic;
sb->s_op = &kernfs_sops;
sb->s_time_gran = 1;
@@ -227,7 +243,8 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
info->root = root;
info->ns = ns;
- sb = sget(fs_type, kernfs_test_super, kernfs_set_super, flags, info);
+ sb = sget_userns(fs_type, kernfs_test_super, kernfs_set_super, flags,
+ &init_user_ns, info);
if (IS_ERR(sb) || sb->s_fs_info != info)
kfree(info);
if (IS_ERR(sb))
diff --git a/fs/libfs.c b/fs/libfs.c
index 0ca80b2af4201..74dc8b9e7f53a 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -25,7 +25,7 @@ int simple_getattr(struct vfsmount *mnt, struct dentry *dentry,
{
struct inode *inode = d_inode(dentry);
generic_fillattr(inode, stat);
- stat->blocks = inode->i_mapping->nrpages << (PAGE_CACHE_SHIFT - 9);
+ stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9);
return 0;
}
EXPORT_SYMBOL(simple_getattr);
@@ -33,7 +33,7 @@ EXPORT_SYMBOL(simple_getattr);
int simple_statfs(struct dentry *dentry, struct kstatfs *buf)
{
buf->f_type = dentry->d_sb->s_magic;
- buf->f_bsize = PAGE_CACHE_SIZE;
+ buf->f_bsize = PAGE_SIZE;
buf->f_namelen = NAME_MAX;
return 0;
}
@@ -71,9 +71,7 @@ EXPORT_SYMBOL(simple_lookup);
int dcache_dir_open(struct inode *inode, struct file *file)
{
- static struct qstr cursor_name = QSTR_INIT(".", 1);
-
- file->private_data = d_alloc(file->f_path.dentry, &cursor_name);
+ file->private_data = d_alloc_cursor(file->f_path.dentry);
return file->private_data ? 0 : -ENOMEM;
}
@@ -86,10 +84,64 @@ int dcache_dir_close(struct inode *inode, struct file *file)
}
EXPORT_SYMBOL(dcache_dir_close);
+/* parent is locked at least shared */
+static struct dentry *next_positive(struct dentry *parent,
+ struct list_head *from,
+ int count)
+{
+ unsigned *seq = &parent->d_inode->i_dir_seq, n;
+ struct dentry *res;
+ struct list_head *p;
+ bool skipped;
+ int i;
+
+retry:
+ i = count;
+ skipped = false;
+ n = smp_load_acquire(seq) & ~1;
+ res = NULL;
+ rcu_read_lock();
+ for (p = from->next; p != &parent->d_subdirs; p = p->next) {
+ struct dentry *d = list_entry(p, struct dentry, d_child);
+ if (!simple_positive(d)) {
+ skipped = true;
+ } else if (!--i) {
+ res = d;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ if (skipped) {
+ smp_rmb();
+ if (unlikely(*seq != n))
+ goto retry;
+ }
+ return res;
+}
+
+static void move_cursor(struct dentry *cursor, struct list_head *after)
+{
+ struct dentry *parent = cursor->d_parent;
+ unsigned n, *seq = &parent->d_inode->i_dir_seq;
+ spin_lock(&parent->d_lock);
+ for (;;) {
+ n = *seq;
+ if (!(n & 1) && cmpxchg(seq, n, n + 1) == n)
+ break;
+ cpu_relax();
+ }
+ __list_del(cursor->d_child.prev, cursor->d_child.next);
+ if (after)
+ list_add(&cursor->d_child, after);
+ else
+ list_add_tail(&cursor->d_child, &parent->d_subdirs);
+ smp_store_release(seq, n + 2);
+ spin_unlock(&parent->d_lock);
+}
+
loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
{
struct dentry *dentry = file->f_path.dentry;
- inode_lock(d_inode(dentry));
switch (whence) {
case 1:
offset += file->f_pos;
@@ -97,34 +149,21 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
if (offset >= 0)
break;
default:
- inode_unlock(d_inode(dentry));
return -EINVAL;
}
if (offset != file->f_pos) {
file->f_pos = offset;
if (file->f_pos >= 2) {
- struct list_head *p;
struct dentry *cursor = file->private_data;
+ struct dentry *to;
loff_t n = file->f_pos - 2;
- spin_lock(&dentry->d_lock);
- /* d_lock not required for cursor */
- list_del(&cursor->d_child);
- p = dentry->d_subdirs.next;
- while (n && p != &dentry->d_subdirs) {
- struct dentry *next;
- next = list_entry(p, struct dentry, d_child);
- spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
- if (simple_positive(next))
- n--;
- spin_unlock(&next->d_lock);
- p = p->next;
- }
- list_add_tail(&cursor->d_child, p);
- spin_unlock(&dentry->d_lock);
+ inode_lock_shared(dentry->d_inode);
+ to = next_positive(dentry, &dentry->d_subdirs, n);
+ move_cursor(cursor, to ? &to->d_child : NULL);
+ inode_unlock_shared(dentry->d_inode);
}
}
- inode_unlock(d_inode(dentry));
return offset;
}
EXPORT_SYMBOL(dcache_dir_lseek);
@@ -145,36 +184,25 @@ int dcache_readdir(struct file *file, struct dir_context *ctx)
{
struct dentry *dentry = file->f_path.dentry;
struct dentry *cursor = file->private_data;
- struct list_head *p, *q = &cursor->d_child;
+ struct list_head *p = &cursor->d_child;
+ struct dentry *next;
+ bool moved = false;
if (!dir_emit_dots(file, ctx))
return 0;
- spin_lock(&dentry->d_lock);
- if (ctx->pos == 2)
- list_move(q, &dentry->d_subdirs);
- for (p = q->next; p != &dentry->d_subdirs; p = p->next) {
- struct dentry *next = list_entry(p, struct dentry, d_child);
- spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
- if (!simple_positive(next)) {
- spin_unlock(&next->d_lock);
- continue;
- }
-
- spin_unlock(&next->d_lock);
- spin_unlock(&dentry->d_lock);
+ if (ctx->pos == 2)
+ p = &dentry->d_subdirs;
+ while ((next = next_positive(dentry, p, 1)) != NULL) {
if (!dir_emit(ctx, next->d_name.name, next->d_name.len,
d_inode(next)->i_ino, dt_type(d_inode(next))))
- return 0;
- spin_lock(&dentry->d_lock);
- spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
- /* next is still alive */
- list_move(q, p);
- spin_unlock(&next->d_lock);
- p = q;
+ break;
+ moved = true;
+ p = &next->d_child;
ctx->pos++;
}
- spin_unlock(&dentry->d_lock);
+ if (moved)
+ move_cursor(cursor, p);
return 0;
}
EXPORT_SYMBOL(dcache_readdir);
@@ -190,7 +218,7 @@ const struct file_operations simple_dir_operations = {
.release = dcache_dir_close,
.llseek = dcache_dir_lseek,
.read = generic_read_dir,
- .iterate = dcache_readdir,
+ .iterate_shared = dcache_readdir,
.fsync = noop_fsync,
};
EXPORT_SYMBOL(simple_dir_operations);
@@ -395,7 +423,7 @@ int simple_write_begin(struct file *file, struct address_space *mapping,
struct page *page;
pgoff_t index;
- index = pos >> PAGE_CACHE_SHIFT;
+ index = pos >> PAGE_SHIFT;
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
@@ -403,10 +431,10 @@ int simple_write_begin(struct file *file, struct address_space *mapping,
*pagep = page;
- if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
- unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ if (!PageUptodate(page) && (len != PAGE_SIZE)) {
+ unsigned from = pos & (PAGE_SIZE - 1);
- zero_user_segments(page, 0, from, from + len, PAGE_CACHE_SIZE);
+ zero_user_segments(page, 0, from, from + len, PAGE_SIZE);
}
return 0;
}
@@ -442,7 +470,7 @@ int simple_write_end(struct file *file, struct address_space *mapping,
/* zero the stale part of the page if we did a short copy */
if (copied < len) {
- unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned from = pos & (PAGE_SIZE - 1);
zero_user(page, from + copied, len - copied);
}
@@ -458,7 +486,7 @@ int simple_write_end(struct file *file, struct address_space *mapping,
set_page_dirty(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return copied;
}
@@ -477,8 +505,8 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
struct dentry *dentry;
int i;
- s->s_blocksize = PAGE_CACHE_SIZE;
- s->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ s->s_blocksize = PAGE_SIZE;
+ s->s_blocksize_bits = PAGE_SHIFT;
s->s_magic = magic;
s->s_op = &simple_super_operations;
s->s_time_gran = 1;
@@ -994,12 +1022,12 @@ int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks)
{
u64 last_fs_block = num_blocks - 1;
u64 last_fs_page =
- last_fs_block >> (PAGE_CACHE_SHIFT - blocksize_bits);
+ last_fs_block >> (PAGE_SHIFT - blocksize_bits);
if (unlikely(num_blocks == 0))
return 0;
- if ((blocksize_bits < 9) || (blocksize_bits > PAGE_CACHE_SHIFT))
+ if ((blocksize_bits < 9) || (blocksize_bits > PAGE_SHIFT))
return -EINVAL;
if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) ||
@@ -1121,14 +1149,15 @@ static int empty_dir_setattr(struct dentry *dentry, struct iattr *attr)
return -EPERM;
}
-static int empty_dir_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+static int empty_dir_setxattr(struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
return -EOPNOTSUPP;
}
-static ssize_t empty_dir_getxattr(struct dentry *dentry, const char *name,
- void *value, size_t size)
+static ssize_t empty_dir_getxattr(struct dentry *dentry, struct inode *inode,
+ const char *name, void *value, size_t size)
{
return -EOPNOTSUPP;
}
@@ -1169,7 +1198,7 @@ static int empty_dir_readdir(struct file *file, struct dir_context *ctx)
static const struct file_operations empty_dir_operations = {
.llseek = empty_dir_llseek,
.read = generic_read_dir,
- .iterate = empty_dir_readdir,
+ .iterate_shared = empty_dir_readdir,
.fsync = noop_fsync,
};
diff --git a/fs/lockd/procfs.c b/fs/lockd/procfs.c
index 2a0a98480e39d..8f72cb237ef34 100644
--- a/fs/lockd/procfs.c
+++ b/fs/lockd/procfs.c
@@ -64,7 +64,6 @@ static const struct file_operations lockd_end_grace_operations = {
.read = nlm_end_grace_read,
.llseek = default_llseek,
.release = simple_transaction_release,
- .owner = THIS_MODULE,
};
int __init
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 154a107cd3762..fc4084ef4736d 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -335,12 +335,17 @@ static struct notifier_block lockd_inet6addr_notifier = {
};
#endif
-static void lockd_svc_exit_thread(void)
+static void lockd_unregister_notifiers(void)
{
unregister_inetaddr_notifier(&lockd_inetaddr_notifier);
#if IS_ENABLED(CONFIG_IPV6)
unregister_inet6addr_notifier(&lockd_inet6addr_notifier);
#endif
+}
+
+static void lockd_svc_exit_thread(void)
+{
+ lockd_unregister_notifiers();
svc_exit_thread(nlmsvc_rqst);
}
@@ -462,7 +467,7 @@ int lockd_up(struct net *net)
* Note: svc_serv structures have an initial use count of 1,
* so we exit through here on both success and failure.
*/
-err_net:
+err_put:
svc_destroy(serv);
err_create:
mutex_unlock(&nlmsvc_mutex);
@@ -470,7 +475,9 @@ err_create:
err_start:
lockd_down_net(serv, net);
- goto err_net;
+err_net:
+ lockd_unregister_notifiers();
+ goto err_put;
}
EXPORT_SYMBOL_GPL(lockd_up);
diff --git a/fs/locks.c b/fs/locks.c
index 7c5f91be9b65c..ee1b15f6fc135 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1628,7 +1628,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
{
struct file_lock *fl, *my_fl = NULL, *lease;
struct dentry *dentry = filp->f_path.dentry;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct file_lock_context *ctx;
bool is_deleg = (*flp)->fl_flags & FL_DELEG;
int error;
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index a709d80c8ebcc..a8329cc47decd 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -14,7 +14,7 @@
#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
-static int sync_request(struct page *page, struct block_device *bdev, int rw)
+static int sync_request(struct page *page, struct block_device *bdev, int op)
{
struct bio bio;
struct bio_vec bio_vec;
@@ -29,8 +29,9 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw)
bio.bi_bdev = bdev;
bio.bi_iter.bi_sector = page->index * (PAGE_SIZE >> 9);
bio.bi_iter.bi_size = PAGE_SIZE;
+ bio_set_op_attrs(&bio, op, 0);
- return submit_bio_wait(rw, &bio);
+ return submit_bio_wait(&bio);
}
static int bdev_readpage(void *_sb, struct page *page)
@@ -64,7 +65,7 @@ static void writeseg_end_io(struct bio *bio)
bio_for_each_segment_all(bvec, bio, i) {
end_page_writeback(bvec->bv_page);
- page_cache_release(bvec->bv_page);
+ put_page(bvec->bv_page);
}
bio_put(bio);
if (atomic_dec_and_test(&super->s_pending_writes))
@@ -95,8 +96,9 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
bio->bi_iter.bi_sector = ofs >> 9;
bio->bi_private = sb;
bio->bi_end_io = writeseg_end_io;
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
atomic_inc(&super->s_pending_writes);
- submit_bio(WRITE, bio);
+ submit_bio(bio);
ofs += i * PAGE_SIZE;
index += i;
@@ -122,8 +124,9 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
bio->bi_iter.bi_sector = ofs >> 9;
bio->bi_private = sb;
bio->bi_end_io = writeseg_end_io;
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
atomic_inc(&super->s_pending_writes);
- submit_bio(WRITE, bio);
+ submit_bio(bio);
return 0;
}
@@ -185,8 +188,9 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
bio->bi_iter.bi_sector = ofs >> 9;
bio->bi_private = sb;
bio->bi_end_io = erase_end_io;
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
atomic_inc(&super->s_pending_writes);
- submit_bio(WRITE, bio);
+ submit_bio(bio);
ofs += i * PAGE_SIZE;
index += i;
@@ -206,8 +210,9 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
bio->bi_iter.bi_sector = ofs >> 9;
bio->bi_private = sb;
bio->bi_end_io = erase_end_io;
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
atomic_inc(&super->s_pending_writes);
- submit_bio(WRITE, bio);
+ submit_bio(bio);
return 0;
}
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index 9c501449450dc..b76a62b1978fd 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -46,9 +46,9 @@ static int loffs_mtd_write(struct super_block *sb, loff_t ofs, size_t len,
BUG_ON((ofs >= mtd->size) || (len > mtd->size - ofs));
BUG_ON(ofs != (ofs >> super->s_writeshift) << super->s_writeshift);
- BUG_ON(len > PAGE_CACHE_SIZE);
- page_start = ofs & PAGE_CACHE_MASK;
- page_end = PAGE_CACHE_ALIGN(ofs + len) - 1;
+ BUG_ON(len > PAGE_SIZE);
+ page_start = ofs & PAGE_MASK;
+ page_end = PAGE_ALIGN(ofs + len) - 1;
ret = mtd_write(mtd, ofs, len, &retlen, buf);
if (ret || (retlen != len))
return -EIO;
@@ -82,7 +82,7 @@ static int logfs_mtd_erase_mapping(struct super_block *sb, loff_t ofs,
if (!page)
continue;
memset(page_address(page), 0xFF, PAGE_SIZE);
- page_cache_release(page);
+ put_page(page);
}
return 0;
}
@@ -195,7 +195,7 @@ static int __logfs_mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
err = loffs_mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
page_address(page));
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if (err)
return err;
}
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 542468e9bfb49..9568064ecadf2 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -95,7 +95,7 @@ static int beyond_eof(struct inode *inode, loff_t bix)
* of each character and pick a prime nearby, preferably a bit-sparse
* one.
*/
-static u32 hash_32(const char *s, int len, u32 seed)
+static u32 logfs_hash_32(const char *s, int len, u32 seed)
{
u32 hash = seed;
int i;
@@ -156,10 +156,10 @@ static pgoff_t hash_index(u32 hash, int round)
static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry)
{
- struct qstr *name = &dentry->d_name;
+ const struct qstr *name = &dentry->d_name;
struct page *page;
struct logfs_disk_dentry *dd;
- u32 hash = hash_32(name->name, name->len, 0);
+ u32 hash = logfs_hash_32(name->name, name->len, 0);
pgoff_t index;
int round;
@@ -183,7 +183,7 @@ static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry)
if (name->len != be16_to_cpu(dd->namelen) ||
memcmp(name->name, dd->name, name->len)) {
kunmap_atomic(dd);
- page_cache_release(page);
+ put_page(page);
continue;
}
@@ -238,7 +238,7 @@ static int logfs_unlink(struct inode *dir, struct dentry *dentry)
return PTR_ERR(page);
}
index = page->index;
- page_cache_release(page);
+ put_page(page);
mutex_lock(&super->s_dirop_mutex);
logfs_add_transaction(dir, ta);
@@ -316,14 +316,14 @@ static int logfs_readdir(struct file *file, struct dir_context *ctx)
be16_to_cpu(dd->namelen),
be64_to_cpu(dd->ino), dd->type);
kunmap(page);
- page_cache_release(page);
+ put_page(page);
if (full)
break;
}
return 0;
}
-static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name)
+static void logfs_set_name(struct logfs_disk_dentry *dd, const struct qstr *name)
{
dd->namelen = cpu_to_be16(name->len);
memcpy(dd->name, name->name, name->len);
@@ -349,7 +349,7 @@ static struct dentry *logfs_lookup(struct inode *dir, struct dentry *dentry,
dd = kmap_atomic(page);
ino = be64_to_cpu(dd->ino);
kunmap_atomic(dd);
- page_cache_release(page);
+ put_page(page);
inode = logfs_iget(dir->i_sb, ino);
if (IS_ERR(inode))
@@ -370,7 +370,7 @@ static int logfs_write_dir(struct inode *dir, struct dentry *dentry,
{
struct page *page;
struct logfs_disk_dentry *dd;
- u32 hash = hash_32(dentry->d_name.name, dentry->d_name.len, 0);
+ u32 hash = logfs_hash_32(dentry->d_name.name, dentry->d_name.len, 0);
pgoff_t index;
int round, err;
@@ -392,7 +392,7 @@ static int logfs_write_dir(struct inode *dir, struct dentry *dentry,
err = logfs_write_buf(dir, page, WF_LOCK);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if (!err)
grow_dir(dir, index);
return err;
@@ -561,7 +561,7 @@ static int logfs_get_dd(struct inode *dir, struct dentry *dentry,
map = kmap_atomic(page);
memcpy(dd, map, sizeof(*dd));
kunmap_atomic(map);
- page_cache_release(page);
+ put_page(page);
return 0;
}
@@ -791,7 +791,7 @@ const struct inode_operations logfs_dir_iops = {
const struct file_operations logfs_dir_fops = {
.fsync = logfs_fsync,
.unlocked_ioctl = logfs_ioctl,
- .iterate = logfs_readdir,
+ .iterate_shared = logfs_readdir,
.read = generic_read_dir,
- .llseek = default_llseek,
+ .llseek = generic_file_llseek,
};
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index 61eaeb1b6cac1..f01ddfb1a03b6 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -15,21 +15,21 @@ static int logfs_write_begin(struct file *file, struct address_space *mapping,
{
struct inode *inode = mapping->host;
struct page *page;
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t index = pos >> PAGE_SHIFT;
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
*pagep = page;
- if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
+ if ((len == PAGE_SIZE) || PageUptodate(page))
return 0;
- if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
- unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+ if ((pos & PAGE_MASK) >= i_size_read(inode)) {
+ unsigned start = pos & (PAGE_SIZE - 1);
unsigned end = start + len;
/* Reading beyond i_size is simple: memset to zero */
- zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE);
+ zero_user_segments(page, 0, start, end, PAGE_SIZE);
return 0;
}
return logfs_readpage_nolock(page);
@@ -41,11 +41,11 @@ static int logfs_write_end(struct file *file, struct address_space *mapping,
{
struct inode *inode = mapping->host;
pgoff_t index = page->index;
- unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned start = pos & (PAGE_SIZE - 1);
unsigned end = start + copied;
int ret = 0;
- BUG_ON(PAGE_CACHE_SIZE != inode->i_sb->s_blocksize);
+ BUG_ON(PAGE_SIZE != inode->i_sb->s_blocksize);
BUG_ON(page->index > I3_BLOCKS);
if (copied < len) {
@@ -61,8 +61,8 @@ static int logfs_write_end(struct file *file, struct address_space *mapping,
if (copied == 0)
goto out; /* FIXME: do we need to update inode? */
- if (i_size_read(inode) < (index << PAGE_CACHE_SHIFT) + end) {
- i_size_write(inode, (index << PAGE_CACHE_SHIFT) + end);
+ if (i_size_read(inode) < (index << PAGE_SHIFT) + end) {
+ i_size_write(inode, (index << PAGE_SHIFT) + end);
mark_inode_dirty_sync(inode);
}
@@ -75,7 +75,7 @@ static int logfs_write_end(struct file *file, struct address_space *mapping,
}
out:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return ret ? ret : copied;
}
@@ -118,7 +118,7 @@ static int logfs_writepage(struct page *page, struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
loff_t i_size = i_size_read(inode);
- pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ pgoff_t end_index = i_size >> PAGE_SHIFT;
unsigned offset;
u64 bix;
level_t level;
@@ -142,7 +142,7 @@ static int logfs_writepage(struct page *page, struct writeback_control *wbc)
return __logfs_writepage(page);
/* Is the page fully outside i_size? (truncate in progress) */
- offset = i_size & (PAGE_CACHE_SIZE-1);
+ offset = i_size & (PAGE_SIZE-1);
if (bix > end_index || offset == 0) {
unlock_page(page);
return 0; /* don't care */
@@ -155,7 +155,7 @@ static int logfs_writepage(struct page *page, struct writeback_control *wbc)
* the page size, the remaining memory is zeroed when mapped, and
* writes to that region are not written out to the file."
*/
- zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+ zero_user_segment(page, offset, PAGE_SIZE);
return __logfs_writepage(page);
}
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 20973c9e52f80..3fb8c6d67303e 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -281,7 +281,7 @@ static struct page *logfs_get_read_page(struct inode *inode, u64 bix,
static void logfs_put_read_page(struct page *page)
{
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
static void logfs_lock_write_page(struct page *page)
@@ -323,7 +323,7 @@ repeat:
return NULL;
err = add_to_page_cache_lru(page, mapping, index, GFP_NOFS);
if (unlikely(err)) {
- page_cache_release(page);
+ put_page(page);
if (err == -EEXIST)
goto repeat;
return NULL;
@@ -342,7 +342,7 @@ static void logfs_unlock_write_page(struct page *page)
static void logfs_put_write_page(struct page *page)
{
logfs_unlock_write_page(page);
- page_cache_release(page);
+ put_page(page);
}
static struct page *logfs_get_page(struct inode *inode, u64 bix, level_t level,
@@ -562,7 +562,7 @@ static void indirect_free_block(struct super_block *sb,
if (PagePrivate(page)) {
ClearPagePrivate(page);
- page_cache_release(page);
+ put_page(page);
set_page_private(page, 0);
}
__free_block(sb, block);
@@ -655,7 +655,7 @@ static void alloc_data_block(struct inode *inode, struct page *page)
block->page = page;
SetPagePrivate(page);
- page_cache_get(page);
+ get_page(page);
set_page_private(page, (unsigned long) block);
block->ops = &indirect_block_ops;
@@ -709,7 +709,7 @@ static u64 block_get_pointer(struct page *page, int index)
static int logfs_read_empty(struct page *page)
{
- zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ zero_user_segment(page, 0, PAGE_SIZE);
return 0;
}
@@ -1660,7 +1660,7 @@ static int truncate_data_block(struct inode *inode, struct page *page,
if (err)
return err;
- zero_user_segment(page, size - pageofs, PAGE_CACHE_SIZE);
+ zero_user_segment(page, size - pageofs, PAGE_SIZE);
return logfs_segment_write(inode, page, shadow);
}
@@ -1919,7 +1919,7 @@ static void move_page_to_inode(struct inode *inode, struct page *page)
block->page = NULL;
if (PagePrivate(page)) {
ClearPagePrivate(page);
- page_cache_release(page);
+ put_page(page);
set_page_private(page, 0);
}
}
@@ -1940,7 +1940,7 @@ static void move_inode_to_page(struct page *page, struct inode *inode)
if (!PagePrivate(page)) {
SetPagePrivate(page);
- page_cache_get(page);
+ get_page(page);
set_page_private(page, (unsigned long) block);
}
@@ -1971,7 +1971,7 @@ int logfs_read_inode(struct inode *inode)
logfs_disk_to_inode(di, inode);
kunmap_atomic(di);
move_page_to_inode(inode, page);
- page_cache_release(page);
+ put_page(page);
return 0;
}
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index d270e4b2ab6b0..1efd6055f4b05 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -90,9 +90,9 @@ int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
if (!PagePrivate(page)) {
SetPagePrivate(page);
- page_cache_get(page);
+ get_page(page);
}
- page_cache_release(page);
+ put_page(page);
buf += copylen;
len -= copylen;
@@ -117,9 +117,9 @@ static void pad_partial_page(struct logfs_area *area)
memset(page_address(page) + offset, 0xff, len);
if (!PagePrivate(page)) {
SetPagePrivate(page);
- page_cache_get(page);
+ get_page(page);
}
- page_cache_release(page);
+ put_page(page);
}
}
@@ -129,20 +129,20 @@ static void pad_full_pages(struct logfs_area *area)
struct logfs_super *super = logfs_super(sb);
u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
u32 len = super->s_segsize - area->a_used_bytes;
- pgoff_t index = PAGE_CACHE_ALIGN(ofs) >> PAGE_CACHE_SHIFT;
- pgoff_t no_indizes = len >> PAGE_CACHE_SHIFT;
+ pgoff_t index = PAGE_ALIGN(ofs) >> PAGE_SHIFT;
+ pgoff_t no_indizes = len >> PAGE_SHIFT;
struct page *page;
while (no_indizes) {
page = get_mapping_page(sb, index, 0);
BUG_ON(!page); /* FIXME: reserve a pool */
SetPageUptodate(page);
- memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
+ memset(page_address(page), 0xff, PAGE_SIZE);
if (!PagePrivate(page)) {
SetPagePrivate(page);
- page_cache_get(page);
+ get_page(page);
}
- page_cache_release(page);
+ put_page(page);
index++;
no_indizes--;
}
@@ -411,7 +411,7 @@ int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf)
if (IS_ERR(page))
return PTR_ERR(page);
memcpy(buf, page_address(page) + offset, copylen);
- page_cache_release(page);
+ put_page(page);
buf += copylen;
len -= copylen;
@@ -499,7 +499,7 @@ static void move_btree_to_page(struct inode *inode, struct page *page,
if (!PagePrivate(page)) {
SetPagePrivate(page);
- page_cache_get(page);
+ get_page(page);
set_page_private(page, (unsigned long) block);
}
block->ops = &indirect_block_ops;
@@ -554,7 +554,7 @@ void move_page_to_btree(struct page *page)
if (PagePrivate(page)) {
ClearPagePrivate(page);
- page_cache_release(page);
+ put_page(page);
set_page_private(page, 0);
}
block->ops = &btree_block_ops;
@@ -723,9 +723,9 @@ void freeseg(struct super_block *sb, u32 segno)
continue;
if (PagePrivate(page)) {
ClearPagePrivate(page);
- page_cache_release(page);
+ put_page(page);
}
- page_cache_release(page);
+ put_page(page);
}
}
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index 54360293bcb5c..5751082dba52b 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -48,7 +48,7 @@ void emergency_read_end(struct page *page)
if (page == emergency_page)
mutex_unlock(&emergency_mutex);
else
- page_cache_release(page);
+ put_page(page);
}
static void dump_segfile(struct super_block *sb)
@@ -206,7 +206,7 @@ static int write_one_sb(struct super_block *sb,
logfs_set_segment_erased(sb, segno, ec, 0);
logfs_write_ds(sb, ds, segno, ec);
err = super->s_devops->write_sb(sb, page);
- page_cache_release(page);
+ put_page(page);
return err;
}
@@ -366,24 +366,24 @@ static struct page *find_super_block(struct super_block *sb)
return NULL;
last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]);
if (!last || IS_ERR(last)) {
- page_cache_release(first);
+ put_page(first);
return NULL;
}
if (!logfs_check_ds(page_address(first))) {
- page_cache_release(last);
+ put_page(last);
return first;
}
/* First one didn't work, try the second superblock */
if (!logfs_check_ds(page_address(last))) {
- page_cache_release(first);
+ put_page(first);
return last;
}
/* Neither worked, sorry folks */
- page_cache_release(first);
- page_cache_release(last);
+ put_page(first);
+ put_page(last);
return NULL;
}
@@ -425,7 +425,7 @@ static int __logfs_read_sb(struct super_block *sb)
super->s_data_levels = ds->ds_data_levels;
super->s_total_levels = super->s_ifile_levels + super->s_iblock_levels
+ super->s_data_levels;
- page_cache_release(page);
+ put_page(page);
return 0;
}
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index d19ac258105aa..31dcd515b9d50 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -21,14 +21,14 @@ static int minix_readdir(struct file *, struct dir_context *);
const struct file_operations minix_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .iterate = minix_readdir,
+ .iterate_shared = minix_readdir,
.fsync = generic_file_fsync,
};
static inline void dir_put_page(struct page *page)
{
kunmap(page);
- page_cache_release(page);
+ put_page(page);
}
/*
@@ -38,10 +38,10 @@ static inline void dir_put_page(struct page *page)
static unsigned
minix_last_byte(struct inode *inode, unsigned long page_nr)
{
- unsigned last_byte = PAGE_CACHE_SIZE;
+ unsigned last_byte = PAGE_SIZE;
- if (page_nr == (inode->i_size >> PAGE_CACHE_SHIFT))
- last_byte = inode->i_size & (PAGE_CACHE_SIZE - 1);
+ if (page_nr == (inode->i_size >> PAGE_SHIFT))
+ last_byte = inode->i_size & (PAGE_SIZE - 1);
return last_byte;
}
@@ -92,8 +92,8 @@ static int minix_readdir(struct file *file, struct dir_context *ctx)
if (pos >= inode->i_size)
return 0;
- offset = pos & ~PAGE_CACHE_MASK;
- n = pos >> PAGE_CACHE_SHIFT;
+ offset = pos & ~PAGE_MASK;
+ n = pos >> PAGE_SHIFT;
for ( ; n < npages; n++, offset = 0) {
char *p, *kaddr, *limit;
@@ -229,7 +229,7 @@ int minix_add_link(struct dentry *dentry, struct inode *inode)
lock_page(page);
kaddr = (char*)page_address(page);
dir_end = kaddr + minix_last_byte(dir, n);
- limit = kaddr + PAGE_CACHE_SIZE - sbi->s_dirsize;
+ limit = kaddr + PAGE_SIZE - sbi->s_dirsize;
for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) {
de = (minix_dirent *)p;
de3 = (minix3_dirent *)p;
@@ -327,7 +327,7 @@ int minix_make_empty(struct inode *inode, struct inode *dir)
}
kaddr = kmap_atomic(page);
- memset(kaddr, 0, PAGE_CACHE_SIZE);
+ memset(kaddr, 0, PAGE_SIZE);
if (sbi->s_version == MINIX_V3) {
minix3_dirent *de3 = (minix3_dirent *)kaddr;
@@ -350,7 +350,7 @@ int minix_make_empty(struct inode *inode, struct inode *dir)
err = dir_commit_chunk(page, 0, 2 * sbi->s_dirsize);
fail:
- page_cache_release(page);
+ put_page(page);
return err;
}
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index a795a11e50c72..2887d1d95ce24 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -243,11 +243,11 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
out_dir:
if (dir_de) {
kunmap(dir_page);
- page_cache_release(dir_page);
+ put_page(dir_page);
}
out_old:
kunmap(old_page);
- page_cache_release(old_page);
+ put_page(old_page);
out:
return err;
}
diff --git a/fs/mpage.c b/fs/mpage.c
index 6bd9fd90964e2..7a09c55b4bd0d 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -50,17 +50,18 @@ static void mpage_end_io(struct bio *bio)
bio_for_each_segment_all(bv, bio, i) {
struct page *page = bv->bv_page;
- page_endio(page, bio_data_dir(bio), bio->bi_error);
+ page_endio(page, bio_op(bio), bio->bi_error);
}
bio_put(bio);
}
-static struct bio *mpage_bio_submit(int rw, struct bio *bio)
+static struct bio *mpage_bio_submit(int op, int op_flags, struct bio *bio)
{
bio->bi_end_io = mpage_end_io;
- guard_bio_eod(rw, bio);
- submit_bio(rw, bio);
+ bio_set_op_attrs(bio, op, op_flags);
+ guard_bio_eod(op, bio);
+ submit_bio(bio);
return NULL;
}
@@ -71,6 +72,8 @@ mpage_alloc(struct block_device *bdev,
{
struct bio *bio;
+ /* Restrict the given (page cache) mask for slab allocations */
+ gfp_flags &= GFP_KERNEL;
bio = bio_alloc(gfp_flags, nr_vecs);
if (bio == NULL && (current->flags & PF_MEMALLOC)) {
@@ -107,7 +110,7 @@ map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block)
* don't make any buffers if there is only one buffer on
* the page and the page just needs to be set up to date
*/
- if (inode->i_blkbits == PAGE_CACHE_SHIFT &&
+ if (inode->i_blkbits == PAGE_SHIFT &&
buffer_uptodate(bh)) {
SetPageUptodate(page);
return;
@@ -145,7 +148,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
{
struct inode *inode = page->mapping->host;
const unsigned blkbits = inode->i_blkbits;
- const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
+ const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
const unsigned blocksize = 1 << blkbits;
sector_t block_in_file;
sector_t last_block;
@@ -162,7 +165,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
if (page_has_buffers(page))
goto confused;
- block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
+ block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
last_block = block_in_file + nr_pages * blocks_per_page;
last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
if (last_block > last_block_in_file)
@@ -249,7 +252,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
}
if (first_hole != blocks_per_page) {
- zero_user_segment(page, first_hole << blkbits, PAGE_CACHE_SIZE);
+ zero_user_segment(page, first_hole << blkbits, PAGE_SIZE);
if (first_hole == 0) {
SetPageUptodate(page);
unlock_page(page);
@@ -269,7 +272,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
* This page will go to BIO. Do we need to send this BIO off first?
*/
if (bio && (*last_block_in_bio != blocks[0] - 1))
- bio = mpage_bio_submit(READ, bio);
+ bio = mpage_bio_submit(REQ_OP_READ, 0, bio);
alloc_new:
if (bio == NULL) {
@@ -286,7 +289,7 @@ alloc_new:
length = first_hole << blkbits;
if (bio_add_page(bio, page, length, 0) < length) {
- bio = mpage_bio_submit(READ, bio);
+ bio = mpage_bio_submit(REQ_OP_READ, 0, bio);
goto alloc_new;
}
@@ -294,7 +297,7 @@ alloc_new:
nblocks = map_bh->b_size >> blkbits;
if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
(first_hole != blocks_per_page))
- bio = mpage_bio_submit(READ, bio);
+ bio = mpage_bio_submit(REQ_OP_READ, 0, bio);
else
*last_block_in_bio = blocks[blocks_per_page - 1];
out:
@@ -302,7 +305,7 @@ out:
confused:
if (bio)
- bio = mpage_bio_submit(READ, bio);
+ bio = mpage_bio_submit(REQ_OP_READ, 0, bio);
if (!PageUptodate(page))
block_read_full_page(page, get_block);
else
@@ -331,7 +334,7 @@ confused:
*
* then this code just gives up and calls the buffer_head-based read function.
* It does handle a page which has holes at the end - that is a common case:
- * the end-of-file on blocksize < PAGE_CACHE_SIZE setups.
+ * the end-of-file on blocksize < PAGE_SIZE setups.
*
* BH_Boundary explanation:
*
@@ -362,7 +365,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
sector_t last_block_in_bio = 0;
struct buffer_head map_bh;
unsigned long first_logical_block = 0;
- gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
+ gfp_t gfp = readahead_gfp_mask(mapping);
map_bh.b_state = 0;
map_bh.b_size = 0;
@@ -380,11 +383,11 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
&first_logical_block,
get_block, gfp);
}
- page_cache_release(page);
+ put_page(page);
}
BUG_ON(!list_empty(pages));
if (bio)
- mpage_bio_submit(READ, bio);
+ mpage_bio_submit(REQ_OP_READ, 0, bio);
return 0;
}
EXPORT_SYMBOL(mpage_readpages);
@@ -405,7 +408,7 @@ int mpage_readpage(struct page *page, get_block_t get_block)
bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio,
&map_bh, &first_logical_block, get_block, gfp);
if (bio)
- mpage_bio_submit(READ, bio);
+ mpage_bio_submit(REQ_OP_READ, 0, bio);
return 0;
}
EXPORT_SYMBOL(mpage_readpage);
@@ -472,7 +475,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
struct inode *inode = page->mapping->host;
const unsigned blkbits = inode->i_blkbits;
unsigned long end_index;
- const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
+ const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
sector_t last_block;
sector_t block_in_file;
sector_t blocks[MAX_BUF_PER_PAGE];
@@ -486,7 +489,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
struct buffer_head map_bh;
loff_t i_size = i_size_read(inode);
int ret = 0;
- int wr = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
+ int op_flags = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0);
if (page_has_buffers(page)) {
struct buffer_head *head = page_buffers(page);
@@ -542,7 +545,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
* The page has no buffers: map it to disk
*/
BUG_ON(!PageUptodate(page));
- block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
+ block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
last_block = (i_size - 1) >> blkbits;
map_bh.b_page = page;
for (page_block = 0; page_block < blocks_per_page; ) {
@@ -574,7 +577,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
first_unmapped = page_block;
page_is_mapped:
- end_index = i_size >> PAGE_CACHE_SHIFT;
+ end_index = i_size >> PAGE_SHIFT;
if (page->index >= end_index) {
/*
* The page straddles i_size. It must be zeroed out on each
@@ -584,18 +587,18 @@ page_is_mapped:
* is zeroed when mapped, and writes to that region are not
* written out to the file."
*/
- unsigned offset = i_size & (PAGE_CACHE_SIZE - 1);
+ unsigned offset = i_size & (PAGE_SIZE - 1);
if (page->index > end_index || !offset)
goto confused;
- zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+ zero_user_segment(page, offset, PAGE_SIZE);
}
/*
* This page will go to BIO. Do we need to send this BIO off first?
*/
if (bio && mpd->last_block_in_bio != blocks[0] - 1)
- bio = mpage_bio_submit(wr, bio);
+ bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
alloc_new:
if (bio == NULL) {
@@ -622,7 +625,7 @@ alloc_new:
wbc_account_io(wbc, page, PAGE_SIZE);
length = first_unmapped << blkbits;
if (bio_add_page(bio, page, length, 0) < length) {
- bio = mpage_bio_submit(wr, bio);
+ bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
goto alloc_new;
}
@@ -632,7 +635,7 @@ alloc_new:
set_page_writeback(page);
unlock_page(page);
if (boundary || (first_unmapped != blocks_per_page)) {
- bio = mpage_bio_submit(wr, bio);
+ bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
if (boundary_block) {
write_boundary_block(boundary_bdev,
boundary_block, 1 << blkbits);
@@ -644,7 +647,7 @@ alloc_new:
confused:
if (bio)
- bio = mpage_bio_submit(wr, bio);
+ bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
if (mpd->use_writepage) {
ret = mapping->a_ops->writepage(page, wbc);
@@ -701,9 +704,9 @@ mpage_writepages(struct address_space *mapping,
ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
if (mpd.bio) {
- int wr = (wbc->sync_mode == WB_SYNC_ALL ?
- WRITE_SYNC : WRITE);
- mpage_bio_submit(wr, mpd.bio);
+ int op_flags = (wbc->sync_mode == WB_SYNC_ALL ?
+ WRITE_SYNC : 0);
+ mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio);
}
}
blk_finish_plug(&plug);
@@ -722,9 +725,9 @@ int mpage_writepage(struct page *page, get_block_t get_block,
};
int ret = __mpage_writepage(page, wbc, &mpd);
if (mpd.bio) {
- int wr = (wbc->sync_mode == WB_SYNC_ALL ?
- WRITE_SYNC : WRITE);
- mpage_bio_submit(wr, mpd.bio);
+ int op_flags = (wbc->sync_mode == WB_SYNC_ALL ?
+ WRITE_SYNC : 0);
+ mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio);
}
return ret;
}
diff --git a/fs/namei.c b/fs/namei.c
index 794f81dce7660..adb04146df092 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -35,6 +35,8 @@
#include <linux/fs_struct.h>
#include <linux/posix_acl.h>
#include <linux/hash.h>
+#include <linux/bitops.h>
+#include <linux/init_task.h>
#include <asm/uaccess.h>
#include "internal.h"
@@ -265,7 +267,7 @@ static int check_acl(struct inode *inode, int mask)
if (!acl)
return -EAGAIN;
/* no ->get_acl() calls in RCU mode... */
- if (acl == ACL_NOT_CACHED)
+ if (is_uncached_acl(acl))
return -ECHILD;
return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
}
@@ -408,6 +410,14 @@ int __inode_permission(struct inode *inode, int mask)
* Nobody gets write access to an immutable file.
*/
if (IS_IMMUTABLE(inode))
+ return -EPERM;
+
+ /*
+ * Updating mtime will likely cause i_uid and i_gid to be
+ * written back improperly if their true value is unknown
+ * to the vfs.
+ */
+ if (HAS_UNMAPPED_ID(inode))
return -EACCES;
}
@@ -900,6 +910,7 @@ static inline int may_follow_link(struct nameidata *nd)
{
const struct inode *inode;
const struct inode *parent;
+ kuid_t puid;
if (!sysctl_protected_symlinks)
return 0;
@@ -915,7 +926,8 @@ static inline int may_follow_link(struct nameidata *nd)
return 0;
/* Allowed if parent directory and link owner match. */
- if (uid_eq(parent->i_uid, inode->i_uid))
+ puid = parent->i_uid;
+ if (uid_valid(puid) && uid_eq(puid, inode->i_uid))
return 0;
if (nd->flags & LOOKUP_RCU)
@@ -1088,6 +1100,7 @@ static int follow_automount(struct path *path, struct nameidata *nd,
bool *need_mntput)
{
struct vfsmount *mnt;
+ const struct cred *old_cred;
int err;
if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
@@ -1109,11 +1122,16 @@ static int follow_automount(struct path *path, struct nameidata *nd,
path->dentry->d_inode)
return -EISDIR;
+ if (path->dentry->d_sb->s_user_ns != &init_user_ns)
+ return -EACCES;
+
nd->total_link_count++;
if (nd->total_link_count >= 40)
return -ELOOP;
+ old_cred = override_creds(&init_cred);
mnt = path->dentry->d_op->d_automount(path);
+ revert_creds(old_cred);
if (IS_ERR(mnt)) {
/*
* The filesystem is allowed to return -EISDIR here to indicate
@@ -1415,21 +1433,28 @@ static void follow_mount(struct path *path)
}
}
+static int path_parent_directory(struct path *path)
+{
+ struct dentry *old = path->dentry;
+ /* rare case of legitimate dget_parent()... */
+ path->dentry = dget_parent(path->dentry);
+ dput(old);
+ if (unlikely(!path_connected(path)))
+ return -ENOENT;
+ return 0;
+}
+
static int follow_dotdot(struct nameidata *nd)
{
while(1) {
- struct dentry *old = nd->path.dentry;
-
if (nd->path.dentry == nd->root.dentry &&
nd->path.mnt == nd->root.mnt) {
break;
}
if (nd->path.dentry != nd->path.mnt->mnt_root) {
- /* rare case of legitimate dget_parent()... */
- nd->path.dentry = dget_parent(nd->path.dentry);
- dput(old);
- if (unlikely(!path_connected(&nd->path)))
- return -ENOENT;
+ int ret = path_parent_directory(&nd->path);
+ if (ret)
+ return ret;
break;
}
if (!follow_up(&nd->path))
@@ -1441,9 +1466,8 @@ static int follow_dotdot(struct nameidata *nd)
}
/*
- * This looks up the name in dcache, possibly revalidates the old dentry and
- * allocates a new one if not found or not valid. In the need_lookup argument
- * returns whether i_op->lookup is necessary.
+ * This looks up the name in dcache and possibly revalidates the found dentry.
+ * NULL is returned if the dentry does not exist in the cache.
*/
static struct dentry *lookup_dcache(const struct qstr *name,
struct dentry *dir,
@@ -1603,32 +1627,42 @@ static struct dentry *lookup_slow(const struct qstr *name,
struct dentry *dir,
unsigned int flags)
{
- struct dentry *dentry;
- inode_lock(dir->d_inode);
- dentry = d_lookup(dir, name);
- if (unlikely(dentry)) {
+ struct dentry *dentry = ERR_PTR(-ENOENT), *old;
+ struct inode *inode = dir->d_inode;
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+
+ inode_lock_shared(inode);
+ /* Don't go there if it's already dead */
+ if (unlikely(IS_DEADDIR(inode)))
+ goto out;
+again:
+ dentry = d_alloc_parallel(dir, name, &wq);
+ if (IS_ERR(dentry))
+ goto out;
+ if (unlikely(!d_in_lookup(dentry))) {
if ((dentry->d_flags & DCACHE_OP_REVALIDATE) &&
!(flags & LOOKUP_NO_REVAL)) {
int error = d_revalidate(dentry, flags);
if (unlikely(error <= 0)) {
- if (!error)
+ if (!error) {
d_invalidate(dentry);
+ dput(dentry);
+ goto again;
+ }
dput(dentry);
dentry = ERR_PTR(error);
}
}
- if (dentry) {
- inode_unlock(dir->d_inode);
- return dentry;
+ } else {
+ old = inode->i_op->lookup(inode, dentry, flags);
+ d_lookup_done(dentry);
+ if (unlikely(old)) {
+ dput(dentry);
+ dentry = old;
}
}
- dentry = d_alloc(dir, name);
- if (unlikely(!dentry)) {
- inode_unlock(dir->d_inode);
- return ERR_PTR(-ENOMEM);
- }
- dentry = lookup_real(dir->d_inode, dentry, flags);
- inode_unlock(dir->d_inode);
+out:
+ inode_unlock_shared(inode);
return dentry;
}
@@ -1740,15 +1774,17 @@ static int walk_component(struct nameidata *nd, int flags)
nd->flags);
if (IS_ERR(path.dentry))
return PTR_ERR(path.dentry);
- if (unlikely(d_is_negative(path.dentry))) {
- dput(path.dentry);
- return -ENOENT;
- }
+
path.mnt = nd->path.mnt;
err = follow_managed(&path, nd);
if (unlikely(err < 0))
return err;
+ if (unlikely(d_is_negative(path.dentry))) {
+ path_to_nameidata(&path, nd);
+ return -ENOENT;
+ }
+
seq = 0; /* we are already out of RCU mode */
inode = d_backing_inode(path.dentry);
}
@@ -1785,88 +1821,200 @@ static int walk_component(struct nameidata *nd, int flags)
#include <asm/word-at-a-time.h>
-#ifdef CONFIG_64BIT
+#ifdef HASH_MIX
+
+/* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */
-static inline unsigned int fold_hash(unsigned long hash)
+#elif defined(CONFIG_64BIT)
+/*
+ * Register pressure in the mixing function is an issue, particularly
+ * on 32-bit x86, but almost any function requires one state value and
+ * one temporary. Instead, use a function designed for two state values
+ * and no temporaries.
+ *
+ * This function cannot create a collision in only two iterations, so
+ * we have two iterations to achieve avalanche. In those two iterations,
+ * we have six layers of mixing, which is enough to spread one bit's
+ * influence out to 2^6 = 64 state bits.
+ *
+ * Rotate constants are scored by considering either 64 one-bit input
+ * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
+ * probability of that delta causing a change to each of the 128 output
+ * bits, using a sample of random initial states.
+ *
+ * The Shannon entropy of the computed probabilities is then summed
+ * to produce a score. Ideally, any input change has a 50% chance of
+ * toggling any given output bit.
+ *
+ * Mixing scores (in bits) for (12,45):
+ * Input delta: 1-bit 2-bit
+ * 1 round: 713.3 42542.6
+ * 2 rounds: 2753.7 140389.8
+ * 3 rounds: 5954.1 233458.2
+ * 4 rounds: 7862.6 256672.2
+ * Perfect: 8192 258048
+ * (64*128) (64*63/2 * 128)
+ */
+#define HASH_MIX(x, y, a) \
+ ( x ^= (a), \
+ y ^= x, x = rol64(x,12),\
+ x += y, y = rol64(y,45),\
+ y *= 9 )
+
+/*
+ * Fold two longs into one 32-bit hash value. This must be fast, but
+ * latency isn't quite as critical, as there is a fair bit of additional
+ * work done before the hash value is used.
+ */
+static inline unsigned int fold_hash(unsigned long x, unsigned long y)
{
- return hash_64(hash, 32);
+ y ^= x * GOLDEN_RATIO_64;
+ y *= GOLDEN_RATIO_64;
+ return y >> 32;
}
#else /* 32-bit case */
-#define fold_hash(x) (x)
+/*
+ * Mixing scores (in bits) for (7,20):
+ * Input delta: 1-bit 2-bit
+ * 1 round: 330.3 9201.6
+ * 2 rounds: 1246.4 25475.4
+ * 3 rounds: 1907.1 31295.1
+ * 4 rounds: 2042.3 31718.6
+ * Perfect: 2048 31744
+ * (32*64) (32*31/2 * 64)
+ */
+#define HASH_MIX(x, y, a) \
+ ( x ^= (a), \
+ y ^= x, x = rol32(x, 7),\
+ x += y, y = rol32(y,20),\
+ y *= 9 )
+
+static inline unsigned int fold_hash(unsigned long x, unsigned long y)
+{
+ /* Use arch-optimized multiply if one exists */
+ return __hash_32(y ^ __hash_32(x));
+}
#endif
-unsigned int full_name_hash(const unsigned char *name, unsigned int len)
+/*
+ * Return the hash of a string of known length. This is carfully
+ * designed to match hash_name(), which is the more critical function.
+ * In particular, we must end by hashing a final word containing 0..7
+ * payload bytes, to match the way that hash_name() iterates until it
+ * finds the delimiter after the name.
+ */
+unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
{
- unsigned long a, mask;
- unsigned long hash = 0;
+ unsigned long a, x = 0, y = (unsigned long)salt;
for (;;) {
+ if (!len)
+ goto done;
a = load_unaligned_zeropad(name);
if (len < sizeof(unsigned long))
break;
- hash += a;
- hash *= 9;
+ HASH_MIX(x, y, a);
name += sizeof(unsigned long);
len -= sizeof(unsigned long);
- if (!len)
- goto done;
}
- mask = bytemask_from_count(len);
- hash += mask & a;
+ x ^= a & bytemask_from_count(len);
done:
- return fold_hash(hash);
+ return fold_hash(x, y);
}
EXPORT_SYMBOL(full_name_hash);
+/* Return the "hash_len" (hash and length) of a null-terminated string */
+u64 hashlen_string(const void *salt, const char *name)
+{
+ unsigned long a = 0, x = 0, y = (unsigned long)salt;
+ unsigned long adata, mask, len;
+ const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
+
+ len = 0;
+ goto inside;
+
+ do {
+ HASH_MIX(x, y, a);
+ len += sizeof(unsigned long);
+inside:
+ a = load_unaligned_zeropad(name+len);
+ } while (!has_zero(a, &adata, &constants));
+
+ adata = prep_zero_mask(a, adata, &constants);
+ mask = create_zero_mask(adata);
+ x ^= a & zero_bytemask(mask);
+
+ return hashlen_create(fold_hash(x, y), len + find_zero(mask));
+}
+EXPORT_SYMBOL(hashlen_string);
+
/*
* Calculate the length and hash of the path component, and
* return the "hash_len" as the result.
*/
-static inline u64 hash_name(const char *name)
+static inline u64 hash_name(const void *salt, const char *name)
{
- unsigned long a, b, adata, bdata, mask, hash, len;
+ unsigned long a = 0, b, x = 0, y = (unsigned long)salt;
+ unsigned long adata, bdata, mask, len;
const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
- hash = a = 0;
- len = -sizeof(unsigned long);
+ len = 0;
+ goto inside;
+
do {
- hash = (hash + a) * 9;
+ HASH_MIX(x, y, a);
len += sizeof(unsigned long);
+inside:
a = load_unaligned_zeropad(name+len);
b = a ^ REPEAT_BYTE('/');
} while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));
adata = prep_zero_mask(a, adata, &constants);
bdata = prep_zero_mask(b, bdata, &constants);
-
mask = create_zero_mask(adata | bdata);
+ x ^= a & zero_bytemask(mask);
- hash += a & zero_bytemask(mask);
- len += find_zero(mask);
- return hashlen_create(fold_hash(hash), len);
+ return hashlen_create(fold_hash(x, y), len + find_zero(mask));
}
-#else
+#else /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
-unsigned int full_name_hash(const unsigned char *name, unsigned int len)
+/* Return the hash of a string of known length */
+unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
{
- unsigned long hash = init_name_hash();
+ unsigned long hash = init_name_hash(salt);
while (len--)
- hash = partial_name_hash(*name++, hash);
+ hash = partial_name_hash((unsigned char)*name++, hash);
return end_name_hash(hash);
}
EXPORT_SYMBOL(full_name_hash);
+/* Return the "hash_len" (hash and length) of a null-terminated string */
+u64 hashlen_string(const void *salt, const char *name)
+{
+ unsigned long hash = init_name_hash(salt);
+ unsigned long len = 0, c;
+
+ c = (unsigned char)*name;
+ while (c) {
+ len++;
+ hash = partial_name_hash(c, hash);
+ c = (unsigned char)name[len];
+ }
+ return hashlen_create(end_name_hash(hash), len);
+}
+EXPORT_SYMBOL(hashlen_string);
+
/*
* We know there's a real path component here of at least
* one character.
*/
-static inline u64 hash_name(const char *name)
+static inline u64 hash_name(const void *salt, const char *name)
{
- unsigned long hash = init_name_hash();
+ unsigned long hash = init_name_hash(salt);
unsigned long len = 0, c;
c = (unsigned char)*name;
@@ -1903,10 +2051,10 @@ static int link_path_walk(const char *name, struct nameidata *nd)
int type;
err = may_lookup(nd);
- if (err)
+ if (err)
return err;
- hash_len = hash_name(name);
+ hash_len = hash_name(nd->path.dentry, name);
type = LAST_NORM;
if (name[0] == '.') switch (hashlen_len(hash_len)) {
@@ -2285,7 +2433,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
this.name = name;
this.len = len;
- this.hash = full_name_hash(name, len);
+ this.hash = full_name_hash(base, name, len);
if (!len)
return ERR_PTR(-EACCES);
@@ -2339,7 +2487,7 @@ struct dentry *lookup_one_len_unlocked(const char *name,
this.name = name;
this.len = len;
- this.hash = full_name_hash(name, len);
+ this.hash = full_name_hash(base, name, len);
if (!len)
return ERR_PTR(-EACCES);
@@ -2374,6 +2522,34 @@ struct dentry *lookup_one_len_unlocked(const char *name,
}
EXPORT_SYMBOL(lookup_one_len_unlocked);
+#ifdef CONFIG_UNIX98_PTYS
+int path_pts(struct path *path)
+{
+ /* Find something mounted on "pts" in the same directory as
+ * the input path.
+ */
+ struct dentry *child, *parent;
+ struct qstr this;
+ int ret;
+
+ ret = path_parent_directory(path);
+ if (ret)
+ return ret;
+
+ parent = path->dentry;
+ this.name = "pts";
+ this.len = 3;
+ child = d_hash_and_lookup(parent, &this);
+ if (!child)
+ return -ENOENT;
+
+ path->dentry = child;
+ dput(parent);
+ follow_mount(path);
+ return 0;
+}
+#endif
+
int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
struct path *path, int *empty)
{
@@ -2582,10 +2758,11 @@ EXPORT_SYMBOL(__check_sticky);
* c. have CAP_FOWNER capability
* 6. If the victim is append-only or immutable we can't do antyhing with
* links pointing to it.
- * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
- * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
- * 9. We can't remove a root or mountpoint.
- * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
+ * 7. If the victim has an unknown uid or gid we can't change the inode.
+ * 8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
+ * 9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
+ * 10. We can't remove a root or mountpoint.
+ * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
* nfs_async_unlink().
*/
static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
@@ -2607,7 +2784,7 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
return -EPERM;
if (check_sticky(dir, inode) || IS_APPEND(inode) ||
- IS_IMMUTABLE(inode) || IS_SWAPFILE(inode))
+ IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(inode))
return -EPERM;
if (isdir) {
if (!d_is_dir(victim))
@@ -2628,16 +2805,22 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
* 1. We can't do it if child already exists (open has special treatment for
* this case, but since we are inlined it's OK)
* 2. We can't do it if dir is read-only (done in permission())
- * 3. We should have write and exec permissions on dir
- * 4. We can't do it if dir is immutable (done in permission())
+ * 3. We can't do it if the fs can't represent the fsuid or fsgid.
+ * 4. We should have write and exec permissions on dir
+ * 5. We can't do it if dir is immutable (done in permission())
*/
static inline int may_create(struct inode *dir, struct dentry *child)
{
+ struct user_namespace *s_user_ns;
audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
if (child->d_inode)
return -EEXIST;
if (IS_DEADDIR(dir))
return -ENOENT;
+ s_user_ns = dir->i_sb->s_user_ns;
+ if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
+ !kgid_has_mapping(s_user_ns, current_fsgid()))
+ return -EOVERFLOW;
return inode_permission(dir, MAY_WRITE | MAY_EXEC);
}
@@ -2653,7 +2836,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
return NULL;
}
- mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
+ mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
p = d_ancestor(p2, p1);
if (p) {
@@ -2680,7 +2863,7 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
inode_unlock(p1->d_inode);
if (p1 != p2) {
inode_unlock(p2->d_inode);
- mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
+ mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
}
}
EXPORT_SYMBOL(unlock_rename);
@@ -2706,6 +2889,12 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
}
EXPORT_SYMBOL(vfs_create);
+bool may_open_dev(const struct path *path)
+{
+ return !(path->mnt->mnt_flags & MNT_NODEV) &&
+ !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
+}
+
static int may_open(struct path *path, int acc_mode, int flag)
{
struct dentry *dentry = path->dentry;
@@ -2724,7 +2913,7 @@ static int may_open(struct path *path, int acc_mode, int flag)
break;
case S_IFBLK:
case S_IFCHR:
- if (path->mnt->mnt_flags & MNT_NODEV)
+ if (!may_open_dev(path))
return -EACCES;
/*FALLTHRU*/
case S_IFIFO:
@@ -2783,7 +2972,7 @@ static inline int open_to_namei_flags(int flag)
return flag;
}
-static int may_o_create(struct path *dir, struct dentry *dentry, umode_t mode)
+static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t mode)
{
int error = security_path_mknod(dir, dentry, mode, 0);
if (error)
@@ -2812,155 +3001,60 @@ static int may_o_create(struct path *dir, struct dentry *dentry, umode_t mode)
static int atomic_open(struct nameidata *nd, struct dentry *dentry,
struct path *path, struct file *file,
const struct open_flags *op,
- bool got_write, bool need_lookup,
+ int open_flag, umode_t mode,
int *opened)
{
+ struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
struct inode *dir = nd->path.dentry->d_inode;
- unsigned open_flag = open_to_namei_flags(op->open_flag);
- umode_t mode;
int error;
- int acc_mode;
- int create_error = 0;
- struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
- bool excl;
-
- BUG_ON(dentry->d_inode);
- /* Don't create child dentry for a dead directory. */
- if (unlikely(IS_DEADDIR(dir))) {
- error = -ENOENT;
- goto out;
- }
-
- mode = op->mode;
- if ((open_flag & O_CREAT) && !IS_POSIXACL(dir))
- mode &= ~current_umask();
-
- excl = (open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT);
- if (excl)
+ if (!(~open_flag & (O_EXCL | O_CREAT))) /* both O_EXCL and O_CREAT */
open_flag &= ~O_TRUNC;
- /*
- * Checking write permission is tricky, bacuse we don't know if we are
- * going to actually need it: O_CREAT opens should work as long as the
- * file exists. But checking existence breaks atomicity. The trick is
- * to check access and if not granted clear O_CREAT from the flags.
- *
- * Another problem is returing the "right" error value (e.g. for an
- * O_EXCL open we want to return EEXIST not EROFS).
- */
- if (((open_flag & (O_CREAT | O_TRUNC)) ||
- (open_flag & O_ACCMODE) != O_RDONLY) && unlikely(!got_write)) {
- if (!(open_flag & O_CREAT)) {
- /*
- * No O_CREATE -> atomicity not a requirement -> fall
- * back to lookup + open
- */
- goto no_open;
- } else if (open_flag & (O_EXCL | O_TRUNC)) {
- /* Fall back and fail with the right error */
- create_error = -EROFS;
- goto no_open;
- } else {
- /* No side effects, safe to clear O_CREAT */
- create_error = -EROFS;
- open_flag &= ~O_CREAT;
- }
- }
-
- if (open_flag & O_CREAT) {
- error = may_o_create(&nd->path, dentry, mode);
- if (error) {
- create_error = error;
- if (open_flag & O_EXCL)
- goto no_open;
- open_flag &= ~O_CREAT;
- }
- }
-
if (nd->flags & LOOKUP_DIRECTORY)
open_flag |= O_DIRECTORY;
file->f_path.dentry = DENTRY_NOT_SET;
file->f_path.mnt = nd->path.mnt;
- error = dir->i_op->atomic_open(dir, dentry, file, open_flag, mode,
- opened);
- if (error < 0) {
- if (create_error && error == -ENOENT)
- error = create_error;
- goto out;
- }
-
- if (error) { /* returned 1, that is */
+ error = dir->i_op->atomic_open(dir, dentry, file,
+ open_to_namei_flags(open_flag),
+ mode, opened);
+ d_lookup_done(dentry);
+ if (!error) {
+ /*
+ * We didn't have the inode before the open, so check open
+ * permission here.
+ */
+ int acc_mode = op->acc_mode;
+ if (*opened & FILE_CREATED) {
+ WARN_ON(!(open_flag & O_CREAT));
+ fsnotify_create(dir, dentry);
+ acc_mode = 0;
+ }
+ error = may_open(&file->f_path, acc_mode, open_flag);
+ if (WARN_ON(error > 0))
+ error = -EINVAL;
+ } else if (error > 0) {
if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
error = -EIO;
- goto out;
- }
- if (file->f_path.dentry) {
- dput(dentry);
- dentry = file->f_path.dentry;
- }
- if (*opened & FILE_CREATED)
- fsnotify_create(dir, dentry);
- if (!dentry->d_inode) {
- WARN_ON(*opened & FILE_CREATED);
- if (create_error) {
- error = create_error;
- goto out;
- }
} else {
- if (excl && !(*opened & FILE_CREATED)) {
- error = -EEXIST;
- goto out;
+ if (file->f_path.dentry) {
+ dput(dentry);
+ dentry = file->f_path.dentry;
+ }
+ if (*opened & FILE_CREATED)
+ fsnotify_create(dir, dentry);
+ if (unlikely(d_is_negative(dentry))) {
+ error = -ENOENT;
+ } else {
+ path->dentry = dentry;
+ path->mnt = nd->path.mnt;
+ return 1;
}
}
- goto looked_up;
- }
-
- /*
- * We didn't have the inode before the open, so check open permission
- * here.
- */
- acc_mode = op->acc_mode;
- if (*opened & FILE_CREATED) {
- WARN_ON(!(open_flag & O_CREAT));
- fsnotify_create(dir, dentry);
- acc_mode = 0;
}
- error = may_open(&file->f_path, acc_mode, open_flag);
- if (error)
- fput(file);
-
-out:
dput(dentry);
return error;
-
-no_open:
- if (need_lookup) {
- dentry = lookup_real(dir, dentry, nd->flags);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
-
- if (create_error) {
- int open_flag = op->open_flag;
-
- error = create_error;
- if ((open_flag & O_EXCL)) {
- if (!dentry->d_inode)
- goto out;
- } else if (!dentry->d_inode) {
- goto out;
- } else if ((open_flag & O_TRUNC) &&
- d_is_reg(dentry)) {
- goto out;
- }
- /* will fail later, go on to get the right error */
- }
- }
-looked_up:
- path->dentry = dentry;
- path->mnt = nd->path.mnt;
- return 1;
}
/*
@@ -2988,62 +3082,118 @@ static int lookup_open(struct nameidata *nd, struct path *path,
{
struct dentry *dir = nd->path.dentry;
struct inode *dir_inode = dir->d_inode;
+ int open_flag = op->open_flag;
struct dentry *dentry;
- int error;
- bool need_lookup = false;
+ int error, create_error = 0;
+ umode_t mode = op->mode;
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+
+ if (unlikely(IS_DEADDIR(dir_inode)))
+ return -ENOENT;
*opened &= ~FILE_CREATED;
- dentry = lookup_dcache(&nd->last, dir, nd->flags);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
+ dentry = d_lookup(dir, &nd->last);
+ for (;;) {
+ if (!dentry) {
+ dentry = d_alloc_parallel(dir, &nd->last, &wq);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+ }
+ if (d_in_lookup(dentry))
+ break;
- if (!dentry) {
- dentry = d_alloc(dir, &nd->last);
- if (unlikely(!dentry))
- return -ENOMEM;
- need_lookup = true;
- } else if (dentry->d_inode) {
+ if (!(dentry->d_flags & DCACHE_OP_REVALIDATE))
+ break;
+
+ error = d_revalidate(dentry, nd->flags);
+ if (likely(error > 0))
+ break;
+ if (error)
+ goto out_dput;
+ d_invalidate(dentry);
+ dput(dentry);
+ dentry = NULL;
+ }
+ if (dentry->d_inode) {
/* Cached positive dentry: will open in f_op->open */
goto out_no_open;
}
- if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) {
- return atomic_open(nd, dentry, path, file, op, got_write,
- need_lookup, opened);
+ /*
+ * Checking write permission is tricky, bacuse we don't know if we are
+ * going to actually need it: O_CREAT opens should work as long as the
+ * file exists. But checking existence breaks atomicity. The trick is
+ * to check access and if not granted clear O_CREAT from the flags.
+ *
+ * Another problem is returing the "right" error value (e.g. for an
+ * O_EXCL open we want to return EEXIST not EROFS).
+ */
+ if (open_flag & O_CREAT) {
+ if (!IS_POSIXACL(dir->d_inode))
+ mode &= ~current_umask();
+ if (unlikely(!got_write)) {
+ create_error = -EROFS;
+ open_flag &= ~O_CREAT;
+ if (open_flag & (O_EXCL | O_TRUNC))
+ goto no_open;
+ /* No side effects, safe to clear O_CREAT */
+ } else {
+ create_error = may_o_create(&nd->path, dentry, mode);
+ if (create_error) {
+ open_flag &= ~O_CREAT;
+ if (open_flag & O_EXCL)
+ goto no_open;
+ }
+ }
+ } else if ((open_flag & (O_TRUNC|O_WRONLY|O_RDWR)) &&
+ unlikely(!got_write)) {
+ /*
+ * No O_CREATE -> atomicity not a requirement -> fall
+ * back to lookup + open
+ */
+ goto no_open;
}
- if (need_lookup) {
- BUG_ON(dentry->d_inode);
+ if (dir_inode->i_op->atomic_open) {
+ error = atomic_open(nd, dentry, path, file, op, open_flag,
+ mode, opened);
+ if (unlikely(error == -ENOENT) && create_error)
+ error = create_error;
+ return error;
+ }
- dentry = lookup_real(dir_inode, dentry, nd->flags);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
+no_open:
+ if (d_in_lookup(dentry)) {
+ struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
+ nd->flags);
+ d_lookup_done(dentry);
+ if (unlikely(res)) {
+ if (IS_ERR(res)) {
+ error = PTR_ERR(res);
+ goto out_dput;
+ }
+ dput(dentry);
+ dentry = res;
+ }
}
/* Negative dentry, just create the file */
- if (!dentry->d_inode && (op->open_flag & O_CREAT)) {
- umode_t mode = op->mode;
- if (!IS_POSIXACL(dir->d_inode))
- mode &= ~current_umask();
- /*
- * This write is needed to ensure that a
- * rw->ro transition does not occur between
- * the time when the file is created and when
- * a permanent write count is taken through
- * the 'struct file' in finish_open().
- */
- if (!got_write) {
- error = -EROFS;
- goto out_dput;
- }
+ if (!dentry->d_inode && (open_flag & O_CREAT)) {
*opened |= FILE_CREATED;
- error = security_path_mknod(&nd->path, dentry, mode, 0);
- if (error)
+ audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
+ if (!dir_inode->i_op->create) {
+ error = -EACCES;
goto out_dput;
- error = vfs_create(dir->d_inode, dentry, mode,
- nd->flags & LOOKUP_EXCL);
+ }
+ error = dir_inode->i_op->create(dir_inode, dentry, mode,
+ open_flag & O_EXCL);
if (error)
goto out_dput;
+ fsnotify_create(dir_inode, dentry);
+ }
+ if (unlikely(create_error) && !dentry->d_inode) {
+ error = create_error;
+ goto out_dput;
}
out_no_open:
path->dentry = dentry;
@@ -3069,9 +3219,7 @@ static int do_last(struct nameidata *nd,
int acc_mode = op->acc_mode;
unsigned seq;
struct inode *inode;
- struct path save_parent = { .dentry = NULL, .mnt = NULL };
struct path path;
- bool retried = false;
int error;
nd->flags &= ~LOOKUP_PARENT;
@@ -3114,8 +3262,7 @@ static int do_last(struct nameidata *nd,
return -EISDIR;
}
-retry_lookup:
- if (op->open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
+ if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
error = mnt_want_write(nd->path.mnt);
if (!error)
got_write = true;
@@ -3125,9 +3272,15 @@ retry_lookup:
* dropping this one anyway.
*/
}
- inode_lock(dir->d_inode);
+ if (open_flag & O_CREAT)
+ inode_lock(dir->d_inode);
+ else
+ inode_lock_shared(dir->d_inode);
error = lookup_open(nd, &path, file, op, got_write, opened);
- inode_unlock(dir->d_inode);
+ if (open_flag & O_CREAT)
+ inode_unlock(dir->d_inode);
+ else
+ inode_unlock_shared(dir->d_inode);
if (error <= 0) {
if (error)
@@ -3160,6 +3313,10 @@ retry_lookup:
got_write = false;
}
+ error = follow_managed(&path, nd);
+ if (unlikely(error < 0))
+ return error;
+
if (unlikely(d_is_negative(path.dentry))) {
path_to_nameidata(&path, nd);
return -ENOENT;
@@ -3175,10 +3332,6 @@ retry_lookup:
return -EEXIST;
}
- error = follow_managed(&path, nd);
- if (unlikely(error < 0))
- return error;
-
seq = 0; /* out of RCU mode, so the value doesn't matter */
inode = d_backing_inode(path.dentry);
finish_lookup:
@@ -3189,28 +3342,15 @@ finish_lookup:
if (unlikely(error))
return error;
- if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path.mnt) {
- path_to_nameidata(&path, nd);
- } else {
- save_parent.dentry = nd->path.dentry;
- save_parent.mnt = mntget(path.mnt);
- nd->path.dentry = path.dentry;
-
- }
+ path_to_nameidata(&path, nd);
nd->inode = inode;
nd->seq = seq;
/* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */
finish_open:
error = complete_walk(nd);
- if (error) {
- path_put(&save_parent);
+ if (error)
return error;
- }
audit_inode(nd->name, nd->path.dentry, 0);
- if (unlikely(d_is_symlink(nd->path.dentry)) && !(open_flag & O_PATH)) {
- error = -ELOOP;
- goto out;
- }
error = -EISDIR;
if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry))
goto out;
@@ -3227,64 +3367,30 @@ finish_open:
got_write = true;
}
finish_open_created:
- if (likely(!(open_flag & O_PATH))) {
- error = may_open(&nd->path, acc_mode, open_flag);
- if (error)
- goto out;
- }
+ error = may_open(&nd->path, acc_mode, open_flag);
+ if (error)
+ goto out;
BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
error = vfs_open(&nd->path, file, current_cred());
- if (!error) {
- *opened |= FILE_OPENED;
- } else {
- if (error == -EOPENSTALE)
- goto stale_open;
+ if (error)
goto out;
- }
+ *opened |= FILE_OPENED;
opened:
error = open_check_o_direct(file);
- if (error)
- goto exit_fput;
- error = ima_file_check(file, op->acc_mode, *opened);
- if (error)
- goto exit_fput;
-
- if (will_truncate) {
+ if (!error)
+ error = ima_file_check(file, op->acc_mode, *opened);
+ if (!error && will_truncate)
error = handle_truncate(file);
- if (error)
- goto exit_fput;
- }
out:
+ if (unlikely(error) && (*opened & FILE_OPENED))
+ fput(file);
if (unlikely(error > 0)) {
WARN_ON(1);
error = -EINVAL;
}
if (got_write)
mnt_drop_write(nd->path.mnt);
- path_put(&save_parent);
return error;
-
-exit_fput:
- fput(file);
- goto out;
-
-stale_open:
- /* If no saved parent or already retried then can't retry */
- if (!save_parent.dentry || retried)
- goto out;
-
- BUG_ON(save_parent.dentry != dir);
- path_put(&nd->path);
- nd->path = save_parent;
- nd->inode = dir->d_inode;
- save_parent.mnt = NULL;
- save_parent.dentry = NULL;
- if (got_write) {
- mnt_drop_write(nd->path.mnt);
- got_write = false;
- }
- retried = true;
- goto retry_lookup;
}
static int do_tmpfile(struct nameidata *nd, unsigned flags,
@@ -3345,6 +3451,18 @@ out:
return error;
}
+static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
+{
+ struct path path;
+ int error = path_lookupat(nd, flags, &path);
+ if (!error) {
+ audit_inode(nd->name, path.dentry, 0);
+ error = vfs_open(&path, file, current_cred());
+ path_put(&path);
+ }
+ return error;
+}
+
static struct file *path_openat(struct nameidata *nd,
const struct open_flags *op, unsigned flags)
{
@@ -3364,6 +3482,13 @@ static struct file *path_openat(struct nameidata *nd,
goto out2;
}
+ if (unlikely(file->f_flags & O_PATH)) {
+ error = do_o_path(nd, flags, file);
+ if (!error)
+ opened |= FILE_OPENED;
+ goto out2;
+ }
+
s = path_init(nd, flags);
if (IS_ERR(s)) {
put_filp(file);
@@ -3606,6 +3731,8 @@ retry:
switch (mode & S_IFMT) {
case 0: case S_IFREG:
error = vfs_create(path.dentry->d_inode,dentry,mode,true);
+ if (!error)
+ ima_post_path_mknod(dentry);
break;
case S_IFCHR: case S_IFBLK:
error = vfs_mknod(path.dentry->d_inode,dentry,mode,
@@ -4038,6 +4165,13 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
*/
if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
return -EPERM;
+ /*
+ * Updating the link count will likely cause i_uid and i_gid to
+ * be writen back improperly if their true value is unknown to
+ * the vfs.
+ */
+ if (HAS_UNMAPPED_ID(inode))
+ return -EPERM;
if (!dir->i_op->link)
return -EPERM;
if (S_ISDIR(inode->i_mode))
@@ -4211,7 +4345,11 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
bool new_is_dir = false;
unsigned max_links = new_dir->i_sb->s_max_links;
- if (source == target)
+ /*
+ * Check source == target.
+ * On overlayfs need to look at underlying inodes.
+ */
+ if (d_real_inode(old_dentry) == d_real_inode(new_dentry))
return 0;
error = may_delete(old_dir, old_dentry, is_dir);
@@ -4515,7 +4653,6 @@ int readlink_copy(char __user *buffer, int buflen, const char *link)
out:
return len;
}
-EXPORT_SYMBOL(readlink_copy);
/*
* A helper for ->readlink(). This should be used *ONLY* for symlinks that
diff --git a/fs/namespace.c b/fs/namespace.c
index 4fb1691b43555..7bb2cda3bfef5 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1562,6 +1562,7 @@ void __detach_mounts(struct dentry *dentry)
goto out_unlock;
lock_mount_hash();
+ event++;
while (!hlist_empty(&mp->m_list)) {
mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
@@ -2185,13 +2186,7 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
}
if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
!(mnt_flags & MNT_NODEV)) {
- /* Was the nodev implicitly added in mount? */
- if ((mnt->mnt_ns->user_ns != &init_user_ns) &&
- !(sb->s_type->fs_flags & FS_USERNS_DEV_MOUNT)) {
- mnt_flags |= MNT_NODEV;
- } else {
- return -EPERM;
- }
+ return -EPERM;
}
if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&
!(mnt_flags & MNT_NOSUID)) {
@@ -2375,7 +2370,7 @@ unlock:
return err;
}
-static bool fs_fully_visible(struct file_system_type *fs_type, int *new_mnt_flags);
+static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags);
/*
* create a new mount for userspace and request it to be added into the
@@ -2385,7 +2380,6 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
int mnt_flags, const char *name, void *data)
{
struct file_system_type *type;
- struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
struct vfsmount *mnt;
int err;
@@ -2396,24 +2390,6 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
if (!type)
return -ENODEV;
- if (user_ns != &init_user_ns) {
- if (!(type->fs_flags & FS_USERNS_MOUNT)) {
- put_filesystem(type);
- return -EPERM;
- }
- /* Only in special cases allow devices from mounts
- * created outside the initial user namespace.
- */
- if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
- flags |= MS_NODEV;
- mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;
- }
- if (type->fs_flags & FS_USERNS_VISIBLE) {
- if (!fs_fully_visible(type, &mnt_flags))
- return -EPERM;
- }
- }
-
mnt = vfs_kern_mount(type, flags, name, data);
if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
!mnt->mnt_sb->s_subtype)
@@ -2423,6 +2399,11 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
if (IS_ERR(mnt))
return PTR_ERR(mnt);
+ if (mount_too_revealing(mnt, &mnt_flags)) {
+ mntput(mnt);
+ return -EPERM;
+ }
+
err = do_add_mount(real_mount(mnt), path, mnt_flags);
if (err)
mntput(mnt);
@@ -3214,22 +3195,19 @@ bool current_chrooted(void)
return chrooted;
}
-static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags)
+static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new,
+ int *new_mnt_flags)
{
- struct mnt_namespace *ns = current->nsproxy->mnt_ns;
int new_flags = *new_mnt_flags;
struct mount *mnt;
bool visible = false;
- if (unlikely(!ns))
- return false;
-
down_read(&namespace_sem);
list_for_each_entry(mnt, &ns->list, mnt_list) {
struct mount *child;
int mnt_flags;
- if (mnt->mnt.mnt_sb->s_type != type)
+ if (mnt->mnt.mnt_sb->s_type != new->mnt_sb->s_type)
continue;
/* This mount is not fully visible if it's root directory
@@ -3238,12 +3216,12 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags)
if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
continue;
- /* Read the mount flags and filter out flags that
- * may safely be ignored.
- */
+ /* A local view of the mount flags */
mnt_flags = mnt->mnt.mnt_flags;
- if (mnt->mnt.mnt_sb->s_iflags & SB_I_NOEXEC)
- mnt_flags &= ~(MNT_LOCK_NOSUID | MNT_LOCK_NOEXEC);
+
+ /* Don't miss readonly hidden in the superblock flags */
+ if (mnt->mnt.mnt_sb->s_flags & MS_RDONLY)
+ mnt_flags |= MNT_LOCK_READONLY;
/* Verify the mount flags are equal to or more permissive
* than the proposed new mount.
@@ -3251,15 +3229,6 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags)
if ((mnt_flags & MNT_LOCK_READONLY) &&
!(new_flags & MNT_READONLY))
continue;
- if ((mnt_flags & MNT_LOCK_NODEV) &&
- !(new_flags & MNT_NODEV))
- continue;
- if ((mnt_flags & MNT_LOCK_NOSUID) &&
- !(new_flags & MNT_NOSUID))
- continue;
- if ((mnt_flags & MNT_LOCK_NOEXEC) &&
- !(new_flags & MNT_NOEXEC))
- continue;
if ((mnt_flags & MNT_LOCK_ATIME) &&
((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
continue;
@@ -3271,7 +3240,7 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags)
list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
struct inode *inode = child->mnt_mountpoint->d_inode;
/* Only worry about locked mounts */
- if (!(mnt_flags & MNT_LOCKED))
+ if (!(child->mnt.mnt_flags & MNT_LOCKED))
continue;
/* Is the directory permanetly empty? */
if (!is_empty_dir_inode(inode))
@@ -3279,9 +3248,6 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags)
}
/* Preserve the locked attributes */
*new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \
- MNT_LOCK_NODEV | \
- MNT_LOCK_NOSUID | \
- MNT_LOCK_NOEXEC | \
MNT_LOCK_ATIME);
visible = true;
goto found;
@@ -3292,6 +3258,42 @@ found:
return visible;
}
+static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags)
+{
+ const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV;
+ struct mnt_namespace *ns = current->nsproxy->mnt_ns;
+ unsigned long s_iflags;
+
+ if (ns->user_ns == &init_user_ns)
+ return false;
+
+ /* Can this filesystem be too revealing? */
+ s_iflags = mnt->mnt_sb->s_iflags;
+ if (!(s_iflags & SB_I_USERNS_VISIBLE))
+ return false;
+
+ if ((s_iflags & required_iflags) != required_iflags) {
+ WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n",
+ required_iflags);
+ return true;
+ }
+
+ return !mnt_already_visible(ns, mnt, new_mnt_flags);
+}
+
+bool mnt_may_suid(struct vfsmount *mnt)
+{
+ /*
+ * Foreign mounts (accessed via fchdir or through /proc
+ * symlinks) are always treated as if they are nosuid. This
+ * prevents namespaces from trusting potentially unsafe
+ * suid/sgid bits, file caps, or security labels that originate
+ * in other namespaces.
+ */
+ return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) &&
+ current_in_userns(mnt->mnt_sb->s_user_ns);
+}
+
static struct ns_common *mntns_get(struct task_struct *task)
{
struct ns_common *ns = NULL;
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index b7f8eaeea5d83..17de5c13dfaed 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -74,7 +74,7 @@ const struct inode_operations ncp_dir_inode_operations =
*/
static int ncp_lookup_validate(struct dentry *, unsigned int);
static int ncp_hash_dentry(const struct dentry *, struct qstr *);
-static int ncp_compare_dentry(const struct dentry *, const struct dentry *,
+static int ncp_compare_dentry(const struct dentry *,
unsigned int, const char *, const struct qstr *);
static int ncp_delete_dentry(const struct dentry *);
static void ncp_d_prune(struct dentry *dentry);
@@ -139,7 +139,7 @@ ncp_hash_dentry(const struct dentry *dentry, struct qstr *this)
int i;
t = NCP_IO_TABLE(sb);
- hash = init_name_hash();
+ hash = init_name_hash(dentry);
for (i=0; i<this->len ; i++)
hash = partial_name_hash(ncp_tolower(t, this->name[i]),
hash);
@@ -154,7 +154,7 @@ ncp_hash_dentry(const struct dentry *dentry, struct qstr *this)
* the callers will handle races.
*/
static int
-ncp_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
+ncp_compare_dentry(const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
struct inode *pinode;
@@ -162,7 +162,7 @@ ncp_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
if (len != name->len)
return 1;
- pinode = d_inode_rcu(parent);
+ pinode = d_inode_rcu(dentry->d_parent);
if (!pinode)
return 1;
@@ -510,7 +510,7 @@ static int ncp_readdir(struct file *file, struct dir_context *ctx)
kunmap(ctl.page);
SetPageUptodate(ctl.page);
unlock_page(ctl.page);
- page_cache_release(ctl.page);
+ put_page(ctl.page);
ctl.page = NULL;
}
ctl.idx = 0;
@@ -520,7 +520,7 @@ invalid_cache:
if (ctl.page) {
kunmap(ctl.page);
unlock_page(ctl.page);
- page_cache_release(ctl.page);
+ put_page(ctl.page);
ctl.page = NULL;
}
ctl.cache = cache;
@@ -554,14 +554,14 @@ finished:
kunmap(ctl.page);
SetPageUptodate(ctl.page);
unlock_page(ctl.page);
- page_cache_release(ctl.page);
+ put_page(ctl.page);
}
if (page) {
cache->head = ctl.head;
kunmap(page);
SetPageUptodate(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
out:
return result;
@@ -649,7 +649,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
kunmap(ctl.page);
SetPageUptodate(ctl.page);
unlock_page(ctl.page);
- page_cache_release(ctl.page);
+ put_page(ctl.page);
}
ctl.cache = NULL;
ctl.idx -= NCP_DIRCACHE_SIZE;
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 5233fbc1747a5..17cfb743b5bf0 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -191,7 +191,7 @@ struct ncp_cache_head {
int eof;
};
-#define NCP_DIRCACHE_SIZE ((int)(PAGE_CACHE_SIZE/sizeof(struct dentry *)))
+#define NCP_DIRCACHE_SIZE ((int)(PAGE_SIZE/sizeof(struct dentry *)))
union ncp_dir_cache {
struct ncp_cache_head head;
struct dentry *dentry[NCP_DIRCACHE_SIZE];
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 8664417955a27..6abdda209642e 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_NFS_FS) += nfs.o
CFLAGS_nfstrace.o += -I$(src)
nfs-y := client.o dir.o file.o getroot.o inode.o super.o \
- direct.o pagelist.o read.o symlink.o unlink.o \
+ io.o direct.o pagelist.o read.o symlink.o unlink.o \
write.o namespace.o mount_clnt.o nfstrace.o
nfs-$(CONFIG_ROOT_NFS) += nfsroot.o
nfs-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 02e4d87d2ed31..f55a4e7560470 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -102,14 +102,15 @@ static inline void put_parallel(struct parallel_io *p)
}
static struct bio *
-bl_submit_bio(int rw, struct bio *bio)
+bl_submit_bio(struct bio *bio)
{
if (bio) {
get_parallel(bio->bi_private);
dprintk("%s submitting %s bio %u@%llu\n", __func__,
- rw == READ ? "read" : "write", bio->bi_iter.bi_size,
+ bio_op(bio) == READ ? "read" : "write",
+ bio->bi_iter.bi_size,
(unsigned long long)bio->bi_iter.bi_sector);
- submit_bio(rw, bio);
+ submit_bio(bio);
}
return NULL;
}
@@ -158,7 +159,7 @@ do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
if (disk_addr < map->start || disk_addr >= map->start + map->len) {
if (!dev->map(dev, disk_addr, map))
return ERR_PTR(-EIO);
- bio = bl_submit_bio(rw, bio);
+ bio = bl_submit_bio(bio);
}
disk_addr += map->disk_offset;
disk_addr -= map->start;
@@ -174,9 +175,10 @@ retry:
disk_addr >> SECTOR_SHIFT, end_io, par);
if (!bio)
return ERR_PTR(-ENOMEM);
+ bio_set_op_attrs(bio, rw, 0);
}
if (bio_add_page(bio, page, *len, offset) < *len) {
- bio = bl_submit_bio(rw, bio);
+ bio = bl_submit_bio(bio);
goto retry;
}
return bio;
@@ -231,7 +233,7 @@ bl_read_pagelist(struct nfs_pgio_header *header)
size_t bytes_left = header->args.count;
unsigned int pg_offset = header->args.pgbase, pg_len;
struct page **pages = header->args.pages;
- int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
+ int pg_index = header->args.pgbase >> PAGE_SHIFT;
const bool is_dio = (header->dreq != NULL);
struct blk_plug plug;
int i;
@@ -252,7 +254,7 @@ bl_read_pagelist(struct nfs_pgio_header *header)
for (i = pg_index; i < header->page_array.npages; i++) {
if (extent_length <= 0) {
/* We've used up the previous extent */
- bio = bl_submit_bio(READ, bio);
+ bio = bl_submit_bio(bio);
/* Get the next one */
if (!ext_tree_lookup(bl, isect, &be, false)) {
@@ -263,17 +265,17 @@ bl_read_pagelist(struct nfs_pgio_header *header)
}
if (is_dio) {
- if (pg_offset + bytes_left > PAGE_CACHE_SIZE)
- pg_len = PAGE_CACHE_SIZE - pg_offset;
+ if (pg_offset + bytes_left > PAGE_SIZE)
+ pg_len = PAGE_SIZE - pg_offset;
else
pg_len = bytes_left;
} else {
BUG_ON(pg_offset != 0);
- pg_len = PAGE_CACHE_SIZE;
+ pg_len = PAGE_SIZE;
}
if (is_hole(&be)) {
- bio = bl_submit_bio(READ, bio);
+ bio = bl_submit_bio(bio);
/* Fill hole w/ zeroes w/o accessing device */
dprintk("%s Zeroing page for hole\n", __func__);
zero_user_segment(pages[i], pg_offset, pg_len);
@@ -306,7 +308,7 @@ bl_read_pagelist(struct nfs_pgio_header *header)
header->res.count = (isect << SECTOR_SHIFT) - header->args.offset;
}
out:
- bl_submit_bio(READ, bio);
+ bl_submit_bio(bio);
blk_finish_plug(&plug);
put_parallel(par);
return PNFS_ATTEMPTED;
@@ -339,9 +341,9 @@ static void bl_write_cleanup(struct work_struct *work)
if (likely(!hdr->pnfs_error)) {
struct pnfs_block_layout *bl = BLK_LSEG2EXT(hdr->lseg);
- u64 start = hdr->args.offset & (loff_t)PAGE_CACHE_MASK;
+ u64 start = hdr->args.offset & (loff_t)PAGE_MASK;
u64 end = (hdr->args.offset + hdr->args.count +
- PAGE_CACHE_SIZE - 1) & (loff_t)PAGE_CACHE_MASK;
+ PAGE_SIZE - 1) & (loff_t)PAGE_MASK;
ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
(end - start) >> SECTOR_SHIFT);
@@ -373,7 +375,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
loff_t offset = header->args.offset;
size_t count = header->args.count;
struct page **pages = header->args.pages;
- int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
+ int pg_index = header->args.pgbase >> PAGE_SHIFT;
unsigned int pg_len;
struct blk_plug plug;
int i;
@@ -392,13 +394,13 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
blk_start_plug(&plug);
/* we always write out the whole page */
- offset = offset & (loff_t)PAGE_CACHE_MASK;
+ offset = offset & (loff_t)PAGE_MASK;
isect = offset >> SECTOR_SHIFT;
for (i = pg_index; i < header->page_array.npages; i++) {
if (extent_length <= 0) {
/* We've used up the previous extent */
- bio = bl_submit_bio(WRITE, bio);
+ bio = bl_submit_bio(bio);
/* Get the next one */
if (!ext_tree_lookup(bl, isect, &be, true)) {
header->pnfs_error = -EINVAL;
@@ -408,7 +410,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
extent_length = be.be_length - (isect - be.be_f_offset);
}
- pg_len = PAGE_CACHE_SIZE;
+ pg_len = PAGE_SIZE;
bio = do_add_page_to_bio(bio, header->page_array.npages - i,
WRITE, isect, pages[i], &map, &be,
bl_end_io_write, par,
@@ -427,7 +429,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
header->res.count = header->args.count;
out:
- bl_submit_bio(WRITE, bio);
+ bl_submit_bio(bio);
blk_finish_plug(&plug);
put_parallel(par);
return PNFS_ATTEMPTED;
@@ -820,7 +822,7 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
pgoff_t end;
/* Optimize common case that writes from 0 to end of file */
- end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
+ end = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
if (end != inode->i_mapping->nrpages) {
rcu_read_lock();
end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX);
@@ -828,9 +830,9 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
}
if (!end)
- return i_size_read(inode) - (idx << PAGE_CACHE_SHIFT);
+ return i_size_read(inode) - (idx << PAGE_SHIFT);
else
- return (end - idx) << PAGE_CACHE_SHIFT;
+ return (end - idx) << PAGE_SHIFT;
}
static void
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index bc21205309e08..18e6fd0b9506e 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -40,8 +40,8 @@
#include "../pnfs.h"
#include "../netns.h"
-#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT)
-#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
+#define PAGE_CACHE_SECTORS (PAGE_SIZE >> SECTOR_SHIFT)
+#define PAGE_CACHE_SECTOR_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
#define SECTOR_SIZE (1 << SECTOR_SHIFT)
struct pnfs_block_dev;
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index e5b89675263ef..a69ef4e9c24c7 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -65,8 +65,8 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
if (!p)
return -EIO;
b->simple.nr_sigs = be32_to_cpup(p++);
- if (!b->simple.nr_sigs) {
- dprintk("no signature\n");
+ if (!b->simple.nr_sigs || b->simple.nr_sigs > PNFS_BLOCK_MAX_UUIDS) {
+ dprintk("Bad signature count: %d\n", b->simple.nr_sigs);
return -EIO;
}
@@ -89,7 +89,8 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
memcpy(&b->simple.sigs[i].sig, p,
b->simple.sigs[i].sig_len);
- b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len;
+ b->simple.len += 8 + 4 + \
+ (XDR_QUADLEN(b->simple.sigs[i].sig_len) << 2);
}
break;
case PNFS_BLOCK_VOLUME_SLICE:
@@ -104,7 +105,12 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
p = xdr_inline_decode(xdr, 4);
if (!p)
return -EIO;
+
b->concat.volumes_count = be32_to_cpup(p++);
+ if (b->concat.volumes_count > PNFS_BLOCK_MAX_DEVICES) {
+ dprintk("Too many volumes: %d\n", b->concat.volumes_count);
+ return -EIO;
+ }
p = xdr_inline_decode(xdr, b->concat.volumes_count * 4);
if (!p)
@@ -116,8 +122,13 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
p = xdr_inline_decode(xdr, 8 + 4);
if (!p)
return -EIO;
+
p = xdr_decode_hyper(p, &b->stripe.chunk_size);
b->stripe.volumes_count = be32_to_cpup(p++);
+ if (b->stripe.volumes_count > PNFS_BLOCK_MAX_DEVICES) {
+ dprintk("Too many volumes: %d\n", b->stripe.volumes_count);
+ return -EIO;
+ }
p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4);
if (!p)
@@ -224,18 +235,20 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
{
struct pnfs_block_volume *v = &volumes[idx];
+ struct block_device *bdev;
dev_t dev;
dev = bl_resolve_deviceid(server, v, gfp_mask);
if (!dev)
return -EIO;
- d->bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL);
- if (IS_ERR(d->bdev)) {
+ bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL);
+ if (IS_ERR(bdev)) {
printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
- MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev));
- return PTR_ERR(d->bdev);
+ MAJOR(dev), MINOR(dev), PTR_ERR(bdev));
+ return PTR_ERR(bdev);
}
+ d->bdev = bdev;
d->len = i_size_read(d->bdev->bd_inode);
@@ -287,44 +300,71 @@ bl_validate_designator(struct pnfs_block_volume *v)
}
}
+/*
+ * Try to open the udev path for the WWN. At least on Debian the udev
+ * by-id path will always point to the dm-multipath device if one exists.
+ */
+static struct block_device *
+bl_open_udev_path(struct pnfs_block_volume *v)
+{
+ struct block_device *bdev;
+ const char *devname;
+
+ devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%*phN",
+ v->scsi.designator_len, v->scsi.designator);
+ if (!devname)
+ return ERR_PTR(-ENOMEM);
+
+ bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL);
+ if (IS_ERR(bdev)) {
+ pr_warn("pNFS: failed to open device %s (%ld)\n",
+ devname, PTR_ERR(bdev));
+ }
+
+ kfree(devname);
+ return bdev;
+}
+
+/*
+ * Try to open the RH/Fedora specific dm-mpath udev path for this WWN, as the
+ * wwn- links will only point to the first discovered SCSI device there.
+ */
+static struct block_device *
+bl_open_dm_mpath_udev_path(struct pnfs_block_volume *v)
+{
+ struct block_device *bdev;
+ const char *devname;
+
+ devname = kasprintf(GFP_KERNEL,
+ "/dev/disk/by-id/dm-uuid-mpath-%d%*phN",
+ v->scsi.designator_type,
+ v->scsi.designator_len, v->scsi.designator);
+ if (!devname)
+ return ERR_PTR(-ENOMEM);
+
+ bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL);
+ kfree(devname);
+ return bdev;
+}
+
static int
bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
{
struct pnfs_block_volume *v = &volumes[idx];
+ struct block_device *bdev;
const struct pr_ops *ops;
- const char *devname;
int error;
if (!bl_validate_designator(v))
return -EINVAL;
- switch (v->scsi.designator_len) {
- case 8:
- devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%8phN",
- v->scsi.designator);
- break;
- case 12:
- devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%12phN",
- v->scsi.designator);
- break;
- case 16:
- devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%16phN",
- v->scsi.designator);
- break;
- default:
- return -EINVAL;
- }
-
- d->bdev = blkdev_get_by_path(devname, FMODE_READ, NULL);
- if (IS_ERR(d->bdev)) {
- pr_warn("pNFS: failed to open device %s (%ld)\n",
- devname, PTR_ERR(d->bdev));
- kfree(devname);
- return PTR_ERR(d->bdev);
- }
-
- kfree(devname);
+ bdev = bl_open_dm_mpath_udev_path(v);
+ if (IS_ERR(bdev))
+ bdev = bl_open_udev_path(v);
+ if (IS_ERR(bdev))
+ return PTR_ERR(bdev);
+ d->bdev = bdev;
d->len = i_size_read(d->bdev->bd_inode);
d->map = bl_map_simple;
@@ -352,7 +392,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
return 0;
out_blkdev_put:
- blkdev_put(d->bdev, FMODE_READ);
+ blkdev_put(d->bdev, FMODE_READ | FMODE_WRITE);
return error;
}
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index 720b3ff55fa9b..992bcb19c11e7 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -121,6 +121,16 @@ ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
return be;
}
+static void __ext_put_deviceids(struct list_head *head)
+{
+ struct pnfs_block_extent *be, *tmp;
+
+ list_for_each_entry_safe(be, tmp, head, be_list) {
+ nfs4_put_deviceid_node(be->be_device);
+ kfree(be);
+ }
+}
+
static void
__ext_tree_insert(struct rb_root *root,
struct pnfs_block_extent *new, bool merge_ok)
@@ -163,7 +173,8 @@ free_new:
}
static int
-__ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
+__ext_tree_remove(struct rb_root *root,
+ sector_t start, sector_t end, struct list_head *tmp)
{
struct pnfs_block_extent *be;
sector_t len1 = 0, len2 = 0;
@@ -223,8 +234,7 @@ __ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
struct pnfs_block_extent *next = ext_tree_next(be);
rb_erase(&be->be_node, root);
- nfs4_put_deviceid_node(be->be_device);
- kfree(be);
+ list_add_tail(&be->be_list, tmp);
be = next;
}
@@ -350,16 +360,18 @@ int ext_tree_remove(struct pnfs_block_layout *bl, bool rw,
sector_t start, sector_t end)
{
int err, err2;
+ LIST_HEAD(tmp);
spin_lock(&bl->bl_ext_lock);
- err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
+ err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp);
if (rw) {
- err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end);
+ err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end, &tmp);
if (!err)
err = err2;
}
spin_unlock(&bl->bl_ext_lock);
+ __ext_put_deviceids(&tmp);
return err;
}
@@ -396,12 +408,13 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
sector_t end = start + len;
struct pnfs_block_extent *be;
int err = 0;
+ LIST_HEAD(tmp);
spin_lock(&bl->bl_ext_lock);
/*
* First remove all COW extents or holes from written to range.
*/
- err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
+ err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp);
if (err)
goto out;
@@ -459,6 +472,8 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
}
out:
spin_unlock(&bl->bl_ext_lock);
+
+ __ext_put_deviceids(&tmp);
return err;
}
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 618ced381a140..c92a75e066a6f 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -119,27 +119,30 @@ out:
* hashed by filehandle.
*/
static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
- struct nfs_fh *fh, nfs4_stateid *stateid)
+ struct nfs_fh *fh)
{
struct nfs_server *server;
+ struct nfs_inode *nfsi;
struct inode *ino;
struct pnfs_layout_hdr *lo;
+restart:
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
list_for_each_entry(lo, &server->layouts, plh_layouts) {
- if (!nfs4_stateid_match_other(&lo->plh_stateid, stateid))
+ nfsi = NFS_I(lo->plh_inode);
+ if (nfs_compare_fh(fh, &nfsi->fh))
continue;
- if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh))
+ if (nfsi->layout != lo)
continue;
ino = igrab(lo->plh_inode);
if (!ino)
break;
spin_lock(&ino->i_lock);
/* Is this layout in the process of being freed? */
- if (NFS_I(ino)->layout != lo) {
+ if (nfsi->layout != lo) {
spin_unlock(&ino->i_lock);
iput(ino);
- break;
+ goto restart;
}
pnfs_get_layout_hdr(lo);
spin_unlock(&ino->i_lock);
@@ -151,13 +154,13 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
}
static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp,
- struct nfs_fh *fh, nfs4_stateid *stateid)
+ struct nfs_fh *fh)
{
struct pnfs_layout_hdr *lo;
spin_lock(&clp->cl_lock);
rcu_read_lock();
- lo = get_layout_by_fh_locked(clp, fh, stateid);
+ lo = get_layout_by_fh_locked(clp, fh);
rcu_read_unlock();
spin_unlock(&clp->cl_lock);
@@ -167,17 +170,39 @@ static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp,
/*
* Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing)
*/
-static bool pnfs_check_stateid_sequence(struct pnfs_layout_hdr *lo,
+static u32 pnfs_check_callback_stateid(struct pnfs_layout_hdr *lo,
const nfs4_stateid *new)
{
u32 oldseq, newseq;
- oldseq = be32_to_cpu(lo->plh_stateid.seqid);
+ /* Is the stateid still not initialised? */
+ if (!pnfs_layout_is_valid(lo))
+ return NFS4ERR_DELAY;
+
+ /* Mismatched stateid? */
+ if (!nfs4_stateid_match_other(&lo->plh_stateid, new))
+ return NFS4ERR_BAD_STATEID;
+
newseq = be32_to_cpu(new->seqid);
+ /* Are we already in a layout recall situation? */
+ if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) &&
+ lo->plh_return_seq != 0) {
+ if (newseq < lo->plh_return_seq)
+ return NFS4ERR_OLD_STATEID;
+ if (newseq > lo->plh_return_seq)
+ return NFS4ERR_DELAY;
+ goto out;
+ }
+ /* Check that the stateid matches what we think it should be. */
+ oldseq = be32_to_cpu(lo->plh_stateid.seqid);
if (newseq > oldseq + 1)
- return false;
- return true;
+ return NFS4ERR_DELAY;
+ /* Crazy server! */
+ if (newseq <= oldseq)
+ return NFS4ERR_OLD_STATEID;
+out:
+ return NFS_OK;
}
static u32 initiate_file_draining(struct nfs_client *clp,
@@ -188,7 +213,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
LIST_HEAD(free_me_list);
- lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid);
+ lo = get_layout_by_fh(clp, &args->cbl_fh);
if (!lo) {
trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, NULL,
&args->cbl_stateid, -rv);
@@ -196,18 +221,15 @@ static u32 initiate_file_draining(struct nfs_client *clp,
}
ino = lo->plh_inode;
+ pnfs_layoutcommit_inode(ino, false);
+
spin_lock(&ino->i_lock);
- if (!pnfs_check_stateid_sequence(lo, &args->cbl_stateid)) {
- rv = NFS4ERR_DELAY;
+ rv = pnfs_check_callback_stateid(lo, &args->cbl_stateid);
+ if (rv != NFS_OK)
goto unlock;
- }
pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
- spin_unlock(&ino->i_lock);
- pnfs_layoutcommit_inode(ino, false);
-
- spin_lock(&ino->i_lock);
/*
* Enforce RFC5661 Section 12.5.5.2.1.5 (Bulk Recall and Return)
*/
@@ -217,16 +239,19 @@ static u32 initiate_file_draining(struct nfs_client *clp,
}
if (pnfs_mark_matching_lsegs_return(lo, &free_me_list,
- &args->cbl_range)) {
+ &args->cbl_range,
+ be32_to_cpu(args->cbl_stateid.seqid))) {
rv = NFS4_OK;
goto unlock;
}
+ /* Embrace your forgetfulness! */
+ rv = NFS4ERR_NOMATCHING_LAYOUT;
+
if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo,
&args->cbl_range);
}
- pnfs_mark_layout_returned_if_empty(lo);
unlock:
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&free_me_list);
@@ -500,8 +525,10 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
cps->slot = slot;
/* The ca_maxresponsesize_cached is 0 with no DRC */
- if (args->csa_cachethis != 0)
- return htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE);
+ if (args->csa_cachethis != 0) {
+ status = htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE);
+ goto out_unlock;
+ }
/*
* Check for pending referring calls. If a match is found, a
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 976c90608e561..656f68f7fe53e 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -146,10 +146,16 @@ static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
p = read_buf(xdr, NFS4_STATEID_SIZE);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
- memcpy(stateid, p, NFS4_STATEID_SIZE);
+ memcpy(stateid->data, p, NFS4_STATEID_SIZE);
return 0;
}
+static __be32 decode_delegation_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ stateid->type = NFS4_DELEGATION_STATEID_TYPE;
+ return decode_stateid(xdr, stateid);
+}
+
static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr)
{
__be32 *p;
@@ -211,7 +217,7 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr,
__be32 *p;
__be32 status;
- status = decode_stateid(xdr, &args->stateid);
+ status = decode_delegation_stateid(xdr, &args->stateid);
if (unlikely(status != 0))
goto out;
p = read_buf(xdr, 4);
@@ -227,6 +233,11 @@ out:
}
#if defined(CONFIG_NFS_V4_1)
+static __be32 decode_layout_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ stateid->type = NFS4_LAYOUT_STATEID_TYPE;
+ return decode_stateid(xdr, stateid);
+}
static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
struct xdr_stream *xdr,
@@ -263,7 +274,7 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
}
p = xdr_decode_hyper(p, &args->cbl_range.offset);
p = xdr_decode_hyper(p, &args->cbl_range.length);
- status = decode_stateid(xdr, &args->cbl_stateid);
+ status = decode_layout_stateid(xdr, &args->cbl_stateid);
if (unlikely(status != 0))
goto out;
} else if (args->cbl_recall_type == RETURN_FSID) {
@@ -914,7 +925,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
if (hdr_arg.minorversion == 0) {
cps.clp = nfs4_find_client_ident(SVC_NET(rqstp), hdr_arg.cb_ident);
if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp))
- return rpc_drop_reply;
+ goto out_invalidcred;
}
cps.minorversion = hdr_arg.minorversion;
@@ -942,6 +953,10 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
nfs_put_client(cps.clp);
dprintk("%s: done, status = %u\n", __func__, ntohl(status));
return rpc_success;
+
+out_invalidcred:
+ pr_warn_ratelimited("NFS: NFSv4 callback contains invalid cred\n");
+ return rpc_autherr_badcred;
}
/*
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index d6d5d2a48e838..003ebce4bbc49 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -367,8 +367,6 @@ nfs_found_client(const struct nfs_client_initdata *cl_init,
*/
struct nfs_client *
nfs_get_client(const struct nfs_client_initdata *cl_init,
- const struct rpc_timeout *timeparms,
- const char *ip_addr,
rpc_authflavor_t authflavour)
{
struct nfs_client *clp, *new = NULL;
@@ -399,7 +397,7 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
&nn->nfs_client_list);
spin_unlock(&nn->nfs_client_lock);
new->cl_flags = cl_init->init_flags;
- return rpc_ops->init_client(new, timeparms, ip_addr);
+ return rpc_ops->init_client(new, cl_init);
}
spin_unlock(&nn->nfs_client_lock);
@@ -470,7 +468,7 @@ EXPORT_SYMBOL_GPL(nfs_init_timeout_values);
* Create an RPC client handle
*/
int nfs_create_rpc_client(struct nfs_client *clp,
- const struct rpc_timeout *timeparms,
+ const struct nfs_client_initdata *cl_init,
rpc_authflavor_t flavor)
{
struct rpc_clnt *clnt = NULL;
@@ -479,8 +477,9 @@ int nfs_create_rpc_client(struct nfs_client *clp,
.protocol = clp->cl_proto,
.address = (struct sockaddr *)&clp->cl_addr,
.addrsize = clp->cl_addrlen,
- .timeout = timeparms,
+ .timeout = cl_init->timeparms,
.servername = clp->cl_hostname,
+ .nodename = cl_init->nodename,
.program = &nfs_program,
.version = clp->rpc_ops->version,
.authflavor = flavor,
@@ -591,14 +590,12 @@ EXPORT_SYMBOL_GPL(nfs_init_server_rpcclient);
* nfs_init_client - Initialise an NFS2 or NFS3 client
*
* @clp: nfs_client to initialise
- * @timeparms: timeout parameters for underlying RPC transport
- * @ip_addr: IP presentation address (not used)
+ * @cl_init: Initialisation parameters
*
* Returns pointer to an NFS client, or an ERR_PTR value.
*/
struct nfs_client *nfs_init_client(struct nfs_client *clp,
- const struct rpc_timeout *timeparms,
- const char *ip_addr)
+ const struct nfs_client_initdata *cl_init)
{
int error;
@@ -612,7 +609,7 @@ struct nfs_client *nfs_init_client(struct nfs_client *clp,
* Create a client RPC handle for doing FSSTAT with UNIX auth only
* - RFC 2623, sec 2.3.2
*/
- error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX);
+ error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX);
if (error < 0)
goto error;
nfs_mark_client_ready(clp, NFS_CS_READY);
@@ -633,6 +630,7 @@ static int nfs_init_server(struct nfs_server *server,
const struct nfs_parsed_mount_data *data,
struct nfs_subversion *nfs_mod)
{
+ struct rpc_timeout timeparms;
struct nfs_client_initdata cl_init = {
.hostname = data->nfs_server.hostname,
.addr = (const struct sockaddr *)&data->nfs_server.address,
@@ -640,8 +638,8 @@ static int nfs_init_server(struct nfs_server *server,
.nfs_mod = nfs_mod,
.proto = data->nfs_server.protocol,
.net = data->net,
+ .timeparms = &timeparms,
};
- struct rpc_timeout timeparms;
struct nfs_client *clp;
int error;
@@ -653,7 +651,7 @@ static int nfs_init_server(struct nfs_server *server,
set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
/* Allocate or find a client reference we can use */
- clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX);
+ clp = nfs_get_client(&cl_init, RPC_AUTH_UNIX);
if (IS_ERR(clp)) {
dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
return PTR_ERR(clp);
@@ -736,7 +734,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
server->rsize = max_rpc_payload;
if (server->rsize > NFS_MAX_FILE_IO_SIZE)
server->rsize = NFS_MAX_FILE_IO_SIZE;
- server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ server->rpages = (server->rsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
server->backing_dev_info.name = "nfs";
server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
@@ -745,13 +743,13 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
server->wsize = max_rpc_payload;
if (server->wsize > NFS_MAX_FILE_IO_SIZE)
server->wsize = NFS_MAX_FILE_IO_SIZE;
- server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ server->wpages = (server->wsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
server->dtsize = nfs_block_size(fsinfo->dtpref, NULL);
- if (server->dtsize > PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES)
- server->dtsize = PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES;
+ if (server->dtsize > PAGE_SIZE * NFS_MAX_READDIR_PAGES)
+ server->dtsize = PAGE_SIZE * NFS_MAX_READDIR_PAGES;
if (server->dtsize > server->rsize)
server->dtsize = server->rsize;
@@ -1102,7 +1100,6 @@ static const struct file_operations nfs_server_list_fops = {
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_net,
- .owner = THIS_MODULE,
};
static int nfs_volume_list_open(struct inode *inode, struct file *file);
@@ -1123,7 +1120,6 @@ static const struct file_operations nfs_volume_list_fops = {
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_net,
- .owner = THIS_MODULE,
};
/*
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 5166adcfc0fb2..322c2585bc341 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -875,15 +875,16 @@ int nfs_delegations_present(struct nfs_client *clp)
/**
* nfs4_copy_delegation_stateid - Copy inode's state ID information
- * @dst: stateid data structure to fill in
* @inode: inode to check
* @flags: delegation type requirement
+ * @dst: stateid data structure to fill in
+ * @cred: optional argument to retrieve credential
*
* Returns "true" and fills in "dst->data" * if inode had a delegation,
* otherwise "false" is returned.
*/
-bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode,
- fmode_t flags)
+bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags,
+ nfs4_stateid *dst, struct rpc_cred **cred)
{
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_delegation *delegation;
@@ -896,6 +897,8 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode,
if (ret) {
nfs4_stateid_copy(dst, &delegation->stateid);
nfs_mark_delegation_referenced(delegation);
+ if (cred)
+ *cred = get_rpccred(delegation->cred);
}
rcu_read_unlock();
return ret;
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 333063e032f01..64724d252a797 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -56,7 +56,7 @@ void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync);
int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid, fmode_t type);
int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid);
-bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_t flags);
+bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, nfs4_stateid *dst, struct rpc_cred **cred);
void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
int nfs4_have_delegation(struct inode *inode, fmode_t flags);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 4bfa7d8bcadee..177fefb26c18d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -57,7 +57,7 @@ static void nfs_readdir_clear_array(struct page*);
const struct file_operations nfs_dir_operations = {
.llseek = nfs_llseek_dir,
.read = generic_read_dir,
- .iterate = nfs_readdir,
+ .iterate_shared = nfs_readdir,
.open = nfs_opendir,
.release = nfs_closedir,
.fsync = nfs_fsync_dir,
@@ -145,6 +145,7 @@ struct nfs_cache_array_entry {
};
struct nfs_cache_array {
+ atomic_t refcount;
int size;
int eof_index;
u64 last_cookie;
@@ -200,11 +201,20 @@ void nfs_readdir_clear_array(struct page *page)
int i;
array = kmap_atomic(page);
- for (i = 0; i < array->size; i++)
- kfree(array->array[i].string.name);
+ if (atomic_dec_and_test(&array->refcount))
+ for (i = 0; i < array->size; i++)
+ kfree(array->array[i].string.name);
kunmap_atomic(array);
}
+static bool grab_page(struct page *page)
+{
+ struct nfs_cache_array *array = kmap_atomic(page);
+ bool res = atomic_inc_not_zero(&array->refcount);
+ kunmap_atomic(array);
+ return res;
+}
+
/*
* the caller is responsible for freeing qstr.name
* when called by nfs_readdir_add_to_array, the strings will be freed in
@@ -222,7 +232,7 @@ int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int le
* in a page cache page which kmemleak does not scan.
*/
kmemleak_not_leak(string->name);
- string->hash = full_name_hash(name, len);
+ string->hash = full_name_hash(NULL, name, len);
return 0;
}
@@ -377,7 +387,7 @@ int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
again:
timestamp = jiffies;
gencount = nfs_inc_attr_generation_counter();
- error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, pages,
+ error = NFS_PROTO(inode)->readdir(file_dentry(file), cred, entry->cookie, pages,
NFS_SERVER(inode)->dtsize, desc->plus);
if (error < 0) {
/* We requested READDIRPLUS, but the server doesn't grok it */
@@ -414,12 +424,17 @@ static int xdr_decode(nfs_readdir_descriptor_t *desc,
static
int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
{
+ struct inode *inode;
struct nfs_inode *nfsi;
if (d_really_is_negative(dentry))
return 0;
- nfsi = NFS_I(d_inode(dentry));
+ inode = d_inode(dentry);
+ if (is_bad_inode(inode) || NFS_STALE(inode))
+ return 0;
+
+ nfsi = NFS_I(inode);
if (entry->fattr->fileid == nfsi->fileid)
return 1;
if (nfs_compare_fh(entry->fh, &nfsi->fh) == 0)
@@ -470,6 +485,7 @@ static
void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
{
struct qstr filename = QSTR_INIT(entry->name, entry->len);
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
struct dentry *dentry;
struct dentry *alias;
struct inode *dir = d_inode(parent);
@@ -486,10 +502,16 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
if (filename.len == 2 && filename.name[1] == '.')
return;
}
- filename.hash = full_name_hash(filename.name, filename.len);
+ filename.hash = full_name_hash(parent, filename.name, filename.len);
dentry = d_lookup(parent, &filename);
- if (dentry != NULL) {
+again:
+ if (!dentry) {
+ dentry = d_alloc_parallel(parent, &filename, &wq);
+ if (IS_ERR(dentry))
+ return;
+ }
+ if (!d_in_lookup(dentry)) {
/* Is there a mountpoint here? If so, just exit */
if (!nfs_fsid_equal(&NFS_SB(dentry->d_sb)->fsid,
&entry->fattr->fsid))
@@ -503,26 +525,21 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
} else {
d_invalidate(dentry);
dput(dentry);
+ dentry = NULL;
+ goto again;
}
}
- dentry = d_alloc(parent, &filename);
- if (dentry == NULL)
- return;
-
inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr, entry->label);
- if (IS_ERR(inode))
- goto out;
-
alias = d_splice_alias(inode, dentry);
- if (IS_ERR(alias))
- goto out;
- else if (alias) {
- nfs_set_verifier(alias, nfs_save_change_attribute(dir));
- dput(alias);
- } else
- nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-
+ d_lookup_done(dentry);
+ if (alias) {
+ if (IS_ERR(alias))
+ goto out;
+ dput(dentry);
+ dentry = alias;
+ }
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
out:
dput(dentry);
}
@@ -560,7 +577,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
count++;
if (desc->plus != 0)
- nfs_prime_dcache(desc->file->f_path.dentry, entry);
+ nfs_prime_dcache(file_dentry(desc->file), entry);
status = nfs_readdir_add_to_array(entry, page);
if (status != 0)
@@ -643,6 +660,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
goto out_label_free;
}
memset(array, 0, sizeof(struct nfs_cache_array));
+ atomic_set(&array->refcount, 1);
array->eof_index = -1;
status = nfs_readdir_alloc_pages(pages, array_size);
@@ -705,17 +723,24 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
static
void cache_page_release(nfs_readdir_descriptor_t *desc)
{
- if (!desc->page->mapping)
- nfs_readdir_clear_array(desc->page);
- page_cache_release(desc->page);
+ nfs_readdir_clear_array(desc->page);
+ put_page(desc->page);
desc->page = NULL;
}
static
struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
{
- return read_cache_page(file_inode(desc->file)->i_mapping,
+ struct page *page;
+
+ for (;;) {
+ page = read_cache_page(desc->file->f_mapping,
desc->page_index, (filler_t *)nfs_readdir_filler, desc);
+ if (IS_ERR(page) || grab_page(page))
+ break;
+ put_page(page);
+ }
+ return page;
}
/*
@@ -864,7 +889,7 @@ static bool nfs_dir_mapping_need_revalidate(struct inode *dir)
*/
static int nfs_readdir(struct file *file, struct dir_context *ctx)
{
- struct dentry *dentry = file->f_path.dentry;
+ struct dentry *dentry = file_dentry(file);
struct inode *inode = d_inode(dentry);
nfs_readdir_descriptor_t my_desc,
*desc = &my_desc;
@@ -889,7 +914,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
desc->decode = NFS_PROTO(inode)->decode_dirent;
desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0;
- nfs_block_sillyrename(dentry);
if (ctx->pos == 0 || nfs_dir_mapping_need_revalidate(inode))
res = nfs_revalidate_mapping(inode, file->f_mapping);
if (res < 0)
@@ -925,7 +949,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
break;
} while (!desc->eof);
out:
- nfs_unblock_sillyrename(dentry);
if (res > 0)
res = 0;
dfprintk(FILE, "NFS: readdir(%pD2) returns %d\n", file, res);
@@ -934,13 +957,11 @@ out:
static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
{
- struct inode *inode = file_inode(filp);
struct nfs_open_dir_context *dir_ctx = filp->private_data;
dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n",
filp, offset, whence);
- inode_lock(inode);
switch (whence) {
case 1:
offset += filp->f_pos;
@@ -948,16 +969,13 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
if (offset >= 0)
break;
default:
- offset = -EINVAL;
- goto out;
+ return -EINVAL;
}
if (offset != filp->f_pos) {
filp->f_pos = offset;
dir_ctx->dir_cookie = 0;
dir_ctx->duped = 0;
}
-out:
- inode_unlock(inode);
return offset;
}
@@ -1350,7 +1368,6 @@ EXPORT_SYMBOL_GPL(nfs_dentry_operations);
struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
{
struct dentry *res;
- struct dentry *parent;
struct inode *inode = NULL;
struct nfs_fh *fhandle = NULL;
struct nfs_fattr *fattr = NULL;
@@ -1380,21 +1397,18 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
if (IS_ERR(label))
goto out;
- parent = dentry->d_parent;
- /* Protect against concurrent sillydeletes */
trace_nfs_lookup_enter(dir, dentry, flags);
- nfs_block_sillyrename(parent);
error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
if (error == -ENOENT)
goto no_entry;
if (error < 0) {
res = ERR_PTR(error);
- goto out_unblock_sillyrename;
+ goto out_label;
}
inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);
res = ERR_CAST(inode);
if (IS_ERR(res))
- goto out_unblock_sillyrename;
+ goto out_label;
/* Success: notify readdir to use READDIRPLUS */
nfs_advise_use_readdirplus(dir);
@@ -1403,12 +1417,11 @@ no_entry:
res = d_splice_alias(inode, dentry);
if (res != NULL) {
if (IS_ERR(res))
- goto out_unblock_sillyrename;
+ goto out_label;
dentry = res;
}
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-out_unblock_sillyrename:
- nfs_unblock_sillyrename(parent);
+out_label:
trace_nfs_lookup_exit(dir, dentry, flags, error);
nfs4_label_free(label);
out:
@@ -1471,11 +1484,13 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
struct file *file, unsigned open_flags,
umode_t mode, int *opened)
{
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
struct nfs_open_context *ctx;
struct dentry *res;
struct iattr attr = { .ia_valid = ATTR_OPEN };
struct inode *inode;
unsigned int lookup_flags = 0;
+ bool switched = false;
int err;
/* Expect a negative dentry */
@@ -1490,7 +1505,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
/* NFS only supports OPEN on regular files */
if ((open_flags & O_DIRECTORY)) {
- if (!d_unhashed(dentry)) {
+ if (!d_in_lookup(dentry)) {
/*
* Hashed negative dentry with O_DIRECTORY: dentry was
* revalidated and is fine, no need to perform lookup
@@ -1514,22 +1529,31 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
attr.ia_size = 0;
}
+ if (!(open_flags & O_CREAT) && !d_in_lookup(dentry)) {
+ d_drop(dentry);
+ switched = true;
+ dentry = d_alloc_parallel(dentry->d_parent,
+ &dentry->d_name, &wq);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+ if (unlikely(!d_in_lookup(dentry)))
+ return finish_no_open(file, dentry);
+ }
+
ctx = create_nfs_open_context(dentry, open_flags);
err = PTR_ERR(ctx);
if (IS_ERR(ctx))
goto out;
trace_nfs_atomic_open_enter(dir, ctx, open_flags);
- nfs_block_sillyrename(dentry->d_parent);
inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr, opened);
- nfs_unblock_sillyrename(dentry->d_parent);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
trace_nfs_atomic_open_exit(dir, ctx, open_flags, err);
put_nfs_open_context(ctx);
+ d_drop(dentry);
switch (err) {
case -ENOENT:
- d_drop(dentry);
d_add(dentry, NULL);
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
break;
@@ -1551,14 +1575,23 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
trace_nfs_atomic_open_exit(dir, ctx, open_flags, err);
put_nfs_open_context(ctx);
out:
+ if (unlikely(switched)) {
+ d_lookup_done(dentry);
+ dput(dentry);
+ }
return err;
no_open:
res = nfs_lookup(dir, dentry, lookup_flags);
- err = PTR_ERR(res);
+ if (switched) {
+ d_lookup_done(dentry);
+ if (!res)
+ res = dentry;
+ else
+ dput(dentry);
+ }
if (IS_ERR(res))
- goto out;
-
+ return PTR_ERR(res);
return finish_no_open(file, res);
}
EXPORT_SYMBOL_GPL(nfs_atomic_open);
@@ -1766,7 +1799,7 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
trace_nfs_rmdir_enter(dir, dentry);
if (d_really_is_positive(dentry)) {
- nfs_wait_on_sillyrename(dentry);
+ down_write(&NFS_I(d_inode(dentry))->rmdir_sem);
error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
/* Ensure the VFS deletes this inode */
switch (error) {
@@ -1776,6 +1809,7 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
case -ENOENT:
nfs_dentry_handle_enoent(dentry);
}
+ up_write(&NFS_I(d_inode(dentry))->rmdir_sem);
} else
error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
trace_nfs_rmdir_exit(dir, dentry, error);
@@ -1923,7 +1957,7 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
* add_to_page_cache_lru() grabs an extra page refcount.
* Drop it here to avoid leaking this page later.
*/
- page_cache_release(page);
+ put_page(page);
} else
__free_page(page);
@@ -2218,21 +2252,37 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, st
return NULL;
}
-static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
+static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res, bool may_block)
{
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_access_entry *cache;
- int err = -ENOENT;
+ bool retry = true;
+ int err;
spin_lock(&inode->i_lock);
- if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
- goto out_zap;
- cache = nfs_access_search_rbtree(inode, cred);
- if (cache == NULL)
- goto out;
- if (!nfs_have_delegated_attributes(inode) &&
- !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
- goto out_stale;
+ for(;;) {
+ if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
+ goto out_zap;
+ cache = nfs_access_search_rbtree(inode, cred);
+ err = -ENOENT;
+ if (cache == NULL)
+ goto out;
+ /* Found an entry, is our attribute cache valid? */
+ if (!nfs_attribute_cache_expired(inode) &&
+ !(nfsi->cache_validity & NFS_INO_INVALID_ATTR))
+ break;
+ err = -ECHILD;
+ if (!may_block)
+ goto out;
+ if (!retry)
+ goto out_zap;
+ spin_unlock(&inode->i_lock);
+ err = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+ if (err)
+ return err;
+ spin_lock(&inode->i_lock);
+ retry = false;
+ }
res->jiffies = cache->jiffies;
res->cred = cache->cred;
res->mask = cache->mask;
@@ -2241,12 +2291,6 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str
out:
spin_unlock(&inode->i_lock);
return err;
-out_stale:
- rb_erase(&cache->rb_node, &nfsi->access_cache);
- list_del(&cache->lru);
- spin_unlock(&inode->i_lock);
- nfs_access_free_entry(cache);
- return -ENOENT;
out_zap:
spin_unlock(&inode->i_lock);
nfs_access_zap_cache(inode);
@@ -2273,13 +2317,12 @@ static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred,
cache = NULL;
if (cache == NULL)
goto out;
- if (!nfs_have_delegated_attributes(inode) &&
- !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
+ err = nfs_revalidate_inode_rcu(NFS_SERVER(inode), inode);
+ if (err)
goto out;
res->jiffies = cache->jiffies;
res->cred = cache->cred;
res->mask = cache->mask;
- err = 0;
out:
rcu_read_unlock();
return err;
@@ -2368,18 +2411,19 @@ EXPORT_SYMBOL_GPL(nfs_access_set_mask);
static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
{
struct nfs_access_entry cache;
+ bool may_block = (mask & MAY_NOT_BLOCK) == 0;
int status;
trace_nfs_access_enter(inode);
status = nfs_access_get_cached_rcu(inode, cred, &cache);
if (status != 0)
- status = nfs_access_get_cached(inode, cred, &cache);
+ status = nfs_access_get_cached(inode, cred, &cache, may_block);
if (status == 0)
goto out_cached;
status = -ECHILD;
- if (mask & MAY_NOT_BLOCK)
+ if (!may_block)
goto out;
/* Be clever: ask server to check for all possible rights */
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 7a0cfd3266e56..72b7d13ee3c6a 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -87,6 +87,7 @@ struct nfs_direct_req {
int mirror_count;
ssize_t count, /* bytes actually processed */
+ max_count, /* max expected count */
bytes_left, /* bytes left to be sent */
io_start, /* start of IO */
error; /* any reported error */
@@ -123,6 +124,8 @@ nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr)
int i;
ssize_t count;
+ WARN_ON_ONCE(dreq->count >= dreq->max_count);
+
if (dreq->mirror_count == 1) {
dreq->mirrors[hdr->pgio_mirror_idx].count += hdr->good_bytes;
dreq->count += hdr->good_bytes;
@@ -193,6 +196,12 @@ static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
WARN_ON_ONCE(verfp->committed < 0);
}
+static int nfs_direct_cmp_verf(const struct nfs_writeverf *v1,
+ const struct nfs_writeverf *v2)
+{
+ return nfs_write_verifier_cmp(&v1->verifier, &v2->verifier);
+}
+
/*
* nfs_direct_cmp_hdr_verf - compare verifier for pgio header
* @dreq - direct request possibly spanning multiple servers
@@ -212,7 +221,7 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
nfs_direct_set_hdr_verf(dreq, hdr);
return 0;
}
- return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
+ return nfs_direct_cmp_verf(verfp, &hdr->verf);
}
/*
@@ -235,22 +244,20 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
if (verfp->committed < 0)
return 1;
- return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf));
+ return nfs_direct_cmp_verf(verfp, &data->verf);
}
/**
* nfs_direct_IO - NFS address space operation for direct I/O
* @iocb: target I/O control block
- * @iov: array of vectors that define I/O buffer
- * @pos: offset in file to begin the operation
- * @nr_segs: size of iovec array
+ * @iter: I/O buffer
*
* The presence of this routine in the address space ops vector means
* the NFS client supports direct I/O. However, for most direct IO, we
* shunt off direct read and write requests before the VFS gets them,
* so this method is only ever called for swap.
*/
-ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
+ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct inode *inode = iocb->ki_filp->f_mapping->host;
@@ -261,7 +268,7 @@ ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
if (iov_iter_rw(iter) == READ)
- return nfs_file_direct_read(iocb, iter, pos);
+ return nfs_file_direct_read(iocb, iter);
return nfs_file_direct_write(iocb, iter);
}
@@ -269,13 +276,13 @@ static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
{
unsigned int i;
for (i = 0; i < npages; i++)
- page_cache_release(pages[i]);
+ put_page(pages[i]);
}
void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
struct nfs_direct_req *dreq)
{
- cinfo->lock = &dreq->inode->i_lock;
+ cinfo->inode = dreq->inode;
cinfo->mds = &dreq->mds_cinfo;
cinfo->ds = &dreq->ds_cinfo;
cinfo->dreq = dreq;
@@ -350,10 +357,12 @@ static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq)
result = wait_for_completion_killable(&dreq->completion);
+ if (!result) {
+ result = dreq->count;
+ WARN_ON_ONCE(dreq->count < 0);
+ }
if (!result)
result = dreq->error;
- if (!result)
- result = dreq->count;
out:
return (ssize_t) result;
@@ -363,28 +372,18 @@ out:
* Synchronous I/O uses a stack-allocated iocb. Thus we can't trust
* the iocb is still valid here if this is a synchronous request.
*/
-static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)
+static void nfs_direct_complete(struct nfs_direct_req *dreq)
{
struct inode *inode = dreq->inode;
- if (dreq->iocb && write) {
- loff_t pos = dreq->iocb->ki_pos + dreq->count;
-
- spin_lock(&inode->i_lock);
- if (i_size_read(inode) < pos)
- i_size_write(inode, pos);
- spin_unlock(&inode->i_lock);
- }
-
- if (write)
- nfs_zap_mapping(inode, inode->i_mapping);
-
inode_dio_end(inode);
if (dreq->iocb) {
long res = (long) dreq->error;
- if (!res)
+ if (dreq->count != 0) {
res = (long) dreq->count;
+ WARN_ON_ONCE(dreq->count < 0);
+ }
dreq->iocb->ki_complete(dreq->iocb, res, 0);
}
@@ -396,7 +395,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)
static void nfs_direct_readpage_release(struct nfs_page *req)
{
dprintk("NFS: direct read done (%s/%llu %d@%lld)\n",
- d_inode(req->wb_context->dentry)->i_sb->s_id,
+ req->wb_context->dentry->d_sb->s_id,
(unsigned long long)NFS_FILEID(d_inode(req->wb_context->dentry)),
req->wb_bytes,
(long long)req_offset(req));
@@ -431,7 +430,7 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
}
out_put:
if (put_dreq(dreq))
- nfs_direct_complete(dreq, false);
+ nfs_direct_complete(dreq);
hdr->release(hdr);
}
@@ -537,7 +536,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
}
if (put_dreq(dreq))
- nfs_direct_complete(dreq, false);
+ nfs_direct_complete(dreq);
return 0;
}
@@ -545,7 +544,6 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
* nfs_file_direct_read - file direct read operation for NFS files
* @iocb: target I/O control block
* @iter: vector of user buffers into which to read data
- * @pos: byte offset in file where reading starts
*
* We use this function for direct reads instead of calling
* generic_file_aio_read() in order to avoid gfar's check to see if
@@ -561,8 +559,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
* client must read the updated atime from the server back into its
* cache.
*/
-ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
- loff_t pos)
+ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -574,27 +571,22 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n",
- file, count, (long long) pos);
+ file, count, (long long) iocb->ki_pos);
result = 0;
if (!count)
goto out;
- inode_lock(inode);
- result = nfs_sync_mapping(mapping);
- if (result)
- goto out_unlock;
-
task_io_account_read(count);
result = -ENOMEM;
dreq = nfs_direct_req_alloc();
if (dreq == NULL)
- goto out_unlock;
+ goto out;
dreq->inode = inode;
- dreq->bytes_left = count;
- dreq->io_start = pos;
+ dreq->bytes_left = dreq->max_count = count;
+ dreq->io_start = iocb->ki_pos;
dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
l_ctx = nfs_get_lock_context(dreq->ctx);
if (IS_ERR(l_ctx)) {
@@ -605,24 +597,21 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
if (!is_sync_kiocb(iocb))
dreq->iocb = iocb;
+ nfs_start_io_direct(inode);
+
NFS_I(inode)->read_io += count;
- result = nfs_direct_read_schedule_iovec(dreq, iter, pos);
+ result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
- inode_unlock(inode);
+ nfs_end_io_direct(inode);
if (!result) {
result = nfs_direct_wait(dreq);
if (result > 0)
- iocb->ki_pos = pos + result;
+ iocb->ki_pos += result;
}
- nfs_direct_req_release(dreq);
- return result;
-
out_release:
nfs_direct_req_release(dreq);
-out_unlock:
- inode_unlock(inode);
out:
return result;
}
@@ -632,13 +621,13 @@ nfs_direct_write_scan_commit_list(struct inode *inode,
struct list_head *list,
struct nfs_commit_info *cinfo)
{
- spin_lock(cinfo->lock);
+ spin_lock(&cinfo->inode->i_lock);
#ifdef CONFIG_NFS_V4_1
if (cinfo->ds != NULL && cinfo->ds->nwritten != 0)
NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
#endif
nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0);
- spin_unlock(cinfo->lock);
+ spin_unlock(&cinfo->inode->i_lock);
}
static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
@@ -654,6 +643,8 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
dreq->count = 0;
+ dreq->verf.committed = NFS_INVALID_STABLE_HOW;
+ nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo);
for (i = 0; i < dreq->mirror_count; i++)
dreq->mirrors[i].count = 0;
get_dreq(dreq);
@@ -673,13 +664,13 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
if (!nfs_pageio_add_request(&desc, req)) {
nfs_list_remove_request(req);
nfs_list_add_request(req, &failed);
- spin_lock(cinfo.lock);
+ spin_lock(&cinfo.inode->i_lock);
dreq->flags = 0;
if (desc.pg_error < 0)
dreq->error = desc.pg_error;
else
dreq->error = -EIO;
- spin_unlock(cinfo.lock);
+ spin_unlock(&cinfo.inode->i_lock);
}
nfs_release_request(req);
}
@@ -772,7 +763,8 @@ static void nfs_direct_write_schedule_work(struct work_struct *work)
nfs_direct_write_reschedule(dreq);
break;
default:
- nfs_direct_complete(dreq, true);
+ nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping);
+ nfs_direct_complete(dreq);
}
}
@@ -969,7 +961,6 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
* nfs_file_direct_write - file direct write operation for NFS files
* @iocb: target I/O control block
* @iter: vector of user buffers from which to write data
- * @pos: byte offset in file where writing starts
*
* We use this function for direct writes instead of calling
* generic_file_aio_write() in order to avoid taking the inode
@@ -989,6 +980,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
{
ssize_t result = -EINVAL;
+ size_t count;
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
@@ -999,34 +991,24 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
file, iov_iter_count(iter), (long long) iocb->ki_pos);
- nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES,
- iov_iter_count(iter));
+ result = generic_write_checks(iocb, iter);
+ if (result <= 0)
+ return result;
+ count = result;
+ nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
pos = iocb->ki_pos;
- end = (pos + iov_iter_count(iter) - 1) >> PAGE_CACHE_SHIFT;
+ end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT;
- inode_lock(inode);
-
- result = nfs_sync_mapping(mapping);
- if (result)
- goto out_unlock;
-
- if (mapping->nrpages) {
- result = invalidate_inode_pages2_range(mapping,
- pos >> PAGE_CACHE_SHIFT, end);
- if (result)
- goto out_unlock;
- }
-
- task_io_account_write(iov_iter_count(iter));
+ task_io_account_write(count);
result = -ENOMEM;
dreq = nfs_direct_req_alloc();
if (!dreq)
- goto out_unlock;
+ goto out;
dreq->inode = inode;
- dreq->bytes_left = iov_iter_count(iter);
+ dreq->bytes_left = dreq->max_count = count;
dreq->io_start = pos;
dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
l_ctx = nfs_get_lock_context(dreq->ctx);
@@ -1038,35 +1020,28 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
if (!is_sync_kiocb(iocb))
dreq->iocb = iocb;
+ nfs_start_io_direct(inode);
+
result = nfs_direct_write_schedule_iovec(dreq, iter, pos);
if (mapping->nrpages) {
invalidate_inode_pages2_range(mapping,
- pos >> PAGE_CACHE_SHIFT, end);
+ pos >> PAGE_SHIFT, end);
}
- inode_unlock(inode);
+ nfs_end_io_direct(inode);
if (!result) {
result = nfs_direct_wait(dreq);
if (result > 0) {
- struct inode *inode = mapping->host;
-
iocb->ki_pos = pos + result;
- spin_lock(&inode->i_lock);
- if (i_size_read(inode) < iocb->ki_pos)
- i_size_write(inode, iocb->ki_pos);
- spin_unlock(&inode->i_lock);
- generic_write_sync(file, pos, result);
+ /* XXX: should check the generic_write_sync retval */
+ generic_write_sync(iocb, result);
}
}
- nfs_direct_req_release(dreq);
- return result;
-
out_release:
nfs_direct_req_release(dreq);
-out_unlock:
- inode_unlock(inode);
+out:
return result;
}
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 89bf093d342a5..7d620970f2e1a 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -164,18 +164,20 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
ssize_t result;
if (iocb->ki_flags & IOCB_DIRECT)
- return nfs_file_direct_read(iocb, to, iocb->ki_pos);
+ return nfs_file_direct_read(iocb, to);
dprintk("NFS: read(%pD2, %zu@%lu)\n",
iocb->ki_filp,
iov_iter_count(to), (unsigned long) iocb->ki_pos);
- result = nfs_revalidate_mapping_protected(inode, iocb->ki_filp->f_mapping);
+ nfs_start_io_read(inode);
+ result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
if (!result) {
result = generic_file_read_iter(iocb, to);
if (result > 0)
nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);
}
+ nfs_end_io_read(inode);
return result;
}
EXPORT_SYMBOL_GPL(nfs_file_read);
@@ -191,12 +193,14 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos,
dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n",
filp, (unsigned long) count, (unsigned long long) *ppos);
- res = nfs_revalidate_mapping_protected(inode, filp->f_mapping);
+ nfs_start_io_read(inode);
+ res = nfs_revalidate_mapping(inode, filp->f_mapping);
if (!res) {
res = generic_file_splice_read(filp, ppos, pipe, count, flags);
if (res > 0)
nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res);
}
+ nfs_end_io_read(inode);
return res;
}
EXPORT_SYMBOL_GPL(nfs_file_splice_read);
@@ -272,16 +276,13 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
trace_nfs_fsync_enter(inode);
- inode_dio_wait(inode);
do {
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (ret != 0)
break;
- inode_lock(inode);
ret = nfs_file_fsync_commit(file, start, end, datasync);
if (!ret)
ret = pnfs_sync_inode(inode, !!datasync);
- inode_unlock(inode);
/*
* If nfs_file_fsync_commit detected a server reboot, then
* resend all dirty pages that might have been covered by
@@ -320,7 +321,7 @@ static int nfs_want_read_modify_write(struct file *file, struct page *page,
loff_t pos, unsigned len)
{
unsigned int pglen = nfs_page_length(page);
- unsigned int offset = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned int offset = pos & (PAGE_SIZE - 1);
unsigned int end = offset + len;
if (pnfs_ld_read_whole_page(file->f_mapping->host)) {
@@ -351,7 +352,7 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
int ret;
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t index = pos >> PAGE_SHIFT;
struct page *page;
int once_thru = 0;
@@ -359,19 +360,6 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
file, mapping->host->i_ino, len, (long long) pos);
start:
- /*
- * Prevent starvation issues if someone is doing a consistency
- * sync-to-disk
- */
- ret = wait_on_bit_action(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
- nfs_wait_bit_killable, TASK_KILLABLE);
- if (ret)
- return ret;
- /*
- * Wait for O_DIRECT to complete
- */
- inode_dio_wait(mapping->host);
-
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
@@ -380,12 +368,12 @@ start:
ret = nfs_flush_incompatible(file, page);
if (ret) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
} else if (!once_thru &&
nfs_want_read_modify_write(file, page, pos, len)) {
once_thru = 1;
ret = nfs_readpage(file, page);
- page_cache_release(page);
+ put_page(page);
if (!ret)
goto start;
}
@@ -396,7 +384,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
- unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned offset = pos & (PAGE_SIZE - 1);
struct nfs_open_context *ctx = nfs_file_open_context(file);
int status;
@@ -413,26 +401,26 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
if (pglen == 0) {
zero_user_segments(page, 0, offset,
- end, PAGE_CACHE_SIZE);
+ end, PAGE_SIZE);
SetPageUptodate(page);
} else if (end >= pglen) {
- zero_user_segment(page, end, PAGE_CACHE_SIZE);
+ zero_user_segment(page, end, PAGE_SIZE);
if (offset == 0)
SetPageUptodate(page);
} else
- zero_user_segment(page, pglen, PAGE_CACHE_SIZE);
+ zero_user_segment(page, pglen, PAGE_SIZE);
}
status = nfs_updatepage(file, page, offset, copied);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if (status < 0)
return status;
NFS_I(mapping->host)->write_io += copied;
- if (nfs_ctx_key_to_expire(ctx)) {
+ if (nfs_ctx_key_to_expire(ctx, mapping->host)) {
status = nfs_wb_all(mapping->host);
if (status < 0)
return status;
@@ -454,7 +442,7 @@ static void nfs_invalidate_page(struct page *page, unsigned int offset,
dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %u, %u)\n",
page, offset, length);
- if (offset != 0 || length < PAGE_CACHE_SIZE)
+ if (offset != 0 || length < PAGE_SIZE)
return;
/* Cancel any unstarted writes on this page */
nfs_wb_page_cancel(page_file_mapping(page)->host, page);
@@ -470,31 +458,8 @@ static void nfs_invalidate_page(struct page *page, unsigned int offset,
*/
static int nfs_release_page(struct page *page, gfp_t gfp)
{
- struct address_space *mapping = page->mapping;
-
dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
- /* Always try to initiate a 'commit' if relevant, but only
- * wait for it if the caller allows blocking. Even then,
- * only wait 1 second and only if the 'bdi' is not congested.
- * Waiting indefinitely can cause deadlocks when the NFS
- * server is on this machine, when a new TCP connection is
- * needed and in other rare cases. There is no particular
- * need to wait extensively here. A short wait has the
- * benefit that someone else can worry about the freezer.
- */
- if (mapping) {
- struct nfs_server *nfss = NFS_SERVER(mapping->host);
- nfs_commit_inode(mapping->host, 0);
- if (gfpflags_allow_blocking(gfp) &&
- !bdi_write_congested(&nfss->backing_dev_info)) {
- wait_on_page_bit_killable_timeout(page, PG_private,
- HZ);
- if (PagePrivate(page))
- set_bdi_congested(&nfss->backing_dev_info,
- BLK_RW_ASYNC);
- }
- }
/* If PagePrivate() is set, then the page is not freeable */
if (PagePrivate(page))
return 0;
@@ -604,6 +569,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
filp, filp->f_mapping->host->i_ino,
(long long)page_offset(page));
+ sb_start_pagefault(inode->i_sb);
+
/* make sure the cache has finished storing the page */
nfs_fscache_wait_on_page_write(NFS_I(inode), page);
@@ -630,6 +597,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
out_unlock:
unlock_page(page);
out:
+ sb_end_pagefault(inode->i_sb);
return ret;
}
@@ -645,7 +613,7 @@ static int nfs_need_check_write(struct file *filp, struct inode *inode)
ctx = nfs_file_open_context(filp);
if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) ||
- nfs_ctx_key_to_expire(ctx))
+ nfs_ctx_key_to_expire(ctx, inode))
return 1;
return 0;
}
@@ -656,23 +624,17 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file_inode(file);
unsigned long written = 0;
ssize_t result;
- size_t count = iov_iter_count(from);
result = nfs_key_timeout_notify(file, inode);
if (result)
return result;
- if (iocb->ki_flags & IOCB_DIRECT) {
- result = generic_write_checks(iocb, from);
- if (result <= 0)
- return result;
+ if (iocb->ki_flags & IOCB_DIRECT)
return nfs_file_direct_write(iocb, from);
- }
dprintk("NFS: write(%pD2, %zu@%Ld)\n",
- file, count, (long long) iocb->ki_pos);
+ file, iov_iter_count(from), (long long) iocb->ki_pos);
- result = -EBUSY;
if (IS_SWAPFILE(inode))
goto out_swapfile;
/*
@@ -684,28 +646,33 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
goto out;
}
- result = count;
- if (!count)
+ nfs_start_io_write(inode);
+ result = generic_write_checks(iocb, from);
+ if (result > 0) {
+ current->backing_dev_info = inode_to_bdi(inode);
+ result = generic_perform_write(file, from, iocb->ki_pos);
+ current->backing_dev_info = NULL;
+ }
+ nfs_end_io_write(inode);
+ if (result <= 0)
goto out;
- result = generic_file_write_iter(iocb, from);
- if (result > 0)
- written = result;
+ written = generic_write_sync(iocb, result);
+ iocb->ki_pos += written;
/* Return error values */
- if (result >= 0 && nfs_need_check_write(file, inode)) {
+ if (nfs_need_check_write(file, inode)) {
int err = vfs_fsync(file, 0);
if (err < 0)
result = err;
}
- if (result > 0)
- nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
+ nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
out:
return result;
out_swapfile:
printk(KERN_INFO "NFS: attempt to write to active swap file!\n");
- goto out;
+ return -EBUSY;
}
EXPORT_SYMBOL_GPL(nfs_file_write);
@@ -780,11 +747,6 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
}
static int
-is_time_granular(struct timespec *ts) {
- return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000));
-}
-
-static int
do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
{
struct inode *inode = filp->f_mapping->host;
@@ -817,12 +779,8 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
* This makes locking act as a cache coherency point.
*/
nfs_sync_mapping(filp->f_mapping);
- if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) {
- if (is_time_granular(&NFS_SERVER(inode)->time_delta))
- __nfs_revalidate_inode(NFS_SERVER(inode), inode);
- else
- nfs_zap_caches(inode);
- }
+ if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+ nfs_zap_mapping(inode, filp->f_mapping);
out:
return status;
}
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 3384dc8e66836..a3fc48ba4931d 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -255,13 +255,16 @@ static int filelayout_read_done_cb(struct rpc_task *task,
static void
filelayout_set_layoutcommit(struct nfs_pgio_header *hdr)
{
+ loff_t end_offs = 0;
if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds ||
- hdr->res.verf->committed != NFS_DATA_SYNC)
+ hdr->res.verf->committed == NFS_FILE_SYNC)
return;
+ if (hdr->res.verf->committed == NFS_DATA_SYNC)
+ end_offs = hdr->mds_offset + (loff_t)hdr->res.count;
- pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
- hdr->mds_offset + hdr->res.count);
+ /* Note: if the write is unstable, don't set end_offs until commit */
+ pnfs_set_layoutcommit(hdr->inode, hdr->lseg, end_offs);
dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
(unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
}
@@ -354,6 +357,12 @@ static int filelayout_write_done_cb(struct rpc_task *task,
}
filelayout_set_layoutcommit(hdr);
+
+ /* zero out the fattr */
+ hdr->fattr.valid = 0;
+ if (task->tk_status >= 0)
+ nfs_writeback_update_inode(hdr);
+
return 0;
}
@@ -375,8 +384,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
return -EAGAIN;
}
- if (data->verf.committed == NFS_UNSTABLE)
- pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
+ pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
return 0;
}
@@ -795,7 +803,7 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,
buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
}
- spin_lock(cinfo->lock);
+ spin_lock(&cinfo->inode->i_lock);
if (cinfo->ds->nbuckets >= size)
goto out;
for (i = 0; i < cinfo->ds->nbuckets; i++) {
@@ -811,7 +819,7 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,
swap(cinfo->ds->buckets, buckets);
cinfo->ds->nbuckets = size;
out:
- spin_unlock(cinfo->lock);
+ spin_unlock(&cinfo->inode->i_lock);
kfree(buckets);
return 0;
}
@@ -890,6 +898,7 @@ filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
0,
NFS4_MAX_UINT64,
IOMODE_READ,
+ false,
GFP_KERNEL);
if (IS_ERR(pgio->pg_lseg)) {
pgio->pg_error = PTR_ERR(pgio->pg_lseg);
@@ -915,6 +924,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
0,
NFS4_MAX_UINT64,
IOMODE_RW,
+ false,
GFP_NOFS);
if (IS_ERR(pgio->pg_lseg)) {
pgio->pg_error = PTR_ERR(pgio->pg_lseg);
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 0cb1abd535e38..e6206eaf2bdf3 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -26,6 +26,8 @@
#define FF_LAYOUT_POLL_RETRY_MAX (15*HZ)
+static struct group_info *ff_zero_group;
+
static struct pnfs_layout_hdr *
ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
{
@@ -53,14 +55,15 @@ ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo)
kfree(FF_LAYOUT_FROM_HDR(lo));
}
-static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+static int decode_pnfs_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
{
__be32 *p;
p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE);
if (unlikely(p == NULL))
return -ENOBUFS;
- memcpy(stateid, p, NFS4_STATEID_SIZE);
+ stateid->type = NFS4_PNFS_DS_STATEID_TYPE;
+ memcpy(stateid->data, p, NFS4_STATEID_SIZE);
dprintk("%s: stateid id= [%x%x%x%x]\n", __func__,
p[0], p[1], p[2], p[3]);
return 0;
@@ -211,10 +214,16 @@ static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror)
{
+ struct rpc_cred *cred;
+
ff_layout_remove_mirror(mirror);
kfree(mirror->fh_versions);
- if (mirror->cred)
- put_rpccred(mirror->cred);
+ cred = rcu_access_pointer(mirror->ro_cred);
+ if (cred)
+ put_rpccred(cred);
+ cred = rcu_access_pointer(mirror->rw_cred);
+ if (cred)
+ put_rpccred(cred);
nfs4_ff_layout_put_deviceid(mirror->mirror_ds);
kfree(mirror);
}
@@ -290,6 +299,8 @@ ff_lseg_merge(struct pnfs_layout_segment *new,
{
u64 new_end, old_end;
+ if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags))
+ return false;
if (new->pls_range.iomode != old->pls_range.iomode)
return false;
old_end = pnfs_calc_offset_end(old->pls_range.offset,
@@ -310,8 +321,6 @@ ff_lseg_merge(struct pnfs_layout_segment *new,
new_end);
if (test_bit(NFS_LSEG_ROC, &old->pls_flags))
set_bit(NFS_LSEG_ROC, &new->pls_flags);
- if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags))
- set_bit(NFS_LSEG_LAYOUTRETURN, &new->pls_flags);
return true;
}
@@ -407,8 +416,9 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
struct nfs4_ff_layout_mirror *mirror;
struct nfs4_deviceid devid;
struct nfs4_deviceid_node *idnode;
- u32 ds_count;
- u32 fh_count;
+ struct auth_cred acred = { .group_info = ff_zero_group };
+ struct rpc_cred __rcu *cred;
+ u32 ds_count, fh_count, id;
int j;
rc = -EIO;
@@ -456,7 +466,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
fls->mirror_array[i]->efficiency = be32_to_cpup(p);
/* stateid */
- rc = decode_stateid(&stream, &fls->mirror_array[i]->stateid);
+ rc = decode_pnfs_stateid(&stream, &fls->mirror_array[i]->stateid);
if (rc)
goto out_err_free;
@@ -484,24 +494,49 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
fls->mirror_array[i]->fh_versions_cnt = fh_count;
/* user */
- rc = decode_name(&stream, &fls->mirror_array[i]->uid);
+ rc = decode_name(&stream, &id);
if (rc)
goto out_err_free;
+ acred.uid = make_kuid(&init_user_ns, id);
+
/* group */
- rc = decode_name(&stream, &fls->mirror_array[i]->gid);
+ rc = decode_name(&stream, &id);
if (rc)
goto out_err_free;
+ acred.gid = make_kgid(&init_user_ns, id);
+
+ /* find the cred for it */
+ rcu_assign_pointer(cred, rpc_lookup_generic_cred(&acred, 0, gfp_flags));
+ if (IS_ERR(cred)) {
+ rc = PTR_ERR(cred);
+ goto out_err_free;
+ }
+
+ if (lgr->range.iomode == IOMODE_READ)
+ rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred);
+ else
+ rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred);
+
mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]);
if (mirror != fls->mirror_array[i]) {
+ /* swap cred ptrs so free_mirror will clean up old */
+ if (lgr->range.iomode == IOMODE_READ) {
+ cred = xchg(&mirror->ro_cred, cred);
+ rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred);
+ } else {
+ cred = xchg(&mirror->rw_cred, cred);
+ rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred);
+ }
ff_layout_free_mirror(fls->mirror_array[i]);
fls->mirror_array[i] = mirror;
}
- dprintk("%s: uid %d gid %d\n", __func__,
- fls->mirror_array[i]->uid,
- fls->mirror_array[i]->gid);
+ dprintk("%s: iomode %s uid %u gid %u\n", __func__,
+ lgr->range.iomode == IOMODE_READ ? "READ" : "RW",
+ from_kuid(&init_user_ns, acred.uid),
+ from_kgid(&init_user_ns, acred.gid));
}
p = xdr_inline_decode(&stream, 4);
@@ -745,7 +780,7 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
else {
int i;
- spin_lock(cinfo->lock);
+ spin_lock(&cinfo->inode->i_lock);
if (cinfo->ds->nbuckets != 0)
kfree(buckets);
else {
@@ -759,7 +794,7 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
NFS_INVALID_STABLE_HOW;
}
}
- spin_unlock(cinfo->lock);
+ spin_unlock(&cinfo->inode->i_lock);
return 0;
}
}
@@ -786,6 +821,36 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
}
static void
+ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req,
+ bool strict_iomode)
+{
+retry_strict:
+ pnfs_put_lseg(pgio->pg_lseg);
+ pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+ req->wb_context,
+ 0,
+ NFS4_MAX_UINT64,
+ IOMODE_READ,
+ strict_iomode,
+ GFP_KERNEL);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ }
+
+ /* If we don't have checking, do get a IOMODE_RW
+ * segment, and the server wants to avoid READs
+ * there, then retry!
+ */
+ if (pgio->pg_lseg && !strict_iomode &&
+ ff_layout_avoid_read_on_rw(pgio->pg_lseg)) {
+ strict_iomode = true;
+ goto retry_strict;
+ }
+}
+
+static void
ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
@@ -795,26 +860,23 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
int ds_idx;
/* Use full layout for now */
- if (!pgio->pg_lseg) {
- pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
- req->wb_context,
- 0,
- NFS4_MAX_UINT64,
- IOMODE_READ,
- GFP_KERNEL);
- if (IS_ERR(pgio->pg_lseg)) {
- pgio->pg_error = PTR_ERR(pgio->pg_lseg);
- pgio->pg_lseg = NULL;
- return;
- }
- }
+ if (!pgio->pg_lseg)
+ ff_layout_pg_get_read(pgio, req, false);
+ else if (ff_layout_avoid_read_on_rw(pgio->pg_lseg))
+ ff_layout_pg_get_read(pgio, req, true);
+
/* If no lseg, fall back to read through mds */
if (pgio->pg_lseg == NULL)
goto out_mds;
ds = ff_layout_choose_best_ds_for_read(pgio->pg_lseg, 0, &ds_idx);
- if (!ds)
- goto out_mds;
+ if (!ds) {
+ if (ff_layout_no_fallback_to_mds(pgio->pg_lseg))
+ goto out_pnfs;
+ else
+ goto out_mds;
+ }
+
mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
pgio->pg_mirror_idx = ds_idx;
@@ -828,6 +890,12 @@ out_mds:
pnfs_put_lseg(pgio->pg_lseg);
pgio->pg_lseg = NULL;
nfs_pageio_reset_read_mds(pgio);
+ return;
+
+out_pnfs:
+ pnfs_set_lo_fail(pgio->pg_lseg);
+ pnfs_put_lseg(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
}
static void
@@ -847,6 +915,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
0,
NFS4_MAX_UINT64,
IOMODE_RW,
+ false,
GFP_NOFS);
if (IS_ERR(pgio->pg_lseg)) {
pgio->pg_error = PTR_ERR(pgio->pg_lseg);
@@ -870,8 +939,12 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
for (i = 0; i < pgio->pg_mirror_count; i++) {
ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true);
- if (!ds)
- goto out_mds;
+ if (!ds) {
+ if (ff_layout_no_fallback_to_mds(pgio->pg_lseg))
+ goto out_pnfs;
+ else
+ goto out_mds;
+ }
pgm = &pgio->pg_mirrors[i];
mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize;
@@ -883,6 +956,12 @@ out_mds:
pnfs_put_lseg(pgio->pg_lseg);
pgio->pg_lseg = NULL;
nfs_pageio_reset_write_mds(pgio);
+ return;
+
+out_pnfs:
+ pnfs_set_lo_fail(pgio->pg_lseg);
+ pnfs_put_lseg(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
}
static unsigned int
@@ -895,6 +974,7 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
0,
NFS4_MAX_UINT64,
IOMODE_RW,
+ false,
GFP_NOFS);
if (IS_ERR(pgio->pg_lseg)) {
pgio->pg_error = PTR_ERR(pgio->pg_lseg);
@@ -1067,8 +1147,7 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
rpc_wake_up(&tbl->slot_tbl_waitq);
/* fall through */
default:
- if (ff_layout_no_fallback_to_mds(lseg) ||
- ff_layout_has_available_ds(lseg))
+ if (ff_layout_avoid_mds_available_ds(lseg))
return -NFS4ERR_RESET_TO_PNFS;
reset:
dprintk("%s Retry through MDS. Error %d\n", __func__,
@@ -1215,8 +1294,6 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
hdr->pgio_mirror_idx + 1,
&hdr->pgio_mirror_idx))
goto out_eagain;
- set_bit(NFS_LAYOUT_RETURN_REQUESTED,
- &hdr->lseg->pls_layout->plh_flags);
pnfs_read_resend_pnfs(hdr);
return task->tk_status;
case -NFS4ERR_RESET_TO_MDS:
@@ -1248,19 +1325,20 @@ ff_layout_need_layoutcommit(struct pnfs_layout_segment *lseg)
* we always send layoutcommit after DS writes.
*/
static void
-ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr)
+ff_layout_set_layoutcommit(struct inode *inode,
+ struct pnfs_layout_segment *lseg,
+ loff_t end_offset)
{
- if (!ff_layout_need_layoutcommit(hdr->lseg))
+ if (!ff_layout_need_layoutcommit(lseg))
return;
- pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
- hdr->mds_offset + hdr->res.count);
- dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
- (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
+ pnfs_set_layoutcommit(inode, lseg, end_offset);
+ dprintk("%s inode %lu pls_end_pos %llu\n", __func__, inode->i_ino,
+ (unsigned long long) NFS_I(inode)->layout->plh_lwb);
}
static bool
-ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
+ff_layout_device_unavailable(struct pnfs_layout_segment *lseg, int idx)
{
/* No mirroring for now */
struct nfs4_deviceid_node *node = FF_LAYOUT_DEVID_NODE(lseg, idx);
@@ -1297,16 +1375,10 @@ static int ff_layout_read_prepare_common(struct rpc_task *task,
rpc_exit(task, -EIO);
return -EIO;
}
- if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) {
- dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
- if (ff_layout_has_available_ds(hdr->lseg))
- pnfs_read_resend_pnfs(hdr);
- else
- ff_layout_reset_read(hdr);
- rpc_exit(task, 0);
+ if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) {
+ rpc_exit(task, -EHOSTDOWN);
return -EAGAIN;
}
- hdr->pgio_done_cb = ff_layout_read_done_cb;
ff_layout_read_record_layoutstats_start(task, hdr);
return 0;
@@ -1398,6 +1470,7 @@ static void ff_layout_read_release(void *data)
static int ff_layout_write_done_cb(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
+ loff_t end_offs = 0;
int err;
trace_nfs4_pnfs_write(hdr, task->tk_status);
@@ -1423,7 +1496,10 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
if (hdr->res.verf->committed == NFS_FILE_SYNC ||
hdr->res.verf->committed == NFS_DATA_SYNC)
- ff_layout_set_layoutcommit(hdr);
+ end_offs = hdr->mds_offset + (loff_t)hdr->res.count;
+
+ /* Note: if the write is unstable, don't set end_offs until commit */
+ ff_layout_set_layoutcommit(hdr->inode, hdr->lseg, end_offs);
/* zero out fattr since we don't care DS attr at all */
hdr->fattr.valid = 0;
@@ -1459,9 +1535,7 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
return -EAGAIN;
}
- if (data->verf.committed == NFS_UNSTABLE
- && ff_layout_need_layoutcommit(data->lseg))
- pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
+ ff_layout_set_layoutcommit(data->inode, data->lseg, data->lwb);
return 0;
}
@@ -1496,14 +1570,8 @@ static int ff_layout_write_prepare_common(struct rpc_task *task,
return -EIO;
}
- if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) {
- bool retry_pnfs;
-
- retry_pnfs = ff_layout_has_available_ds(hdr->lseg);
- dprintk("%s task %u reset io to %s\n", __func__,
- task->tk_pid, retry_pnfs ? "pNFS" : "MDS");
- ff_layout_reset_write(hdr, retry_pnfs);
- rpc_exit(task, 0);
+ if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) {
+ rpc_exit(task, -EHOSTDOWN);
return -EAGAIN;
}
@@ -1712,7 +1780,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
goto out_failed;
ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
- if (IS_ERR(ds_cred))
+ if (!ds_cred)
goto out_failed;
vers = nfs4_ff_layout_ds_version(lseg, idx);
@@ -1720,6 +1788,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__,
ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), vers);
+ hdr->pgio_done_cb = ff_layout_read_done_cb;
atomic_inc(&ds->ds_clp->cl_count);
hdr->ds_clp = ds->ds_clp;
fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
@@ -1737,11 +1806,11 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
vers == 3 ? &ff_layout_read_call_ops_v3 :
&ff_layout_read_call_ops_v4,
0, RPC_TASK_SOFTCONN);
-
+ put_rpccred(ds_cred);
return PNFS_ATTEMPTED;
out_failed:
- if (ff_layout_has_available_ds(lseg))
+ if (ff_layout_avoid_mds_available_ds(lseg))
return PNFS_TRY_AGAIN;
return PNFS_NOT_ATTEMPTED;
}
@@ -1769,7 +1838,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
return PNFS_NOT_ATTEMPTED;
ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
- if (IS_ERR(ds_cred))
+ if (!ds_cred)
return PNFS_NOT_ATTEMPTED;
vers = nfs4_ff_layout_ds_version(lseg, idx);
@@ -1798,6 +1867,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
vers == 3 ? &ff_layout_write_call_ops_v3 :
&ff_layout_write_call_ops_v4,
sync, RPC_TASK_SOFTCONN);
+ put_rpccred(ds_cred);
return PNFS_ATTEMPTED;
}
@@ -1824,7 +1894,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
struct rpc_clnt *ds_clnt;
struct rpc_cred *ds_cred;
u32 idx;
- int vers;
+ int vers, ret;
struct nfs_fh *fh;
idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
@@ -1838,7 +1908,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
goto out_err;
ds_cred = ff_layout_get_ds_cred(lseg, idx, data->cred);
- if (IS_ERR(ds_cred))
+ if (!ds_cred)
goto out_err;
vers = nfs4_ff_layout_ds_version(lseg, idx);
@@ -1854,10 +1924,12 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
if (fh)
data->args.fh = fh;
- return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops,
+ ret = nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops,
vers == 3 ? &ff_layout_commit_call_ops_v3 :
&ff_layout_commit_call_ops_v4,
how, RPC_TASK_SOFTCONN);
+ put_rpccred(ds_cred);
+ return ret;
out_err:
pnfs_generic_prepare_to_resend_writes(data);
pnfs_generic_commit_release(data);
@@ -2223,6 +2295,11 @@ static int __init nfs4flexfilelayout_init(void)
{
printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Registering...\n",
__func__);
+ if (!ff_zero_group) {
+ ff_zero_group = groups_alloc(0);
+ if (!ff_zero_group)
+ return -ENOMEM;
+ }
return pnfs_register_layoutdriver(&flexfilelayout_type);
}
@@ -2231,6 +2308,10 @@ static void __exit nfs4flexfilelayout_exit(void)
printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Unregistering...\n",
__func__);
pnfs_unregister_layoutdriver(&flexfilelayout_type);
+ if (ff_zero_group) {
+ put_group_info(ff_zero_group);
+ ff_zero_group = NULL;
+ }
}
MODULE_ALIAS("nfs-layouttype4-4");
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index dd353bb7dc0a0..1bcdb15d0c41a 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -10,7 +10,8 @@
#define FS_NFS_NFS4FLEXFILELAYOUT_H
#define FF_FLAGS_NO_LAYOUTCOMMIT 1
-#define FF_FLAGS_NO_IO_THRU_MDS 2
+#define FF_FLAGS_NO_IO_THRU_MDS 2
+#define FF_FLAGS_NO_READ_IO 4
#include "../pnfs.h"
@@ -76,9 +77,8 @@ struct nfs4_ff_layout_mirror {
u32 fh_versions_cnt;
struct nfs_fh *fh_versions;
nfs4_stateid stateid;
- u32 uid;
- u32 gid;
- struct rpc_cred *cred;
+ struct rpc_cred __rcu *ro_cred;
+ struct rpc_cred __rcu *rw_cred;
atomic_t ref;
spinlock_t lock;
struct nfs4_ff_layoutstat read_stat;
@@ -154,6 +154,12 @@ ff_layout_no_fallback_to_mds(struct pnfs_layout_segment *lseg)
}
static inline bool
+ff_layout_no_read_on_rw(struct pnfs_layout_segment *lseg)
+{
+ return FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_READ_IO;
+}
+
+static inline bool
ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node)
{
return nfs4_test_deviceid_unavailable(node);
@@ -192,4 +198,7 @@ nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg,
struct rpc_cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg,
u32 ds_idx, struct rpc_cred *mdscred);
bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg);
+bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg);
+bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg);
+
#endif /* FS_NFS_NFS4FLEXFILELAYOUT_H */
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index add0e5a70bd60..0aa36be71fcea 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -228,7 +228,8 @@ ff_ds_error_match(const struct nfs4_ff_layout_ds_err *e1,
return e1->opnum < e2->opnum ? -1 : 1;
if (e1->status != e2->status)
return e1->status < e2->status ? -1 : 1;
- ret = memcmp(&e1->stateid, &e2->stateid, sizeof(e1->stateid));
+ ret = memcmp(e1->stateid.data, e2->stateid.data,
+ sizeof(e1->stateid.data));
if (ret != 0)
return ret;
ret = memcmp(&e1->deviceid, &e2->deviceid, sizeof(e1->deviceid));
@@ -302,40 +303,26 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
return 0;
}
-/* currently we only support AUTH_NONE and AUTH_SYS */
-static rpc_authflavor_t
-nfs4_ff_layout_choose_authflavor(struct nfs4_ff_layout_mirror *mirror)
+static struct rpc_cred *
+ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode)
{
- if (mirror->uid == (u32)-1)
- return RPC_AUTH_NULL;
- return RPC_AUTH_UNIX;
-}
+ struct rpc_cred *cred, __rcu **pcred;
-/* fetch cred for NFSv3 DS */
-static int ff_layout_update_mirror_cred(struct nfs4_ff_layout_mirror *mirror,
- struct nfs4_pnfs_ds *ds)
-{
- if (ds->ds_clp && !mirror->cred &&
- mirror->mirror_ds->ds_versions[0].version == 3) {
- struct rpc_auth *auth = ds->ds_clp->cl_rpcclient->cl_auth;
- struct rpc_cred *cred;
- struct auth_cred acred = {
- .uid = make_kuid(&init_user_ns, mirror->uid),
- .gid = make_kgid(&init_user_ns, mirror->gid),
- };
-
- /* AUTH_NULL ignores acred */
- cred = auth->au_ops->lookup_cred(auth, &acred, 0);
- if (IS_ERR(cred)) {
- dprintk("%s: lookup_cred failed with %ld\n",
- __func__, PTR_ERR(cred));
- return PTR_ERR(cred);
- } else {
- if (cmpxchg(&mirror->cred, NULL, cred))
- put_rpccred(cred);
- }
- }
- return 0;
+ if (iomode == IOMODE_READ)
+ pcred = &mirror->ro_cred;
+ else
+ pcred = &mirror->rw_cred;
+
+ rcu_read_lock();
+ do {
+ cred = rcu_dereference(*pcred);
+ if (!cred)
+ break;
+
+ cred = get_rpccred_rcu(cred);
+ } while(!cred);
+ rcu_read_unlock();
+ return cred;
}
struct nfs_fh *
@@ -356,7 +343,23 @@ out:
return fh;
}
-/* Upon return, either ds is connected, or ds is NULL */
+/**
+ * nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call
+ * @lseg: the layout segment we're operating on
+ * @ds_idx: index of the DS to use
+ * @fail_return: return layout on connect failure?
+ *
+ * Try to prepare a DS connection to accept an RPC call. This involves
+ * selecting a mirror to use and connecting the client to it if it's not
+ * already connected.
+ *
+ * Since we only need a single functioning mirror to satisfy a read, we don't
+ * want to return the layout if there is one. For writes though, any down
+ * mirror should result in a LAYOUTRETURN. @fail_return is how we distinguish
+ * between the two cases.
+ *
+ * Returns a pointer to a connected DS object on success or NULL on failure.
+ */
struct nfs4_pnfs_ds *
nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
bool fail_return)
@@ -367,7 +370,6 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
struct inode *ino = lseg->pls_layout->plh_inode;
struct nfs_server *s = NFS_SERVER(ino);
unsigned int max_payload;
- rpc_authflavor_t flavor;
if (!ff_layout_mirror_valid(lseg, mirror)) {
pr_err_ratelimited("NFS: %s: No data server for offset index %d\n",
@@ -383,9 +385,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
/* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
smp_rmb();
if (ds->ds_clp)
- goto out_update_creds;
-
- flavor = nfs4_ff_layout_choose_authflavor(mirror);
+ goto out;
/* FIXME: For now we assume the server sent only one version of NFS
* to use for the DS.
@@ -394,7 +394,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
dataserver_retrans,
mirror->mirror_ds->ds_versions[0].version,
mirror->mirror_ds->ds_versions[0].minor_version,
- flavor);
+ RPC_AUTH_UNIX);
/* connect success, check rsize/wsize limit */
if (ds->ds_clp) {
@@ -410,20 +410,10 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
mirror, lseg->pls_range.offset,
lseg->pls_range.length, NFS4ERR_NXIO,
OP_ILLEGAL, GFP_NOIO);
- if (!fail_return) {
- if (ff_layout_has_available_ds(lseg))
- set_bit(NFS_LAYOUT_RETURN_REQUESTED,
- &lseg->pls_layout->plh_flags);
- else
- pnfs_error_mark_layout_for_return(ino, lseg);
- } else
+ if (fail_return || !ff_layout_has_available_ds(lseg))
pnfs_error_mark_layout_for_return(ino, lseg);
ds = NULL;
- goto out;
}
-out_update_creds:
- if (ff_layout_update_mirror_cred(mirror, ds))
- ds = NULL;
out:
return ds;
}
@@ -433,16 +423,15 @@ ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx,
struct rpc_cred *mdscred)
{
struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
- struct rpc_cred *cred = ERR_PTR(-EINVAL);
-
- if (!nfs4_ff_layout_prepare_ds(lseg, ds_idx, true))
- goto out;
+ struct rpc_cred *cred;
- if (mirror && mirror->cred)
- cred = mirror->cred;
- else
- cred = mdscred;
-out:
+ if (mirror) {
+ cred = ff_layout_get_mirror_cred(mirror, lseg->pls_range.iomode);
+ if (!cred)
+ cred = get_rpccred(mdscred);
+ } else {
+ cred = get_rpccred(mdscred);
+ }
return cred;
}
@@ -562,6 +551,18 @@ bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
return ff_rw_layout_has_available_ds(lseg);
}
+bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg)
+{
+ return ff_layout_no_fallback_to_mds(lseg) ||
+ ff_layout_has_available_ds(lseg);
+}
+
+bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg)
+{
+ return lseg->pls_range.iomode == IOMODE_RW &&
+ ff_layout_no_read_on_rw(lseg);
+}
+
module_param(dataserver_retrans, uint, 0644);
MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client "
"retries a request before it attempts further "
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 33d18c4119057..bf4ec5ecc97e4 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -282,6 +282,7 @@ nfs_init_locked(struct inode *inode, void *opaque)
struct nfs_fattr *fattr = desc->fattr;
set_nfs_fileid(inode, fattr->fileid);
+ inode->i_mode = fattr->mode;
nfs_copy_fh(NFS_FH(inode), desc->fh);
return 0;
}
@@ -661,9 +662,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
trace_nfs_getattr_enter(inode);
/* Flush out writes to the server in order to update c/mtime. */
if (S_ISREG(inode->i_mode)) {
- inode_lock(inode);
- err = nfs_sync_inode(inode);
- inode_unlock(inode);
+ err = filemap_write_and_wait(inode->i_mapping);
if (err)
goto out;
}
@@ -878,7 +877,10 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx)
struct nfs_inode *nfsi = NFS_I(inode);
spin_lock(&inode->i_lock);
- list_add(&ctx->list, &nfsi->open_files);
+ if (ctx->mode & FMODE_WRITE)
+ list_add(&ctx->list, &nfsi->open_files);
+ else
+ list_add_tail(&ctx->list, &nfsi->open_files);
spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context);
@@ -940,7 +942,7 @@ int nfs_open(struct inode *inode, struct file *filp)
{
struct nfs_open_context *ctx;
- ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode);
+ ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode);
if (IS_ERR(ctx))
return PTR_ERR(ctx);
nfs_file_set_open_context(filp, ctx);
@@ -971,6 +973,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
if (NFS_STALE(inode))
goto out;
+ /* pNFS: Attributes aren't updated until we layoutcommit */
+ if (S_ISREG(inode->i_mode)) {
+ status = pnfs_sync_inode(inode, false);
+ if (status)
+ goto out;
+ }
+
status = -ENOMEM;
fattr = nfs_alloc_fattr();
if (fattr == NULL)
@@ -1121,14 +1130,12 @@ out:
}
/**
- * __nfs_revalidate_mapping - Revalidate the pagecache
+ * nfs_revalidate_mapping - Revalidate the pagecache
* @inode - pointer to host inode
* @mapping - pointer to mapping
- * @may_lock - take inode->i_mutex?
*/
-static int __nfs_revalidate_mapping(struct inode *inode,
- struct address_space *mapping,
- bool may_lock)
+int nfs_revalidate_mapping(struct inode *inode,
+ struct address_space *mapping)
{
struct nfs_inode *nfsi = NFS_I(inode);
unsigned long *bitlock = &nfsi->flags;
@@ -1177,12 +1184,7 @@ static int __nfs_revalidate_mapping(struct inode *inode,
nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
spin_unlock(&inode->i_lock);
trace_nfs_invalidate_mapping_enter(inode);
- if (may_lock) {
- inode_lock(inode);
- ret = nfs_invalidate_mapping(inode, mapping);
- inode_unlock(inode);
- } else
- ret = nfs_invalidate_mapping(inode, mapping);
+ ret = nfs_invalidate_mapping(inode, mapping);
trace_nfs_invalidate_mapping_exit(inode, ret);
clear_bit_unlock(NFS_INO_INVALIDATING, bitlock);
@@ -1192,27 +1194,28 @@ out:
return ret;
}
-/**
- * nfs_revalidate_mapping - Revalidate the pagecache
- * @inode - pointer to host inode
- * @mapping - pointer to mapping
- */
-int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
+static bool nfs_file_has_writers(struct nfs_inode *nfsi)
{
- return __nfs_revalidate_mapping(inode, mapping, false);
+ struct inode *inode = &nfsi->vfs_inode;
+
+ assert_spin_locked(&inode->i_lock);
+
+ if (!S_ISREG(inode->i_mode))
+ return false;
+ if (list_empty(&nfsi->open_files))
+ return false;
+ /* Note: This relies on nfsi->open_files being ordered with writers
+ * being placed at the head of the list.
+ * See nfs_inode_attach_open_context()
+ */
+ return (list_first_entry(&nfsi->open_files,
+ struct nfs_open_context,
+ list)->mode & FMODE_WRITE) == FMODE_WRITE;
}
-/**
- * nfs_revalidate_mapping_protected - Revalidate the pagecache
- * @inode - pointer to host inode
- * @mapping - pointer to mapping
- *
- * Differs from nfs_revalidate_mapping() in that it grabs the inode->i_mutex
- * while invalidating the mapping.
- */
-int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping)
+static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi)
{
- return __nfs_revalidate_mapping(inode, mapping, true);
+ return nfs_file_has_writers(nfsi) && nfs_file_io_is_buffered(nfsi);
}
static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
@@ -1279,22 +1282,24 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
return -EIO;
- if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
- inode->i_version != fattr->change_attr)
- invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+ if (!nfs_file_has_buffered_writers(nfsi)) {
+ /* Verify a few of the more important attributes */
+ if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && inode->i_version != fattr->change_attr)
+ invalid |= NFS_INO_INVALID_ATTR | NFS_INO_REVAL_PAGECACHE;
- /* Verify a few of the more important attributes */
- if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime))
- invalid |= NFS_INO_INVALID_ATTR;
+ if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime))
+ invalid |= NFS_INO_INVALID_ATTR;
- if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
- cur_size = i_size_read(inode);
- new_isize = nfs_size_to_loff_t(fattr->size);
- if (cur_size != new_isize)
- invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+ if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec_equal(&inode->i_ctime, &fattr->ctime))
+ invalid |= NFS_INO_INVALID_ATTR;
+
+ if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
+ cur_size = i_size_read(inode);
+ new_isize = nfs_size_to_loff_t(fattr->size);
+ if (cur_size != new_isize)
+ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+ }
}
- if (nfsi->nrequests != 0)
- invalid &= ~NFS_INO_REVAL_PAGECACHE;
/* Have any file permissions changed? */
if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))
@@ -1469,28 +1474,12 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n
((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
}
-/*
- * Don't trust the change_attribute, mtime, ctime or size if
- * a pnfs LAYOUTCOMMIT is outstanding
- */
-static void nfs_inode_attrs_handle_layoutcommit(struct inode *inode,
- struct nfs_fattr *fattr)
-{
- if (pnfs_layoutcommit_outstanding(inode))
- fattr->valid &= ~(NFS_ATTR_FATTR_CHANGE |
- NFS_ATTR_FATTR_MTIME |
- NFS_ATTR_FATTR_CTIME |
- NFS_ATTR_FATTR_SIZE);
-}
-
static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
{
int ret;
trace_nfs_refresh_inode_enter(inode);
- nfs_inode_attrs_handle_layoutcommit(inode, fattr);
-
if (nfs_inode_attrs_need_update(inode, fattr))
ret = nfs_update_inode(inode, fattr);
else
@@ -1526,7 +1515,7 @@ EXPORT_SYMBOL_GPL(nfs_refresh_inode);
static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
{
- unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+ unsigned long invalid = NFS_INO_INVALID_ATTR;
/*
* Don't revalidate the pagecache if we hold a delegation, but do
@@ -1675,6 +1664,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
unsigned long invalid = 0;
unsigned long now = jiffies;
unsigned long save_cache_validity;
+ bool have_writers = nfs_file_has_buffered_writers(nfsi);
bool cache_revalidated = true;
dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n",
@@ -1724,17 +1714,25 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
/* Do atomic weak cache consistency updates */
invalid |= nfs_wcc_update_inode(inode, fattr);
+ if (pnfs_layoutcommit_outstanding(inode)) {
+ nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_ATTR;
+ cache_revalidated = false;
+ }
+
/* More cache consistency checks */
if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
if (inode->i_version != fattr->change_attr) {
dprintk("NFS: change_attr change on server for file %s/%ld\n",
inode->i_sb->s_id, inode->i_ino);
- invalid |= NFS_INO_INVALID_ATTR
- | NFS_INO_INVALID_DATA
- | NFS_INO_INVALID_ACCESS
- | NFS_INO_INVALID_ACL;
- if (S_ISDIR(inode->i_mode))
- nfs_force_lookup_revalidate(inode);
+ /* Could it be a race with writeback? */
+ if (!have_writers) {
+ invalid |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_DATA
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL;
+ if (S_ISDIR(inode->i_mode))
+ nfs_force_lookup_revalidate(inode);
+ }
inode->i_version = fattr->change_attr;
}
} else {
@@ -1767,9 +1765,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
if (new_isize != cur_isize) {
/* Do we perhaps have any outstanding writes, or has
* the file grown beyond our last write? */
- if ((nfsi->nrequests == 0) || new_isize > cur_isize) {
+ if (nfsi->nrequests == 0 || new_isize > cur_isize) {
i_size_write(inode, new_isize);
- invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+ if (!have_writers)
+ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
}
dprintk("NFS: isize change on server for file %s/%ld "
"(%Ld to %Ld)\n",
@@ -1958,9 +1957,7 @@ static void init_once(void *foo)
nfsi->nrequests = 0;
nfsi->commit_info.ncommit = 0;
atomic_set(&nfsi->commit_info.rpcs_out, 0);
- atomic_set(&nfsi->silly_count, 1);
- INIT_HLIST_HEAD(&nfsi->silly_list);
- init_waitqueue_head(&nfsi->waitqueue);
+ init_rwsem(&nfsi->rmdir_sem);
nfs4_init_once(nfsi);
}
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 565f8135ae1fd..7ce5e023c3c3c 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -66,13 +66,16 @@ struct nfs_clone_mount {
struct nfs_client_initdata {
unsigned long init_flags;
- const char *hostname;
- const struct sockaddr *addr;
+ const char *hostname; /* Hostname of the server */
+ const struct sockaddr *addr; /* Address of the server */
+ const char *nodename; /* Hostname of the client */
+ const char *ip_addr; /* IP address of the client */
size_t addrlen;
struct nfs_subversion *nfs_mod;
int proto;
u32 minorversion;
struct net *net;
+ const struct rpc_timeout *timeparms;
};
/*
@@ -147,9 +150,8 @@ extern void nfs_umount(const struct nfs_mount_request *info);
extern const struct rpc_program nfs_program;
extern void nfs_clients_init(struct net *net);
extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *);
-int nfs_create_rpc_client(struct nfs_client *, const struct rpc_timeout *, rpc_authflavor_t);
+int nfs_create_rpc_client(struct nfs_client *, const struct nfs_client_initdata *, rpc_authflavor_t);
struct nfs_client *nfs_get_client(const struct nfs_client_initdata *,
- const struct rpc_timeout *, const char *,
rpc_authflavor_t);
int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *);
void nfs_server_insert_lists(struct nfs_server *);
@@ -184,7 +186,7 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *,
rpc_authflavor_t);
extern int nfs_wait_client_init_complete(const struct nfs_client *clp);
extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
-extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
+extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
const struct sockaddr *ds_addr,
int ds_addrlen, int ds_proto,
unsigned int ds_timeo,
@@ -193,7 +195,7 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
rpc_authflavor_t au_flavor);
extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *,
struct inode *);
-extern struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
+extern struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
const struct sockaddr *ds_addr, int ds_addrlen,
int ds_proto, unsigned int ds_timeo,
unsigned int ds_retrans, rpc_authflavor_t au_flavor);
@@ -338,8 +340,7 @@ nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src)
/* proc.c */
void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
- const struct rpc_timeout *timeparms,
- const char *ip_addr);
+ const struct nfs_client_initdata *);
/* dir.c */
extern void nfs_force_use_readdirplus(struct inode *dir);
@@ -411,6 +412,19 @@ extern void __exit unregister_nfs_fs(void);
extern bool nfs_sb_active(struct super_block *sb);
extern void nfs_sb_deactive(struct super_block *sb);
+/* io.c */
+extern void nfs_start_io_read(struct inode *inode);
+extern void nfs_end_io_read(struct inode *inode);
+extern void nfs_start_io_write(struct inode *inode);
+extern void nfs_end_io_write(struct inode *inode);
+extern void nfs_start_io_direct(struct inode *inode);
+extern void nfs_end_io_direct(struct inode *inode);
+
+static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi)
+{
+ return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0;
+}
+
/* namespace.c */
#define NFS_PATH_CANONICAL 1
extern char *nfs_path(char **p, struct dentry *dentry,
@@ -477,6 +491,7 @@ void nfs_mark_request_commit(struct nfs_page *req,
u32 ds_commit_idx);
int nfs_write_need_commit(struct nfs_pgio_header *);
void nfs_writeback_update_inode(struct nfs_pgio_header *hdr);
+int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf);
int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
int how, struct nfs_commit_info *cinfo);
void nfs_retry_commit(struct list_head *page_list,
@@ -495,9 +510,29 @@ void nfs_init_cinfo(struct nfs_commit_info *cinfo,
struct inode *inode,
struct nfs_direct_req *dreq);
int nfs_key_timeout_notify(struct file *filp, struct inode *inode);
-bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx);
+bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode);
void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio);
+int nfs_filemap_write_and_wait_range(struct address_space *mapping,
+ loff_t lstart, loff_t lend);
+
+#ifdef CONFIG_NFS_V4_1
+static inline
+void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo)
+{
+ int i;
+
+ for (i = 0; i < cinfo->nbuckets; i++)
+ cinfo->buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
+}
+#else
+static inline
+void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo)
+{
+}
+#endif
+
+
#ifdef CONFIG_MIGRATION
extern int nfs_migrate_page(struct address_space *,
struct page *, struct page *, enum migrate_mode);
@@ -505,6 +540,13 @@ extern int nfs_migrate_page(struct address_space *,
#define nfs_migrate_page NULL
#endif
+static inline int
+nfs_write_verifier_cmp(const struct nfs_write_verifier *v1,
+ const struct nfs_write_verifier *v2)
+{
+ return memcmp(v1->data, v2->data, sizeof(v1->data));
+}
+
/* unlink.c */
extern struct rpc_task *
nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
@@ -520,8 +562,7 @@ extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
/* nfs4proc.c */
extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
- const struct rpc_timeout *timeparms,
- const char *ip_addr);
+ const struct nfs_client_initdata *);
extern int nfs40_walk_client_list(struct nfs_client *clp,
struct nfs_client **result,
struct rpc_cred *cred);
@@ -622,7 +663,7 @@ void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo)
if (!cinfo->dreq) {
struct inode *inode = page_file_mapping(page)->host;
- inc_zone_page_state(page, NR_UNSTABLE_NFS);
+ inc_node_page_state(page, NR_UNSTABLE_NFS);
inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE);
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
}
@@ -638,11 +679,11 @@ unsigned int nfs_page_length(struct page *page)
if (i_size > 0) {
pgoff_t page_index = page_file_index(page);
- pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
+ pgoff_t end_index = (i_size - 1) >> PAGE_SHIFT;
if (page_index < end_index)
- return PAGE_CACHE_SIZE;
+ return PAGE_SIZE;
if (page_index == end_index)
- return ((i_size - 1) & ~PAGE_CACHE_MASK) + 1;
+ return ((i_size - 1) & ~PAGE_MASK) + 1;
}
return 0;
}
diff --git a/fs/nfs/io.c b/fs/nfs/io.c
new file mode 100644
index 0000000000000..1fc5d1ce327e2
--- /dev/null
+++ b/fs/nfs/io.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2016 Trond Myklebust
+ *
+ * I/O and data path helper functionality.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/rwsem.h>
+#include <linux/fs.h>
+#include <linux/nfs_fs.h>
+
+#include "internal.h"
+
+/* Call with exclusively locked inode->i_rwsem */
+static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode)
+{
+ if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) {
+ clear_bit(NFS_INO_ODIRECT, &nfsi->flags);
+ inode_dio_wait(inode);
+ }
+}
+
+/**
+ * nfs_start_io_read - declare the file is being used for buffered reads
+ * @inode - file inode
+ *
+ * Declare that a buffered read operation is about to start, and ensure
+ * that we block all direct I/O.
+ * On exit, the function ensures that the NFS_INO_ODIRECT flag is unset,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that buffered read operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas direct I/O
+ * operations need to wait to grab an exclusive lock in order to set
+ * NFS_INO_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. the reads.
+ */
+void
+nfs_start_io_read(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ /* Be an optimist! */
+ down_read(&inode->i_rwsem);
+ if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0)
+ return;
+ up_read(&inode->i_rwsem);
+ /* Slow path.... */
+ down_write(&inode->i_rwsem);
+ nfs_block_o_direct(nfsi, inode);
+ downgrade_write(&inode->i_rwsem);
+}
+
+/**
+ * nfs_end_io_read - declare that the buffered read operation is done
+ * @inode - file inode
+ *
+ * Declare that a buffered read operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void
+nfs_end_io_read(struct inode *inode)
+{
+ up_read(&inode->i_rwsem);
+}
+
+/**
+ * nfs_start_io_write - declare the file is being used for buffered writes
+ * @inode - file inode
+ *
+ * Declare that a buffered read operation is about to start, and ensure
+ * that we block all direct I/O.
+ */
+void
+nfs_start_io_write(struct inode *inode)
+{
+ down_write(&inode->i_rwsem);
+ nfs_block_o_direct(NFS_I(inode), inode);
+}
+
+/**
+ * nfs_end_io_write - declare that the buffered write operation is done
+ * @inode - file inode
+ *
+ * Declare that a buffered write operation is done, and release the
+ * lock on inode->i_rwsem.
+ */
+void
+nfs_end_io_write(struct inode *inode)
+{
+ up_write(&inode->i_rwsem);
+}
+
+/* Call with exclusively locked inode->i_rwsem */
+static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode)
+{
+ if (!test_bit(NFS_INO_ODIRECT, &nfsi->flags)) {
+ set_bit(NFS_INO_ODIRECT, &nfsi->flags);
+ nfs_wb_all(inode);
+ }
+}
+
+/**
+ * nfs_end_io_direct - declare the file is being used for direct i/o
+ * @inode - file inode
+ *
+ * Declare that a direct I/O operation is about to start, and ensure
+ * that we block all buffered I/O.
+ * On exit, the function ensures that the NFS_INO_ODIRECT flag is set,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that direct I/O operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas buffered I/O
+ * operations need to wait to grab an exclusive lock in order to clear
+ * NFS_INO_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT.
+ */
+void
+nfs_start_io_direct(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ /* Be an optimist! */
+ down_read(&inode->i_rwsem);
+ if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) != 0)
+ return;
+ up_read(&inode->i_rwsem);
+ /* Slow path.... */
+ down_write(&inode->i_rwsem);
+ nfs_block_buffered(nfsi, inode);
+ downgrade_write(&inode->i_rwsem);
+}
+
+/**
+ * nfs_end_io_direct - declare that the direct i/o operation is done
+ * @inode - file inode
+ *
+ * Declare that a direct I/O operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void
+nfs_end_io_direct(struct inode *inode)
+{
+ up_read(&inode->i_rwsem);
+}
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 17c0fa1eccfaa..720d92f5abfb8 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -11,6 +11,38 @@
#define NFSDBG_FACILITY NFSDBG_PROC
+/*
+ * nfs3_prepare_get_acl, nfs3_complete_get_acl, nfs3_abort_get_acl: Helpers for
+ * caching get_acl results in a race-free way. See fs/posix_acl.c:get_acl()
+ * for explanations.
+ */
+static void nfs3_prepare_get_acl(struct posix_acl **p)
+{
+ struct posix_acl *sentinel = uncached_acl_sentinel(current);
+
+ if (cmpxchg(p, ACL_NOT_CACHED, sentinel) != ACL_NOT_CACHED) {
+ /* Not the first reader or sentinel already in place. */
+ }
+}
+
+static void nfs3_complete_get_acl(struct posix_acl **p, struct posix_acl *acl)
+{
+ struct posix_acl *sentinel = uncached_acl_sentinel(current);
+
+ /* Only cache the ACL if our sentinel is still in place. */
+ posix_acl_dup(acl);
+ if (cmpxchg(p, sentinel, acl) != sentinel)
+ posix_acl_release(acl);
+}
+
+static void nfs3_abort_get_acl(struct posix_acl **p)
+{
+ struct posix_acl *sentinel = uncached_acl_sentinel(current);
+
+ /* Remove our sentinel upon failure. */
+ cmpxchg(p, sentinel, ACL_NOT_CACHED);
+}
+
struct posix_acl *nfs3_get_acl(struct inode *inode, int type)
{
struct nfs_server *server = NFS_SERVER(inode);
@@ -55,6 +87,11 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type)
if (res.fattr == NULL)
return ERR_PTR(-ENOMEM);
+ if (args.mask & NFS_ACL)
+ nfs3_prepare_get_acl(&inode->i_acl);
+ if (args.mask & NFS_DFACL)
+ nfs3_prepare_get_acl(&inode->i_default_acl);
+
status = rpc_call_sync(server->client_acl, &msg, 0);
dprintk("NFS reply getacl: %d\n", status);
@@ -89,12 +126,12 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type)
}
if (res.mask & NFS_ACL)
- set_cached_acl(inode, ACL_TYPE_ACCESS, res.acl_access);
+ nfs3_complete_get_acl(&inode->i_acl, res.acl_access);
else
forget_cached_acl(inode, ACL_TYPE_ACCESS);
if (res.mask & NFS_DFACL)
- set_cached_acl(inode, ACL_TYPE_DEFAULT, res.acl_default);
+ nfs3_complete_get_acl(&inode->i_default_acl, res.acl_default);
else
forget_cached_acl(inode, ACL_TYPE_DEFAULT);
@@ -108,6 +145,8 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type)
}
getout:
+ nfs3_abort_get_acl(&inode->i_acl);
+ nfs3_abort_get_acl(&inode->i_default_acl);
posix_acl_release(res.acl_access);
posix_acl_release(res.acl_default);
nfs_free_fattr(res.fattr);
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index 9e9fa347a9486..ee753547fb0a3 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -76,19 +76,23 @@ struct nfs_server *nfs3_clone_server(struct nfs_server *source,
* low timeout interval so that if a connection is lost, we retry through
* the MDS.
*/
-struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
+struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
const struct sockaddr *ds_addr, int ds_addrlen,
int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
rpc_authflavor_t au_flavor)
{
+ struct rpc_timeout ds_timeout;
+ struct nfs_client *mds_clp = mds_srv->nfs_client;
struct nfs_client_initdata cl_init = {
.addr = ds_addr,
.addrlen = ds_addrlen,
+ .nodename = mds_clp->cl_rpcclient->cl_nodename,
+ .ip_addr = mds_clp->cl_ipaddr,
.nfs_mod = &nfs_v3,
.proto = ds_proto,
.net = mds_clp->cl_net,
+ .timeparms = &ds_timeout,
};
- struct rpc_timeout ds_timeout;
struct nfs_client *clp;
char buf[INET6_ADDRSTRLEN + 1];
@@ -97,10 +101,12 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
return ERR_PTR(-EINVAL);
cl_init.hostname = buf;
+ if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
+ set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+
/* Use the MDS nfs_client cl_ipaddr. */
nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
- clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
- au_flavor);
+ clp = nfs_get_client(&cl_init, au_flavor);
return clp;
}
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index cb28cceefebe0..698be93612808 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -144,7 +144,7 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
}
static int
-nfs3_proc_lookup(struct inode *dir, struct qstr *name,
+nfs3_proc_lookup(struct inode *dir, const struct qstr *name,
struct nfs_fh *fhandle, struct nfs_fattr *fattr,
struct nfs4_label *label)
{
@@ -404,7 +404,7 @@ out:
}
static int
-nfs3_proc_remove(struct inode *dir, struct qstr *name)
+nfs3_proc_remove(struct inode *dir, const struct qstr *name)
{
struct nfs_removeargs arg = {
.fh = NFS_FH(dir),
@@ -480,7 +480,7 @@ nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
}
static int
-nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
+nfs3_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name)
{
struct nfs3_linkargs arg = {
.fromfh = NFS_FH(inode),
@@ -582,7 +582,7 @@ out:
}
static int
-nfs3_proc_rmdir(struct inode *dir, struct qstr *name)
+nfs3_proc_rmdir(struct inode *dir, const struct qstr *name)
{
struct nfs_fattr *dir_attr;
struct nfs3_diropargs arg = {
diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index b587ccd310834..b6cd15314bab4 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -13,6 +13,7 @@
/* nfs4.2proc.c */
int nfs42_proc_allocate(struct file *, loff_t, loff_t);
+ssize_t nfs42_proc_copy(struct file *, loff_t, struct file *, loff_t, size_t);
int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
loff_t nfs42_proc_llseek(struct file *, loff_t, int);
int nfs42_proc_layoutstats_generic(struct nfs_server *,
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index dff83460e5a63..33da841a21bb2 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -113,19 +113,135 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE))
return -EOPNOTSUPP;
- nfs_wb_all(inode);
inode_lock(inode);
+ err = nfs_sync_inode(inode);
+ if (err)
+ goto out_unlock;
err = nfs42_proc_fallocate(&msg, filep, offset, len);
if (err == 0)
truncate_pagecache_range(inode, offset, (offset + len) -1);
if (err == -EOPNOTSUPP)
NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE;
-
+out_unlock:
inode_unlock(inode);
return err;
}
+static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src,
+ struct nfs_lock_context *src_lock,
+ struct file *dst, loff_t pos_dst,
+ struct nfs_lock_context *dst_lock,
+ size_t count)
+{
+ struct nfs42_copy_args args = {
+ .src_fh = NFS_FH(file_inode(src)),
+ .src_pos = pos_src,
+ .dst_fh = NFS_FH(file_inode(dst)),
+ .dst_pos = pos_dst,
+ .count = count,
+ };
+ struct nfs42_copy_res res;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COPY],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ struct inode *dst_inode = file_inode(dst);
+ struct nfs_server *server = NFS_SERVER(dst_inode);
+ int status;
+
+ status = nfs4_set_rw_stateid(&args.src_stateid, src_lock->open_context,
+ src_lock, FMODE_READ);
+ if (status)
+ return status;
+
+ status = nfs_filemap_write_and_wait_range(file_inode(src)->i_mapping,
+ pos_src, pos_src + (loff_t)count - 1);
+ if (status)
+ return status;
+
+ status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context,
+ dst_lock, FMODE_WRITE);
+ if (status)
+ return status;
+
+ status = nfs_sync_inode(dst_inode);
+ if (status)
+ return status;
+
+ status = nfs4_call_sync(server->client, server, &msg,
+ &args.seq_args, &res.seq_res, 0);
+ if (status == -ENOTSUPP)
+ server->caps &= ~NFS_CAP_COPY;
+ if (status)
+ return status;
+
+ if (res.write_res.verifier.committed != NFS_FILE_SYNC) {
+ status = nfs_commit_file(dst, &res.write_res.verifier.verifier);
+ if (status)
+ return status;
+ }
+
+ truncate_pagecache_range(dst_inode, pos_dst,
+ pos_dst + res.write_res.count);
+
+ return res.write_res.count;
+}
+
+ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src,
+ struct file *dst, loff_t pos_dst,
+ size_t count)
+{
+ struct nfs_server *server = NFS_SERVER(file_inode(dst));
+ struct nfs_lock_context *src_lock;
+ struct nfs_lock_context *dst_lock;
+ struct nfs4_exception src_exception = { };
+ struct nfs4_exception dst_exception = { };
+ ssize_t err, err2;
+
+ if (!nfs_server_capable(file_inode(dst), NFS_CAP_COPY))
+ return -EOPNOTSUPP;
+
+ src_lock = nfs_get_lock_context(nfs_file_open_context(src));
+ if (IS_ERR(src_lock))
+ return PTR_ERR(src_lock);
+
+ src_exception.inode = file_inode(src);
+ src_exception.state = src_lock->open_context->state;
+
+ dst_lock = nfs_get_lock_context(nfs_file_open_context(dst));
+ if (IS_ERR(dst_lock)) {
+ err = PTR_ERR(dst_lock);
+ goto out_put_src_lock;
+ }
+
+ dst_exception.inode = file_inode(dst);
+ dst_exception.state = dst_lock->open_context->state;
+
+ do {
+ inode_lock(file_inode(dst));
+ err = _nfs42_proc_copy(src, pos_src, src_lock,
+ dst, pos_dst, dst_lock, count);
+ inode_unlock(file_inode(dst));
+
+ if (err == -ENOTSUPP) {
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ err2 = nfs4_handle_exception(server, err, &src_exception);
+ err = nfs4_handle_exception(server, err, &dst_exception);
+ if (!err)
+ err = err2;
+ } while (src_exception.retry || dst_exception.retry);
+
+ nfs_put_lock_context(dst_lock);
+out_put_src_lock:
+ nfs_put_lock_context(src_lock);
+ return err;
+}
+
static loff_t _nfs42_proc_llseek(struct file *filep,
struct nfs_lock_context *lock, loff_t offset, int whence)
{
@@ -153,7 +269,11 @@ static loff_t _nfs42_proc_llseek(struct file *filep,
if (status)
return status;
- nfs_wb_all(inode);
+ status = nfs_filemap_write_and_wait_range(inode->i_mapping,
+ offset, LLONG_MAX);
+ if (status)
+ return status;
+
status = nfs4_call_sync(server->client, server, &msg,
&args.seq_args, &res.seq_res, 0);
if (status == -ENOTSUPP)
@@ -231,8 +351,7 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
* Mark the bad layout state as invalid, then retry
* with the current stateid.
*/
- set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
- pnfs_mark_matching_lsegs_invalid(lo, &head, NULL);
+ pnfs_mark_layout_stateid_invalid(lo, &head);
spin_unlock(&inode->i_lock);
pnfs_free_lseg_list(&head);
} else
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 0ca482a51e532..8b2605882a201 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -9,9 +9,22 @@
#define encode_fallocate_maxsz (encode_stateid_maxsz + \
2 /* offset */ + \
2 /* length */)
+#define NFS42_WRITE_RES_SIZE (1 /* wr_callback_id size */ +\
+ XDR_QUADLEN(NFS4_STATEID_SIZE) + \
+ 2 /* wr_count */ + \
+ 1 /* wr_committed */ + \
+ XDR_QUADLEN(NFS4_VERIFIER_SIZE))
#define encode_allocate_maxsz (op_encode_hdr_maxsz + \
encode_fallocate_maxsz)
#define decode_allocate_maxsz (op_decode_hdr_maxsz)
+#define encode_copy_maxsz (op_encode_hdr_maxsz + \
+ XDR_QUADLEN(NFS4_STATEID_SIZE) + \
+ XDR_QUADLEN(NFS4_STATEID_SIZE) + \
+ 2 + 2 + 2 + 1 + 1 + 1)
+#define decode_copy_maxsz (op_decode_hdr_maxsz + \
+ NFS42_WRITE_RES_SIZE + \
+ 1 /* cr_consecutive */ + \
+ 1 /* cr_synchronous */)
#define encode_deallocate_maxsz (op_encode_hdr_maxsz + \
encode_fallocate_maxsz)
#define decode_deallocate_maxsz (op_decode_hdr_maxsz)
@@ -49,6 +62,16 @@
decode_putfh_maxsz + \
decode_allocate_maxsz + \
decode_getattr_maxsz)
+#define NFS4_enc_copy_sz (compound_encode_hdr_maxsz + \
+ encode_putfh_maxsz + \
+ encode_savefh_maxsz + \
+ encode_putfh_maxsz + \
+ encode_copy_maxsz)
+#define NFS4_dec_copy_sz (compound_decode_hdr_maxsz + \
+ decode_putfh_maxsz + \
+ decode_savefh_maxsz + \
+ decode_putfh_maxsz + \
+ decode_copy_maxsz)
#define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \
encode_putfh_maxsz + \
encode_deallocate_maxsz + \
@@ -102,6 +125,23 @@ static void encode_allocate(struct xdr_stream *xdr,
encode_fallocate(xdr, args);
}
+static void encode_copy(struct xdr_stream *xdr,
+ struct nfs42_copy_args *args,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_COPY, decode_copy_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &args->src_stateid);
+ encode_nfs4_stateid(xdr, &args->dst_stateid);
+
+ encode_uint64(xdr, args->src_pos);
+ encode_uint64(xdr, args->dst_pos);
+ encode_uint64(xdr, args->count);
+
+ encode_uint32(xdr, 1); /* consecutive = true */
+ encode_uint32(xdr, 1); /* synchronous = true */
+ encode_uint32(xdr, 0); /* src server list */
+}
+
static void encode_deallocate(struct xdr_stream *xdr,
struct nfs42_falloc_args *args,
struct compound_hdr *hdr)
@@ -182,6 +222,26 @@ static void nfs4_xdr_enc_allocate(struct rpc_rqst *req,
}
/*
+ * Encode COPY request
+ */
+static void nfs4_xdr_enc_copy(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ struct nfs42_copy_args *args)
+{
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->src_fh, &hdr);
+ encode_savefh(xdr, &hdr);
+ encode_putfh(xdr, args->dst_fh, &hdr);
+ encode_copy(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
* Encode DEALLOCATE request
*/
static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req,
@@ -266,6 +326,70 @@ static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
return decode_op_hdr(xdr, OP_ALLOCATE);
}
+static int decode_write_response(struct xdr_stream *xdr,
+ struct nfs42_write_res *res)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4 + 8 + 4);
+ if (unlikely(!p))
+ goto out_overflow;
+
+ /*
+ * We never use asynchronous mode, so warn if a server returns
+ * a stateid.
+ */
+ if (unlikely(*p != 0)) {
+ pr_err_once("%s: server has set unrequested "
+ "asynchronous mode\n", __func__);
+ return -EREMOTEIO;
+ }
+ p++;
+ p = xdr_decode_hyper(p, &res->count);
+ res->verifier.committed = be32_to_cpup(p);
+ return decode_verifier(xdr, &res->verifier.verifier);
+
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
+
+static int decode_copy_requirements(struct xdr_stream *xdr,
+ struct nfs42_copy_res *res) {
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4 + 4);
+ if (unlikely(!p))
+ goto out_overflow;
+
+ res->consecutive = be32_to_cpup(p++);
+ res->synchronous = be32_to_cpup(p++);
+ return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
+
+static int decode_copy(struct xdr_stream *xdr, struct nfs42_copy_res *res)
+{
+ int status;
+
+ status = decode_op_hdr(xdr, OP_COPY);
+ if (status == NFS4ERR_OFFLOAD_NO_REQS) {
+ status = decode_copy_requirements(xdr, res);
+ if (status)
+ return status;
+ return NFS4ERR_OFFLOAD_NO_REQS;
+ } else if (status)
+ return status;
+
+ status = decode_write_response(xdr, &res->write_res);
+ if (status)
+ return status;
+
+ return decode_copy_requirements(xdr, res);
+}
+
static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
{
return decode_op_hdr(xdr, OP_DEALLOCATE);
@@ -331,6 +455,36 @@ out:
}
/*
+ * Decode COPY response
+ */
+static int nfs4_xdr_dec_copy(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct nfs42_copy_res *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_savefh(xdr);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_copy(xdr, res);
+out:
+ return status;
+}
+
+/*
* Decode DEALLOCATE request
*/
static int nfs4_xdr_dec_deallocate(struct rpc_rqst *rqstp,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 4afdee420d253..324bfdc212504 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -185,6 +185,7 @@ struct nfs4_state {
struct nfs4_exception {
struct nfs4_state *state;
struct inode *inode;
+ nfs4_stateid *stateid;
long timeout;
unsigned char delay : 1,
recovering : 1,
@@ -224,7 +225,8 @@ int nfs_atomic_open(struct inode *, struct dentry *, struct file *,
extern struct file_system_type nfs4_fs_type;
/* nfs4namespace.c */
-struct rpc_clnt *nfs4_negotiate_security(struct rpc_clnt *, struct inode *, struct qstr *);
+struct rpc_clnt *nfs4_negotiate_security(struct rpc_clnt *, struct inode *,
+ const struct qstr *);
struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *,
struct nfs_fh *, struct nfs_fattr *);
int nfs4_replace_transport(struct nfs_server *server,
@@ -251,7 +253,7 @@ extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struc
extern int nfs4_proc_get_locations(struct inode *, struct nfs4_fs_locations *,
struct page *page, struct rpc_cred *);
extern int nfs4_proc_fsid_present(struct inode *, struct rpc_cred *);
-extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, struct qstr *,
+extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, const struct qstr *,
struct nfs_fh *, struct nfs_fattr *);
extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *);
extern const struct xattr_handler *nfs4_xattr_handlers[];
@@ -438,8 +440,9 @@ extern void nfs41_handle_server_scope(struct nfs_client *,
struct nfs41_server_scope **);
extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
-extern int nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *,
- fmode_t, const struct nfs_lockowner *);
+extern int nfs4_select_rw_stateid(struct nfs4_state *, fmode_t,
+ const struct nfs_lockowner *, nfs4_stateid *,
+ struct rpc_cred **);
extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
@@ -496,12 +499,15 @@ extern struct svc_version nfs4_callback_version4;
static inline void nfs4_stateid_copy(nfs4_stateid *dst, const nfs4_stateid *src)
{
- memcpy(dst, src, sizeof(*dst));
+ memcpy(dst->data, src->data, sizeof(dst->data));
+ dst->type = src->type;
}
static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_stateid *src)
{
- return memcmp(dst, src, sizeof(*dst)) == 0;
+ if (dst->type != src->type)
+ return false;
+ return memcmp(dst->data, src->data, sizeof(dst->data)) == 0;
}
static inline bool nfs4_stateid_match_other(const nfs4_stateid *dst, const nfs4_stateid *src)
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 10410e8b58530..8d7d08d4f95f1 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -349,10 +349,10 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
* Returns pointer to an NFS client, or an ERR_PTR value.
*/
struct nfs_client *nfs4_init_client(struct nfs_client *clp,
- const struct rpc_timeout *timeparms,
- const char *ip_addr)
+ const struct nfs_client_initdata *cl_init)
{
char buf[INET6_ADDRSTRLEN + 1];
+ const char *ip_addr = cl_init->ip_addr;
struct nfs_client *old;
int error;
@@ -370,9 +370,9 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
__set_bit(NFS_CS_DISCRTRY, &clp->cl_flags);
__set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags);
- error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I);
+ error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_GSS_KRB5I);
if (error == -EINVAL)
- error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX);
+ error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX);
if (error < 0)
goto error;
@@ -793,10 +793,12 @@ static int nfs4_set_client(struct nfs_server *server,
.hostname = hostname,
.addr = addr,
.addrlen = addrlen,
+ .ip_addr = ip_addr,
.nfs_mod = &nfs_v4,
.proto = proto,
.minorversion = minorversion,
.net = net,
+ .timeparms = timeparms,
};
struct nfs_client *clp;
int error;
@@ -809,7 +811,7 @@ static int nfs4_set_client(struct nfs_server *server,
set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
/* Allocate or find a client reference we can use */
- clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour);
+ clp = nfs_get_client(&cl_init, authflavour);
if (IS_ERR(clp)) {
error = PTR_ERR(clp);
goto error;
@@ -842,20 +844,24 @@ error:
* low timeout interval so that if a connection is lost, we retry through
* the MDS.
*/
-struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
+struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
const struct sockaddr *ds_addr, int ds_addrlen,
int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
u32 minor_version, rpc_authflavor_t au_flavor)
{
+ struct rpc_timeout ds_timeout;
+ struct nfs_client *mds_clp = mds_srv->nfs_client;
struct nfs_client_initdata cl_init = {
.addr = ds_addr,
.addrlen = ds_addrlen,
+ .nodename = mds_clp->cl_rpcclient->cl_nodename,
+ .ip_addr = mds_clp->cl_ipaddr,
.nfs_mod = &nfs_v4,
.proto = ds_proto,
.minorversion = minor_version,
.net = mds_clp->cl_net,
+ .timeparms = &ds_timeout,
};
- struct rpc_timeout ds_timeout;
struct nfs_client *clp;
char buf[INET6_ADDRSTRLEN + 1];
@@ -863,14 +869,16 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
return ERR_PTR(-EINVAL);
cl_init.hostname = buf;
+ if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
+ __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+
/*
* Set an authflavor equual to the MDS value. Use the MDS nfs_client
* cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS
* (section 13.1 RFC 5661).
*/
nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
- clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
- au_flavor);
+ clp = nfs_get_client(&cl_init, au_flavor);
dprintk("<-- %s %p\n", __func__, clp);
return clp;
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 22c35abbee9d6..d085ad7948844 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -26,7 +26,7 @@ static int
nfs4_file_open(struct inode *inode, struct file *filp)
{
struct nfs_open_context *ctx;
- struct dentry *dentry = filp->f_path.dentry;
+ struct dentry *dentry = file_dentry(filp);
struct dentry *parent = NULL;
struct inode *dir;
unsigned openflags = filp->f_flags;
@@ -57,7 +57,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
parent = dget_parent(dentry);
dir = d_inode(parent);
- ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode);
+ ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode);
err = PTR_ERR(ctx);
if (IS_ERR(ctx))
goto out;
@@ -66,7 +66,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
if (openflags & O_TRUNC) {
attr.ia_valid |= ATTR_SIZE;
attr.ia_size = 0;
- nfs_sync_inode(inode);
+ filemap_write_and_wait(inode->i_mapping);
}
inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, NULL);
@@ -129,6 +129,16 @@ nfs4_file_flush(struct file *file, fl_owner_t id)
}
#ifdef CONFIG_NFS_V4_2
+static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t count, unsigned int flags)
+{
+ if (file_inode(file_in) == file_inode(file_out))
+ return -EINVAL;
+
+ return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count);
+}
+
static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence)
{
loff_t ret;
@@ -243,6 +253,7 @@ const struct file_operations nfs4_file_operations = {
.check_flags = nfs_check_flags,
.setlease = simple_nosetlease,
#ifdef CONFIG_NFS_V4_2
+ .copy_file_range = nfs4_copy_file_range,
.llseek = nfs4_file_llseek,
.fallocate = nfs42_fallocate,
.clone_file_range = nfs42_clone_file_range,
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 5ba22c6b0ffa6..c444285bb1b16 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -201,7 +201,7 @@ int nfs_idmap_init(void)
GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
(KEY_POS_ALL & ~KEY_POS_SETATTR) |
KEY_USR_VIEW | KEY_USR_READ,
- KEY_ALLOC_NOT_IN_QUOTA, NULL);
+ KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
if (IS_ERR(keyring)) {
ret = PTR_ERR(keyring);
goto failed_put_cred;
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index f592672373cbb..d21104912676c 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -208,7 +208,7 @@ static struct rpc_clnt *nfs_find_best_sec(struct rpc_clnt *clnt,
*/
struct rpc_clnt *
nfs4_negotiate_security(struct rpc_clnt *clnt, struct inode *inode,
- struct qstr *name)
+ const struct qstr *name)
{
struct page *page;
struct nfs4_secinfo_flavors *flavors;
@@ -397,7 +397,7 @@ struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry,
rpc_authflavor_t flavor = server->client->cl_auth->au_flavor;
struct dentry *parent = dget_parent(dentry);
struct inode *dir = d_inode(parent);
- struct qstr *name = &dentry->d_name;
+ const struct qstr *name = &dentry->d_name;
struct rpc_clnt *client;
struct vfsmount *mnt;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 327b8c34d3606..a036e93bdf965 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -74,6 +74,17 @@
#define NFS4_POLL_RETRY_MIN (HZ/10)
#define NFS4_POLL_RETRY_MAX (15*HZ)
+/* file attributes which can be mapped to nfs attributes */
+#define NFS4_VALID_ATTRS (ATTR_MODE \
+ | ATTR_UID \
+ | ATTR_GID \
+ | ATTR_SIZE \
+ | ATTR_ATIME \
+ | ATTR_MTIME \
+ | ATTR_CTIME \
+ | ATTR_ATIME_SET \
+ | ATTR_MTIME_SET)
+
struct nfs4_opendata;
static int _nfs4_proc_open(struct nfs4_opendata *data);
static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
@@ -352,6 +363,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
{
struct nfs_client *clp = server->nfs_client;
struct nfs4_state *state = exception->state;
+ const nfs4_stateid *stateid = exception->stateid;
struct inode *inode = exception->inode;
int ret = errorcode;
@@ -365,9 +377,18 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_BAD_STATEID:
- if (inode && nfs_async_inode_return_delegation(inode,
- NULL) == 0)
- goto wait_on_recovery;
+ if (inode) {
+ int err;
+
+ err = nfs_async_inode_return_delegation(inode,
+ stateid);
+ if (err == 0)
+ goto wait_on_recovery;
+ if (stateid != NULL && stateid->type == NFS4_DELEGATION_STATEID_TYPE) {
+ exception->retry = 1;
+ break;
+ }
+ }
if (state == NULL)
break;
ret = nfs4_schedule_stateid_recovery(server, state);
@@ -416,6 +437,8 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
case -NFS4ERR_DELAY:
nfs_inc_server_stats(server, NFSIOS_DELAY);
case -NFS4ERR_GRACE:
+ case -NFS4ERR_LAYOUTTRYLATER:
+ case -NFS4ERR_RECALLCONFLICT:
exception->delay = 1;
return 0;
@@ -2558,15 +2581,20 @@ static int _nfs4_do_open(struct inode *dir,
if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL) &&
(opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) {
nfs4_exclusive_attrset(opendata, sattr, &label);
-
- nfs_fattr_init(opendata->o_res.f_attr);
- status = nfs4_do_setattr(state->inode, cred,
- opendata->o_res.f_attr, sattr,
- state, label, olabel);
- if (status == 0) {
- nfs_setattr_update_inode(state->inode, sattr,
- opendata->o_res.f_attr);
- nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
+ /*
+ * send create attributes which was not set by open
+ * with an extra setattr.
+ */
+ if (sattr->ia_valid & NFS4_VALID_ATTRS) {
+ nfs_fattr_init(opendata->o_res.f_attr);
+ status = nfs4_do_setattr(state->inode, cred,
+ opendata->o_res.f_attr, sattr,
+ state, label, olabel);
+ if (status == 0) {
+ nfs_setattr_update_inode(state->inode, sattr,
+ opendata->o_res.f_attr);
+ nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
+ }
}
}
if (opened && opendata->file_created)
@@ -2652,46 +2680,32 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
return res;
}
-static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
- struct nfs_fattr *fattr, struct iattr *sattr,
- struct nfs4_state *state, struct nfs4_label *ilabel,
- struct nfs4_label *olabel)
+static int _nfs4_do_setattr(struct inode *inode,
+ struct nfs_setattrargs *arg,
+ struct nfs_setattrres *res,
+ struct rpc_cred *cred,
+ struct nfs4_state *state)
{
struct nfs_server *server = NFS_SERVER(inode);
- struct nfs_setattrargs arg = {
- .fh = NFS_FH(inode),
- .iap = sattr,
- .server = server,
- .bitmask = server->attr_bitmask,
- .label = ilabel,
- };
- struct nfs_setattrres res = {
- .fattr = fattr,
- .label = olabel,
- .server = server,
- };
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
- .rpc_argp = &arg,
- .rpc_resp = &res,
+ .rpc_argp = arg,
+ .rpc_resp = res,
.rpc_cred = cred,
};
+ struct rpc_cred *delegation_cred = NULL;
unsigned long timestamp = jiffies;
fmode_t fmode;
bool truncate;
int status;
- arg.bitmask = nfs4_bitmask(server, ilabel);
- if (ilabel)
- arg.bitmask = nfs4_bitmask(server, olabel);
-
- nfs_fattr_init(fattr);
+ nfs_fattr_init(res->fattr);
/* Servers should only apply open mode checks for file size changes */
- truncate = (sattr->ia_valid & ATTR_SIZE) ? true : false;
+ truncate = (arg->iap->ia_valid & ATTR_SIZE) ? true : false;
fmode = truncate ? FMODE_WRITE : FMODE_READ;
- if (nfs4_copy_delegation_stateid(&arg.stateid, inode, fmode)) {
+ if (nfs4_copy_delegation_stateid(inode, fmode, &arg->stateid, &delegation_cred)) {
/* Use that stateid */
} else if (truncate && state != NULL) {
struct nfs_lockowner lockowner = {
@@ -2700,16 +2714,20 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
};
if (!nfs4_valid_open_stateid(state))
return -EBADF;
- if (nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE,
- &lockowner) == -EIO)
+ if (nfs4_select_rw_stateid(state, FMODE_WRITE, &lockowner,
+ &arg->stateid, &delegation_cred) == -EIO)
return -EBADF;
} else
- nfs4_stateid_copy(&arg.stateid, &zero_stateid);
+ nfs4_stateid_copy(&arg->stateid, &zero_stateid);
+ if (delegation_cred)
+ msg.rpc_cred = delegation_cred;
- status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
+ status = nfs4_call_sync(server->client, server, &msg, &arg->seq_args, &res->seq_res, 1);
+
+ put_rpccred(delegation_cred);
if (status == 0 && state != NULL)
renew_lease(server, timestamp);
- trace_nfs4_setattr(inode, &arg.stateid, status);
+ trace_nfs4_setattr(inode, &arg->stateid, status);
return status;
}
@@ -2719,13 +2737,31 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
struct nfs4_label *olabel)
{
struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_setattrargs arg = {
+ .fh = NFS_FH(inode),
+ .iap = sattr,
+ .server = server,
+ .bitmask = server->attr_bitmask,
+ .label = ilabel,
+ };
+ struct nfs_setattrres res = {
+ .fattr = fattr,
+ .label = olabel,
+ .server = server,
+ };
struct nfs4_exception exception = {
.state = state,
.inode = inode,
+ .stateid = &arg.stateid,
};
int err;
+
+ arg.bitmask = nfs4_bitmask(server, ilabel);
+ if (ilabel)
+ arg.bitmask = nfs4_bitmask(server, olabel);
+
do {
- err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel);
+ err = _nfs4_do_setattr(inode, &arg, &res, cred, state);
switch (err) {
case -NFS4ERR_OPENMODE:
if (!(sattr->ia_valid & ATTR_SIZE)) {
@@ -2860,12 +2896,11 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
call_close |= is_wronly;
else if (is_wronly)
calldata->arg.fmode |= FMODE_WRITE;
+ if (calldata->arg.fmode != (FMODE_READ|FMODE_WRITE))
+ call_close |= is_rdwr;
} else if (is_rdwr)
calldata->arg.fmode |= FMODE_READ|FMODE_WRITE;
- if (calldata->arg.fmode == 0)
- call_close |= is_rdwr;
-
if (!nfs4_valid_open_stateid(state))
call_close = 0;
spin_unlock(&state->owner->so_lock);
@@ -3246,13 +3281,6 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
return status;
}
-static int nfs4_do_find_root_sec(struct nfs_server *server,
- struct nfs_fh *fhandle, struct nfs_fsinfo *info)
-{
- int mv = server->nfs_client->cl_minorversion;
- return nfs_v4_minor_ops[mv]->find_root_sec(server, fhandle, info);
-}
-
/**
* nfs4_proc_get_rootfh - get file handle for server's pseudoroot
* @server: initialized nfs_server handle
@@ -3272,7 +3300,8 @@ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle,
status = nfs4_lookup_root(server, fhandle, info);
if (auth_probe || status == NFS4ERR_WRONGSEC)
- status = nfs4_do_find_root_sec(server, fhandle, info);
+ status = server->nfs_client->cl_mvops->find_root_sec(server,
+ fhandle, info);
if (status == 0)
status = nfs4_server_capabilities(server, fhandle);
@@ -3509,7 +3538,7 @@ static void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr)
}
static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
- struct qstr *name, struct nfs_fh *fhandle,
+ const struct qstr *name, struct nfs_fh *fhandle,
struct nfs_fattr *fattr, struct nfs4_label *label)
{
struct nfs4_exception exception = { };
@@ -3551,7 +3580,7 @@ out:
return err;
}
-static int nfs4_proc_lookup(struct inode *dir, struct qstr *name,
+static int nfs4_proc_lookup(struct inode *dir, const struct qstr *name,
struct nfs_fh *fhandle, struct nfs_fattr *fattr,
struct nfs4_label *label)
{
@@ -3567,7 +3596,7 @@ static int nfs4_proc_lookup(struct inode *dir, struct qstr *name,
}
struct rpc_clnt *
-nfs4_proc_lookup_mountpoint(struct inode *dir, struct qstr *name,
+nfs4_proc_lookup_mountpoint(struct inode *dir, const struct qstr *name,
struct nfs_fh *fhandle, struct nfs_fattr *fattr)
{
struct rpc_clnt *client = NFS_CLIENT(dir);
@@ -3726,7 +3755,7 @@ out:
return status;
}
-static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)
+static int _nfs4_proc_remove(struct inode *dir, const struct qstr *name)
{
struct nfs_server *server = NFS_SERVER(dir);
struct nfs_removeargs args = {
@@ -3749,7 +3778,7 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)
return status;
}
-static int nfs4_proc_remove(struct inode *dir, struct qstr *name)
+static int nfs4_proc_remove(struct inode *dir, const struct qstr *name)
{
struct nfs4_exception exception = { };
int err;
@@ -3777,7 +3806,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
{
- nfs4_setup_sequence(NFS_SERVER(data->dir),
+ nfs4_setup_sequence(NFS_SB(data->dentry->d_sb),
&data->args.seq_args,
&data->res.seq_res,
task);
@@ -3832,7 +3861,7 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
return 1;
}
-static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
+static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name)
{
struct nfs_server *server = NFS_SERVER(inode);
struct nfs4_link_arg arg = {
@@ -3879,7 +3908,7 @@ out:
return status;
}
-static int nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
+static int nfs4_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name)
{
struct nfs4_exception exception = { };
int err;
@@ -3901,7 +3930,7 @@ struct nfs4_createdata {
};
static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
- struct qstr *name, struct iattr *sattr, u32 ftype)
+ const struct qstr *name, struct iattr *sattr, u32 ftype)
{
struct nfs4_createdata *data;
@@ -4285,7 +4314,7 @@ int nfs4_set_rw_stateid(nfs4_stateid *stateid,
if (l_ctx != NULL)
lockowner = &l_ctx->lockowner;
- return nfs4_select_rw_stateid(stateid, ctx->state, fmode, lockowner);
+ return nfs4_select_rw_stateid(ctx->state, fmode, lockowner, stateid, NULL);
}
EXPORT_SYMBOL_GPL(nfs4_set_rw_stateid);
@@ -4371,7 +4400,8 @@ static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr,
struct rpc_message *msg)
{
hdr->timestamp = jiffies;
- hdr->pgio_done_cb = nfs4_read_done_cb;
+ if (!hdr->pgio_done_cb)
+ hdr->pgio_done_cb = nfs4_read_done_cb;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0);
}
@@ -4993,12 +5023,11 @@ static int nfs4_do_set_security_label(struct inode *inode,
}
static int
-nfs4_set_security_label(struct dentry *dentry, const void *buf, size_t buflen)
+nfs4_set_security_label(struct inode *inode, const void *buf, size_t buflen)
{
struct nfs4_label ilabel, *olabel = NULL;
struct nfs_fattr fattr;
struct rpc_cred *cred;
- struct inode *inode = d_inode(dentry);
int status;
if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
@@ -6054,6 +6083,7 @@ static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *reques
static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
{
struct nfs_inode *nfsi = NFS_I(state->inode);
+ struct nfs4_state_owner *sp = state->owner;
unsigned char fl_flags = request->fl_flags;
int status = -ENOLCK;
@@ -6068,6 +6098,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
status = do_vfs_lock(state->inode, request);
if (status < 0)
goto out;
+ mutex_lock(&sp->so_delegreturn_mutex);
down_read(&nfsi->rwsem);
if (test_bit(NFS_DELEGATED_STATE, &state->flags)) {
/* Yes: cache locks! */
@@ -6075,9 +6106,11 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
request->fl_flags = fl_flags & ~FL_SLEEP;
status = do_vfs_lock(state->inode, request);
up_read(&nfsi->rwsem);
+ mutex_unlock(&sp->so_delegreturn_mutex);
goto out;
}
up_read(&nfsi->rwsem);
+ mutex_unlock(&sp->so_delegreturn_mutex);
status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);
out:
request->fl_flags = fl_flags;
@@ -6255,18 +6288,18 @@ nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler,
- struct dentry *dentry, const char *key,
- const void *buf, size_t buflen,
- int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *key, const void *buf,
+ size_t buflen, int flags)
{
- return nfs4_proc_set_acl(d_inode(dentry), buf, buflen);
+ return nfs4_proc_set_acl(inode, buf, buflen);
}
static int nfs4_xattr_get_nfs4_acl(const struct xattr_handler *handler,
- struct dentry *dentry, const char *key,
- void *buf, size_t buflen)
+ struct dentry *unused, struct inode *inode,
+ const char *key, void *buf, size_t buflen)
{
- return nfs4_proc_get_acl(d_inode(dentry), buf, buflen);
+ return nfs4_proc_get_acl(inode, buf, buflen);
}
static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry)
@@ -6277,22 +6310,22 @@ static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry)
#ifdef CONFIG_NFS_V4_SECURITY_LABEL
static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler,
- struct dentry *dentry, const char *key,
- const void *buf, size_t buflen,
- int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *key, const void *buf,
+ size_t buflen, int flags)
{
if (security_ismaclabel(key))
- return nfs4_set_security_label(dentry, buf, buflen);
+ return nfs4_set_security_label(inode, buf, buflen);
return -EOPNOTSUPP;
}
static int nfs4_xattr_get_nfs4_label(const struct xattr_handler *handler,
- struct dentry *dentry, const char *key,
- void *buf, size_t buflen)
+ struct dentry *unused, struct inode *inode,
+ const char *key, void *buf, size_t buflen)
{
if (security_ismaclabel(key))
- return nfs4_get_security_label(d_inode(dentry), buf, buflen);
+ return nfs4_get_security_label(inode, buf, buflen);
return -EOPNOTSUPP;
}
@@ -7351,9 +7384,11 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
* always set csa_cachethis to FALSE because the current implementation
* of the back channel DRC only supports caching the CB_SEQUENCE operation.
*/
-static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
+static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args,
+ struct rpc_clnt *clnt)
{
unsigned int max_rqst_sz, max_resp_sz;
+ unsigned int max_bc_payload = rpc_max_bc_payload(clnt);
max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead;
max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead;
@@ -7371,8 +7406,8 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
args->fc_attrs.max_ops, args->fc_attrs.max_reqs);
/* Back channel attributes */
- args->bc_attrs.max_rqst_sz = PAGE_SIZE;
- args->bc_attrs.max_resp_sz = PAGE_SIZE;
+ args->bc_attrs.max_rqst_sz = max_bc_payload;
+ args->bc_attrs.max_resp_sz = max_bc_payload;
args->bc_attrs.max_resp_sz_cached = 0;
args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS;
args->bc_attrs.max_reqs = NFS41_BC_MAX_CALLBACKS;
@@ -7476,7 +7511,7 @@ static int _nfs4_proc_create_session(struct nfs_client *clp,
};
int status;
- nfs4_init_channel_attrs(&args);
+ nfs4_init_channel_attrs(&args, clp->cl_rpcclient);
args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN);
status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
@@ -7820,40 +7855,36 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
struct nfs4_layoutget *lgp = calldata;
struct nfs_server *server = NFS_SERVER(lgp->args.inode);
struct nfs4_session *session = nfs4_get_session(server);
- int ret;
dprintk("--> %s\n", __func__);
- /* Note the is a race here, where a CB_LAYOUTRECALL can come in
- * right now covering the LAYOUTGET we are about to send.
- * However, that is not so catastrophic, and there seems
- * to be no way to prevent it completely.
- */
- if (nfs41_setup_sequence(session, &lgp->args.seq_args,
- &lgp->res.seq_res, task))
- return;
- ret = pnfs_choose_layoutget_stateid(&lgp->args.stateid,
- NFS_I(lgp->args.inode)->layout,
- &lgp->args.range,
- lgp->args.ctx->state);
- if (ret < 0)
- rpc_exit(task, ret);
+ nfs41_setup_sequence(session, &lgp->args.seq_args,
+ &lgp->res.seq_res, task);
+ dprintk("<-- %s\n", __func__);
}
static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
{
struct nfs4_layoutget *lgp = calldata;
+
+ dprintk("--> %s\n", __func__);
+ nfs41_sequence_done(task, &lgp->res.seq_res);
+ dprintk("<-- %s\n", __func__);
+}
+
+static int
+nfs4_layoutget_handle_exception(struct rpc_task *task,
+ struct nfs4_layoutget *lgp, struct nfs4_exception *exception)
+{
struct inode *inode = lgp->args.inode;
struct nfs_server *server = NFS_SERVER(inode);
struct pnfs_layout_hdr *lo;
- struct nfs4_state *state = NULL;
- unsigned long timeo, now, giveup;
+ int nfs4err = task->tk_status;
+ int err, status = 0;
+ LIST_HEAD(head);
dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status);
- if (!nfs41_sequence_done(task, &lgp->res.seq_res))
- goto out;
-
- switch (task->tk_status) {
+ switch (nfs4err) {
case 0:
goto out;
@@ -7863,88 +7894,67 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
* retry go inband.
*/
case -NFS4ERR_LAYOUTUNAVAILABLE:
- task->tk_status = -ENODATA;
+ status = -ENODATA;
goto out;
/*
* NFS4ERR_BADLAYOUT means the MDS cannot return a layout of
* length lgp->args.minlength != 0 (see RFC5661 section 18.43.3).
*/
case -NFS4ERR_BADLAYOUT:
- goto out_overflow;
+ status = -EOVERFLOW;
+ goto out;
/*
* NFS4ERR_LAYOUTTRYLATER is a conflict with another client
* (or clients) writing to the same RAID stripe except when
* the minlength argument is 0 (see RFC5661 section 18.43.3).
+ *
+ * Treat it like we would RECALLCONFLICT -- we retry for a little
+ * while, and then eventually give up.
*/
case -NFS4ERR_LAYOUTTRYLATER:
- if (lgp->args.minlength == 0)
- goto out_overflow;
- /*
- * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall
- * existing layout before getting a new one).
- */
- case -NFS4ERR_RECALLCONFLICT:
- timeo = rpc_get_timeout(task->tk_client);
- giveup = lgp->args.timestamp + timeo;
- now = jiffies;
- if (time_after(giveup, now)) {
- unsigned long delay;
-
- /* Delay for:
- * - Not less then NFS4_POLL_RETRY_MIN.
- * - One last time a jiffie before we give up
- * - exponential backoff (time_now minus start_attempt)
- */
- delay = max_t(unsigned long, NFS4_POLL_RETRY_MIN,
- min((giveup - now - 1),
- now - lgp->args.timestamp));
-
- dprintk("%s: NFS4ERR_RECALLCONFLICT waiting %lu\n",
- __func__, delay);
- rpc_delay(task, delay);
- /* Do not call nfs4_async_handle_error() */
- goto out_restart;
+ if (lgp->args.minlength == 0) {
+ status = -EOVERFLOW;
+ goto out;
}
+ status = -EBUSY;
+ break;
+ case -NFS4ERR_RECALLCONFLICT:
+ status = -ERECALLCONFLICT;
break;
case -NFS4ERR_EXPIRED:
case -NFS4ERR_BAD_STATEID:
+ exception->timeout = 0;
spin_lock(&inode->i_lock);
- if (nfs4_stateid_match(&lgp->args.stateid,
+ lo = NFS_I(inode)->layout;
+ /* If the open stateid was bad, then recover it. */
+ if (!lo || test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) ||
+ nfs4_stateid_match_other(&lgp->args.stateid,
&lgp->args.ctx->state->stateid)) {
spin_unlock(&inode->i_lock);
- /* If the open stateid was bad, then recover it. */
- state = lgp->args.ctx->state;
+ exception->state = lgp->args.ctx->state;
break;
}
- lo = NFS_I(inode)->layout;
- if (lo && nfs4_stateid_match(&lgp->args.stateid,
- &lo->plh_stateid)) {
- LIST_HEAD(head);
-
- /*
- * Mark the bad layout state as invalid, then retry
- * with the current stateid.
- */
- set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
- pnfs_mark_matching_lsegs_invalid(lo, &head, NULL);
- spin_unlock(&inode->i_lock);
- pnfs_free_lseg_list(&head);
- } else
- spin_unlock(&inode->i_lock);
- goto out_restart;
+
+ /*
+ * Mark the bad layout state as invalid, then retry
+ */
+ pnfs_mark_layout_stateid_invalid(lo, &head);
+ spin_unlock(&inode->i_lock);
+ pnfs_free_lseg_list(&head);
+ status = -EAGAIN;
+ goto out;
+ }
+
+ err = nfs4_handle_exception(server, nfs4err, exception);
+ if (!status) {
+ if (exception->retry)
+ status = -EAGAIN;
+ else
+ status = err;
}
- if (nfs4_async_handle_error(task, server, state, &lgp->timeout) == -EAGAIN)
- goto out_restart;
out:
dprintk("<-- %s\n", __func__);
- return;
-out_restart:
- task->tk_status = 0;
- rpc_restart_call_prepare(task);
- return;
-out_overflow:
- task->tk_status = -EOVERFLOW;
- goto out;
+ return status;
}
static size_t max_response_pages(struct nfs_server *server)
@@ -8013,7 +8023,7 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = {
};
struct pnfs_layout_segment *
-nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
+nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags)
{
struct inode *inode = lgp->args.inode;
struct nfs_server *server = NFS_SERVER(inode);
@@ -8033,6 +8043,10 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
.flags = RPC_TASK_ASYNC,
};
struct pnfs_layout_segment *lseg = NULL;
+ struct nfs4_exception exception = {
+ .inode = inode,
+ .timeout = *timeout,
+ };
int status = 0;
dprintk("--> %s\n", __func__);
@@ -8046,7 +8060,6 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
return ERR_PTR(-ENOMEM);
}
lgp->args.layout.pglen = max_pages * PAGE_SIZE;
- lgp->args.timestamp = jiffies;
lgp->res.layoutp = &lgp->args.layout;
lgp->res.seq_res.sr_slot = NULL;
@@ -8056,13 +8069,17 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
if (IS_ERR(task))
return ERR_CAST(task);
status = nfs4_wait_for_completion_rpc_task(task);
- if (status == 0)
- status = task->tk_status;
+ if (status == 0) {
+ status = nfs4_layoutget_handle_exception(task, lgp, &exception);
+ *timeout = exception.timeout;
+ }
+
trace_nfs4_layoutget(lgp->args.ctx,
&lgp->args.range,
&lgp->res.range,
&lgp->res.stateid,
status);
+
/* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */
if (status == 0 && lgp->res.layoutp->len)
lseg = pnfs_layout_process(lgp);
@@ -8118,9 +8135,9 @@ static void nfs4_layoutreturn_release(void *calldata)
dprintk("--> %s\n", __func__);
spin_lock(&lo->plh_inode->i_lock);
- pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range);
- pnfs_mark_layout_returned_if_empty(lo);
- if (lrp->res.lrs_present)
+ pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range,
+ be32_to_cpu(lrp->args.stateid.seqid));
+ if (lrp->res.lrs_present && pnfs_layout_is_valid(lo))
pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
pnfs_clear_layoutreturn_waitbit(lo);
spin_unlock(&lo->plh_inode->i_lock);
@@ -8653,6 +8670,9 @@ nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
static bool nfs41_match_stateid(const nfs4_stateid *s1,
const nfs4_stateid *s2)
{
+ if (s1->type != s2->type)
+ return false;
+
if (memcmp(s1->other, s2->other, sizeof(s1->other)) != 0)
return false;
@@ -8793,6 +8813,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
| NFS_CAP_STATEID_NFSV41
| NFS_CAP_ATOMIC_OPEN_V1
| NFS_CAP_ALLOCATE
+ | NFS_CAP_COPY
| NFS_CAP_DEALLOCATE
| NFS_CAP_SEEK
| NFS_CAP_LAYOUTSTATS
@@ -8821,7 +8842,7 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
#endif
};
-ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
+static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
{
ssize_t error, error2;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index d854693a15b0e..834b875900d62 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -65,7 +65,10 @@
#define OPENOWNER_POOL_SIZE 8
-const nfs4_stateid zero_stateid;
+const nfs4_stateid zero_stateid = {
+ { .data = { 0 } },
+ .type = NFS4_SPECIAL_STATEID_TYPE,
+};
static DEFINE_MUTEX(nfs_clid_init_mutex);
int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
@@ -985,15 +988,20 @@ static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
* Byte-range lock aware utility to initialize the stateid of read/write
* requests.
*/
-int nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state,
- fmode_t fmode, const struct nfs_lockowner *lockowner)
+int nfs4_select_rw_stateid(struct nfs4_state *state,
+ fmode_t fmode, const struct nfs_lockowner *lockowner,
+ nfs4_stateid *dst, struct rpc_cred **cred)
{
- int ret = nfs4_copy_lock_stateid(dst, state, lockowner);
+ int ret;
+
+ if (cred != NULL)
+ *cred = NULL;
+ ret = nfs4_copy_lock_stateid(dst, state, lockowner);
if (ret == -EIO)
/* A lost lock - don't even consider delegations */
goto out;
/* returns true if delegation stateid found and copied */
- if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) {
+ if (nfs4_copy_delegation_stateid(state->inode, fmode, dst, cred)) {
ret = 0;
goto out;
}
@@ -1480,9 +1488,9 @@ restart:
}
spin_unlock(&state->state_lock);
}
- nfs4_put_open_state(state);
clear_bit(NFS_STATE_RECLAIM_NOGRACE,
&state->flags);
+ nfs4_put_open_state(state);
spin_lock(&sp->so_lock);
goto restart;
}
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 2c8d05dae5b16..cfb8f7ce5cf6d 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -1235,8 +1235,8 @@ DECLARE_EVENT_CLASS(nfs4_idmap_event,
len = 0;
__entry->error = error < 0 ? error : 0;
__entry->id = id;
- memcpy(__get_dynamic_array(name), name, len);
- ((char *)__get_dynamic_array(name))[len] = 0;
+ memcpy(__get_str(name), name, len);
+ __get_str(name)[len] = 0;
),
TP_printk(
@@ -1520,6 +1520,8 @@ DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close);
{ PNFS_UPDATE_LAYOUT_FOUND_CACHED, "found cached" }, \
{ PNFS_UPDATE_LAYOUT_RETURN, "layoutreturn" }, \
{ PNFS_UPDATE_LAYOUT_BLOCKED, "layouts blocked" }, \
+ { PNFS_UPDATE_LAYOUT_INVALID_OPEN, "invalid open" }, \
+ { PNFS_UPDATE_LAYOUT_RETRY, "retrying" }, \
{ PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" })
TRACE_EVENT(pnfs_update_layout,
@@ -1528,9 +1530,10 @@ TRACE_EVENT(pnfs_update_layout,
u64 count,
enum pnfs_iomode iomode,
struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_segment *lseg,
enum pnfs_update_layout_reason reason
),
- TP_ARGS(inode, pos, count, iomode, lo, reason),
+ TP_ARGS(inode, pos, count, iomode, lo, lseg, reason),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(u64, fileid)
@@ -1540,6 +1543,7 @@ TRACE_EVENT(pnfs_update_layout,
__field(enum pnfs_iomode, iomode)
__field(int, layoutstateid_seq)
__field(u32, layoutstateid_hash)
+ __field(long, lseg)
__field(enum pnfs_update_layout_reason, reason)
),
TP_fast_assign(
@@ -1559,11 +1563,12 @@ TRACE_EVENT(pnfs_update_layout,
__entry->layoutstateid_seq = 0;
__entry->layoutstateid_hash = 0;
}
+ __entry->lseg = (long)lseg;
),
TP_printk(
"fileid=%02x:%02x:%llu fhandle=0x%08x "
"iomode=%s pos=%llu count=%llu "
- "layoutstateid=%d:0x%08x (%s)",
+ "layoutstateid=%d:0x%08x lseg=0x%lx (%s)",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
@@ -1571,6 +1576,7 @@ TRACE_EVENT(pnfs_update_layout,
(unsigned long long)__entry->pos,
(unsigned long long)__entry->count,
__entry->layoutstateid_seq, __entry->layoutstateid_hash,
+ __entry->lseg,
show_pnfs_update_layout_reason(__entry->reason)
)
);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4e4441216804e..7bd3a5c09d318 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1985,9 +1985,14 @@ encode_layoutcommit(struct xdr_stream *xdr,
p = xdr_encode_hyper(p, args->lastbytewritten + 1); /* length */
*p = cpu_to_be32(0); /* reclaim */
encode_nfs4_stateid(xdr, &args->stateid);
- p = reserve_space(xdr, 20);
- *p++ = cpu_to_be32(1); /* newoffset = TRUE */
- p = xdr_encode_hyper(p, args->lastbytewritten);
+ if (args->lastbytewritten != U64_MAX) {
+ p = reserve_space(xdr, 20);
+ *p++ = cpu_to_be32(1); /* newoffset = TRUE */
+ p = xdr_encode_hyper(p, args->lastbytewritten);
+ } else {
+ p = reserve_space(xdr, 12);
+ *p++ = cpu_to_be32(0); /* newoffset = FALSE */
+ }
*p++ = cpu_to_be32(0); /* Never send time_modify_changed */
*p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
@@ -4270,6 +4275,24 @@ static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
return decode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE);
}
+static int decode_open_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ stateid->type = NFS4_OPEN_STATEID_TYPE;
+ return decode_stateid(xdr, stateid);
+}
+
+static int decode_lock_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ stateid->type = NFS4_LOCK_STATEID_TYPE;
+ return decode_stateid(xdr, stateid);
+}
+
+static int decode_delegation_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ stateid->type = NFS4_DELEGATION_STATEID_TYPE;
+ return decode_stateid(xdr, stateid);
+}
+
static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
{
int status;
@@ -4278,7 +4301,7 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
if (status != -EIO)
nfs_increment_open_seqid(status, res->seqid);
if (!status)
- status = decode_stateid(xdr, &res->stateid);
+ status = decode_open_stateid(xdr, &res->stateid);
return status;
}
@@ -4937,7 +4960,7 @@ static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
if (status == -EIO)
goto out;
if (status == 0) {
- status = decode_stateid(xdr, &res->stateid);
+ status = decode_lock_stateid(xdr, &res->stateid);
if (unlikely(status))
goto out;
} else if (status == -NFS4ERR_DENIED)
@@ -4966,7 +4989,7 @@ static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
if (status != -EIO)
nfs_increment_lock_seqid(status, res->seqid);
if (status == 0)
- status = decode_stateid(xdr, &res->stateid);
+ status = decode_lock_stateid(xdr, &res->stateid);
return status;
}
@@ -5001,7 +5024,7 @@ static int decode_space_limit(struct xdr_stream *xdr,
blocksize = be32_to_cpup(p);
maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
}
- maxsize >>= PAGE_CACHE_SHIFT;
+ maxsize >>= PAGE_SHIFT;
*pagemod_limit = min_t(u64, maxsize, ULONG_MAX);
return 0;
out_overflow:
@@ -5016,7 +5039,7 @@ static int decode_rw_delegation(struct xdr_stream *xdr,
__be32 *p;
int status;
- status = decode_stateid(xdr, &res->delegation);
+ status = decode_delegation_stateid(xdr, &res->delegation);
if (unlikely(status))
return status;
p = xdr_inline_decode(xdr, 4);
@@ -5096,7 +5119,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
nfs_increment_open_seqid(status, res->seqid);
if (status)
return status;
- status = decode_stateid(xdr, &res->stateid);
+ status = decode_open_stateid(xdr, &res->stateid);
if (unlikely(status))
return status;
@@ -5136,7 +5159,7 @@ static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmre
if (status != -EIO)
nfs_increment_open_seqid(status, res->seqid);
if (!status)
- status = decode_stateid(xdr, &res->stateid);
+ status = decode_open_stateid(xdr, &res->stateid);
return status;
}
@@ -5148,7 +5171,7 @@ static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *re
if (status != -EIO)
nfs_increment_open_seqid(status, res->seqid);
if (!status)
- status = decode_stateid(xdr, &res->stateid);
+ status = decode_open_stateid(xdr, &res->stateid);
return status;
}
@@ -5838,6 +5861,12 @@ out_overflow:
}
#if defined(CONFIG_NFS_V4_1)
+static int decode_layout_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ stateid->type = NFS4_LAYOUT_STATEID_TYPE;
+ return decode_stateid(xdr, stateid);
+}
+
static int decode_getdeviceinfo(struct xdr_stream *xdr,
struct nfs4_getdeviceinfo_res *res)
{
@@ -5919,7 +5948,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
if (unlikely(!p))
goto out_overflow;
res->return_on_close = be32_to_cpup(p);
- decode_stateid(xdr, &res->stateid);
+ decode_layout_stateid(xdr, &res->stateid);
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
goto out_overflow;
@@ -5985,7 +6014,7 @@ static int decode_layoutreturn(struct xdr_stream *xdr,
goto out_overflow;
res->lrs_present = be32_to_cpup(p);
if (res->lrs_present)
- status = decode_stateid(xdr, &res->stateid);
+ status = decode_layout_stateid(xdr, &res->stateid);
return status;
out_overflow:
print_overflow_msg(__func__, xdr);
@@ -7515,6 +7544,7 @@ struct rpc_procinfo nfs4_procedures[] = {
PROC(DEALLOCATE, enc_deallocate, dec_deallocate),
PROC(LAYOUTSTATS, enc_layoutstats, dec_layoutstats),
PROC(CLONE, enc_clone, dec_clone),
+ PROC(COPY, enc_copy, dec_copy),
#endif /* CONFIG_NFS_V4_2 */
};
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 9f80a086b612a..2ca9167bc97d0 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -37,7 +37,6 @@
{ 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \
{ 1 << NFS_INO_STALE, "STALE" }, \
{ 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \
- { 1 << NFS_INO_FLUSHING, "FLUSHING" }, \
{ 1 << NFS_INO_FSCACHE, "FSCACHE" }, \
{ 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \
{ 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" })
@@ -702,14 +701,14 @@ TRACE_EVENT(nfs_sillyrename_unlink,
),
TP_fast_assign(
- struct inode *dir = data->dir;
+ struct inode *dir = d_inode(data->dentry->d_parent);
size_t len = data->args.name.len;
__entry->dev = dir->i_sb->s_dev;
__entry->dir = NFS_FILEID(dir);
__entry->error = error;
- memcpy(__get_dynamic_array(name),
+ memcpy(__get_str(name),
data->args.name.name, len);
- ((char *)__get_dynamic_array(name))[len] = 0;
+ __get_str(name)[len] = 0;
),
TP_printk(
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 9aebffb405059..049c1b1f2932b 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -486,7 +486,7 @@ static void __r4w_put_page(void *priv, struct page *page)
dprintk("%s: index=0x%lx\n", __func__,
(page == ZERO_PAGE(0)) ? -1UL : page->index);
if (ZERO_PAGE(0) != page)
- page_cache_release(page);
+ put_page(page);
return;
}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 8ce4f61cbaa5f..174dd4cf5747f 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -341,8 +341,10 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
* long write-back delay. This will be adjusted in
* update_nfs_request below if the region is not locked. */
req->wb_page = page;
- req->wb_index = page_file_index(page);
- page_cache_get(page);
+ if (page) {
+ req->wb_index = page_file_index(page);
+ get_page(page);
+ }
req->wb_offset = offset;
req->wb_pgbase = offset;
req->wb_bytes = count;
@@ -392,7 +394,7 @@ static void nfs_clear_request(struct nfs_page *req)
struct nfs_lock_context *l_ctx = req->wb_lock_context;
if (page != NULL) {
- page_cache_release(page);
+ put_page(page);
req->wb_page = NULL;
}
if (l_ctx != NULL) {
@@ -904,7 +906,7 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
return false;
} else {
if (req->wb_pgbase != 0 ||
- prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
+ prev->wb_pgbase + prev->wb_bytes != PAGE_SIZE)
return false;
}
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 2fa483e6dbe2e..70806cae0d36b 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -259,7 +259,7 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
* is required.
* Note that caller must hold inode->i_lock.
*/
-static int
+int
pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
struct list_head *lseg_list)
{
@@ -270,7 +270,7 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
};
set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
- return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range);
+ return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range, 0);
}
static int
@@ -308,7 +308,7 @@ pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode)
spin_lock(&inode->i_lock);
pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
- pnfs_mark_matching_lsegs_invalid(lo, &head, &range);
+ pnfs_mark_matching_lsegs_invalid(lo, &head, &range, 0);
spin_unlock(&inode->i_lock);
pnfs_free_lseg_list(&head);
dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__,
@@ -334,14 +334,17 @@ pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode)
}
static void
-init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
+pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg,
+ const struct pnfs_layout_range *range,
+ const nfs4_stateid *stateid)
{
INIT_LIST_HEAD(&lseg->pls_list);
INIT_LIST_HEAD(&lseg->pls_lc_list);
atomic_set(&lseg->pls_refcount, 1);
- smp_mb();
set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
lseg->pls_layout = lo;
+ lseg->pls_range = *range;
+ lseg->pls_seq = be32_to_cpu(stateid->seqid);
}
static void pnfs_free_lseg(struct pnfs_layout_segment *lseg)
@@ -361,8 +364,10 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
list_del_init(&lseg->pls_list);
/* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */
atomic_dec(&lo->plh_refcount);
- if (list_empty(&lo->plh_segs))
+ if (list_empty(&lo->plh_segs)) {
+ set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+ }
rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
}
@@ -484,15 +489,6 @@ pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1,
(end2 == NFS4_MAX_UINT64 || end2 > start1);
}
-static bool
-should_free_lseg(const struct pnfs_layout_range *lseg_range,
- const struct pnfs_layout_range *recall_range)
-{
- return (recall_range->iomode == IOMODE_ANY ||
- lseg_range->iomode == recall_range->iomode) &&
- pnfs_lseg_range_intersecting(lseg_range, recall_range);
-}
-
static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
struct list_head *tmp_list)
{
@@ -522,13 +518,56 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
return rv;
}
-/* Returns count of number of matching invalid lsegs remaining in list
- * after call.
+/*
+ * Compare 2 layout stateid sequence ids, to see which is newer,
+ * taking into account wraparound issues.
+ */
+static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
+{
+ return (s32)(s1 - s2) > 0;
+}
+
+static bool
+pnfs_should_free_range(const struct pnfs_layout_range *lseg_range,
+ const struct pnfs_layout_range *recall_range)
+{
+ return (recall_range->iomode == IOMODE_ANY ||
+ lseg_range->iomode == recall_range->iomode) &&
+ pnfs_lseg_range_intersecting(lseg_range, recall_range);
+}
+
+static bool
+pnfs_match_lseg_recall(const struct pnfs_layout_segment *lseg,
+ const struct pnfs_layout_range *recall_range,
+ u32 seq)
+{
+ if (seq != 0 && pnfs_seqid_is_newer(lseg->pls_seq, seq))
+ return false;
+ if (recall_range == NULL)
+ return true;
+ return pnfs_should_free_range(&lseg->pls_range, recall_range);
+}
+
+/**
+ * pnfs_mark_matching_lsegs_invalid - tear down lsegs or mark them for later
+ * @lo: layout header containing the lsegs
+ * @tmp_list: list head where doomed lsegs should go
+ * @recall_range: optional recall range argument to match (may be NULL)
+ * @seq: only invalidate lsegs obtained prior to this sequence (may be 0)
+ *
+ * Walk the list of lsegs in the layout header, and tear down any that should
+ * be destroyed. If "recall_range" is specified then the segment must match
+ * that range. If "seq" is non-zero, then only match segments that were handed
+ * out at or before that sequence.
+ *
+ * Returns number of matching invalid lsegs remaining in list after scanning
+ * it and purging them.
*/
int
pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
- const struct pnfs_layout_range *recall_range)
+ const struct pnfs_layout_range *recall_range,
+ u32 seq)
{
struct pnfs_layout_segment *lseg, *next;
int remaining = 0;
@@ -538,12 +577,11 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
if (list_empty(&lo->plh_segs))
return 0;
list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
- if (!recall_range ||
- should_free_lseg(&lseg->pls_range, recall_range)) {
- dprintk("%s: freeing lseg %p iomode %d "
+ if (pnfs_match_lseg_recall(lseg, recall_range, seq)) {
+ dprintk("%s: freeing lseg %p iomode %d seq %u"
"offset %llu length %llu\n", __func__,
- lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
- lseg->pls_range.length);
+ lseg, lseg->pls_range.iomode, lseg->pls_seq,
+ lseg->pls_range.offset, lseg->pls_range.length);
if (!mark_lseg_invalid(lseg, tmp_list))
remaining++;
}
@@ -730,38 +768,30 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
pnfs_destroy_layouts_byclid(clp, false);
}
-/*
- * Compare 2 layout stateid sequence ids, to see which is newer,
- * taking into account wraparound issues.
- */
-static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
-{
- return (s32)(s1 - s2) > 0;
-}
-
/* update lo->plh_stateid with new if is more recent */
void
pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
bool update_barrier)
{
- u32 oldseq, newseq, new_barrier;
- int empty = list_empty(&lo->plh_segs);
+ u32 oldseq, newseq, new_barrier = 0;
+ bool invalid = !pnfs_layout_is_valid(lo);
oldseq = be32_to_cpu(lo->plh_stateid.seqid);
newseq = be32_to_cpu(new->seqid);
- if (empty || pnfs_seqid_is_newer(newseq, oldseq)) {
+ if (invalid || pnfs_seqid_is_newer(newseq, oldseq)) {
nfs4_stateid_copy(&lo->plh_stateid, new);
- if (update_barrier) {
- new_barrier = be32_to_cpu(new->seqid);
- } else {
- /* Because of wraparound, we want to keep the barrier
- * "close" to the current seqids.
- */
- new_barrier = newseq - atomic_read(&lo->plh_outstanding);
- }
- if (empty || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
- lo->plh_barrier = new_barrier;
+ /*
+ * Because of wraparound, we want to keep the barrier
+ * "close" to the current seqids.
+ */
+ new_barrier = newseq - atomic_read(&lo->plh_outstanding);
}
+ if (update_barrier)
+ new_barrier = be32_to_cpu(new->seqid);
+ else if (new_barrier == 0)
+ return;
+ if (invalid || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
+ lo->plh_barrier = new_barrier;
}
static bool
@@ -781,50 +811,22 @@ pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo)
test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
}
-int
-pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
- const struct pnfs_layout_range *range,
- struct nfs4_state *open_state)
-{
- int status = 0;
-
- dprintk("--> %s\n", __func__);
- spin_lock(&lo->plh_inode->i_lock);
- if (pnfs_layoutgets_blocked(lo)) {
- status = -EAGAIN;
- } else if (!nfs4_valid_open_stateid(open_state)) {
- status = -EBADF;
- } else if (list_empty(&lo->plh_segs) ||
- test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
- int seq;
-
- do {
- seq = read_seqbegin(&open_state->seqlock);
- nfs4_stateid_copy(dst, &open_state->stateid);
- } while (read_seqretry(&open_state->seqlock, seq));
- } else
- nfs4_stateid_copy(dst, &lo->plh_stateid);
- spin_unlock(&lo->plh_inode->i_lock);
- dprintk("<-- %s\n", __func__);
- return status;
-}
-
/*
-* Get layout from server.
-* for now, assume that whole file layouts are requested.
-* arg->offset: 0
-* arg->length: all ones
-*/
+ * Get layout from server.
+ * for now, assume that whole file layouts are requested.
+ * arg->offset: 0
+ * arg->length: all ones
+ */
static struct pnfs_layout_segment *
send_layoutget(struct pnfs_layout_hdr *lo,
struct nfs_open_context *ctx,
+ nfs4_stateid *stateid,
const struct pnfs_layout_range *range,
- gfp_t gfp_flags)
+ long *timeout, gfp_t gfp_flags)
{
struct inode *ino = lo->plh_inode;
struct nfs_server *server = NFS_SERVER(ino);
struct nfs4_layoutget *lgp;
- struct pnfs_layout_segment *lseg;
loff_t i_size;
dprintk("--> %s\n", __func__);
@@ -834,40 +836,31 @@ send_layoutget(struct pnfs_layout_hdr *lo,
* store in lseg. If we race with a concurrent seqid morphing
* op, then re-send the LAYOUTGET.
*/
- do {
- lgp = kzalloc(sizeof(*lgp), gfp_flags);
- if (lgp == NULL)
- return NULL;
-
- i_size = i_size_read(ino);
-
- lgp->args.minlength = PAGE_CACHE_SIZE;
- if (lgp->args.minlength > range->length)
- lgp->args.minlength = range->length;
- if (range->iomode == IOMODE_READ) {
- if (range->offset >= i_size)
- lgp->args.minlength = 0;
- else if (i_size - range->offset < lgp->args.minlength)
- lgp->args.minlength = i_size - range->offset;
- }
- lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
- pnfs_copy_range(&lgp->args.range, range);
- lgp->args.type = server->pnfs_curr_ld->id;
- lgp->args.inode = ino;
- lgp->args.ctx = get_nfs_open_context(ctx);
- lgp->gfp_flags = gfp_flags;
- lgp->cred = lo->plh_lc_cred;
-
- lseg = nfs4_proc_layoutget(lgp, gfp_flags);
- } while (lseg == ERR_PTR(-EAGAIN));
-
- if (IS_ERR(lseg) && !nfs_error_is_fatal(PTR_ERR(lseg)))
- lseg = NULL;
- else
- pnfs_layout_clear_fail_bit(lo,
- pnfs_iomode_to_fail_bit(range->iomode));
+ lgp = kzalloc(sizeof(*lgp), gfp_flags);
+ if (lgp == NULL)
+ return ERR_PTR(-ENOMEM);
- return lseg;
+ i_size = i_size_read(ino);
+
+ lgp->args.minlength = PAGE_SIZE;
+ if (lgp->args.minlength > range->length)
+ lgp->args.minlength = range->length;
+ if (range->iomode == IOMODE_READ) {
+ if (range->offset >= i_size)
+ lgp->args.minlength = 0;
+ else if (i_size - range->offset < lgp->args.minlength)
+ lgp->args.minlength = i_size - range->offset;
+ }
+ lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
+ pnfs_copy_range(&lgp->args.range, range);
+ lgp->args.type = server->pnfs_curr_ld->id;
+ lgp->args.inode = ino;
+ lgp->args.ctx = get_nfs_open_context(ctx);
+ nfs4_stateid_copy(&lgp->args.stateid, stateid);
+ lgp->gfp_flags = gfp_flags;
+ lgp->cred = lo->plh_lc_cred;
+
+ return nfs4_proc_layoutget(lgp, timeout, gfp_flags);
}
static void pnfs_clear_layoutcommit(struct inode *inode,
@@ -893,14 +886,37 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
}
+static void
+pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
+{
+ lo->plh_return_iomode = 0;
+ lo->plh_return_seq = 0;
+ clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+}
+
static bool
-pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo)
+pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
+ nfs4_stateid *stateid,
+ enum pnfs_iomode *iomode)
{
if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
return false;
- lo->plh_return_iomode = 0;
pnfs_get_layout_hdr(lo);
- clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+ if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) {
+ if (stateid != NULL) {
+ nfs4_stateid_copy(stateid, &lo->plh_stateid);
+ if (lo->plh_return_seq != 0)
+ stateid->seqid = cpu_to_be32(lo->plh_return_seq);
+ }
+ if (iomode != NULL)
+ *iomode = lo->plh_return_iomode;
+ pnfs_clear_layoutreturn_info(lo);
+ return true;
+ }
+ if (stateid != NULL)
+ nfs4_stateid_copy(stateid, &lo->plh_stateid);
+ if (iomode != NULL)
+ *iomode = IOMODE_ANY;
return true;
}
@@ -968,9 +984,7 @@ static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
enum pnfs_iomode iomode;
bool send;
- nfs4_stateid_copy(&stateid, &lo->plh_stateid);
- iomode = lo->plh_return_iomode;
- send = pnfs_prepare_layoutreturn(lo);
+ send = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
spin_unlock(&inode->i_lock);
if (send) {
/* Send an async layoutreturn so we dont deadlock */
@@ -1007,12 +1021,11 @@ _pnfs_return_layout(struct inode *ino)
dprintk("NFS: %s no layout to return\n", __func__);
goto out;
}
- nfs4_stateid_copy(&stateid, &nfsi->layout->plh_stateid);
/* Reference matched in nfs4_layoutreturn_release */
pnfs_get_layout_hdr(lo);
empty = list_empty(&lo->plh_segs);
pnfs_clear_layoutcommit(ino, &tmp_list);
- pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
+ pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL, 0);
if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
struct pnfs_layout_range range = {
@@ -1030,8 +1043,7 @@ _pnfs_return_layout(struct inode *ino)
goto out_put_layout_hdr;
}
- set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
- send = pnfs_prepare_layoutreturn(lo);
+ send = pnfs_prepare_layoutreturn(lo, &stateid, NULL);
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&tmp_list);
if (send)
@@ -1098,11 +1110,10 @@ bool pnfs_roc(struct inode *ino)
goto out_noroc;
}
- nfs4_stateid_copy(&stateid, &lo->plh_stateid);
/* always send layoutreturn if being marked so */
- if (test_and_clear_bit(NFS_LAYOUT_RETURN_REQUESTED,
- &lo->plh_flags))
- layoutreturn = pnfs_prepare_layoutreturn(lo);
+ if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+ layoutreturn = pnfs_prepare_layoutreturn(lo,
+ &stateid, NULL);
list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
/* If we are sending layoutreturn, invalidate all valid lsegs */
@@ -1150,7 +1161,6 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
spin_lock(&ino->i_lock);
lo = NFS_I(ino)->layout;
- pnfs_mark_layout_returned_if_empty(lo);
if (pnfs_seqid_is_newer(barrier, lo->plh_barrier))
lo->plh_barrier = barrier;
spin_unlock(&ino->i_lock);
@@ -1310,6 +1320,7 @@ alloc_init_layout_hdr(struct inode *ino,
INIT_LIST_HEAD(&lo->plh_bulk_destroy);
lo->plh_inode = ino;
lo->plh_lc_cred = get_rpccred(ctx->cred);
+ lo->plh_flags |= 1 << NFS_LAYOUT_INVALID_STID;
return lo;
}
@@ -1317,6 +1328,8 @@ static struct pnfs_layout_hdr *
pnfs_find_alloc_layout(struct inode *ino,
struct nfs_open_context *ctx,
gfp_t gfp_flags)
+ __releases(&ino->i_lock)
+ __acquires(&ino->i_lock)
{
struct nfs_inode *nfsi = NFS_I(ino);
struct pnfs_layout_hdr *new = NULL;
@@ -1341,23 +1354,28 @@ out_existing:
/*
* iomode matching rules:
- * iomode lseg match
- * ----- ----- -----
- * ANY READ true
- * ANY RW true
- * RW READ false
- * RW RW true
- * READ READ true
- * READ RW true
+ * iomode lseg strict match
+ * iomode
+ * ----- ----- ------ -----
+ * ANY READ N/A true
+ * ANY RW N/A true
+ * RW READ N/A false
+ * RW RW N/A true
+ * READ READ N/A true
+ * READ RW true false
+ * READ RW false true
*/
static bool
pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
- const struct pnfs_layout_range *range)
+ const struct pnfs_layout_range *range,
+ bool strict_iomode)
{
struct pnfs_layout_range range1;
if ((range->iomode == IOMODE_RW &&
ls_range->iomode != IOMODE_RW) ||
+ (range->iomode != ls_range->iomode &&
+ strict_iomode == true) ||
!pnfs_lseg_range_intersecting(ls_range, range))
return 0;
@@ -1372,7 +1390,8 @@ pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
*/
static struct pnfs_layout_segment *
pnfs_find_lseg(struct pnfs_layout_hdr *lo,
- struct pnfs_layout_range *range)
+ struct pnfs_layout_range *range,
+ bool strict_iomode)
{
struct pnfs_layout_segment *lseg, *ret = NULL;
@@ -1381,7 +1400,8 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) &&
- pnfs_lseg_range_match(&lseg->pls_range, range)) {
+ pnfs_lseg_range_match(&lseg->pls_range, range,
+ strict_iomode)) {
ret = pnfs_get_lseg(lseg);
break;
}
@@ -1498,6 +1518,7 @@ pnfs_update_layout(struct inode *ino,
loff_t pos,
u64 count,
enum pnfs_iomode iomode,
+ bool strict_iomode,
gfp_t gfp_flags)
{
struct pnfs_layout_range arg = {
@@ -1505,27 +1526,30 @@ pnfs_update_layout(struct inode *ino,
.offset = pos,
.length = count,
};
- unsigned pg_offset;
+ unsigned pg_offset, seq;
struct nfs_server *server = NFS_SERVER(ino);
struct nfs_client *clp = server->nfs_client;
- struct pnfs_layout_hdr *lo;
+ struct pnfs_layout_hdr *lo = NULL;
struct pnfs_layout_segment *lseg = NULL;
+ nfs4_stateid stateid;
+ long timeout = 0;
+ unsigned long giveup = jiffies + (clp->cl_lease_time << 1);
bool first;
if (!pnfs_enabled_sb(NFS_SERVER(ino))) {
- trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
PNFS_UPDATE_LAYOUT_NO_PNFS);
goto out;
}
if (iomode == IOMODE_READ && i_size_read(ino) == 0) {
- trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
PNFS_UPDATE_LAYOUT_RD_ZEROLEN);
goto out;
}
if (pnfs_within_mdsthreshold(ctx, ino, iomode)) {
- trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
PNFS_UPDATE_LAYOUT_MDSTHRESH);
goto out;
}
@@ -1536,14 +1560,14 @@ lookup_again:
lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
if (lo == NULL) {
spin_unlock(&ino->i_lock);
- trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
PNFS_UPDATE_LAYOUT_NOMEM);
goto out;
}
/* Do we even need to bother with this? */
if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
- trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
PNFS_UPDATE_LAYOUT_BULK_RECALL);
dprintk("%s matches recall, use MDS\n", __func__);
goto out_unlock;
@@ -1551,14 +1575,33 @@ lookup_again:
/* if LAYOUTGET already failed once we don't try again */
if (pnfs_layout_io_test_failed(lo, iomode)) {
- trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
PNFS_UPDATE_LAYOUT_IO_TEST_FAIL);
goto out_unlock;
}
- first = list_empty(&lo->plh_segs);
- if (first) {
- /* The first layoutget for the file. Need to serialize per
+ lseg = pnfs_find_lseg(lo, &arg, strict_iomode);
+ if (lseg) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+ PNFS_UPDATE_LAYOUT_FOUND_CACHED);
+ goto out_unlock;
+ }
+
+ if (!nfs4_valid_open_stateid(ctx->state)) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+ PNFS_UPDATE_LAYOUT_INVALID_OPEN);
+ goto out_unlock;
+ }
+
+ /*
+ * Choose a stateid for the LAYOUTGET. If we don't have a layout
+ * stateid, or it has been invalidated, then we must use the open
+ * stateid.
+ */
+ if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
+
+ /*
+ * The first layoutget for the file. Need to serialize per
* RFC 5661 Errata 3208.
*/
if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET,
@@ -1567,18 +1610,17 @@ lookup_again:
wait_on_bit(&lo->plh_flags, NFS_LAYOUT_FIRST_LAYOUTGET,
TASK_UNINTERRUPTIBLE);
pnfs_put_layout_hdr(lo);
+ dprintk("%s retrying\n", __func__);
goto lookup_again;
}
+
+ first = true;
+ do {
+ seq = read_seqbegin(&ctx->state->seqlock);
+ nfs4_stateid_copy(&stateid, &ctx->state->stateid);
+ } while (read_seqretry(&ctx->state->seqlock, seq));
} else {
- /* Check to see if the layout for the given range
- * already exists
- */
- lseg = pnfs_find_lseg(lo, &arg);
- if (lseg) {
- trace_pnfs_update_layout(ino, pos, count, iomode, lo,
- PNFS_UPDATE_LAYOUT_FOUND_CACHED);
- goto out_unlock;
- }
+ nfs4_stateid_copy(&stateid, &lo->plh_stateid);
}
/*
@@ -1593,15 +1635,17 @@ lookup_again:
pnfs_clear_first_layoutget(lo);
pnfs_put_layout_hdr(lo);
dprintk("%s retrying\n", __func__);
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ lseg, PNFS_UPDATE_LAYOUT_RETRY);
goto lookup_again;
}
- trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
PNFS_UPDATE_LAYOUT_RETURN);
goto out_put_layout_hdr;
}
if (pnfs_layoutgets_blocked(lo)) {
- trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
PNFS_UPDATE_LAYOUT_BLOCKED);
goto out_unlock;
}
@@ -1618,18 +1662,55 @@ lookup_again:
spin_unlock(&clp->cl_lock);
}
- pg_offset = arg.offset & ~PAGE_CACHE_MASK;
+ pg_offset = arg.offset & ~PAGE_MASK;
if (pg_offset) {
arg.offset -= pg_offset;
arg.length += pg_offset;
}
if (arg.length != NFS4_MAX_UINT64)
- arg.length = PAGE_CACHE_ALIGN(arg.length);
+ arg.length = PAGE_ALIGN(arg.length);
- lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
- atomic_dec(&lo->plh_outstanding);
- trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ lseg = send_layoutget(lo, ctx, &stateid, &arg, &timeout, gfp_flags);
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
+ atomic_dec(&lo->plh_outstanding);
+ if (IS_ERR(lseg)) {
+ switch(PTR_ERR(lseg)) {
+ case -EBUSY:
+ if (time_after(jiffies, giveup))
+ lseg = NULL;
+ break;
+ case -ERECALLCONFLICT:
+ /* Huh? We hold no layouts, how is there a recall? */
+ if (first) {
+ lseg = NULL;
+ break;
+ }
+ /* Destroy the existing layout and start over */
+ if (time_after(jiffies, giveup))
+ pnfs_destroy_layout(NFS_I(ino));
+ /* Fallthrough */
+ case -EAGAIN:
+ break;
+ default:
+ if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
+ pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
+ lseg = NULL;
+ }
+ goto out_put_layout_hdr;
+ }
+ if (lseg) {
+ if (first)
+ pnfs_clear_first_layoutget(lo);
+ trace_pnfs_update_layout(ino, pos, count,
+ iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY);
+ pnfs_put_layout_hdr(lo);
+ goto lookup_again;
+ }
+ } else {
+ pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
+ }
+
out_put_layout_hdr:
if (first)
pnfs_clear_first_layoutget(lo);
@@ -1678,38 +1759,34 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
struct pnfs_layout_segment *lseg;
struct inode *ino = lo->plh_inode;
LIST_HEAD(free_me);
- int status = -EINVAL;
if (!pnfs_sanity_check_layout_range(&res->range))
- goto out;
+ return ERR_PTR(-EINVAL);
/* Inject layout blob into I/O device driver */
lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
- if (!lseg || IS_ERR(lseg)) {
+ if (IS_ERR_OR_NULL(lseg)) {
if (!lseg)
- status = -ENOMEM;
- else
- status = PTR_ERR(lseg);
- dprintk("%s: Could not allocate layout: error %d\n",
- __func__, status);
- goto out;
+ lseg = ERR_PTR(-ENOMEM);
+
+ dprintk("%s: Could not allocate layout: error %ld\n",
+ __func__, PTR_ERR(lseg));
+ return lseg;
}
- init_lseg(lo, lseg);
- lseg->pls_range = res->range;
+ pnfs_init_lseg(lo, lseg, &res->range, &res->stateid);
spin_lock(&ino->i_lock);
if (pnfs_layoutgets_blocked(lo)) {
dprintk("%s forget reply due to state\n", __func__);
- goto out_forget_reply;
+ goto out_forget;
}
if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
/* existing state ID, make sure the sequence number matches. */
if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
dprintk("%s forget reply due to sequence\n", __func__);
- status = -EAGAIN;
- goto out_forget_reply;
+ goto out_forget;
}
pnfs_set_layout_stateid(lo, &res->stateid, false);
} else {
@@ -1718,16 +1795,19 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
* inode invalid, and don't bother validating the stateid
* sequence number.
*/
- pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL);
+ pnfs_mark_layout_stateid_invalid(lo, &free_me);
nfs4_stateid_copy(&lo->plh_stateid, &res->stateid);
lo->plh_barrier = be32_to_cpu(res->stateid.seqid);
}
- clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
-
pnfs_get_lseg(lseg);
pnfs_layout_insert_lseg(lo, lseg, &free_me);
+ if (!pnfs_layout_is_valid(lo)) {
+ pnfs_clear_layoutreturn_info(lo);
+ clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+ }
+
if (res->return_on_close)
set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
@@ -1735,25 +1815,26 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&free_me);
return lseg;
-out:
- return ERR_PTR(status);
-out_forget_reply:
+out_forget:
spin_unlock(&ino->i_lock);
lseg->pls_layout = lo;
NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
- goto out;
+ return ERR_PTR(-EAGAIN);
}
static void
-pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode)
+pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
+ u32 seq)
{
- if (lo->plh_return_iomode == iomode)
- return;
- if (lo->plh_return_iomode != 0)
+ if (lo->plh_return_iomode != 0 && lo->plh_return_iomode != iomode)
iomode = IOMODE_ANY;
lo->plh_return_iomode = iomode;
set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+ if (seq != 0) {
+ WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq);
+ lo->plh_return_seq = seq;
+ }
}
/**
@@ -1769,7 +1850,8 @@ pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode)
int
pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
- const struct pnfs_layout_range *return_range)
+ const struct pnfs_layout_range *return_range,
+ u32 seq)
{
struct pnfs_layout_segment *lseg, *next;
int remaining = 0;
@@ -1782,7 +1864,7 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
assert_spin_locked(&lo->plh_inode->i_lock);
list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
- if (should_free_lseg(&lseg->pls_range, return_range)) {
+ if (pnfs_match_lseg_recall(lseg, return_range, seq)) {
dprintk("%s: marking lseg %p iomode %d "
"offset %llu length %llu\n", __func__,
lseg, lseg->pls_range.iomode,
@@ -1792,8 +1874,11 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
continue;
remaining++;
set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
- pnfs_set_plh_return_iomode(lo, return_range->iomode);
}
+
+ if (remaining)
+ pnfs_set_plh_return_info(lo, return_range->iomode, seq);
+
return remaining;
}
@@ -1810,18 +1895,17 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
bool return_now = false;
spin_lock(&inode->i_lock);
- pnfs_set_plh_return_iomode(lo, range.iomode);
+ pnfs_set_plh_return_info(lo, range.iomode, 0);
/*
* mark all matching lsegs so that we are sure to have no live
* segments at hand when sending layoutreturn. See pnfs_put_lseg()
* for how it works.
*/
- if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range)) {
+ if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range, 0)) {
nfs4_stateid stateid;
- enum pnfs_iomode iomode = lo->plh_return_iomode;
+ enum pnfs_iomode iomode;
- nfs4_stateid_copy(&stateid, &lo->plh_stateid);
- return_now = pnfs_prepare_layoutreturn(lo);
+ return_now = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
spin_unlock(&inode->i_lock);
if (return_now)
pnfs_send_layoutreturn(lo, &stateid, iomode, false);
@@ -1849,6 +1933,7 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r
req_offset(req),
rd_size,
IOMODE_READ,
+ false,
GFP_KERNEL);
if (IS_ERR(pgio->pg_lseg)) {
pgio->pg_error = PTR_ERR(pgio->pg_lseg);
@@ -1873,6 +1958,7 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
req_offset(req),
wb_size,
IOMODE_RW,
+ false,
GFP_NOFS);
if (IS_ERR(pgio->pg_lseg)) {
pgio->pg_error = PTR_ERR(pgio->pg_lseg);
@@ -2143,12 +2229,15 @@ pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
}
/* Resend all requests through pnfs. */
-int pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr)
+void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr)
{
struct nfs_pageio_descriptor pgio;
- nfs_pageio_init_read(&pgio, hdr->inode, false, hdr->completion_ops);
- return nfs_pageio_resend(&pgio, hdr);
+ if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ nfs_pageio_init_read(&pgio, hdr->inode, false,
+ hdr->completion_ops);
+ hdr->task.tk_status = nfs_pageio_resend(&pgio, hdr);
+ }
}
EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs);
@@ -2158,12 +2247,11 @@ pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
struct pnfs_layout_segment *lseg = desc->pg_lseg;
enum pnfs_try_status trypnfs;
- int err = 0;
trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
if (trypnfs == PNFS_TRY_AGAIN)
- err = pnfs_read_resend_pnfs(hdr);
- if (trypnfs == PNFS_NOT_ATTEMPTED || err)
+ pnfs_read_resend_pnfs(hdr);
+ if (trypnfs == PNFS_NOT_ATTEMPTED || hdr->task.tk_status)
pnfs_read_through_mds(desc, hdr);
}
@@ -2332,7 +2420,10 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
nfs_fattr_init(&data->fattr);
data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
data->res.fattr = &data->fattr;
- data->args.lastbytewritten = end_pos - 1;
+ if (end_pos != 0)
+ data->args.lastbytewritten = end_pos - 1;
+ else
+ data->args.lastbytewritten = U64_MAX;
data->res.server = NFS_SERVER(inode);
if (ld->prepare_layoutcommit) {
@@ -2405,7 +2496,7 @@ pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
spin_lock(&inode->i_lock);
if (!NFS_I(inode)->layout) {
spin_unlock(&inode->i_lock);
- goto out;
+ goto out_clear_layoutstats;
}
hdr = NFS_I(inode)->layout;
pnfs_get_layout_hdr(hdr);
@@ -2434,6 +2525,7 @@ out_free:
kfree(data);
out_put:
pnfs_put_layout_hdr(hdr);
+out_clear_layoutstats:
smp_mb__before_atomic();
clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags);
smp_mb__after_atomic();
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 1ac1db5f6dadb..31d99b2927b00 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -64,6 +64,7 @@ struct pnfs_layout_segment {
struct list_head pls_lc_list;
struct pnfs_layout_range pls_range;
atomic_t pls_refcount;
+ u32 pls_seq;
unsigned long pls_flags;
struct pnfs_layout_hdr *pls_layout;
struct work_struct pls_work;
@@ -194,6 +195,7 @@ struct pnfs_layout_hdr {
unsigned long plh_flags;
nfs4_stateid plh_stateid;
u32 plh_barrier; /* ignore lower seqids */
+ u32 plh_return_seq;
enum pnfs_iomode plh_return_iomode;
loff_t plh_lwb; /* last write byte for layoutcommit */
struct rpc_cred *plh_lc_cred; /* layoutcommit cred */
@@ -226,7 +228,7 @@ extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
struct pnfs_device *dev,
struct rpc_cred *cred);
-extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags);
+extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags);
extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync);
/* pnfs.c */
@@ -258,16 +260,16 @@ void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo);
void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
const nfs4_stateid *new,
bool update_barrier);
-int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
- struct pnfs_layout_hdr *lo,
- const struct pnfs_layout_range *range,
- struct nfs4_state *open_state);
int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
- const struct pnfs_layout_range *recall_range);
+ const struct pnfs_layout_range *recall_range,
+ u32 seq);
int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
- const struct pnfs_layout_range *recall_range);
+ const struct pnfs_layout_range *recall_range,
+ u32 seq);
+int pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
+ struct list_head *lseg_list);
bool pnfs_roc(struct inode *ino);
void pnfs_roc_release(struct inode *ino);
void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
@@ -282,12 +284,13 @@ int _pnfs_return_layout(struct inode *);
int pnfs_commit_and_return_layout(struct inode *);
void pnfs_ld_write_done(struct nfs_pgio_header *);
void pnfs_ld_read_done(struct nfs_pgio_header *);
-int pnfs_read_resend_pnfs(struct nfs_pgio_header *);
+void pnfs_read_resend_pnfs(struct nfs_pgio_header *);
struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
struct nfs_open_context *ctx,
loff_t pos,
u64 count,
enum pnfs_iomode iomode,
+ bool strict_iomode,
gfp_t gfp_flags);
void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo);
@@ -374,6 +377,11 @@ static inline bool nfs_have_layout(struct inode *inode)
return NFS_I(inode)->layout != NULL;
}
+static inline bool pnfs_layout_is_valid(const struct pnfs_layout_hdr *lo)
+{
+ return test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) == 0;
+}
+
static inline struct nfs4_deviceid_node *
nfs4_get_deviceid(struct nfs4_deviceid_node *d)
{
@@ -544,19 +552,6 @@ pnfs_calc_offset_length(u64 offset, u64 end)
return 1 + end - offset;
}
-/**
- * pnfs_mark_layout_returned_if_empty - marks the layout as returned
- * @lo: layout header
- *
- * Note: Caller must hold inode->i_lock
- */
-static inline void
-pnfs_mark_layout_returned_if_empty(struct pnfs_layout_hdr *lo)
-{
- if (list_empty(&lo->plh_segs))
- set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
-}
-
static inline void
pnfs_copy_range(struct pnfs_layout_range *dst,
const struct pnfs_layout_range *src)
@@ -628,6 +623,13 @@ pnfs_sync_inode(struct inode *inode, bool datasync)
}
static inline bool
+pnfs_layoutcommit_outstanding(struct inode *inode)
+{
+ return false;
+}
+
+
+static inline bool
pnfs_roc(struct inode *ino)
{
return false;
@@ -715,13 +717,6 @@ pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src,
return false;
}
-static inline bool
-pnfs_layoutcommit_outstanding(struct inode *inode)
-{
- return false;
-}
-
-
static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
{
return NULL;
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 4aaed890048fd..f3468b57a32a3 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -61,7 +61,7 @@ EXPORT_SYMBOL_GPL(pnfs_generic_commit_release);
/* The generic layer is about to remove the req from the commit list.
* If this will make the bucket empty, it will need to put the lseg reference.
- * Note this must be called holding the inode (/cinfo) lock
+ * Note this must be called holding i_lock
*/
void
pnfs_generic_clear_request_commit(struct nfs_page *req,
@@ -98,7 +98,7 @@ pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst,
if (!nfs_lock_request(req))
continue;
kref_get(&req->wb_kref);
- if (cond_resched_lock(cinfo->lock))
+ if (cond_resched_lock(&cinfo->inode->i_lock))
list_safe_reset_next(req, tmp, wb_list);
nfs_request_remove_commit_list(req, cinfo);
clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
@@ -119,7 +119,7 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
struct list_head *dst = &bucket->committing;
int ret;
- lockdep_assert_held(cinfo->lock);
+ lockdep_assert_held(&cinfo->inode->i_lock);
ret = pnfs_generic_transfer_commit_list(src, dst, cinfo, max);
if (ret) {
cinfo->ds->nwritten -= ret;
@@ -142,7 +142,7 @@ int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo,
{
int i, rv = 0, cnt;
- lockdep_assert_held(cinfo->lock);
+ lockdep_assert_held(&cinfo->inode->i_lock);
for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) {
cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i],
cinfo, max);
@@ -161,16 +161,16 @@ void pnfs_generic_recover_commit_reqs(struct list_head *dst,
struct pnfs_layout_segment *freeme;
int i;
- lockdep_assert_held(cinfo->lock);
+ lockdep_assert_held(&cinfo->inode->i_lock);
restart:
for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
if (pnfs_generic_transfer_commit_list(&b->written, dst,
cinfo, 0)) {
freeme = b->wlseg;
b->wlseg = NULL;
- spin_unlock(cinfo->lock);
+ spin_unlock(&cinfo->inode->i_lock);
pnfs_put_lseg(freeme);
- spin_lock(cinfo->lock);
+ spin_lock(&cinfo->inode->i_lock);
goto restart;
}
}
@@ -186,7 +186,7 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx)
LIST_HEAD(pages);
int i;
- spin_lock(cinfo->lock);
+ spin_lock(&cinfo->inode->i_lock);
for (i = idx; i < fl_cinfo->nbuckets; i++) {
bucket = &fl_cinfo->buckets[i];
if (list_empty(&bucket->committing))
@@ -194,12 +194,12 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx)
freeme = bucket->clseg;
bucket->clseg = NULL;
list_splice_init(&bucket->committing, &pages);
- spin_unlock(cinfo->lock);
+ spin_unlock(&cinfo->inode->i_lock);
nfs_retry_commit(&pages, freeme, cinfo, i);
pnfs_put_lseg(freeme);
- spin_lock(cinfo->lock);
+ spin_lock(&cinfo->inode->i_lock);
}
- spin_unlock(cinfo->lock);
+ spin_unlock(&cinfo->inode->i_lock);
}
static unsigned int
@@ -238,14 +238,39 @@ void pnfs_fetch_commit_bucket_list(struct list_head *pages,
struct pnfs_commit_bucket *bucket;
bucket = &cinfo->ds->buckets[data->ds_commit_index];
- spin_lock(cinfo->lock);
+ spin_lock(&cinfo->inode->i_lock);
list_splice_init(&bucket->committing, pages);
data->lseg = bucket->clseg;
bucket->clseg = NULL;
- spin_unlock(cinfo->lock);
+ spin_unlock(&cinfo->inode->i_lock);
}
+/* Helper function for pnfs_generic_commit_pagelist to catch an empty
+ * page list. This can happen when two commits race.
+ *
+ * This must be called instead of nfs_init_commit - call one or the other, but
+ * not both!
+ */
+static bool
+pnfs_generic_commit_cancel_empty_pagelist(struct list_head *pages,
+ struct nfs_commit_data *data,
+ struct nfs_commit_info *cinfo)
+{
+ if (list_empty(pages)) {
+ if (atomic_dec_and_test(&cinfo->mds->rpcs_out))
+ wake_up_atomic_t(&cinfo->mds->rpcs_out);
+ /* don't call nfs_commitdata_release - it tries to put
+ * the open_context which is not acquired until nfs_init_commit
+ * which has not been called on @data */
+ WARN_ON_ONCE(data->context);
+ nfs_commit_free(data);
+ return true;
+ }
+
+ return false;
+}
+
/* This follows nfs_commit_list pretty closely */
int
pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
@@ -280,6 +305,11 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
list_for_each_entry_safe(data, tmp, &list, pages) {
list_del_init(&data->pages);
if (data->ds_commit_index < 0) {
+ /* another commit raced with us */
+ if (pnfs_generic_commit_cancel_empty_pagelist(mds_pages,
+ data, cinfo))
+ continue;
+
nfs_init_commit(data, mds_pages, NULL, cinfo);
nfs_initiate_commit(NFS_CLIENT(inode), data,
NFS_PROTO(data->inode),
@@ -288,6 +318,12 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
LIST_HEAD(pages);
pnfs_fetch_commit_bucket_list(&pages, data, cinfo);
+
+ /* another commit raced with us */
+ if (pnfs_generic_commit_cancel_empty_pagelist(&pages,
+ data, cinfo))
+ continue;
+
nfs_init_commit(data, &pages, data->lseg, cinfo);
initiate_commit(data, how);
}
@@ -559,7 +595,7 @@ static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
}
static struct nfs_client *(*get_v3_ds_connect)(
- struct nfs_client *mds_clp,
+ struct nfs_server *mds_srv,
const struct sockaddr *ds_addr,
int ds_addrlen,
int ds_proto,
@@ -618,7 +654,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args,
rpc_clnt_test_and_add_xprt, NULL);
} else
- clp = get_v3_ds_connect(mds_srv->nfs_client,
+ clp = get_v3_ds_connect(mds_srv,
(struct sockaddr *)&da->da_addr,
da->da_addrlen, IPPROTO_TCP,
timeo, retrans, au_flavor);
@@ -654,7 +690,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
dprintk("%s: DS %s: trying address %s\n",
__func__, ds->ds_remotestr, da->da_remotestr);
- clp = nfs4_set_ds_client(mds_srv->nfs_client,
+ clp = nfs4_set_ds_client(mds_srv,
(struct sockaddr *)&da->da_addr,
da->da_addrlen, IPPROTO_TCP,
timeo, retrans, minor_version,
@@ -874,12 +910,12 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
struct list_head *list;
struct pnfs_commit_bucket *buckets;
- spin_lock(cinfo->lock);
+ spin_lock(&cinfo->inode->i_lock);
buckets = cinfo->ds->buckets;
list = &buckets[ds_commit_idx].written;
if (list_empty(list)) {
if (!pnfs_is_valid_lseg(lseg)) {
- spin_unlock(cinfo->lock);
+ spin_unlock(&cinfo->inode->i_lock);
cinfo->completion_ops->resched_write(cinfo, req);
return;
}
@@ -896,7 +932,7 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
cinfo->ds->nwritten++;
nfs_request_add_commit_list_locked(req, list, cinfo);
- spin_unlock(cinfo->lock);
+ spin_unlock(&cinfo->inode->i_lock);
nfs_mark_page_unstable(req->wb_page, cinfo);
}
EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
@@ -904,6 +940,13 @@ EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
int
pnfs_nfs_generic_sync(struct inode *inode, bool datasync)
{
+ int ret;
+
+ if (!pnfs_layoutcommit_outstanding(inode))
+ return 0;
+ ret = nfs_commit_inode(inode, FLUSH_SYNC);
+ if (ret < 0)
+ return ret;
if (datasync)
return 0;
return pnfs_layoutcommit_inode(inode, true);
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index b417bbcd97046..b7bca83039895 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -145,7 +145,7 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
}
static int
-nfs_proc_lookup(struct inode *dir, struct qstr *name,
+nfs_proc_lookup(struct inode *dir, const struct qstr *name,
struct nfs_fh *fhandle, struct nfs_fattr *fattr,
struct nfs4_label *label)
{
@@ -299,7 +299,7 @@ out:
}
static int
-nfs_proc_remove(struct inode *dir, struct qstr *name)
+nfs_proc_remove(struct inode *dir, const struct qstr *name)
{
struct nfs_removeargs arg = {
.fh = NFS_FH(dir),
@@ -357,7 +357,7 @@ nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
}
static int
-nfs_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
+nfs_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name)
{
struct nfs_linkargs arg = {
.fromfh = NFS_FH(inode),
@@ -456,7 +456,7 @@ out:
}
static int
-nfs_proc_rmdir(struct inode *dir, struct qstr *name)
+nfs_proc_rmdir(struct inode *dir, const struct qstr *name)
{
struct nfs_diropargs arg = {
.fh = NFS_FH(dir),
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index eb31e23e7defa..572e5b3b06f15 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -46,7 +46,7 @@ static void nfs_readhdr_free(struct nfs_pgio_header *rhdr)
static
int nfs_return_empty_page(struct page *page)
{
- zero_user(page, 0, PAGE_CACHE_SIZE);
+ zero_user(page, 0, PAGE_SIZE);
SetPageUptodate(page);
unlock_page(page);
return 0;
@@ -118,8 +118,8 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
unlock_page(page);
return PTR_ERR(new);
}
- if (len < PAGE_CACHE_SIZE)
- zero_user_segment(page, len, PAGE_CACHE_SIZE);
+ if (len < PAGE_SIZE)
+ zero_user_segment(page, len, PAGE_SIZE);
nfs_pageio_init_read(&pgio, inode, false,
&nfs_async_read_completion_ops);
@@ -295,7 +295,7 @@ int nfs_readpage(struct file *file, struct page *page)
int error;
dprintk("NFS: nfs_readpage (%p %ld@%lu)\n",
- page, PAGE_CACHE_SIZE, page_file_index(page));
+ page, PAGE_SIZE, page_file_index(page));
nfs_inc_stats(inode, NFSIOS_VFSREADPAGE);
nfs_add_stats(inode, NFSIOS_READPAGES, 1);
@@ -361,19 +361,19 @@ readpage_async_filler(void *data, struct page *page)
if (IS_ERR(new))
goto out_error;
- if (len < PAGE_CACHE_SIZE)
- zero_user_segment(page, len, PAGE_CACHE_SIZE);
+ if (len < PAGE_SIZE)
+ zero_user_segment(page, len, PAGE_SIZE);
if (!nfs_pageio_add_request(desc->pgio, new)) {
nfs_list_remove_request(new);
nfs_readpage_release(new);
error = desc->pgio->pg_error;
- goto out_unlock;
+ goto out;
}
return 0;
out_error:
error = PTR_ERR(new);
-out_unlock:
unlock_page(page);
+out:
return error;
}
@@ -424,8 +424,8 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
pgm = &pgio.pg_mirrors[0];
NFS_I(inode)->read_io += pgm->pg_bytes_written;
- npages = (pgm->pg_bytes_written + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
+ npages = (pgm->pg_bytes_written + PAGE_SIZE - 1) >>
+ PAGE_SHIFT;
nfs_add_stats(inode, NFSIOS_READPAGES, npages);
read_complete:
put_nfs_open_context(desc.ctx);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f1268280244e4..18d446e1a82bb 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -191,6 +191,7 @@ static const match_table_t nfs_mount_option_tokens = {
enum {
Opt_xprt_udp, Opt_xprt_udp6, Opt_xprt_tcp, Opt_xprt_tcp6, Opt_xprt_rdma,
+ Opt_xprt_rdma6,
Opt_xprt_err
};
@@ -201,6 +202,7 @@ static const match_table_t nfs_xprt_protocol_tokens = {
{ Opt_xprt_tcp, "tcp" },
{ Opt_xprt_tcp6, "tcp6" },
{ Opt_xprt_rdma, "rdma" },
+ { Opt_xprt_rdma6, "rdma6" },
{ Opt_xprt_err, NULL }
};
@@ -1456,6 +1458,8 @@ static int nfs_parse_mount_options(char *raw,
mnt->flags |= NFS_MOUNT_TCP;
mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
break;
+ case Opt_xprt_rdma6:
+ protofamily = AF_INET6;
case Opt_xprt_rdma:
/* vector side protocols to TCP */
mnt->flags |= NFS_MOUNT_TCP;
@@ -1680,6 +1684,7 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args,
{
rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR;
unsigned int i;
+ int use_auth_null = false;
/*
* If the sec= mount option is used, the specified flavor or AUTH_NULL
@@ -1687,14 +1692,21 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args,
*
* AUTH_NULL has a special meaning when it's in the server list - it
* means that the server will ignore the rpc creds, so any flavor
- * can be used.
+ * can be used but still use the sec= that was specified.
*/
for (i = 0; i < count; i++) {
flavor = server_authlist[i];
- if (nfs_auth_info_match(&args->auth_info, flavor) ||
- flavor == RPC_AUTH_NULL)
+ if (nfs_auth_info_match(&args->auth_info, flavor))
goto out;
+
+ if (flavor == RPC_AUTH_NULL)
+ use_auth_null = true;
+ }
+
+ if (use_auth_null) {
+ flavor = RPC_AUTH_NULL;
+ goto out;
}
dfprintk(MOUNT,
@@ -2408,6 +2420,11 @@ static int nfs_compare_super_address(struct nfs_server *server1,
struct nfs_server *server2)
{
struct sockaddr *sap1, *sap2;
+ struct rpc_xprt *xprt1 = server1->client->cl_xprt;
+ struct rpc_xprt *xprt2 = server2->client->cl_xprt;
+
+ if (!net_eq(xprt1->xprt_net, xprt2->xprt_net))
+ return 0;
sap1 = (struct sockaddr *)&server1->nfs_client->cl_addr;
sap2 = (struct sockaddr *)&server2->nfs_client->cl_addr;
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index fa538b2ba2518..191aa577dd1f3 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -30,45 +30,11 @@
static void
nfs_free_unlinkdata(struct nfs_unlinkdata *data)
{
- iput(data->dir);
put_rpccred(data->cred);
kfree(data->args.name.name);
kfree(data);
}
-#define NAME_ALLOC_LEN(len) ((len+16) & ~15)
-/**
- * nfs_copy_dname - copy dentry name to data structure
- * @dentry: pointer to dentry
- * @data: nfs_unlinkdata
- */
-static int nfs_copy_dname(struct dentry *dentry, struct nfs_unlinkdata *data)
-{
- char *str;
- int len = dentry->d_name.len;
-
- str = kmemdup(dentry->d_name.name, NAME_ALLOC_LEN(len), GFP_KERNEL);
- if (!str)
- return -ENOMEM;
- data->args.name.len = len;
- data->args.name.name = str;
- return 0;
-}
-
-static void nfs_free_dname(struct nfs_unlinkdata *data)
-{
- kfree(data->args.name.name);
- data->args.name.name = NULL;
- data->args.name.len = 0;
-}
-
-static void nfs_dec_sillycount(struct inode *dir)
-{
- struct nfs_inode *nfsi = NFS_I(dir);
- if (atomic_dec_return(&nfsi->silly_count) == 1)
- wake_up(&nfsi->waitqueue);
-}
-
/**
* nfs_async_unlink_done - Sillydelete post-processing
* @task: rpc_task of the sillydelete
@@ -78,7 +44,7 @@ static void nfs_dec_sillycount(struct inode *dir)
static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
{
struct nfs_unlinkdata *data = calldata;
- struct inode *dir = data->dir;
+ struct inode *dir = d_inode(data->dentry->d_parent);
trace_nfs_sillyrename_unlink(data, task->tk_status);
if (!NFS_PROTO(dir)->unlink_done(task, dir))
@@ -95,17 +61,21 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
static void nfs_async_unlink_release(void *calldata)
{
struct nfs_unlinkdata *data = calldata;
- struct super_block *sb = data->dir->i_sb;
+ struct dentry *dentry = data->dentry;
+ struct super_block *sb = dentry->d_sb;
- nfs_dec_sillycount(data->dir);
+ up_read_non_owner(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
+ d_lookup_done(dentry);
nfs_free_unlinkdata(data);
+ dput(dentry);
nfs_sb_deactive(sb);
}
static void nfs_unlink_prepare(struct rpc_task *task, void *calldata)
{
struct nfs_unlinkdata *data = calldata;
- NFS_PROTO(data->dir)->unlink_rpc_prepare(task, data);
+ struct inode *dir = d_inode(data->dentry->d_parent);
+ NFS_PROTO(dir)->unlink_rpc_prepare(task, data);
}
static const struct rpc_call_ops nfs_unlink_ops = {
@@ -114,7 +84,7 @@ static const struct rpc_call_ops nfs_unlink_ops = {
.rpc_call_prepare = nfs_unlink_prepare,
};
-static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct nfs_unlinkdata *data)
+static void nfs_do_call_unlink(struct nfs_unlinkdata *data)
{
struct rpc_message msg = {
.rpc_argp = &data->args,
@@ -129,10 +99,31 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
.flags = RPC_TASK_ASYNC,
};
struct rpc_task *task;
+ struct inode *dir = d_inode(data->dentry->d_parent);
+ nfs_sb_active(dir->i_sb);
+ data->args.fh = NFS_FH(dir);
+ nfs_fattr_init(data->res.dir_attr);
+
+ NFS_PROTO(dir)->unlink_setup(&msg, dir);
+
+ task_setup_data.rpc_client = NFS_CLIENT(dir);
+ task = rpc_run_task(&task_setup_data);
+ if (!IS_ERR(task))
+ rpc_put_task_async(task);
+}
+
+static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
+{
+ struct inode *dir = d_inode(dentry->d_parent);
struct dentry *alias;
- alias = d_lookup(parent, &data->args.name);
- if (alias != NULL) {
+ down_read_non_owner(&NFS_I(dir)->rmdir_sem);
+ alias = d_alloc_parallel(dentry->d_parent, &data->args.name, &data->wq);
+ if (IS_ERR(alias)) {
+ up_read_non_owner(&NFS_I(dir)->rmdir_sem);
+ return 0;
+ }
+ if (!d_in_lookup(alias)) {
int ret;
void *devname_garbage = NULL;
@@ -140,10 +131,8 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
* Hey, we raced with lookup... See if we need to transfer
* the sillyrename information to the aliased dentry.
*/
- nfs_free_dname(data);
- ret = nfs_copy_dname(alias, data);
spin_lock(&alias->d_lock);
- if (ret == 0 && d_really_is_positive(alias) &&
+ if (d_really_is_positive(alias) &&
!(alias->d_flags & DCACHE_NFSFS_RENAMED)) {
devname_garbage = alias->d_fsdata;
alias->d_fsdata = data;
@@ -152,8 +141,8 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
} else
ret = 0;
spin_unlock(&alias->d_lock);
- nfs_dec_sillycount(dir);
dput(alias);
+ up_read_non_owner(&NFS_I(dir)->rmdir_sem);
/*
* If we'd displaced old cached devname, free it. At that
* point dentry is definitely not a root, so we won't need
@@ -162,94 +151,18 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
kfree(devname_garbage);
return ret;
}
- data->dir = igrab(dir);
- if (!data->dir) {
- nfs_dec_sillycount(dir);
- return 0;
- }
- nfs_sb_active(dir->i_sb);
- data->args.fh = NFS_FH(dir);
- nfs_fattr_init(data->res.dir_attr);
-
- NFS_PROTO(dir)->unlink_setup(&msg, dir);
-
- task_setup_data.rpc_client = NFS_CLIENT(dir);
- task = rpc_run_task(&task_setup_data);
- if (!IS_ERR(task))
- rpc_put_task_async(task);
+ data->dentry = alias;
+ nfs_do_call_unlink(data);
return 1;
}
-static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
-{
- struct dentry *parent;
- struct inode *dir;
- int ret = 0;
-
-
- parent = dget_parent(dentry);
- if (parent == NULL)
- goto out_free;
- dir = d_inode(parent);
- /* Non-exclusive lock protects against concurrent lookup() calls */
- spin_lock(&dir->i_lock);
- if (atomic_inc_not_zero(&NFS_I(dir)->silly_count) == 0) {
- /* Deferred delete */
- hlist_add_head(&data->list, &NFS_I(dir)->silly_list);
- spin_unlock(&dir->i_lock);
- ret = 1;
- goto out_dput;
- }
- spin_unlock(&dir->i_lock);
- ret = nfs_do_call_unlink(parent, dir, data);
-out_dput:
- dput(parent);
-out_free:
- return ret;
-}
-
-void nfs_wait_on_sillyrename(struct dentry *dentry)
-{
- struct nfs_inode *nfsi = NFS_I(d_inode(dentry));
-
- wait_event(nfsi->waitqueue, atomic_read(&nfsi->silly_count) <= 1);
-}
-
-void nfs_block_sillyrename(struct dentry *dentry)
-{
- struct nfs_inode *nfsi = NFS_I(d_inode(dentry));
-
- wait_event(nfsi->waitqueue, atomic_cmpxchg(&nfsi->silly_count, 1, 0) == 1);
-}
-
-void nfs_unblock_sillyrename(struct dentry *dentry)
-{
- struct inode *dir = d_inode(dentry);
- struct nfs_inode *nfsi = NFS_I(dir);
- struct nfs_unlinkdata *data;
-
- atomic_inc(&nfsi->silly_count);
- spin_lock(&dir->i_lock);
- while (!hlist_empty(&nfsi->silly_list)) {
- if (!atomic_inc_not_zero(&nfsi->silly_count))
- break;
- data = hlist_entry(nfsi->silly_list.first, struct nfs_unlinkdata, list);
- hlist_del(&data->list);
- spin_unlock(&dir->i_lock);
- if (nfs_do_call_unlink(dentry, dir, data) == 0)
- nfs_free_unlinkdata(data);
- spin_lock(&dir->i_lock);
- }
- spin_unlock(&dir->i_lock);
-}
-
/**
* nfs_async_unlink - asynchronous unlinking of a file
* @dir: parent directory of dentry
* @dentry: dentry to unlink
*/
static int
-nfs_async_unlink(struct inode *dir, struct dentry *dentry)
+nfs_async_unlink(struct dentry *dentry, const struct qstr *name)
{
struct nfs_unlinkdata *data;
int status = -ENOMEM;
@@ -258,13 +171,18 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
data = kzalloc(sizeof(*data), GFP_KERNEL);
if (data == NULL)
goto out;
+ data->args.name.name = kstrdup(name->name, GFP_KERNEL);
+ if (!data->args.name.name)
+ goto out_free;
+ data->args.name.len = name->len;
data->cred = rpc_lookup_cred();
if (IS_ERR(data->cred)) {
status = PTR_ERR(data->cred);
- goto out_free;
+ goto out_free_name;
}
data->res.dir_attr = &data->dir_attr;
+ init_waitqueue_head(&data->wq);
status = -EBUSY;
spin_lock(&dentry->d_lock);
@@ -284,6 +202,8 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
out_unlock:
spin_unlock(&dentry->d_lock);
put_rpccred(data->cred);
+out_free_name:
+ kfree(data->args.name.name);
out_free:
kfree(data);
out:
@@ -302,17 +222,15 @@ out:
void
nfs_complete_unlink(struct dentry *dentry, struct inode *inode)
{
- struct nfs_unlinkdata *data = NULL;
+ struct nfs_unlinkdata *data;
spin_lock(&dentry->d_lock);
- if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
- dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
- data = dentry->d_fsdata;
- dentry->d_fsdata = NULL;
- }
+ dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
+ data = dentry->d_fsdata;
+ dentry->d_fsdata = NULL;
spin_unlock(&dentry->d_lock);
- if (data != NULL && (NFS_STALE(inode) || !nfs_call_unlink(dentry, data)))
+ if (NFS_STALE(inode) || !nfs_call_unlink(dentry, data))
nfs_free_unlinkdata(data);
}
@@ -559,18 +477,10 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
/* queue unlink first. Can't do this from rpc_release as it
* has to allocate memory
*/
- error = nfs_async_unlink(dir, dentry);
+ error = nfs_async_unlink(dentry, &sdentry->d_name);
if (error)
goto out_dput;
- /* populate unlinkdata with the right dname */
- error = nfs_copy_dname(sdentry,
- (struct nfs_unlinkdata *)dentry->d_fsdata);
- if (error) {
- nfs_cancel_async_unlink(dentry);
- goto out_dput;
- }
-
/* run the rename task, undo unlink if it fails */
task = nfs_async_rename(dir, dir, dentry, sdentry,
nfs_complete_sillyrename);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 5754835a28860..3a6724c6eb5ff 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -150,7 +150,7 @@ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int c
spin_lock(&inode->i_lock);
i_size = i_size_read(inode);
- end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
+ end_index = (i_size - 1) >> PAGE_SHIFT;
if (i_size > 0 && page_file_index(page) < end_index)
goto out;
end = page_file_offset(page) + ((loff_t)offset+count);
@@ -245,8 +245,7 @@ static void nfs_mark_uptodate(struct nfs_page *req)
static int wb_priority(struct writeback_control *wbc)
{
int ret = 0;
- if (wbc->for_reclaim)
- return FLUSH_HIGHPRI | FLUSH_COND_STABLE;
+
if (wbc->sync_mode == WB_SYNC_ALL)
ret = FLUSH_COND_STABLE;
return ret;
@@ -626,7 +625,7 @@ static int nfs_writepage_locked(struct page *page,
int err;
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
- nfs_pageio_init_write(&pgio, inode, wb_priority(wbc),
+ nfs_pageio_init_write(&pgio, inode, 0,
false, &nfs_async_write_completion_ops);
err = nfs_do_writepage(page, wbc, &pgio, launder);
nfs_pageio_complete(&pgio);
@@ -658,16 +657,9 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control *
int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
- unsigned long *bitlock = &NFS_I(inode)->flags;
struct nfs_pageio_descriptor pgio;
int err;
- /* Stop dirtying of new pages while we sync */
- err = wait_on_bit_lock_action(bitlock, NFS_INO_FLUSHING,
- nfs_wait_bit_killable, TASK_KILLABLE);
- if (err)
- goto out_err;
-
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false,
@@ -675,10 +667,6 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
nfs_pageio_complete(&pgio);
- clear_bit_unlock(NFS_INO_FLUSHING, bitlock);
- smp_mb__after_atomic();
- wake_up_bit(bitlock, NFS_INO_FLUSHING);
-
if (err < 0)
goto out_err;
err = pgio.pg_error;
@@ -737,7 +725,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
head = req->wb_head;
spin_lock(&inode->i_lock);
- if (likely(!PageSwapCache(head->wb_page))) {
+ if (likely(head->wb_page && !PageSwapCache(head->wb_page))) {
set_page_private(head->wb_page, 0);
ClearPagePrivate(head->wb_page);
smp_mb__after_atomic();
@@ -759,7 +747,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
static void
nfs_mark_request_dirty(struct nfs_page *req)
{
- __set_page_dirty_nobuffers(req->wb_page);
+ if (req->wb_page)
+ __set_page_dirty_nobuffers(req->wb_page);
}
/*
@@ -804,7 +793,7 @@ nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
* number of outstanding requests requiring a commit as well as
* the MM page stats.
*
- * The caller must hold the cinfo->lock, and the nfs_page lock.
+ * The caller must hold cinfo->inode->i_lock, and the nfs_page lock.
*/
void
nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst,
@@ -832,10 +821,11 @@ EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked);
void
nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo)
{
- spin_lock(cinfo->lock);
+ spin_lock(&cinfo->inode->i_lock);
nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo);
- spin_unlock(cinfo->lock);
- nfs_mark_page_unstable(req->wb_page, cinfo);
+ spin_unlock(&cinfo->inode->i_lock);
+ if (req->wb_page)
+ nfs_mark_page_unstable(req->wb_page, cinfo);
}
EXPORT_SYMBOL_GPL(nfs_request_add_commit_list);
@@ -864,7 +854,7 @@ EXPORT_SYMBOL_GPL(nfs_request_remove_commit_list);
static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
struct inode *inode)
{
- cinfo->lock = &inode->i_lock;
+ cinfo->inode = inode;
cinfo->mds = &NFS_I(inode)->commit_info;
cinfo->ds = pnfs_get_ds_info(inode);
cinfo->dreq = NULL;
@@ -897,7 +887,7 @@ nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
static void
nfs_clear_page_commit(struct page *page)
{
- dec_zone_page_state(page, NR_UNSTABLE_NFS);
+ dec_node_page_state(page, NR_UNSTABLE_NFS);
dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb,
WB_RECLAIMABLE);
}
@@ -967,7 +957,7 @@ nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
return cinfo->mds->ncommit;
}
-/* cinfo->lock held by caller */
+/* cinfo->inode->i_lock held by caller */
int
nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
struct nfs_commit_info *cinfo, int max)
@@ -979,7 +969,7 @@ nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
if (!nfs_lock_request(req))
continue;
kref_get(&req->wb_kref);
- if (cond_resched_lock(cinfo->lock))
+ if (cond_resched_lock(&cinfo->inode->i_lock))
list_safe_reset_next(req, tmp, wb_list);
nfs_request_remove_commit_list(req, cinfo);
nfs_list_add_request(req, dst);
@@ -1005,7 +995,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
{
int ret = 0;
- spin_lock(cinfo->lock);
+ spin_lock(&cinfo->inode->i_lock);
if (cinfo->mds->ncommit > 0) {
const int max = INT_MAX;
@@ -1013,7 +1003,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
cinfo, max);
ret += pnfs_scan_commit_lists(inode, cinfo, max - ret);
}
- spin_unlock(cinfo->lock);
+ spin_unlock(&cinfo->inode->i_lock);
return ret;
}
@@ -1194,9 +1184,11 @@ nfs_key_timeout_notify(struct file *filp, struct inode *inode)
/*
* Test if the open context credential key is marked to expire soon.
*/
-bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx)
+bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode)
{
- return rpcauth_cred_key_to_expire(ctx->cred);
+ struct rpc_auth *auth = NFS_SERVER(inode)->client->cl_auth;
+
+ return rpcauth_cred_key_to_expire(auth, ctx->cred);
}
/*
@@ -1288,6 +1280,9 @@ int nfs_updatepage(struct file *file, struct page *page,
dprintk("NFS: nfs_updatepage(%pD2 %d@%lld)\n",
file, count, (long long)(page_file_offset(page) + offset));
+ if (!count)
+ goto out;
+
if (nfs_can_extend_write(file, page, inode)) {
count = max(count + offset, nfs_page_length(page));
offset = 0;
@@ -1298,7 +1293,7 @@ int nfs_updatepage(struct file *file, struct page *page,
nfs_set_pageerror(page);
else
__set_page_dirty_nobuffers(page);
-
+out:
dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n",
status, (long long)i_size_read(inode));
return status;
@@ -1709,6 +1704,10 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
{
struct nfs_commit_data *data;
+ /* another commit raced with us */
+ if (list_empty(head))
+ return 0;
+
data = nfs_commitdata_alloc();
if (!data)
@@ -1724,6 +1723,36 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
return -ENOMEM;
}
+int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf)
+{
+ struct inode *inode = file_inode(file);
+ struct nfs_open_context *open;
+ struct nfs_commit_info cinfo;
+ struct nfs_page *req;
+ int ret;
+
+ open = get_nfs_open_context(nfs_file_open_context(file));
+ req = nfs_create_request(open, NULL, NULL, 0, i_size_read(inode));
+ if (IS_ERR(req)) {
+ ret = PTR_ERR(req);
+ goto out_put;
+ }
+
+ nfs_init_cinfo_from_inode(&cinfo, inode);
+
+ memcpy(&req->wb_verf, verf, sizeof(struct nfs_write_verifier));
+ nfs_request_add_commit_list(req, &cinfo);
+ ret = nfs_commit_inode(inode, FLUSH_SYNC);
+ if (ret > 0)
+ ret = 0;
+
+ nfs_free_request(req);
+out_put:
+ put_nfs_open_context(open);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_commit_file);
+
/*
* COMMIT call returned
*/
@@ -1748,7 +1777,8 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
while (!list_empty(&data->pages)) {
req = nfs_list_entry(data->pages.next);
nfs_list_remove_request(req);
- nfs_clear_page_commit(req->wb_page);
+ if (req->wb_page)
+ nfs_clear_page_commit(req->wb_page);
dprintk("NFS: commit (%s/%llu %d@%lld)",
req->wb_context->dentry->d_sb->s_id,
@@ -1764,7 +1794,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
/* Okay, COMMIT succeeded, apparently. Check the verifier
* returned by the server against all stored verfs. */
- if (!memcmp(&req->wb_verf, &data->verf.verifier, sizeof(req->wb_verf))) {
+ if (!nfs_write_verifier_cmp(&req->wb_verf, &data->verf.verifier)) {
/* We have a match */
nfs_inode_remove_request(req);
dprintk(" OK\n");
@@ -1888,6 +1918,24 @@ out_mark_dirty:
EXPORT_SYMBOL_GPL(nfs_write_inode);
/*
+ * Wrapper for filemap_write_and_wait_range()
+ *
+ * Needed for pNFS in order to ensure data becomes visible to the
+ * client.
+ */
+int nfs_filemap_write_and_wait_range(struct address_space *mapping,
+ loff_t lstart, loff_t lend)
+{
+ int ret;
+
+ ret = filemap_write_and_wait_range(mapping, lstart, lend);
+ if (ret == 0)
+ ret = pnfs_sync_inode(mapping->host, true);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_filemap_write_and_wait_range);
+
+/*
* flush the inode to disk.
*/
int nfs_wb_all(struct inode *inode)
@@ -1942,7 +1990,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder)
{
loff_t range_start = page_file_offset(page);
- loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
+ loff_t range_end = range_start + (loff_t)(PAGE_SIZE - 1);
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.nr_to_write = 0,
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index c9f583d7bac85..47febcf991850 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -90,6 +90,7 @@ config NFSD_BLOCKLAYOUT
bool "NFSv4.1 server support for pNFS block layouts"
depends on NFSD_V4 && BLOCK
select NFSD_PNFS
+ select EXPORTFS_BLOCK_OPS
help
This option enables support for the exporting pNFS block layouts
in the kernel's NFS server. The pNFS block layout enables NFS
@@ -102,6 +103,7 @@ config NFSD_SCSILAYOUT
bool "NFSv4.1 server support for pNFS SCSI layouts"
depends on NFSD_V4 && BLOCK
select NFSD_PNFS
+ select EXPORTFS_BLOCK_OPS
help
This option enables support for the exporting pNFS SCSI layouts
in the kernel's NFS server. The pNFS SCSI layout enables NFS
@@ -111,6 +113,23 @@ config NFSD_SCSILAYOUT
If unsure, say N.
+config NFSD_FLEXFILELAYOUT
+ bool "NFSv4.1 server support for pNFS Flex File layouts"
+ depends on NFSD_V4
+ select NFSD_PNFS
+ help
+ This option enables support for the exporting pNFS Flex File
+ layouts in the kernel's NFS server. The pNFS Flex File layout
+ enables NFS clients to directly perform I/O to NFSv3 devices
+ accesible to both the server and the clients. See
+ draft-ietf-nfsv4-flex-files for more details.
+
+ Warning, this server implements the bare minimum functionality
+ to be a flex file server - it is for testing the client,
+ not for use in production.
+
+ If unsure, say N.
+
config NFSD_V4_SECURITY_LABEL
bool "Provide Security Label support for NFSv4 server"
depends on NFSD_V4 && SECURITY
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 3ae5f3c77e28b..5f5d3a76980c0 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -20,3 +20,4 @@ nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o
nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o
+nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index e55b5242614da..5a17084415103 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -2,6 +2,7 @@
* Copyright (c) 2014-2016 Christoph Hellwig.
*/
#include <linux/exportfs.h>
+#include <linux/iomap.h>
#include <linux/genhd.h>
#include <linux/slab.h>
#include <linux/pr.h>
@@ -162,6 +163,7 @@ nfsd4_block_get_device_info_simple(struct super_block *sb,
static __be32
nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
+ struct svc_rqst *rqstp,
struct nfs4_client *clp,
struct nfsd4_getdeviceinfo *gdp)
{
@@ -290,7 +292,7 @@ out_free_buf:
return error;
}
-#define NFSD_MDS_PR_KEY 0x0100000000000000
+#define NFSD_MDS_PR_KEY 0x0100000000000000ULL
/*
* We use the client ID as a unique key for the reservations.
@@ -354,6 +356,7 @@ nfsd4_block_get_device_info_scsi(struct super_block *sb,
static __be32
nfsd4_scsi_proc_getdeviceinfo(struct super_block *sb,
+ struct svc_rqst *rqstp,
struct nfs4_client *clp,
struct nfsd4_getdeviceinfo *gdp)
{
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 6c3b316f932e9..ac6f54546fdde 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -3,6 +3,7 @@
*/
#include <linux/sunrpc/svc.h>
#include <linux/exportfs.h>
+#include <linux/iomap.h>
#include <linux/nfs4.h>
#include "nfsd.h"
@@ -43,7 +44,7 @@ nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
switch (b->type) {
case PNFS_BLOCK_VOLUME_SIMPLE:
- len = 4 + 4 + 8 + 4 + b->simple.sig_len;
+ len = 4 + 4 + 8 + 4 + (XDR_QUADLEN(b->simple.sig_len) << 2);
p = xdr_reserve_space(xdr, len);
if (!p)
return -ETOOSMALL;
@@ -54,7 +55,7 @@ nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len);
break;
case PNFS_BLOCK_VOLUME_SCSI:
- len = 4 + 4 + 4 + 4 + b->scsi.designator_len + 8;
+ len = 4 + 4 + 4 + 4 + (XDR_QUADLEN(b->scsi.designator_len) << 2) + 8;
p = xdr_reserve_space(xdr, len);
if (!p)
return -ETOOSMALL;
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index b4d84b579f20c..43e109cc0ccc3 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -706,7 +706,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
new->ex_fslocs.locations = NULL;
new->ex_fslocs.locations_count = 0;
new->ex_fslocs.migrated = 0;
- new->ex_layout_type = 0;
+ new->ex_layout_types = 0;
new->ex_uuid = NULL;
new->cd = item->cd;
}
@@ -731,7 +731,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
item->ex_fslocs.locations_count = 0;
new->ex_fslocs.migrated = item->ex_fslocs.migrated;
item->ex_fslocs.migrated = 0;
- new->ex_layout_type = item->ex_layout_type;
+ new->ex_layout_types = item->ex_layout_types;
new->ex_nflavors = item->ex_nflavors;
for (i = 0; i < MAX_SECINFO_LIST; i++) {
new->ex_flavors[i] = item->ex_flavors[i];
@@ -954,6 +954,16 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
rqstp->rq_cred.cr_flavor == RPC_AUTH_UNIX)
return 0;
}
+
+ /* If the compound op contains a spo_must_allowed op,
+ * it will be sent with integrity/protection which
+ * will have to be expressly allowed on mounts that
+ * don't support it
+ */
+
+ if (nfsd4_spo_must_allow(rqstp))
+ return 0;
+
return nfserr_wrongsec;
}
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index 2e315072bf3fb..730f15eeb7ed5 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -57,7 +57,7 @@ struct svc_export {
struct nfsd4_fs_locations ex_fslocs;
uint32_t ex_nflavors;
struct exp_flavor_info ex_flavors[MAX_SECINFO_LIST];
- enum pnfs_layouttype ex_layout_type;
+ u32 ex_layout_types;
struct nfsd4_deviceid_map *ex_devid_map;
struct cache_detail *cd;
};
diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c
new file mode 100644
index 0000000000000..df880e9fa71fb
--- /dev/null
+++ b/fs/nfsd/flexfilelayout.c
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2016 Tom Haynes <loghyr@primarydata.com>
+ *
+ * The following implements a super-simple flex-file server
+ * where the NFSv4.1 mds is also the ds. And the storage is
+ * the same. I.e., writing to the mds via a NFSv4.1 WRITE
+ * goes to the same location as the NFSv3 WRITE.
+ */
+#include <linux/slab.h>
+
+#include <linux/nfsd/debug.h>
+
+#include <linux/sunrpc/addr.h>
+
+#include "flexfilelayoutxdr.h"
+#include "pnfs.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_PNFS
+
+static __be32
+nfsd4_ff_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
+ struct nfsd4_layoutget *args)
+{
+ struct nfsd4_layout_seg *seg = &args->lg_seg;
+ u32 device_generation = 0;
+ int error;
+ uid_t u;
+
+ struct pnfs_ff_layout *fl;
+
+ /*
+ * The super simple flex file server has 1 mirror, 1 data server,
+ * and 1 file handle. So instead of 4 allocs, do 1 for now.
+ * Zero it out for the stateid - don't want junk in there!
+ */
+ error = -ENOMEM;
+ fl = kzalloc(sizeof(*fl), GFP_KERNEL);
+ if (!fl)
+ goto out_error;
+ args->lg_content = fl;
+
+ /*
+ * Avoid layout commit, try to force the I/O to the DS,
+ * and for fun, cause all IOMODE_RW layout segments to
+ * effectively be WRITE only.
+ */
+ fl->flags = FF_FLAGS_NO_LAYOUTCOMMIT | FF_FLAGS_NO_IO_THRU_MDS |
+ FF_FLAGS_NO_READ_IO;
+
+ /* Do not allow a IOMODE_READ segment to have write pemissions */
+ if (seg->iomode == IOMODE_READ) {
+ u = from_kuid(&init_user_ns, inode->i_uid) + 1;
+ fl->uid = make_kuid(&init_user_ns, u);
+ } else
+ fl->uid = inode->i_uid;
+ fl->gid = inode->i_gid;
+
+ error = nfsd4_set_deviceid(&fl->deviceid, fhp, device_generation);
+ if (error)
+ goto out_error;
+
+ fl->fh.size = fhp->fh_handle.fh_size;
+ memcpy(fl->fh.data, &fhp->fh_handle.fh_base, fl->fh.size);
+
+ /* Give whole file layout segments */
+ seg->offset = 0;
+ seg->length = NFS4_MAX_UINT64;
+
+ dprintk("GET: 0x%llx:0x%llx %d\n", seg->offset, seg->length,
+ seg->iomode);
+ return 0;
+
+out_error:
+ seg->length = 0;
+ return nfserrno(error);
+}
+
+static __be32
+nfsd4_ff_proc_getdeviceinfo(struct super_block *sb, struct svc_rqst *rqstp,
+ struct nfs4_client *clp, struct nfsd4_getdeviceinfo *gdp)
+{
+ struct pnfs_ff_device_addr *da;
+
+ u16 port;
+ char addr[INET6_ADDRSTRLEN];
+
+ da = kzalloc(sizeof(struct pnfs_ff_device_addr), GFP_KERNEL);
+ if (!da)
+ return nfserrno(-ENOMEM);
+
+ gdp->gd_device = da;
+
+ da->version = 3;
+ da->minor_version = 0;
+
+ da->rsize = svc_max_payload(rqstp);
+ da->wsize = da->rsize;
+
+ rpc_ntop((struct sockaddr *)&rqstp->rq_daddr,
+ addr, INET6_ADDRSTRLEN);
+ if (rqstp->rq_daddr.ss_family == AF_INET) {
+ struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *)&rqstp->rq_daddr;
+ port = ntohs(sin->sin_port);
+ snprintf(da->netaddr.netid, FF_NETID_LEN + 1, "tcp");
+ da->netaddr.netid_len = 3;
+ } else {
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)&rqstp->rq_daddr;
+ port = ntohs(sin6->sin6_port);
+ snprintf(da->netaddr.netid, FF_NETID_LEN + 1, "tcp6");
+ da->netaddr.netid_len = 4;
+ }
+
+ da->netaddr.addr_len =
+ snprintf(da->netaddr.addr, FF_ADDR_LEN + 1,
+ "%s.%hhu.%hhu", addr, port >> 8, port & 0xff);
+
+ da->tightly_coupled = false;
+
+ return 0;
+}
+
+const struct nfsd4_layout_ops ff_layout_ops = {
+ .notify_types =
+ NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE,
+ .proc_getdeviceinfo = nfsd4_ff_proc_getdeviceinfo,
+ .encode_getdeviceinfo = nfsd4_ff_encode_getdeviceinfo,
+ .proc_layoutget = nfsd4_ff_proc_layoutget,
+ .encode_layoutget = nfsd4_ff_encode_layoutget,
+};
diff --git a/fs/nfsd/flexfilelayoutxdr.c b/fs/nfsd/flexfilelayoutxdr.c
new file mode 100644
index 0000000000000..5e3fd7fc1a9fc
--- /dev/null
+++ b/fs/nfsd/flexfilelayoutxdr.c
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2016 Tom Haynes <loghyr@primarydata.com>
+ */
+#include <linux/sunrpc/svc.h>
+#include <linux/nfs4.h>
+
+#include "nfsd.h"
+#include "flexfilelayoutxdr.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_PNFS
+
+struct ff_idmap {
+ char buf[11];
+ int len;
+};
+
+__be32
+nfsd4_ff_encode_layoutget(struct xdr_stream *xdr,
+ struct nfsd4_layoutget *lgp)
+{
+ struct pnfs_ff_layout *fl = lgp->lg_content;
+ int len, mirror_len, ds_len, fh_len;
+ __be32 *p;
+
+ /*
+ * Unlike nfsd4_encode_user, we know these will
+ * always be stringified.
+ */
+ struct ff_idmap uid;
+ struct ff_idmap gid;
+
+ fh_len = 4 + fl->fh.size;
+
+ uid.len = sprintf(uid.buf, "%u", from_kuid(&init_user_ns, fl->uid));
+ gid.len = sprintf(gid.buf, "%u", from_kgid(&init_user_ns, fl->gid));
+
+ /* 8 + len for recording the length, name, and padding */
+ ds_len = 20 + sizeof(stateid_opaque_t) + 4 + fh_len +
+ 8 + uid.len + 8 + gid.len;
+
+ mirror_len = 4 + ds_len;
+
+ /* The layout segment */
+ len = 20 + mirror_len;
+
+ p = xdr_reserve_space(xdr, sizeof(__be32) + len);
+ if (!p)
+ return nfserr_toosmall;
+
+ *p++ = cpu_to_be32(len);
+ p = xdr_encode_hyper(p, 0); /* stripe unit of 1 */
+
+ *p++ = cpu_to_be32(1); /* single mirror */
+ *p++ = cpu_to_be32(1); /* single data server */
+
+ p = xdr_encode_opaque_fixed(p, &fl->deviceid,
+ sizeof(struct nfsd4_deviceid));
+
+ *p++ = cpu_to_be32(1); /* efficiency */
+
+ *p++ = cpu_to_be32(fl->stateid.si_generation);
+ p = xdr_encode_opaque_fixed(p, &fl->stateid.si_opaque,
+ sizeof(stateid_opaque_t));
+
+ *p++ = cpu_to_be32(1); /* single file handle */
+ p = xdr_encode_opaque(p, fl->fh.data, fl->fh.size);
+
+ p = xdr_encode_opaque(p, uid.buf, uid.len);
+ p = xdr_encode_opaque(p, gid.buf, gid.len);
+
+ *p++ = cpu_to_be32(fl->flags);
+ *p++ = cpu_to_be32(0); /* No stats collect hint */
+
+ return 0;
+}
+
+__be32
+nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr,
+ struct nfsd4_getdeviceinfo *gdp)
+{
+ struct pnfs_ff_device_addr *da = gdp->gd_device;
+ int len;
+ int ver_len;
+ int addr_len;
+ __be32 *p;
+
+ /* len + padding for two strings */
+ addr_len = 16 + da->netaddr.netid_len + da->netaddr.addr_len;
+ ver_len = 20;
+
+ len = 4 + ver_len + 4 + addr_len;
+
+ p = xdr_reserve_space(xdr, len + sizeof(__be32));
+ if (!p)
+ return nfserr_resource;
+
+ /*
+ * Fill in the overall length and number of volumes at the beginning
+ * of the layout.
+ */
+ *p++ = cpu_to_be32(len);
+ *p++ = cpu_to_be32(1); /* 1 netaddr */
+ p = xdr_encode_opaque(p, da->netaddr.netid, da->netaddr.netid_len);
+ p = xdr_encode_opaque(p, da->netaddr.addr, da->netaddr.addr_len);
+
+ *p++ = cpu_to_be32(1); /* 1 versions */
+
+ *p++ = cpu_to_be32(da->version);
+ *p++ = cpu_to_be32(da->minor_version);
+ *p++ = cpu_to_be32(da->rsize);
+ *p++ = cpu_to_be32(da->wsize);
+ *p++ = cpu_to_be32(da->tightly_coupled);
+
+ return 0;
+}
diff --git a/fs/nfsd/flexfilelayoutxdr.h b/fs/nfsd/flexfilelayoutxdr.h
new file mode 100644
index 0000000000000..467defd4e5636
--- /dev/null
+++ b/fs/nfsd/flexfilelayoutxdr.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016 Tom Haynes <loghyr@primarydata.com>
+ */
+#ifndef _NFSD_FLEXFILELAYOUTXDR_H
+#define _NFSD_FLEXFILELAYOUTXDR_H 1
+
+#include <linux/inet.h>
+#include "xdr4.h"
+
+#define FF_FLAGS_NO_LAYOUTCOMMIT 1
+#define FF_FLAGS_NO_IO_THRU_MDS 2
+#define FF_FLAGS_NO_READ_IO 4
+
+struct xdr_stream;
+
+#define FF_NETID_LEN (4)
+#define FF_ADDR_LEN (INET6_ADDRSTRLEN + 8)
+struct pnfs_ff_netaddr {
+ char netid[FF_NETID_LEN + 1];
+ char addr[FF_ADDR_LEN + 1];
+ u32 netid_len;
+ u32 addr_len;
+};
+
+struct pnfs_ff_device_addr {
+ struct pnfs_ff_netaddr netaddr;
+ u32 version;
+ u32 minor_version;
+ u32 rsize;
+ u32 wsize;
+ bool tightly_coupled;
+};
+
+struct pnfs_ff_layout {
+ u32 flags;
+ u32 stats_collect_hint;
+ kuid_t uid;
+ kgid_t gid;
+ struct nfsd4_deviceid deviceid;
+ stateid_t stateid;
+ struct nfs_fh fh;
+};
+
+__be32 nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr,
+ struct nfsd4_getdeviceinfo *gdp);
+__be32 nfsd4_ff_encode_layoutget(struct xdr_stream *xdr,
+ struct nfsd4_layoutget *lgp);
+
+#endif /* _NFSD_FLEXFILELAYOUTXDR_H */
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 1580ea6fd64df..d08cd88155c75 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -104,22 +104,21 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp,
goto out;
inode = d_inode(fh->fh_dentry);
- if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) {
- error = -EOPNOTSUPP;
- goto out_errno;
- }
error = fh_want_write(fh);
if (error)
goto out_errno;
- error = inode->i_op->set_acl(inode, argp->acl_access, ACL_TYPE_ACCESS);
+ fh_lock(fh);
+
+ error = set_posix_acl(inode, ACL_TYPE_ACCESS, argp->acl_access);
if (error)
- goto out_drop_write;
- error = inode->i_op->set_acl(inode, argp->acl_default,
- ACL_TYPE_DEFAULT);
+ goto out_drop_lock;
+ error = set_posix_acl(inode, ACL_TYPE_DEFAULT, argp->acl_default);
if (error)
- goto out_drop_write;
+ goto out_drop_lock;
+
+ fh_unlock(fh);
fh_drop_write(fh);
@@ -131,7 +130,8 @@ out:
posix_acl_release(argp->acl_access);
posix_acl_release(argp->acl_default);
return nfserr;
-out_drop_write:
+out_drop_lock:
+ fh_unlock(fh);
fh_drop_write(fh);
out_errno:
nfserr = nfserrno(error);
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 01df4cd7c753f..0c890347cde3d 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -95,22 +95,20 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst * rqstp,
goto out;
inode = d_inode(fh->fh_dentry);
- if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) {
- error = -EOPNOTSUPP;
- goto out_errno;
- }
error = fh_want_write(fh);
if (error)
goto out_errno;
- error = inode->i_op->set_acl(inode, argp->acl_access, ACL_TYPE_ACCESS);
+ fh_lock(fh);
+
+ error = set_posix_acl(inode, ACL_TYPE_ACCESS, argp->acl_access);
if (error)
- goto out_drop_write;
- error = inode->i_op->set_acl(inode, argp->acl_default,
- ACL_TYPE_DEFAULT);
+ goto out_drop_lock;
+ error = set_posix_acl(inode, ACL_TYPE_DEFAULT, argp->acl_default);
-out_drop_write:
+out_drop_lock:
+ fh_unlock(fh);
fh_drop_write(fh);
out_errno:
nfserr = nfserrno(error);
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 51c3b06e80365..d818e4ffd79f9 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -552,7 +552,7 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
* different read/write sizes for file systems known to have
* problems with large blocks */
if (nfserr == 0) {
- struct super_block *sb = d_inode(argp->fh.fh_dentry)->i_sb;
+ struct super_block *sb = argp->fh.fh_dentry->d_sb;
/* Note that we don't care for remote fs's here */
if (sb->s_magic == MSDOS_SUPER_MAGIC) {
@@ -588,7 +588,7 @@ nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP);
if (nfserr == 0) {
- struct super_block *sb = d_inode(argp->fh.fh_dentry)->i_sb;
+ struct super_block *sb = argp->fh.fh_dentry->d_sb;
/* Note that we don't care for remote fs's here */
switch (sb->s_magic) {
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 2246454dec765..dba2ff8eaa68e 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -146,7 +146,7 @@ static __be32 *encode_fsid(__be32 *p, struct svc_fh *fhp)
default:
case FSIDSOURCE_DEV:
p = xdr_encode_hyper(p, (u64)huge_encode_dev
- (d_inode(fhp->fh_dentry)->i_sb->s_dev));
+ (fhp->fh_dentry->d_sb->s_dev));
break;
case FSIDSOURCE_FSID:
p = xdr_encode_hyper(p, (u64) fhp->fh_export->ex_fsid);
@@ -379,7 +379,7 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
*/
hdr = (void*)p - rqstp->rq_arg.head[0].iov_base;
dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len
- - hdr;
+ + rqstp->rq_arg.tail[0].iov_len - hdr;
/*
* Round the length of the data which was specified up to
* the next multiple of XDR units and then compare that
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 6adabd6049b71..71292a0d6f092 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -770,9 +770,6 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
dentry = fhp->fh_dentry;
inode = d_inode(dentry);
- if (!inode->i_op->set_acl || !IS_POSIXACL(inode))
- return nfserr_attrnotsupp;
-
if (S_ISDIR(inode->i_mode))
flags = NFS4_ACL_DIR;
@@ -782,16 +779,19 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (host_error < 0)
goto out_nfserr;
- host_error = inode->i_op->set_acl(inode, pacl, ACL_TYPE_ACCESS);
+ fh_lock(fhp);
+
+ host_error = set_posix_acl(inode, ACL_TYPE_ACCESS, pacl);
if (host_error < 0)
- goto out_release;
+ goto out_drop_lock;
if (S_ISDIR(inode->i_mode)) {
- host_error = inode->i_op->set_acl(inode, dpacl,
- ACL_TYPE_DEFAULT);
+ host_error = set_posix_acl(inode, ACL_TYPE_DEFAULT, dpacl);
}
-out_release:
+out_drop_lock:
+ fh_unlock(fhp);
+
posix_acl_release(pacl);
posix_acl_release(dpacl);
out_nfserr:
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 7389cb1d7409c..04c68d9003249 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -710,22 +710,6 @@ static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc
}
}
-static struct rpc_clnt *create_backchannel_client(struct rpc_create_args *args)
-{
- struct rpc_xprt *xprt;
-
- if (args->protocol != XPRT_TRANSPORT_BC_TCP)
- return rpc_create(args);
-
- xprt = args->bc_xprt->xpt_bc_xprt;
- if (xprt) {
- xprt_get(xprt);
- return rpc_create_xprt(args, xprt);
- }
-
- return rpc_create(args);
-}
-
static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses)
{
int maxtime = max_cb_time(clp->net);
@@ -768,7 +752,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
args.authflavor = ses->se_cb_sec.flavor;
}
/* Create RPC client */
- client = create_backchannel_client(&args);
+ client = rpc_create(&args);
if (IS_ERR(client)) {
dprintk("NFSD: couldn't create callback client: %ld\n",
PTR_ERR(client));
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 825c7bc8d7897..2be9602b0221b 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -27,6 +27,9 @@ static const struct nfsd4_callback_ops nfsd4_cb_layout_ops;
static const struct lock_manager_operations nfsd4_layouts_lm_ops;
const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = {
+#ifdef CONFIG_NFSD_FLEXFILELAYOUT
+ [LAYOUT_FLEX_FILES] = &ff_layout_ops,
+#endif
#ifdef CONFIG_NFSD_BLOCKLAYOUT
[LAYOUT_BLOCK_VOLUME] = &bl_layout_ops,
#endif
@@ -122,28 +125,35 @@ nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp,
void nfsd4_setup_layout_type(struct svc_export *exp)
{
+#if defined(CONFIG_NFSD_BLOCKLAYOUT) || defined(CONFIG_NFSD_SCSILAYOUT)
struct super_block *sb = exp->ex_path.mnt->mnt_sb;
+#endif
if (!(exp->ex_flags & NFSEXP_PNFS))
return;
/*
- * Check if the file system supports exporting a block-like layout.
+ * If flex file is configured, use it by default. Otherwise
+ * check if the file system supports exporting a block-like layout.
* If the block device supports reservations prefer the SCSI layout,
* otherwise advertise the block layout.
*/
+#ifdef CONFIG_NFSD_FLEXFILELAYOUT
+ exp->ex_layout_types |= 1 << LAYOUT_FLEX_FILES;
+#endif
#ifdef CONFIG_NFSD_BLOCKLAYOUT
+ /* overwrite flex file layout selection if needed */
if (sb->s_export_op->get_uuid &&
sb->s_export_op->map_blocks &&
sb->s_export_op->commit_blocks)
- exp->ex_layout_type = LAYOUT_BLOCK_VOLUME;
+ exp->ex_layout_types |= 1 << LAYOUT_BLOCK_VOLUME;
#endif
#ifdef CONFIG_NFSD_SCSILAYOUT
/* overwrite block layout selection if needed */
if (sb->s_export_op->map_blocks &&
sb->s_export_op->commit_blocks &&
sb->s_bdev && sb->s_bdev->bd_disk->fops->pr_ops)
- exp->ex_layout_type = LAYOUT_SCSI;
+ exp->ex_layout_types |= 1 << LAYOUT_SCSI;
#endif
}
@@ -289,7 +299,7 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
status = nfserr_bad_stateid;
mutex_lock(&ls->ls_mutex);
- if (stateid->si_generation > stid->sc_stateid.si_generation)
+ if (nfsd4_stateid_generation_after(stateid, &stid->sc_stateid))
goto out_unlock_stid;
if (layout_type != ls->ls_layout_type)
goto out_unlock_stid;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index de1ff1d98bb18..1fb222752b2b1 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -605,8 +605,7 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
fh_init(&resfh, NFS4_FHSIZE);
- status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR,
- NFSD_MAY_CREATE);
+ status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_NOP);
if (status)
return status;
@@ -1219,12 +1218,12 @@ nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
static const struct nfsd4_layout_ops *
nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type)
{
- if (!exp->ex_layout_type) {
+ if (!exp->ex_layout_types) {
dprintk("%s: export does not support pNFS\n", __func__);
return NULL;
}
- if (exp->ex_layout_type != layout_type) {
+ if (!(exp->ex_layout_types & (1 << layout_type))) {
dprintk("%s: layout type %d not supported\n",
__func__, layout_type);
return NULL;
@@ -1270,7 +1269,7 @@ nfsd4_getdeviceinfo(struct svc_rqst *rqstp,
nfserr = nfs_ok;
if (gdp->gd_maxcount != 0) {
nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb,
- cstate->session->se_client, gdp);
+ rqstp, cstate->session->se_client, gdp);
}
gdp->gd_notify_types &= ops->notify_types;
@@ -2335,6 +2334,45 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
};
+/**
+ * nfsd4_spo_must_allow - Determine if the compound op contains an
+ * operation that is allowed to be sent with machine credentials
+ *
+ * @rqstp: a pointer to the struct svc_rqst
+ *
+ * Checks to see if the compound contains a spo_must_allow op
+ * and confirms that it was sent with the proper machine creds.
+ */
+
+bool nfsd4_spo_must_allow(struct svc_rqst *rqstp)
+{
+ struct nfsd4_compoundres *resp = rqstp->rq_resp;
+ struct nfsd4_compoundargs *argp = rqstp->rq_argp;
+ struct nfsd4_op *this = &argp->ops[resp->opcnt - 1];
+ struct nfsd4_compound_state *cstate = &resp->cstate;
+ struct nfs4_op_map *allow = &cstate->clp->cl_spo_must_allow;
+ u32 opiter;
+
+ if (!cstate->minorversion)
+ return false;
+
+ if (cstate->spo_must_allowed == true)
+ return true;
+
+ opiter = resp->opcnt;
+ while (opiter < argp->opcnt) {
+ this = &argp->ops[opiter++];
+ if (test_bit(this->opnum, allow->u.longs) &&
+ cstate->clp->cl_mach_cred &&
+ nfsd4_mach_creds_match(cstate->clp, rqstp)) {
+ cstate->spo_must_allowed = true;
+ return true;
+ }
+ }
+ cstate->spo_must_allowed = false;
+ return false;
+}
+
int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op)
{
struct nfsd4_operation *opdesc;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 0462eeddfff99..8410ca275db1a 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1200,27 +1200,6 @@ free_ol_stateid_reaplist(struct list_head *reaplist)
}
}
-static void release_lockowner(struct nfs4_lockowner *lo)
-{
- struct nfs4_client *clp = lo->lo_owner.so_client;
- struct nfs4_ol_stateid *stp;
- struct list_head reaplist;
-
- INIT_LIST_HEAD(&reaplist);
-
- spin_lock(&clp->cl_lock);
- unhash_lockowner_locked(lo);
- while (!list_empty(&lo->lo_owner.so_stateids)) {
- stp = list_first_entry(&lo->lo_owner.so_stateids,
- struct nfs4_ol_stateid, st_perstateowner);
- WARN_ON(!unhash_lock_stateid(stp));
- put_ol_stateid_locked(stp, &reaplist);
- }
- spin_unlock(&clp->cl_lock);
- free_ol_stateid_reaplist(&reaplist);
- nfs4_put_stateowner(&lo->lo_owner);
-}
-
static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp,
struct list_head *reaplist)
{
@@ -1972,7 +1951,7 @@ static bool svc_rqst_integrity_protected(struct svc_rqst *rqstp)
service == RPC_GSS_SVC_PRIVACY;
}
-static bool mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp)
+bool nfsd4_mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp)
{
struct svc_cred *cr = &rqstp->rq_cred;
@@ -2388,6 +2367,22 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
switch (exid->spa_how) {
case SP4_MACH_CRED:
+ exid->spo_must_enforce[0] = 0;
+ exid->spo_must_enforce[1] = (
+ 1 << (OP_BIND_CONN_TO_SESSION - 32) |
+ 1 << (OP_EXCHANGE_ID - 32) |
+ 1 << (OP_CREATE_SESSION - 32) |
+ 1 << (OP_DESTROY_SESSION - 32) |
+ 1 << (OP_DESTROY_CLIENTID - 32));
+
+ exid->spo_must_allow[0] &= (1 << (OP_CLOSE) |
+ 1 << (OP_OPEN_DOWNGRADE) |
+ 1 << (OP_LOCKU) |
+ 1 << (OP_DELEGRETURN));
+
+ exid->spo_must_allow[1] &= (
+ 1 << (OP_TEST_STATEID - 32) |
+ 1 << (OP_FREE_STATEID - 32));
if (!svc_rqst_integrity_protected(rqstp)) {
status = nfserr_inval;
goto out_nolock;
@@ -2424,7 +2419,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
status = nfserr_inval;
goto out;
}
- if (!mach_creds_match(conf, rqstp)) {
+ if (!nfsd4_mach_creds_match(conf, rqstp)) {
status = nfserr_wrong_cred;
goto out;
}
@@ -2473,6 +2468,8 @@ out_new:
goto out;
}
new->cl_minorversion = cstate->minorversion;
+ new->cl_spo_must_allow.u.words[0] = exid->spo_must_allow[0];
+ new->cl_spo_must_allow.u.words[1] = exid->spo_must_allow[1];
gen_clid(new, nn);
add_to_unconfirmed(new);
@@ -2676,7 +2673,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
if (conf) {
status = nfserr_wrong_cred;
- if (!mach_creds_match(conf, rqstp))
+ if (!nfsd4_mach_creds_match(conf, rqstp))
goto out_free_conn;
cs_slot = &conf->cl_cs_slot;
status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
@@ -2692,7 +2689,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
goto out_free_conn;
}
status = nfserr_wrong_cred;
- if (!mach_creds_match(unconf, rqstp))
+ if (!nfsd4_mach_creds_match(unconf, rqstp))
goto out_free_conn;
cs_slot = &unconf->cl_cs_slot;
status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
@@ -2801,7 +2798,7 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
if (!session)
goto out_no_session;
status = nfserr_wrong_cred;
- if (!mach_creds_match(session->se_client, rqstp))
+ if (!nfsd4_mach_creds_match(session->se_client, rqstp))
goto out;
status = nfsd4_map_bcts_dir(&bcts->dir);
if (status)
@@ -2848,7 +2845,7 @@ nfsd4_destroy_session(struct svc_rqst *r,
if (!ses)
goto out_client_lock;
status = nfserr_wrong_cred;
- if (!mach_creds_match(ses->se_client, r))
+ if (!nfsd4_mach_creds_match(ses->se_client, r))
goto out_put_session;
status = mark_session_dead_locked(ses, 1 + ref_held_by_me);
if (status)
@@ -3087,7 +3084,7 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
status = nfserr_stale_clientid;
goto out;
}
- if (!mach_creds_match(clp, rqstp)) {
+ if (!nfsd4_mach_creds_match(clp, rqstp)) {
clp = NULL;
status = nfserr_wrong_cred;
goto out;
@@ -3112,7 +3109,7 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
* We don't take advantage of the rca_one_fs case.
* That's OK, it's optional, we can safely ignore it.
*/
- return nfs_ok;
+ return nfs_ok;
}
status = nfserr_complete_already;
@@ -3480,12 +3477,17 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open,
}
static struct nfs4_ol_stateid *
-init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
- struct nfsd4_open *open)
+init_open_stateid(struct nfs4_file *fp, struct nfsd4_open *open)
{
struct nfs4_openowner *oo = open->op_openowner;
struct nfs4_ol_stateid *retstp = NULL;
+ struct nfs4_ol_stateid *stp;
+
+ stp = open->op_stp;
+ /* We are moving these outside of the spinlocks to avoid the warnings */
+ mutex_init(&stp->st_mutex);
+ mutex_lock(&stp->st_mutex);
spin_lock(&oo->oo_owner.so_client->cl_lock);
spin_lock(&fp->fi_lock);
@@ -3493,6 +3495,8 @@ init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
retstp = nfsd4_find_existing_open(fp, open);
if (retstp)
goto out_unlock;
+
+ open->op_stp = NULL;
atomic_inc(&stp->st_stid.sc_count);
stp->st_stid.sc_type = NFS4_OPEN_STID;
INIT_LIST_HEAD(&stp->st_locks);
@@ -3502,14 +3506,19 @@ init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
stp->st_access_bmap = 0;
stp->st_deny_bmap = 0;
stp->st_openstp = NULL;
- init_rwsem(&stp->st_rwsem);
list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
list_add(&stp->st_perfile, &fp->fi_stateids);
out_unlock:
spin_unlock(&fp->fi_lock);
spin_unlock(&oo->oo_owner.so_client->cl_lock);
- return retstp;
+ if (retstp) {
+ mutex_lock(&retstp->st_mutex);
+ /* To keep mutex tracking happy */
+ mutex_unlock(&stp->st_mutex);
+ stp = retstp;
+ }
+ return stp;
}
/*
@@ -4305,7 +4314,6 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
struct nfs4_client *cl = open->op_openowner->oo_owner.so_client;
struct nfs4_file *fp = NULL;
struct nfs4_ol_stateid *stp = NULL;
- struct nfs4_ol_stateid *swapstp = NULL;
struct nfs4_delegation *dp = NULL;
__be32 status;
@@ -4335,32 +4343,28 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
*/
if (stp) {
/* Stateid was found, this is an OPEN upgrade */
- down_read(&stp->st_rwsem);
+ mutex_lock(&stp->st_mutex);
status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
if (status) {
- up_read(&stp->st_rwsem);
+ mutex_unlock(&stp->st_mutex);
goto out;
}
} else {
- stp = open->op_stp;
- open->op_stp = NULL;
- swapstp = init_open_stateid(stp, fp, open);
- if (swapstp) {
- nfs4_put_stid(&stp->st_stid);
- stp = swapstp;
- down_read(&stp->st_rwsem);
+ /* stp is returned locked. */
+ stp = init_open_stateid(fp, open);
+ /* See if we lost the race to some other thread */
+ if (stp->st_access_bmap != 0) {
status = nfs4_upgrade_open(rqstp, fp, current_fh,
stp, open);
if (status) {
- up_read(&stp->st_rwsem);
+ mutex_unlock(&stp->st_mutex);
goto out;
}
goto upgrade_out;
}
- down_read(&stp->st_rwsem);
status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open);
if (status) {
- up_read(&stp->st_rwsem);
+ mutex_unlock(&stp->st_mutex);
release_open_stateid(stp);
goto out;
}
@@ -4372,7 +4376,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
}
upgrade_out:
nfs4_inc_and_copy_stateid(&open->op_stateid, &stp->st_stid);
- up_read(&stp->st_rwsem);
+ mutex_unlock(&stp->st_mutex);
if (nfsd4_has_session(&resp->cstate)) {
if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) {
@@ -4651,12 +4655,6 @@ grace_disallows_io(struct net *net, struct inode *inode)
return opens_in_grace(net) && mandatory_lock(inode);
}
-/* Returns true iff a is later than b: */
-static bool stateid_generation_after(stateid_t *a, stateid_t *b)
-{
- return (s32)(a->si_generation - b->si_generation) > 0;
-}
-
static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session)
{
/*
@@ -4670,7 +4668,7 @@ static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_s
return nfs_ok;
/* If the client sends us a stateid from the future, it's buggy: */
- if (stateid_generation_after(in, ref))
+ if (nfsd4_stateid_generation_after(in, ref))
return nfserr_bad_stateid;
/*
* However, we could see a stateid from the past, even from a
@@ -4983,12 +4981,12 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
* revoked delegations are kept only for free_stateid.
*/
return nfserr_bad_stateid;
- down_write(&stp->st_rwsem);
+ mutex_lock(&stp->st_mutex);
status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate));
if (status == nfs_ok)
status = nfs4_check_fh(current_fh, &stp->st_stid);
if (status != nfs_ok)
- up_write(&stp->st_rwsem);
+ mutex_unlock(&stp->st_mutex);
return status;
}
@@ -5036,7 +5034,7 @@ static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs
return status;
oo = openowner(stp->st_stateowner);
if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
- up_write(&stp->st_rwsem);
+ mutex_unlock(&stp->st_mutex);
nfs4_put_stid(&stp->st_stid);
return nfserr_bad_stateid;
}
@@ -5068,12 +5066,12 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
oo = openowner(stp->st_stateowner);
status = nfserr_bad_stateid;
if (oo->oo_flags & NFS4_OO_CONFIRMED) {
- up_write(&stp->st_rwsem);
+ mutex_unlock(&stp->st_mutex);
goto put_stateid;
}
oo->oo_flags |= NFS4_OO_CONFIRMED;
nfs4_inc_and_copy_stateid(&oc->oc_resp_stateid, &stp->st_stid);
- up_write(&stp->st_rwsem);
+ mutex_unlock(&stp->st_mutex);
dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n",
__func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid));
@@ -5149,7 +5147,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
nfs4_inc_and_copy_stateid(&od->od_stateid, &stp->st_stid);
status = nfs_ok;
put_stateid:
- up_write(&stp->st_rwsem);
+ mutex_unlock(&stp->st_mutex);
nfs4_put_stid(&stp->st_stid);
out:
nfsd4_bump_seqid(cstate, status);
@@ -5202,7 +5200,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (status)
goto out;
nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
- up_write(&stp->st_rwsem);
+ mutex_unlock(&stp->st_mutex);
nfsd4_close_open_stateid(stp);
@@ -5428,7 +5426,7 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo,
stp->st_access_bmap = 0;
stp->st_deny_bmap = open_stp->st_deny_bmap;
stp->st_openstp = open_stp;
- init_rwsem(&stp->st_rwsem);
+ mutex_init(&stp->st_mutex);
list_add(&stp->st_locks, &open_stp->st_locks);
list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
spin_lock(&fp->fi_lock);
@@ -5597,7 +5595,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
&open_stp, nn);
if (status)
goto out;
- up_write(&open_stp->st_rwsem);
+ mutex_unlock(&open_stp->st_mutex);
open_sop = openowner(open_stp->st_stateowner);
status = nfserr_bad_stateid;
if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid,
@@ -5606,7 +5604,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = lookup_or_create_lock_state(cstate, open_stp, lock,
&lock_stp, &new);
if (status == nfs_ok)
- down_write(&lock_stp->st_rwsem);
+ mutex_lock(&lock_stp->st_mutex);
} else {
status = nfs4_preprocess_seqid_op(cstate,
lock->lk_old_lock_seqid,
@@ -5710,7 +5708,7 @@ out:
seqid_mutating_err(ntohl(status)))
lock_sop->lo_owner.so_seqid++;
- up_write(&lock_stp->st_rwsem);
+ mutex_unlock(&lock_stp->st_mutex);
/*
* If this is a new, never-before-used stateid, and we are
@@ -5880,7 +5878,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
fput:
fput(filp);
put_stateid:
- up_write(&stp->st_rwsem);
+ mutex_unlock(&stp->st_mutex);
nfs4_put_stid(&stp->st_stid);
out:
nfsd4_bump_seqid(cstate, status);
@@ -5944,6 +5942,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
__be32 status;
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
struct nfs4_client *clp;
+ LIST_HEAD (reaplist);
dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
clid->cl_boot, clid->cl_id);
@@ -5974,9 +5973,23 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
nfs4_get_stateowner(sop);
break;
}
+ if (!lo) {
+ spin_unlock(&clp->cl_lock);
+ return status;
+ }
+
+ unhash_lockowner_locked(lo);
+ while (!list_empty(&lo->lo_owner.so_stateids)) {
+ stp = list_first_entry(&lo->lo_owner.so_stateids,
+ struct nfs4_ol_stateid,
+ st_perstateowner);
+ WARN_ON(!unhash_lock_stateid(stp));
+ put_ol_stateid_locked(stp, &reaplist);
+ }
spin_unlock(&clp->cl_lock);
- if (lo)
- release_lockowner(lo);
+ free_ol_stateid_reaplist(&reaplist);
+ nfs4_put_stateowner(&lo->lo_owner);
+
return status;
}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 9df898ba648f7..0aa0236a14290 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1299,16 +1299,14 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
break;
case SP4_MACH_CRED:
/* spo_must_enforce */
- READ_BUF(4);
- dummy = be32_to_cpup(p++);
- READ_BUF(dummy * 4);
- p += dummy;
-
+ status = nfsd4_decode_bitmap(argp,
+ exid->spo_must_enforce);
+ if (status)
+ goto out;
/* spo_must_allow */
- READ_BUF(4);
- dummy = be32_to_cpup(p++);
- READ_BUF(dummy * 4);
- p += dummy;
+ status = nfsd4_decode_bitmap(argp, exid->spo_must_allow);
+ if (status)
+ goto out;
break;
case SP4_SSV:
/* ssp_ops */
@@ -2164,22 +2162,20 @@ nfsd4_encode_aclname(struct xdr_stream *xdr, struct svc_rqst *rqstp,
}
static inline __be32
-nfsd4_encode_layout_type(struct xdr_stream *xdr, enum pnfs_layouttype layout_type)
+nfsd4_encode_layout_types(struct xdr_stream *xdr, u32 layout_types)
{
- __be32 *p;
+ __be32 *p;
+ unsigned long i = hweight_long(layout_types);
- if (layout_type) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(1);
- *p++ = cpu_to_be32(layout_type);
- } else {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(0);
- }
+ p = xdr_reserve_space(xdr, 4 + 4 * i);
+ if (!p)
+ return nfserr_resource;
+
+ *p++ = cpu_to_be32(i);
+
+ for (i = LAYOUT_NFSV4_1_FILES; i < LAYOUT_TYPE_MAX; ++i)
+ if (layout_types & (1 << i))
+ *p++ = cpu_to_be32(i);
return 0;
}
@@ -2754,13 +2750,13 @@ out_acl:
}
#ifdef CONFIG_NFSD_PNFS
if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) {
- status = nfsd4_encode_layout_type(xdr, exp->ex_layout_type);
+ status = nfsd4_encode_layout_types(xdr, exp->ex_layout_types);
if (status)
goto out;
}
if (bmval2 & FATTR4_WORD2_LAYOUT_TYPES) {
- status = nfsd4_encode_layout_type(xdr, exp->ex_layout_type);
+ status = nfsd4_encode_layout_types(xdr, exp->ex_layout_types);
if (status)
goto out;
}
@@ -3867,14 +3863,6 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
return nfserr;
}
-static const u32 nfs4_minimal_spo_must_enforce[2] = {
- [1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) |
- 1 << (OP_EXCHANGE_ID - 32) |
- 1 << (OP_CREATE_SESSION - 32) |
- 1 << (OP_DESTROY_SESSION - 32) |
- 1 << (OP_DESTROY_CLIENTID - 32)
-};
-
static __be32
nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_exchange_id *exid)
@@ -3885,6 +3873,7 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
char *server_scope;
int major_id_sz;
int server_scope_sz;
+ int status = 0;
uint64_t minor_id = 0;
if (nfserr)
@@ -3913,18 +3902,20 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
case SP4_NONE:
break;
case SP4_MACH_CRED:
- /* spo_must_enforce, spo_must_allow */
- p = xdr_reserve_space(xdr, 16);
- if (!p)
- return nfserr_resource;
-
/* spo_must_enforce bitmap: */
- *p++ = cpu_to_be32(2);
- *p++ = cpu_to_be32(nfs4_minimal_spo_must_enforce[0]);
- *p++ = cpu_to_be32(nfs4_minimal_spo_must_enforce[1]);
- /* empty spo_must_allow bitmap: */
- *p++ = cpu_to_be32(0);
-
+ status = nfsd4_encode_bitmap(xdr,
+ exid->spo_must_enforce[0],
+ exid->spo_must_enforce[1],
+ exid->spo_must_enforce[2]);
+ if (status)
+ goto out;
+ /* spo_must_allow bitmap: */
+ status = nfsd4_encode_bitmap(xdr,
+ exid->spo_must_allow[0],
+ exid->spo_must_allow[1],
+ exid->spo_must_allow[2]);
+ if (status)
+ goto out;
break;
default:
WARN_ON_ONCE(1);
@@ -3951,6 +3942,8 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
/* Implementation id */
*p++ = cpu_to_be32(0); /* zero length nfs_impl_id4 array */
return 0;
+out:
+ return status;
}
static __be32
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 9690cb4dd5887..65ad0165a94f8 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -158,7 +158,6 @@ static const struct file_operations exports_proc_operations = {
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
- .owner = THIS_MODULE,
};
static int exports_nfsd_open(struct inode *inode, struct file *file)
@@ -171,7 +170,6 @@ static const struct file_operations exports_nfsd_operations = {
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
- .owner = THIS_MODULE,
};
static int export_features_show(struct seq_file *m, void *v)
@@ -217,7 +215,6 @@ static const struct file_operations pool_stats_operations = {
.read = seq_read,
.llseek = seq_lseek,
.release = nfsd_pool_stats_release,
- .owner = THIS_MODULE,
};
static struct file_operations reply_cache_stats_operations = {
@@ -1154,20 +1151,15 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
#endif
/* last one */ {""}
};
- struct net *net = data;
- int ret;
-
- ret = simple_fill_super(sb, 0x6e667364, nfsd_files);
- if (ret)
- return ret;
- sb->s_fs_info = get_net(net);
- return 0;
+ get_net(sb->s_fs_info);
+ return simple_fill_super(sb, 0x6e667364, nfsd_files);
}
static struct dentry *nfsd_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
- return mount_ns(fs_type, flags, current->nsproxy->net_ns, nfsd_fill_super);
+ struct net *net = current->nsproxy->net_ns;
+ return mount_ns(fs_type, flags, data, net, net->user_ns, nfsd_fill_super);
}
static void nfsd_umount(struct super_block *sb)
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index cf980523898b7..9446849888d52 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -124,6 +124,7 @@ void nfs4_state_shutdown_net(struct net *net);
void nfs4_reset_lease(time_t leasetime);
int nfs4_reset_recoverydir(char *recdir);
char * nfs4_recoverydir(void);
+bool nfsd4_spo_must_allow(struct svc_rqst *rqstp);
#else
static inline int nfsd4_init_slabs(void) { return 0; }
static inline void nfsd4_free_slabs(void) { }
@@ -134,6 +135,10 @@ static inline void nfs4_state_shutdown_net(struct net *net) { }
static inline void nfs4_reset_lease(time_t leasetime) { }
static inline int nfs4_reset_recoverydir(char *recdir) { return 0; }
static inline char * nfs4_recoverydir(void) {return NULL; }
+static inline bool nfsd4_spo_must_allow(struct svc_rqst *rqstp)
+{
+ return false;
+}
#endif
/*
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index c1681ce894c5e..cfe7500d5847b 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -59,14 +59,20 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry)
* the write call).
*/
static inline __be32
-nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, umode_t requested)
+nfsd_mode_check(struct svc_rqst *rqstp, struct dentry *dentry,
+ umode_t requested)
{
- mode &= S_IFMT;
+ umode_t mode = d_inode(dentry)->i_mode & S_IFMT;
if (requested == 0) /* the caller doesn't care */
return nfs_ok;
- if (mode == requested)
+ if (mode == requested) {
+ if (mode == S_IFDIR && !d_can_lookup(dentry)) {
+ WARN_ON_ONCE(1);
+ return nfserr_notdir;
+ }
return nfs_ok;
+ }
/*
* v4 has an error more specific than err_notdir which we should
* return in preference to err_notdir:
@@ -298,7 +304,7 @@ out:
* that it expects something not of the given type.
*
* @access is formed from the NFSD_MAY_* constants defined in
- * include/linux/nfsd/nfsd.h.
+ * fs/nfsd/vfs.h.
*/
__be32
fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
@@ -340,7 +346,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
if (error)
goto out;
- error = nfsd_mode_check(rqstp, d_inode(dentry)->i_mode, type);
+ error = nfsd_mode_check(rqstp, dentry, type);
if (error)
goto out;
@@ -426,7 +432,7 @@ static bool is_root_export(struct svc_export *exp)
static struct super_block *exp_sb(struct svc_export *exp)
{
- return d_inode(exp->ex_path.dentry)->i_sb;
+ return exp->ex_path.dentry->d_sb;
}
static bool fsid_type_ok_for_exp(u8 fsid_type, struct svc_export *exp)
@@ -533,7 +539,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
* the reference filehandle (if it is in the same export)
* or the export options.
*/
- set_version_and_fsid_type(fhp, exp, ref_fh);
+ set_version_and_fsid_type(fhp, exp, ref_fh);
if (ref_fh == fhp)
fh_put(ref_fh);
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 4cd78ef4c95c4..e9214768cde90 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -251,9 +251,6 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
/* Check for NFSD_MAY_WRITE in nfsd_create if necessary */
- nfserr = nfserr_acces;
- if (!argp->len)
- goto done;
nfserr = nfserr_exist;
if (isdotent(argp->name, argp->len))
goto done;
@@ -362,8 +359,8 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
nfserr = 0;
if (!inode) {
/* File doesn't exist. Create it and set attrs */
- nfserr = nfsd_create(rqstp, dirfhp, argp->name, argp->len,
- attr, type, rdev, newfhp);
+ nfserr = nfsd_create_locked(rqstp, dirfhp, argp->name,
+ argp->len, attr, type, rdev, newfhp);
} else if (type == S_IFREG) {
dprintk("nfsd: existing %s, valid=%x, size=%ld\n",
argp->name, attr->ia_valid, (long) attr->ia_size);
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 79d964aa8079f..41b468a6a90f8 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -240,7 +240,7 @@ nfssvc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p,
|| !(p = decode_filename(p, &args->name, &args->len)))
return 0;
- return xdr_argsize_check(rqstp, p);
+ return xdr_argsize_check(rqstp, p);
}
int
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
index 7d073b9b15530..0c2a716e87411 100644
--- a/fs/nfsd/pnfs.h
+++ b/fs/nfsd/pnfs.h
@@ -21,6 +21,7 @@ struct nfsd4_layout_ops {
u32 notify_types;
__be32 (*proc_getdeviceinfo)(struct super_block *sb,
+ struct svc_rqst *rqstp,
struct nfs4_client *clp,
struct nfsd4_getdeviceinfo *gdevp);
__be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr,
@@ -44,6 +45,9 @@ extern const struct nfsd4_layout_ops bl_layout_ops;
#ifdef CONFIG_NFSD_SCSILAYOUT
extern const struct nfsd4_layout_ops scsi_layout_ops;
#endif
+#ifdef CONFIG_NFSD_FLEXFILELAYOUT
+extern const struct nfsd4_layout_ops ff_layout_ops;
+#endif
__be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate, stateid_t *stateid,
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index c050c53036a62..b95adf9a15954 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -345,6 +345,7 @@ struct nfs4_client {
u32 cl_exchange_flags;
/* number of rpc's in progress over an associated session: */
atomic_t cl_refcount;
+ struct nfs4_op_map cl_spo_must_allow;
/* for nfs41 callbacks */
/* We currently support a single back channel with a single slot */
@@ -535,7 +536,7 @@ struct nfs4_ol_stateid {
unsigned char st_access_bmap;
unsigned char st_deny_bmap;
struct nfs4_ol_stateid *st_openstp;
- struct rw_semaphore st_rwsem;
+ struct mutex st_mutex;
};
static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
@@ -573,6 +574,11 @@ enum nfsd4_cb_op {
NFSPROC4_CLNT_CB_SEQUENCE,
};
+/* Returns true iff a is later than b: */
+static inline bool nfsd4_stateid_generation_after(stateid_t *a, stateid_t *b)
+{
+ return (s32)(a->si_generation - b->si_generation) > 0;
+}
struct nfsd4_compound_state;
struct nfsd_net;
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index cd90878a76aaa..d97338bb6a398 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -84,7 +84,6 @@ static int nfsd_proc_open(struct inode *inode, struct file *file)
}
static const struct file_operations nfsd_proc_fops = {
- .owner = THIS_MODULE,
.open = nfsd_proc_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index d40010e4f1a97..ba944123167b9 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -935,8 +935,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
int stable = *stablep;
int use_wgather;
loff_t pos = offset;
- loff_t end = LLONG_MAX;
unsigned int pflags = current->flags;
+ int flags = 0;
if (test_bit(RQ_LOCAL, &rqstp->rq_flags))
/*
@@ -955,9 +955,12 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
if (!EX_ISSYNC(exp))
stable = 0;
+ if (stable && !use_wgather)
+ flags |= RWF_SYNC;
+
/* Write the data. */
oldfs = get_fs(); set_fs(KERNEL_DS);
- host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos, 0);
+ host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos, flags);
set_fs(oldfs);
if (host_err < 0)
goto out_nfserr;
@@ -965,15 +968,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
nfsdstats.io_write += host_err;
fsnotify_modify(file);
- if (stable) {
- if (use_wgather) {
- host_err = wait_for_concurrent_writes(file);
- } else {
- if (*cnt)
- end = offset + *cnt - 1;
- host_err = vfs_fsync_range(file, offset, end, 0);
- }
- }
+ if (stable && use_wgather)
+ host_err = wait_for_concurrent_writes(file);
out_nfserr:
dprintk("nfsd: write complete host_err=%d\n", host_err);
@@ -1139,96 +1135,37 @@ nfsd_check_ignore_resizing(struct iattr *iap)
iap->ia_valid &= ~ATTR_SIZE;
}
-/*
- * Create a file (regular, directory, device, fifo); UNIX sockets
- * not yet implemented.
- * If the response fh has been verified, the parent directory should
- * already be locked. Note that the parent directory is left locked.
- *
- * N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp
- */
+/* The parent directory should already be locked: */
__be32
-nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
+nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
char *fname, int flen, struct iattr *iap,
int type, dev_t rdev, struct svc_fh *resfhp)
{
- struct dentry *dentry, *dchild = NULL;
+ struct dentry *dentry, *dchild;
struct inode *dirp;
__be32 err;
__be32 err2;
int host_err;
- err = nfserr_perm;
- if (!flen)
- goto out;
- err = nfserr_exist;
- if (isdotent(fname, flen))
- goto out;
-
- err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
- if (err)
- goto out;
-
dentry = fhp->fh_dentry;
dirp = d_inode(dentry);
- err = nfserr_notdir;
- if (!dirp->i_op->lookup)
- goto out;
- /*
- * Check whether the response file handle has been verified yet.
- * If it has, the parent directory should already be locked.
- */
- if (!resfhp->fh_dentry) {
- host_err = fh_want_write(fhp);
- if (host_err)
- goto out_nfserr;
-
- /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */
- fh_lock_nested(fhp, I_MUTEX_PARENT);
- dchild = lookup_one_len(fname, dentry, flen);
- host_err = PTR_ERR(dchild);
- if (IS_ERR(dchild))
- goto out_nfserr;
- err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
- if (err)
- goto out;
- } else {
- /* called from nfsd_proc_create */
- dchild = dget(resfhp->fh_dentry);
- if (!fhp->fh_locked) {
- /* not actually possible */
- printk(KERN_ERR
- "nfsd_create: parent %pd2 not locked!\n",
+ dchild = dget(resfhp->fh_dentry);
+ if (!fhp->fh_locked) {
+ WARN_ONCE(1, "nfsd_create: parent %pd2 not locked!\n",
dentry);
- err = nfserr_io;
- goto out;
- }
- }
- /*
- * Make sure the child dentry is still negative ...
- */
- err = nfserr_exist;
- if (d_really_is_positive(dchild)) {
- dprintk("nfsd_create: dentry %pd/%pd not negative!\n",
- dentry, dchild);
- goto out;
+ err = nfserr_io;
+ goto out;
}
+ err = nfsd_permission(rqstp, fhp->fh_export, dentry, NFSD_MAY_CREATE);
+ if (err)
+ goto out;
+
if (!(iap->ia_valid & ATTR_MODE))
iap->ia_mode = 0;
iap->ia_mode = (iap->ia_mode & S_IALLUGO) | type;
- err = nfserr_inval;
- if (!S_ISREG(type) && !S_ISDIR(type) && !special_file(type)) {
- printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n",
- type);
- goto out;
- }
-
- /*
- * Get the dir op function pointer.
- */
err = 0;
host_err = 0;
switch (type) {
@@ -1246,6 +1183,10 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
case S_IFSOCK:
host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
break;
+ default:
+ printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n",
+ type);
+ host_err = -EINVAL;
}
if (host_err < 0)
goto out_nfserr;
@@ -1255,7 +1196,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
/*
* nfsd_create_setattr already committed the child. Transactional
* filesystems had a chance to commit changes for both parent and
- * child * simultaneously making the following commit_metadata a
+ * child simultaneously making the following commit_metadata a
* noop.
*/
err2 = nfserrno(commit_metadata(fhp));
@@ -1267,8 +1208,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (!err)
err = fh_update(resfhp);
out:
- if (dchild && !IS_ERR(dchild))
- dput(dchild);
+ dput(dchild);
return err;
out_nfserr:
@@ -1276,6 +1216,50 @@ out_nfserr:
goto out;
}
+/*
+ * Create a filesystem object (regular, directory, special).
+ * Note that the parent directory is left locked.
+ *
+ * N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp
+ */
+__be32
+nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ char *fname, int flen, struct iattr *iap,
+ int type, dev_t rdev, struct svc_fh *resfhp)
+{
+ struct dentry *dentry, *dchild = NULL;
+ struct inode *dirp;
+ __be32 err;
+ int host_err;
+
+ if (isdotent(fname, flen))
+ return nfserr_exist;
+
+ err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_NOP);
+ if (err)
+ return err;
+
+ dentry = fhp->fh_dentry;
+ dirp = d_inode(dentry);
+
+ host_err = fh_want_write(fhp);
+ if (host_err)
+ return nfserrno(host_err);
+
+ fh_lock_nested(fhp, I_MUTEX_PARENT);
+ dchild = lookup_one_len(fname, dentry, flen);
+ host_err = PTR_ERR(dchild);
+ if (IS_ERR(dchild))
+ return nfserrno(host_err);
+ err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
+ if (err) {
+ dput(dchild);
+ return err;
+ }
+ return nfsd_create_locked(rqstp, fhp, fname, flen, iap, type,
+ rdev, resfhp);
+}
+
#ifdef CONFIG_NFSD_V3
/*
@@ -1308,12 +1292,6 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
dentry = fhp->fh_dentry;
dirp = d_inode(dentry);
- /* Get all the sanity checks out of the way before
- * we lock the parent. */
- err = nfserr_notdir;
- if (!dirp->i_op->lookup)
- goto out;
-
host_err = fh_want_write(fhp);
if (host_err)
goto out_nfserr;
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 2d573ec057f80..3cbb1b33777b5 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -59,6 +59,9 @@ __be32 nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *,
__be32 nfsd4_clone_file_range(struct file *, u64, struct file *,
u64, u64);
#endif /* CONFIG_NFSD_V4 */
+__be32 nfsd_create_locked(struct svc_rqst *, struct svc_fh *,
+ char *name, int len, struct iattr *attrs,
+ int type, dev_t rdev, struct svc_fh *res);
__be32 nfsd_create(struct svc_rqst *, struct svc_fh *,
char *name, int len, struct iattr *attrs,
int type, dev_t rdev, struct svc_fh *res);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index d9554813e58af..beea0c5edc514 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -59,6 +59,7 @@ struct nfsd4_compound_state {
struct nfsd4_session *session;
struct nfsd4_slot *slot;
int data_offset;
+ bool spo_must_allowed;
size_t iovlen;
u32 minorversion;
__be32 status;
@@ -403,6 +404,8 @@ struct nfsd4_exchange_id {
clientid_t clientid;
u32 seqid;
int spa_how;
+ u32 spo_must_enforce[3];
+ u32 spo_must_allow[3];
};
struct nfsd4_sequence {
@@ -654,6 +657,8 @@ set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
}
+
+bool nfsd4_mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp);
int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *);
int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *,
struct nfsd4_compoundargs *);
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 2ccbf5531554b..2c90e285d7c67 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -13,13 +13,8 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Original code was written by Koji Sato <koji@osrg.net>.
- * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
- * Amagai Yoshiji <amagai@osrg.net>.
+ * Originally written by Koji Sato.
+ * Two allocators were unified by Ryusuke Konishi and Amagai Yoshiji.
*/
#include <linux/types.h>
@@ -58,7 +53,7 @@ nilfs_palloc_groups_count(const struct inode *inode)
* @inode: inode of metadata file using this allocator
* @entry_size: size of the persistent object
*/
-int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
+int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned int entry_size)
{
struct nilfs_mdt_info *mi = NILFS_MDT(inode);
@@ -73,13 +68,17 @@ int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
mi->mi_blocks_per_group =
DIV_ROUND_UP(nilfs_palloc_entries_per_group(inode),
mi->mi_entries_per_block) + 1;
- /* Number of blocks in a group including entry blocks and
- a bitmap block */
+ /*
+ * Number of blocks in a group including entry blocks
+ * and a bitmap block
+ */
mi->mi_blocks_per_desc_block =
nilfs_palloc_groups_per_desc_block(inode) *
mi->mi_blocks_per_group + 1;
- /* Number of blocks per descriptor including the
- descriptor block */
+ /*
+ * Number of blocks per descriptor including the
+ * descriptor block
+ */
return 0;
}
@@ -389,7 +388,7 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
*/
static int nilfs_palloc_find_available_slot(unsigned char *bitmap,
unsigned long target,
- unsigned bsize,
+ unsigned int bsize,
spinlock_t *lock)
{
int pos, end = bsize;
@@ -623,10 +622,10 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
lock = nilfs_mdt_bgl_lock(inode, group);
if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap))
- nilfs_warning(inode->i_sb, __func__,
- "entry number %llu already freed: ino=%lu\n",
- (unsigned long long)req->pr_entry_nr,
- (unsigned long)inode->i_ino);
+ nilfs_msg(inode->i_sb, KERN_WARNING,
+ "%s (ino=%lu): entry number %llu already freed",
+ __func__, inode->i_ino,
+ (unsigned long long)req->pr_entry_nr);
else
nilfs_palloc_group_desc_add_entries(desc, lock, 1);
@@ -664,10 +663,10 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
lock = nilfs_mdt_bgl_lock(inode, group);
if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap))
- nilfs_warning(inode->i_sb, __func__,
- "entry number %llu already freed: ino=%lu\n",
- (unsigned long long)req->pr_entry_nr,
- (unsigned long)inode->i_ino);
+ nilfs_msg(inode->i_sb, KERN_WARNING,
+ "%s (ino=%lu): entry number %llu already freed",
+ __func__, inode->i_ino,
+ (unsigned long long)req->pr_entry_nr);
else
nilfs_palloc_group_desc_add_entries(desc, lock, 1);
@@ -740,8 +739,8 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
unsigned long group, group_offset;
__u64 group_min_nr, last_nrs[8];
const unsigned long epg = nilfs_palloc_entries_per_group(inode);
- const unsigned epb = NILFS_MDT(inode)->mi_entries_per_block;
- unsigned entry_start, end, pos;
+ const unsigned int epb = NILFS_MDT(inode)->mi_entries_per_block;
+ unsigned int entry_start, end, pos;
spinlock_t *lock;
int i, j, k, ret;
u32 nfree;
@@ -773,10 +772,10 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
do {
if (!nilfs_clear_bit_atomic(lock, group_offset,
bitmap)) {
- nilfs_warning(inode->i_sb, __func__,
- "entry number %llu already freed: ino=%lu\n",
- (unsigned long long)entry_nrs[j],
- (unsigned long)inode->i_ino);
+ nilfs_msg(inode->i_sb, KERN_WARNING,
+ "%s (ino=%lu): entry number %llu already freed",
+ __func__, inode->i_ino,
+ (unsigned long long)entry_nrs[j]);
} else {
n++;
}
@@ -817,12 +816,11 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
for (k = 0; k < nempties; k++) {
ret = nilfs_palloc_delete_entry_block(inode,
last_nrs[k]);
- if (ret && ret != -ENOENT) {
- nilfs_warning(inode->i_sb, __func__,
- "failed to delete block of entry %llu: ino=%lu, err=%d\n",
- (unsigned long long)last_nrs[k],
- (unsigned long)inode->i_ino, ret);
- }
+ if (ret && ret != -ENOENT)
+ nilfs_msg(inode->i_sb, KERN_WARNING,
+ "error %d deleting block that object (entry=%llu, ino=%lu) belongs to",
+ ret, (unsigned long long)last_nrs[k],
+ inode->i_ino);
}
desc_kaddr = kmap_atomic(desc_bh->b_page);
@@ -836,12 +834,10 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
if (nfree == nilfs_palloc_entries_per_group(inode)) {
ret = nilfs_palloc_delete_bitmap_block(inode, group);
- if (ret && ret != -ENOENT) {
- nilfs_warning(inode->i_sb, __func__,
- "failed to delete bitmap block of group %lu: ino=%lu, err=%d\n",
- group,
- (unsigned long)inode->i_ino, ret);
- }
+ if (ret && ret != -ENOENT)
+ nilfs_msg(inode->i_sb, KERN_WARNING,
+ "error %d deleting bitmap block of group=%lu, ino=%lu",
+ ret, group, inode->i_ino);
}
}
return 0;
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 6e6f49aa53df3..05149e606a78a 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -13,13 +13,8 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Original code was written by Koji Sato <koji@osrg.net>.
- * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
- * Amagai Yoshiji <amagai@osrg.net>.
+ * Originally written by Koji Sato.
+ * Two allocators were unified by Ryusuke Konishi and Amagai Yoshiji.
*/
#ifndef _NILFS_ALLOC_H
@@ -42,7 +37,7 @@ nilfs_palloc_entries_per_group(const struct inode *inode)
return 1UL << (inode->i_blkbits + 3 /* log2(8 = CHAR_BITS) */);
}
-int nilfs_palloc_init_blockgroup(struct inode *, unsigned);
+int nilfs_palloc_init_blockgroup(struct inode *, unsigned int);
int nilfs_palloc_get_entry_block(struct inode *, __u64, int,
struct buffer_head **);
void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 27f75bcbeb30d..01fb1831ca250 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#include <linux/fs.h>
@@ -45,8 +41,8 @@ static int nilfs_bmap_convert_error(struct nilfs_bmap *bmap,
struct inode *inode = bmap->b_inode;
if (err == -EINVAL) {
- nilfs_error(inode->i_sb, fname,
- "broken bmap (inode number=%lu)\n", inode->i_ino);
+ __nilfs_error(inode->i_sb, fname,
+ "broken bmap (inode number=%lu)", inode->i_ino);
err = -EIO;
}
return err;
@@ -97,7 +93,7 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
}
int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp,
- unsigned maxblocks)
+ unsigned int maxblocks)
{
int ret;
@@ -458,7 +454,7 @@ __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
struct buffer_head *pbh;
__u64 key;
- key = page_index(bh->b_page) << (PAGE_CACHE_SHIFT -
+ key = page_index(bh->b_page) << (PAGE_SHIFT -
bmap->b_inode->i_blkbits);
for (pbh = page_buffers(bh->b_page); pbh != bh; pbh = pbh->b_this_page)
key++;
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index bfa817ce40b3a..2b6ffbe5997a2 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#ifndef _NILFS_BMAP_H
@@ -26,7 +22,7 @@
#include <linux/types.h>
#include <linux/fs.h>
#include <linux/buffer_head.h>
-#include <linux/nilfs2_fs.h>
+#include <linux/nilfs2_ondisk.h> /* nilfs_binfo, nilfs_inode, etc */
#include "alloc.h"
#include "dat.h"
@@ -61,7 +57,7 @@ struct nilfs_bmap_stats {
struct nilfs_bmap_operations {
int (*bop_lookup)(const struct nilfs_bmap *, __u64, int, __u64 *);
int (*bop_lookup_contig)(const struct nilfs_bmap *, __u64, __u64 *,
- unsigned);
+ unsigned int);
int (*bop_insert)(struct nilfs_bmap *, __u64, __u64);
int (*bop_delete)(struct nilfs_bmap *, __u64);
void (*bop_clear)(struct nilfs_bmap *);
@@ -126,10 +122,14 @@ struct nilfs_bmap {
/* pointer type */
#define NILFS_BMAP_PTR_P 0 /* physical block number (i.e. LBN) */
-#define NILFS_BMAP_PTR_VS 1 /* virtual block number (single
- version) */
-#define NILFS_BMAP_PTR_VM 2 /* virtual block number (has multiple
- versions) */
+#define NILFS_BMAP_PTR_VS 1 /*
+ * virtual block number (single
+ * version)
+ */
+#define NILFS_BMAP_PTR_VM 2 /*
+ * virtual block number (has multiple
+ * versions)
+ */
#define NILFS_BMAP_PTR_U (-1) /* never perform pointer operations */
#define NILFS_BMAP_USE_VBN(bmap) ((bmap)->b_ptr_type > 0)
@@ -154,7 +154,7 @@ struct nilfs_bmap_store {
int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
-int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned);
+int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned int);
int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec);
int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key);
int nilfs_bmap_seek_key(struct nilfs_bmap *bmap, __u64 start, __u64 *keyp);
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index a35ae35e69320..d5c23da43513c 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -13,13 +13,8 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * This file was originally written by Seiji Kihara <kihara@osrg.net>
- * and fully revised by Ryusuke Konishi <ryusuke@osrg.net> for
- * stabilization and simplification.
+ * Originally written by Seiji Kihara.
+ * Fully revised by Ryusuke Konishi for stabilization and simplification.
*
*/
@@ -46,7 +41,7 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
struct inode *inode = NILFS_BTNC_I(btnc);
struct buffer_head *bh;
- bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node);
+ bh = nilfs_grab_buffer(inode, btnc, blocknr, BIT(BH_NILFS_Node));
if (unlikely(!bh))
return NULL;
@@ -62,12 +57,12 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
set_buffer_uptodate(bh);
unlock_page(bh->b_page);
- page_cache_release(bh->b_page);
+ put_page(bh->b_page);
return bh;
}
int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
- sector_t pblocknr, int mode,
+ sector_t pblocknr, int mode, int mode_flags,
struct buffer_head **pbh, sector_t *submit_ptr)
{
struct buffer_head *bh;
@@ -75,7 +70,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
struct page *page;
int err;
- bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node);
+ bh = nilfs_grab_buffer(inode, btnc, blocknr, BIT(BH_NILFS_Node));
if (unlikely(!bh))
return -ENOMEM;
@@ -100,7 +95,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
}
}
- if (mode == READA) {
+ if (mode_flags & REQ_RAHEAD) {
if (pblocknr != *submit_ptr + 1 || !trylock_buffer(bh)) {
err = -EBUSY; /* internal code */
brelse(bh);
@@ -119,7 +114,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
bh->b_blocknr = pblocknr; /* set block address for read */
bh->b_end_io = end_buffer_read_sync;
get_bh(bh);
- submit_bh(mode, bh);
+ submit_bh(mode, mode_flags, bh);
bh->b_blocknr = blocknr; /* set back to the given block address */
*submit_ptr = pblocknr;
err = 0;
@@ -128,7 +123,7 @@ found:
out_locked:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return err;
}
@@ -146,7 +141,7 @@ void nilfs_btnode_delete(struct buffer_head *bh)
pgoff_t index = page_index(page);
int still_dirty;
- page_cache_get(page);
+ get_page(page);
lock_page(page);
wait_on_page_writeback(page);
@@ -154,7 +149,7 @@ void nilfs_btnode_delete(struct buffer_head *bh)
still_dirty = PageDirty(page);
mapping = page->mapping;
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if (!still_dirty && mapping)
invalidate_inode_pages2_range(mapping, index, index);
@@ -181,7 +176,7 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc,
obh = ctxt->bh;
ctxt->newbh = NULL;
- if (inode->i_blkbits == PAGE_CACHE_SHIFT) {
+ if (inode->i_blkbits == PAGE_SHIFT) {
lock_page(obh->b_page);
/*
* We cannot call radix_tree_preload for the kernels older
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index d876b565ce648..4e8aaa1aeb65d 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -13,12 +13,8 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Seiji Kihara <kihara@osrg.net>
- * Revised by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Seiji Kihara.
+ * Revised by Ryusuke Konishi.
*/
#ifndef _NILFS_BTNODE_H
@@ -47,7 +43,7 @@ void nilfs_btnode_cache_clear(struct address_space *);
struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
__u64 blocknr);
int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, int,
- struct buffer_head **, sector_t *);
+ int, struct buffer_head **, sector_t *);
void nilfs_btnode_delete(struct buffer_head *);
int nilfs_btnode_prepare_change_key(struct address_space *,
struct nilfs_btnode_chkey_ctxt *);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 3a3821b00486b..2e315f9f2e51d 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#include <linux/slab.h>
@@ -343,12 +339,14 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node,
* nilfs_btree_node_broken - verify consistency of btree node
* @node: btree node block to be examined
* @size: node size (in bytes)
+ * @inode: host inode of btree
* @blocknr: block number
*
* Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned.
*/
static int nilfs_btree_node_broken(const struct nilfs_btree_node *node,
- size_t size, sector_t blocknr)
+ size_t size, struct inode *inode,
+ sector_t blocknr)
{
int level, flags, nchildren;
int ret = 0;
@@ -362,9 +360,10 @@ static int nilfs_btree_node_broken(const struct nilfs_btree_node *node,
(flags & NILFS_BTREE_NODE_ROOT) ||
nchildren < 0 ||
nchildren > NILFS_BTREE_NODE_NCHILDREN_MAX(size))) {
- printk(KERN_CRIT "NILFS: bad btree node (blocknr=%llu): "
- "level = %d, flags = 0x%x, nchildren = %d\n",
- (unsigned long long)blocknr, level, flags, nchildren);
+ nilfs_msg(inode->i_sb, KERN_CRIT,
+ "bad btree node (ino=%lu, blocknr=%llu): level = %d, flags = 0x%x, nchildren = %d",
+ inode->i_ino, (unsigned long long)blocknr, level,
+ flags, nchildren);
ret = 1;
}
return ret;
@@ -373,12 +372,12 @@ static int nilfs_btree_node_broken(const struct nilfs_btree_node *node,
/**
* nilfs_btree_root_broken - verify consistency of btree root node
* @node: btree root node to be examined
- * @ino: inode number
+ * @inode: host inode of btree
*
* Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned.
*/
static int nilfs_btree_root_broken(const struct nilfs_btree_node *node,
- unsigned long ino)
+ struct inode *inode)
{
int level, flags, nchildren;
int ret = 0;
@@ -391,8 +390,9 @@ static int nilfs_btree_root_broken(const struct nilfs_btree_node *node,
level >= NILFS_BTREE_LEVEL_MAX ||
nchildren < 0 ||
nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX)) {
- pr_crit("NILFS: bad btree root (inode number=%lu): level = %d, flags = 0x%x, nchildren = %d\n",
- ino, level, flags, nchildren);
+ nilfs_msg(inode->i_sb, KERN_CRIT,
+ "bad btree root (ino=%lu): level = %d, flags = 0x%x, nchildren = %d",
+ inode->i_ino, level, flags, nchildren);
ret = 1;
}
return ret;
@@ -400,13 +400,15 @@ static int nilfs_btree_root_broken(const struct nilfs_btree_node *node,
int nilfs_btree_broken_node_block(struct buffer_head *bh)
{
+ struct inode *inode;
int ret;
if (buffer_nilfs_checked(bh))
return 0;
+ inode = bh->b_page->mapping->host;
ret = nilfs_btree_node_broken((struct nilfs_btree_node *)bh->b_data,
- bh->b_size, bh->b_blocknr);
+ bh->b_size, inode, bh->b_blocknr);
if (likely(!ret))
set_buffer_nilfs_checked(bh);
return ret;
@@ -452,13 +454,15 @@ nilfs_btree_get_node(const struct nilfs_bmap *btree,
return node;
}
-static int
-nilfs_btree_bad_node(struct nilfs_btree_node *node, int level)
+static int nilfs_btree_bad_node(const struct nilfs_bmap *btree,
+ struct nilfs_btree_node *node, int level)
{
if (unlikely(nilfs_btree_node_get_level(node) != level)) {
dump_stack();
- printk(KERN_CRIT "NILFS: btree level mismatch: %d != %d\n",
- nilfs_btree_node_get_level(node), level);
+ nilfs_msg(btree->b_inode->i_sb, KERN_CRIT,
+ "btree level mismatch (ino=%lu): %d != %d",
+ btree->b_inode->i_ino,
+ nilfs_btree_node_get_level(node), level);
return 1;
}
return 0;
@@ -480,7 +484,8 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
sector_t submit_ptr = 0;
int ret;
- ret = nilfs_btnode_submit_block(btnc, ptr, 0, READ, &bh, &submit_ptr);
+ ret = nilfs_btnode_submit_block(btnc, ptr, 0, REQ_OP_READ, 0, &bh,
+ &submit_ptr);
if (ret) {
if (ret != -EEXIST)
return ret;
@@ -496,7 +501,8 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
n > 0 && i < ra->ncmax; n--, i++) {
ptr2 = nilfs_btree_node_get_ptr(ra->node, i, ra->ncmax);
- ret = nilfs_btnode_submit_block(btnc, ptr2, 0, READA,
+ ret = nilfs_btnode_submit_block(btnc, ptr2, 0,
+ REQ_OP_READ, REQ_RAHEAD,
&ra_bh, &submit_ptr);
if (likely(!ret || ret == -EEXIST))
brelse(ra_bh);
@@ -511,6 +517,9 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
out_no_wait:
if (!buffer_uptodate(bh)) {
+ nilfs_msg(btree->b_inode->i_sb, KERN_ERR,
+ "I/O error reading b-tree node block (ino=%lu, blocknr=%llu)",
+ btree->b_inode->i_ino, (unsigned long long)ptr);
brelse(bh);
return -EIO;
}
@@ -570,7 +579,7 @@ static int nilfs_btree_do_lookup(const struct nilfs_bmap *btree,
return ret;
node = nilfs_btree_get_nonroot_node(path, level);
- if (nilfs_btree_bad_node(node, level))
+ if (nilfs_btree_bad_node(btree, node, level))
return -EINVAL;
if (!found)
found = nilfs_btree_node_lookup(node, key, &index);
@@ -618,7 +627,7 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_bmap *btree,
if (ret < 0)
return ret;
node = nilfs_btree_get_nonroot_node(path, level);
- if (nilfs_btree_bad_node(node, level))
+ if (nilfs_btree_bad_node(btree, node, level))
return -EINVAL;
index = nilfs_btree_node_get_nchildren(node) - 1;
ptr = nilfs_btree_node_get_ptr(node, index, ncmax);
@@ -689,7 +698,8 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *btree,
}
static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree,
- __u64 key, __u64 *ptrp, unsigned maxblocks)
+ __u64 key, __u64 *ptrp,
+ unsigned int maxblocks)
{
struct nilfs_btree_path *path;
struct nilfs_btree_node *node;
@@ -1032,12 +1042,12 @@ static __u64 nilfs_btree_find_target_v(const struct nilfs_bmap *btree,
if (ptr != NILFS_BMAP_INVALID_PTR)
/* sequential access */
return ptr;
- else {
- ptr = nilfs_btree_find_near(btree, path);
- if (ptr != NILFS_BMAP_INVALID_PTR)
- /* near */
- return ptr;
- }
+
+ ptr = nilfs_btree_find_near(btree, path);
+ if (ptr != NILFS_BMAP_INVALID_PTR)
+ /* near */
+ return ptr;
+
/* block group */
return nilfs_bmap_find_target_in_group(btree);
}
@@ -2073,8 +2083,10 @@ static int nilfs_btree_propagate(struct nilfs_bmap *btree,
ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0);
if (ret < 0) {
if (unlikely(ret == -ENOENT))
- printk(KERN_CRIT "%s: key = %llu, level == %d\n",
- __func__, (unsigned long long)key, level);
+ nilfs_msg(btree->b_inode->i_sb, KERN_CRIT,
+ "writing node/leaf block does not appear in b-tree (ino=%lu) at key=%llu, level=%d",
+ btree->b_inode->i_ino,
+ (unsigned long long)key, level);
goto out;
}
@@ -2111,12 +2123,11 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_bmap *btree,
if (level < NILFS_BTREE_LEVEL_NODE_MIN ||
level >= NILFS_BTREE_LEVEL_MAX) {
dump_stack();
- printk(KERN_WARNING
- "%s: invalid btree level: %d (key=%llu, ino=%lu, "
- "blocknr=%llu)\n",
- __func__, level, (unsigned long long)key,
- NILFS_BMAP_I(btree)->vfs_inode.i_ino,
- (unsigned long long)bh->b_blocknr);
+ nilfs_msg(btree->b_inode->i_sb, KERN_WARNING,
+ "invalid btree level: %d (key=%llu, ino=%lu, blocknr=%llu)",
+ level, (unsigned long long)key,
+ btree->b_inode->i_ino,
+ (unsigned long long)bh->b_blocknr);
return;
}
@@ -2395,8 +2406,7 @@ int nilfs_btree_init(struct nilfs_bmap *bmap)
__nilfs_btree_init(bmap);
- if (nilfs_btree_root_broken(nilfs_btree_get_root(bmap),
- bmap->b_inode->i_ino))
+ if (nilfs_btree_root_broken(nilfs_btree_get_root(bmap), bmap->b_inode))
ret = -EIO;
return ret;
}
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index 22c02e35b6ef4..2184e47fa4bf6 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#ifndef _NILFS_BTREE_H
@@ -26,7 +22,7 @@
#include <linux/types.h>
#include <linux/buffer_head.h>
#include <linux/list.h>
-#include <linux/nilfs2_fs.h>
+#include <linux/nilfs2_ondisk.h> /* nilfs_btree_node */
#include "btnode.h"
#include "bmap.h"
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index b6596cab9e99e..a15a1601e931d 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#include <linux/kernel.h>
@@ -25,7 +21,6 @@
#include <linux/string.h>
#include <linux/buffer_head.h>
#include <linux/errno.h>
-#include <linux/nilfs2_fs.h>
#include "mdt.h"
#include "cpfile.h"
@@ -41,6 +36,7 @@ static unsigned long
nilfs_cpfile_get_blkoff(const struct inode *cpfile, __u64 cno)
{
__u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1;
+
do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
return (unsigned long)tcno;
}
@@ -50,6 +46,7 @@ static unsigned long
nilfs_cpfile_get_offset(const struct inode *cpfile, __u64 cno)
{
__u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1;
+
return do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
}
@@ -334,9 +331,9 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
int ret, ncps, nicps, nss, count, i;
if (unlikely(start == 0 || start > end)) {
- printk(KERN_ERR "%s: invalid range of checkpoint numbers: "
- "[%llu, %llu)\n", __func__,
- (unsigned long long)start, (unsigned long long)end);
+ nilfs_msg(cpfile->i_sb, KERN_ERR,
+ "cannot delete checkpoints: invalid range [%llu, %llu)",
+ (unsigned long long)start, (unsigned long long)end);
return -EINVAL;
}
@@ -388,9 +385,9 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
cpfile, cno);
if (ret == 0)
continue;
- printk(KERN_ERR
- "%s: cannot delete block\n",
- __func__);
+ nilfs_msg(cpfile->i_sb, KERN_ERR,
+ "error %d deleting checkpoint block",
+ ret);
break;
}
}
@@ -433,7 +430,8 @@ static void nilfs_cpfile_checkpoint_to_cpinfo(struct inode *cpfile,
}
static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
- void *buf, unsigned cisz, size_t nci)
+ void *buf, unsigned int cisz,
+ size_t nci)
{
struct nilfs_checkpoint *cp;
struct nilfs_cpinfo *ci = buf;
@@ -484,7 +482,8 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
}
static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
- void *buf, unsigned cisz, size_t nci)
+ void *buf, unsigned int cisz,
+ size_t nci)
{
struct buffer_head *bh;
struct nilfs_cpfile_header *header;
@@ -570,7 +569,7 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
*/
ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode,
- void *buf, unsigned cisz, size_t nci)
+ void *buf, unsigned int cisz, size_t nci)
{
switch (mode) {
case NILFS_CHECKPOINT:
@@ -870,8 +869,10 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
void *kaddr;
int ret;
- /* CP number is invalid if it's zero or larger than the
- largest exist one.*/
+ /*
+ * CP number is invalid if it's zero or larger than the
+ * largest existing one.
+ */
if (cno == 0 || cno >= nilfs_mdt_cno(cpfile))
return -ENOENT;
down_read(&NILFS_MDT(cpfile)->mi_sem);
@@ -989,14 +990,12 @@ int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
int err;
if (cpsize > sb->s_blocksize) {
- printk(KERN_ERR
- "NILFS: too large checkpoint size: %zu bytes.\n",
- cpsize);
+ nilfs_msg(sb, KERN_ERR,
+ "too large checkpoint size: %zu bytes", cpsize);
return -EINVAL;
} else if (cpsize < NILFS_MIN_CHECKPOINT_SIZE) {
- printk(KERN_ERR
- "NILFS: too small checkpoint size: %zu bytes.\n",
- cpsize);
+ nilfs_msg(sb, KERN_ERR,
+ "too small checkpoint size: %zu bytes", cpsize);
return -EINVAL;
}
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index a242b9a314f95..6eca972f9673c 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#ifndef _NILFS_CPFILE_H
@@ -25,7 +21,8 @@
#include <linux/fs.h>
#include <linux/buffer_head.h>
-#include <linux/nilfs2_fs.h>
+#include <linux/nilfs2_api.h> /* nilfs_cpstat */
+#include <linux/nilfs2_ondisk.h> /* nilfs_inode, nilfs_checkpoint */
int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
@@ -37,8 +34,8 @@ int nilfs_cpfile_delete_checkpoint(struct inode *, __u64);
int nilfs_cpfile_change_cpmode(struct inode *, __u64, int);
int nilfs_cpfile_is_snapshot(struct inode *, __u64);
int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *);
-ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned,
- size_t);
+ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *,
+ unsigned int, size_t);
int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
struct nilfs_inode *raw_inode, struct inode **inodep);
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 7dc23f100e579..dffedb2f88179 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#include <linux/types.h>
@@ -353,10 +349,11 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
kaddr = kmap_atomic(entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
if (unlikely(entry->de_blocknr == cpu_to_le64(0))) {
- printk(KERN_CRIT "%s: vbn = %llu, [%llu, %llu)\n", __func__,
- (unsigned long long)vblocknr,
- (unsigned long long)le64_to_cpu(entry->de_start),
- (unsigned long long)le64_to_cpu(entry->de_end));
+ nilfs_msg(dat->i_sb, KERN_CRIT,
+ "%s: invalid vblocknr = %llu, [%llu, %llu)",
+ __func__, (unsigned long long)vblocknr,
+ (unsigned long long)le64_to_cpu(entry->de_start),
+ (unsigned long long)le64_to_cpu(entry->de_end));
kunmap_atomic(kaddr);
brelse(entry_bh);
return -EINVAL;
@@ -428,7 +425,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
return ret;
}
-ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz,
+ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned int visz,
size_t nvi)
{
struct buffer_head *entry_bh;
@@ -483,14 +480,12 @@ int nilfs_dat_read(struct super_block *sb, size_t entry_size,
int err;
if (entry_size > sb->s_blocksize) {
- printk(KERN_ERR
- "NILFS: too large DAT entry size: %zu bytes.\n",
- entry_size);
+ nilfs_msg(sb, KERN_ERR, "too large DAT entry size: %zu bytes",
+ entry_size);
return -EINVAL;
} else if (entry_size < NILFS_MIN_DAT_ENTRY_SIZE) {
- printk(KERN_ERR
- "NILFS: too small DAT entry size: %zu bytes.\n",
- entry_size);
+ nilfs_msg(sb, KERN_ERR, "too small DAT entry size: %zu bytes",
+ entry_size);
return -EINVAL;
}
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index cbd8e97325030..57dc6cf466d02 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#ifndef _NILFS_DAT_H
@@ -26,6 +22,7 @@
#include <linux/types.h>
#include <linux/buffer_head.h>
#include <linux/fs.h>
+#include <linux/nilfs2_ondisk.h> /* nilfs_inode, nilfs_checkpoint */
struct nilfs_palloc_req;
@@ -51,7 +48,7 @@ void nilfs_dat_abort_update(struct inode *, struct nilfs_palloc_req *,
int nilfs_dat_mark_dirty(struct inode *, __u64);
int nilfs_dat_freev(struct inode *, __u64 *, size_t);
int nilfs_dat_move(struct inode *, __u64, sector_t);
-ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t);
+ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned int, size_t);
int nilfs_dat_read(struct super_block *sb, size_t entry_size,
struct nilfs_inode *raw_inode, struct inode **inodep);
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 6b8b92b19cec9..908ebbf0ac7ea 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>
+ * Modified for NILFS by Amagai Yoshiji.
*/
/*
* linux/fs/ext2/dir.c
@@ -46,11 +42,33 @@
#include "nilfs.h"
#include "page.h"
+static inline unsigned int nilfs_rec_len_from_disk(__le16 dlen)
+{
+ unsigned int len = le16_to_cpu(dlen);
+
+#if (PAGE_SIZE >= 65536)
+ if (len == NILFS_MAX_REC_LEN)
+ return 1 << 16;
+#endif
+ return len;
+}
+
+static inline __le16 nilfs_rec_len_to_disk(unsigned int len)
+{
+#if (PAGE_SIZE >= 65536)
+ if (len == (1 << 16))
+ return cpu_to_le16(NILFS_MAX_REC_LEN);
+
+ BUG_ON(len > (1 << 16));
+#endif
+ return cpu_to_le16(len);
+}
+
/*
* nilfs uses block-sized chunks. Arguably, sector-sized ones would be
* more robust, but we have what we have
*/
-static inline unsigned nilfs_chunk_size(struct inode *inode)
+static inline unsigned int nilfs_chunk_size(struct inode *inode)
{
return inode->i_sb->s_blocksize;
}
@@ -58,37 +76,39 @@ static inline unsigned nilfs_chunk_size(struct inode *inode)
static inline void nilfs_put_page(struct page *page)
{
kunmap(page);
- page_cache_release(page);
+ put_page(page);
}
/*
* Return the offset into page `page_nr' of the last valid
* byte in that page, plus one.
*/
-static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr)
+static unsigned int nilfs_last_byte(struct inode *inode, unsigned long page_nr)
{
- unsigned last_byte = inode->i_size;
+ unsigned int last_byte = inode->i_size;
- last_byte -= page_nr << PAGE_CACHE_SHIFT;
- if (last_byte > PAGE_CACHE_SIZE)
- last_byte = PAGE_CACHE_SIZE;
+ last_byte -= page_nr << PAGE_SHIFT;
+ if (last_byte > PAGE_SIZE)
+ last_byte = PAGE_SIZE;
return last_byte;
}
-static int nilfs_prepare_chunk(struct page *page, unsigned from, unsigned to)
+static int nilfs_prepare_chunk(struct page *page, unsigned int from,
+ unsigned int to)
{
loff_t pos = page_offset(page) + from;
+
return __block_write_begin(page, pos, to - from, nilfs_get_block);
}
static void nilfs_commit_chunk(struct page *page,
struct address_space *mapping,
- unsigned from, unsigned to)
+ unsigned int from, unsigned int to)
{
struct inode *dir = mapping->host;
loff_t pos = page_offset(page) + from;
- unsigned len = to - from;
- unsigned nr_dirty, copied;
+ unsigned int len = to - from;
+ unsigned int nr_dirty, copied;
int err;
nr_dirty = nilfs_page_count_clean_buffers(page, from, to);
@@ -102,19 +122,19 @@ static void nilfs_commit_chunk(struct page *page,
unlock_page(page);
}
-static void nilfs_check_page(struct page *page)
+static bool nilfs_check_page(struct page *page)
{
struct inode *dir = page->mapping->host;
struct super_block *sb = dir->i_sb;
- unsigned chunk_size = nilfs_chunk_size(dir);
+ unsigned int chunk_size = nilfs_chunk_size(dir);
char *kaddr = page_address(page);
- unsigned offs, rec_len;
- unsigned limit = PAGE_CACHE_SIZE;
+ unsigned int offs, rec_len;
+ unsigned int limit = PAGE_SIZE;
struct nilfs_dir_entry *p;
char *error;
- if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
- limit = dir->i_size & ~PAGE_CACHE_MASK;
+ if ((dir->i_size >> PAGE_SHIFT) == page->index) {
+ limit = dir->i_size & ~PAGE_MASK;
if (limit & (chunk_size - 1))
goto Ebadsize;
if (!limit)
@@ -137,15 +157,14 @@ static void nilfs_check_page(struct page *page)
goto Eend;
out:
SetPageChecked(page);
- return;
+ return true;
/* Too bad, we had an error */
Ebadsize:
- nilfs_error(sb, "nilfs_check_page",
+ nilfs_error(sb,
"size of directory #%lu is not a multiple of chunk size",
- dir->i_ino
- );
+ dir->i_ino);
goto fail;
Eshort:
error = "rec_len is smaller than minimal";
@@ -159,22 +178,21 @@ Enamelen:
Espan:
error = "directory entry across blocks";
bad_entry:
- nilfs_error(sb, "nilfs_check_page", "bad entry in directory #%lu: %s - "
- "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
- dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
- (unsigned long) le64_to_cpu(p->inode),
+ nilfs_error(sb,
+ "bad entry in directory #%lu: %s - offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
+ dir->i_ino, error, (page->index << PAGE_SHIFT) + offs,
+ (unsigned long)le64_to_cpu(p->inode),
rec_len, p->name_len);
goto fail;
Eend:
p = (struct nilfs_dir_entry *)(kaddr + offs);
- nilfs_error(sb, "nilfs_check_page",
- "entry in directory #%lu spans the page boundary"
- "offset=%lu, inode=%lu",
- dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
- (unsigned long) le64_to_cpu(p->inode));
+ nilfs_error(sb,
+ "entry in directory #%lu spans the page boundary offset=%lu, inode=%lu",
+ dir->i_ino, (page->index << PAGE_SHIFT) + offs,
+ (unsigned long)le64_to_cpu(p->inode));
fail:
- SetPageChecked(page);
SetPageError(page);
+ return false;
}
static struct page *nilfs_get_page(struct inode *dir, unsigned long n)
@@ -184,10 +202,10 @@ static struct page *nilfs_get_page(struct inode *dir, unsigned long n)
if (!IS_ERR(page)) {
kmap(page);
- if (!PageChecked(page))
- nilfs_check_page(page);
- if (PageError(page))
- goto fail;
+ if (unlikely(!PageChecked(page))) {
+ if (PageError(page) || !nilfs_check_page(page))
+ goto fail;
+ }
}
return page;
@@ -256,10 +274,9 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx)
loff_t pos = ctx->pos;
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
- unsigned int offset = pos & ~PAGE_CACHE_MASK;
- unsigned long n = pos >> PAGE_CACHE_SHIFT;
+ unsigned int offset = pos & ~PAGE_MASK;
+ unsigned long n = pos >> PAGE_SHIFT;
unsigned long npages = dir_pages(inode);
-/* unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */
if (pos > inode->i_size - NILFS_DIR_REC_LEN(1))
return 0;
@@ -270,9 +287,8 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx)
struct page *page = nilfs_get_page(inode, n);
if (IS_ERR(page)) {
- nilfs_error(sb, __func__, "bad page in #%lu",
- inode->i_ino);
- ctx->pos += PAGE_CACHE_SIZE - offset;
+ nilfs_error(sb, "bad page in #%lu", inode->i_ino);
+ ctx->pos += PAGE_SIZE - offset;
return -EIO;
}
kaddr = page_address(page);
@@ -281,8 +297,7 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx)
NILFS_DIR_REC_LEN(1);
for ( ; (char *)de <= limit; de = nilfs_next_entry(de)) {
if (de->rec_len == 0) {
- nilfs_error(sb, __func__,
- "zero-length directory entry");
+ nilfs_error(sb, "zero-length directory entry");
nilfs_put_page(page);
return -EIO;
}
@@ -321,7 +336,7 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr,
{
const unsigned char *name = qstr->name;
int namelen = qstr->len;
- unsigned reclen = NILFS_DIR_REC_LEN(namelen);
+ unsigned int reclen = NILFS_DIR_REC_LEN(namelen);
unsigned long start, n;
unsigned long npages = dir_pages(dir);
struct page *page = NULL;
@@ -340,6 +355,7 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr,
n = start;
do {
char *kaddr;
+
page = nilfs_get_page(dir, n);
if (!IS_ERR(page)) {
kaddr = page_address(page);
@@ -347,7 +363,7 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr,
kaddr += nilfs_last_byte(dir, n) - reclen;
while ((char *) de <= kaddr) {
if (de->rec_len == 0) {
- nilfs_error(dir->i_sb, __func__,
+ nilfs_error(dir->i_sb,
"zero-length directory entry");
nilfs_put_page(page);
goto out;
@@ -361,8 +377,8 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr,
if (++n >= npages)
n = 0;
/* next page is past the blocks we've got */
- if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) {
- nilfs_error(dir->i_sb, __func__,
+ if (unlikely(n > (dir->i_blocks >> (PAGE_SHIFT - 9)))) {
+ nilfs_error(dir->i_sb,
"dir %lu size %lld exceeds block count %llu",
dir->i_ino, dir->i_size,
(unsigned long long)dir->i_blocks);
@@ -401,7 +417,7 @@ ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr)
if (de) {
res = le64_to_cpu(de->inode);
kunmap(page);
- page_cache_release(page);
+ put_page(page);
}
return res;
}
@@ -410,8 +426,8 @@ ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr)
void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
struct page *page, struct inode *inode)
{
- unsigned from = (char *) de - (char *) page_address(page);
- unsigned to = from + nilfs_rec_len_from_disk(de->rec_len);
+ unsigned int from = (char *)de - (char *)page_address(page);
+ unsigned int to = from + nilfs_rec_len_from_disk(de->rec_len);
struct address_space *mapping = page->mapping;
int err;
@@ -433,15 +449,15 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
struct inode *dir = d_inode(dentry->d_parent);
const unsigned char *name = dentry->d_name.name;
int namelen = dentry->d_name.len;
- unsigned chunk_size = nilfs_chunk_size(dir);
- unsigned reclen = NILFS_DIR_REC_LEN(namelen);
+ unsigned int chunk_size = nilfs_chunk_size(dir);
+ unsigned int reclen = NILFS_DIR_REC_LEN(namelen);
unsigned short rec_len, name_len;
struct page *page = NULL;
struct nilfs_dir_entry *de;
unsigned long npages = dir_pages(dir);
unsigned long n;
char *kaddr;
- unsigned from, to;
+ unsigned int from, to;
int err;
/*
@@ -460,7 +476,7 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
kaddr = page_address(page);
dir_end = kaddr + nilfs_last_byte(dir, n);
de = (struct nilfs_dir_entry *)kaddr;
- kaddr += PAGE_CACHE_SIZE - reclen;
+ kaddr += PAGE_SIZE - reclen;
while ((char *)de <= kaddr) {
if ((char *)de == dir_end) {
/* We hit i_size */
@@ -471,7 +487,7 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
goto got_it;
}
if (de->rec_len == 0) {
- nilfs_error(dir->i_sb, __func__,
+ nilfs_error(dir->i_sb,
"zero-length directory entry");
err = -EIO;
goto out_unlock;
@@ -533,16 +549,17 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
struct address_space *mapping = page->mapping;
struct inode *inode = mapping->host;
char *kaddr = page_address(page);
- unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1);
- unsigned to = ((char *)dir - kaddr) +
- nilfs_rec_len_from_disk(dir->rec_len);
- struct nilfs_dir_entry *pde = NULL;
- struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from);
+ unsigned int from, to;
+ struct nilfs_dir_entry *de, *pde = NULL;
int err;
+ from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1);
+ to = ((char *)dir - kaddr) + nilfs_rec_len_from_disk(dir->rec_len);
+ de = (struct nilfs_dir_entry *)(kaddr + from);
+
while ((char *)de < (char *)dir) {
if (de->rec_len == 0) {
- nilfs_error(inode->i_sb, __func__,
+ nilfs_error(inode->i_sb,
"zero-length directory entry");
err = -EIO;
goto out;
@@ -572,7 +589,7 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent)
{
struct address_space *mapping = inode->i_mapping;
struct page *page = grab_cache_page(mapping, 0);
- unsigned chunk_size = nilfs_chunk_size(inode);
+ unsigned int chunk_size = nilfs_chunk_size(inode);
struct nilfs_dir_entry *de;
int err;
void *kaddr;
@@ -603,7 +620,7 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent)
kunmap_atomic(kaddr);
nilfs_commit_chunk(page, mapping, 0, chunk_size);
fail:
- page_cache_release(page);
+ put_page(page);
return err;
}
@@ -629,9 +646,9 @@ int nilfs_empty_dir(struct inode *inode)
while ((char *)de <= kaddr) {
if (de->rec_len == 0) {
- nilfs_error(inode->i_sb, __func__,
- "zero-length directory entry "
- "(kaddr=%p, de=%p)\n", kaddr, de);
+ nilfs_error(inode->i_sb,
+ "zero-length directory entry (kaddr=%p, de=%p)",
+ kaddr, de);
goto not_empty;
}
if (de->inode != 0) {
@@ -661,7 +678,7 @@ not_empty:
const struct file_operations nilfs_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .iterate = nilfs_readdir,
+ .iterate_shared = nilfs_readdir,
.unlocked_ioctl = nilfs_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = nilfs_compat_ioctl,
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index ebf89fd8ac1a1..96e3ed0d9652b 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#include <linux/errno.h>
@@ -62,7 +58,7 @@ static int nilfs_direct_lookup(const struct nilfs_bmap *direct,
static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct,
__u64 key, __u64 *ptrp,
- unsigned maxblocks)
+ unsigned int maxblocks)
{
struct inode *dat = NULL;
__u64 ptr, ptr2;
@@ -83,7 +79,8 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct,
ptr = blocknr;
}
- maxblocks = min_t(unsigned, maxblocks, NILFS_DIRECT_KEY_MAX - key + 1);
+ maxblocks = min_t(unsigned int, maxblocks,
+ NILFS_DIRECT_KEY_MAX - key + 1);
for (cnt = 1; cnt < maxblocks &&
(ptr2 = nilfs_direct_get_ptr(direct, key + cnt)) !=
NILFS_BMAP_INVALID_PTR;
@@ -110,9 +107,9 @@ nilfs_direct_find_target_v(const struct nilfs_bmap *direct, __u64 key)
if (ptr != NILFS_BMAP_INVALID_PTR)
/* sequential access */
return ptr;
- else
- /* block group */
- return nilfs_bmap_find_target_in_group(direct);
+
+ /* block group */
+ return nilfs_bmap_find_target_in_group(direct);
}
static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
@@ -340,14 +337,16 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap,
key = nilfs_bmap_data_get_key(bmap, *bh);
if (unlikely(key > NILFS_DIRECT_KEY_MAX)) {
- printk(KERN_CRIT "%s: invalid key: %llu\n", __func__,
- (unsigned long long)key);
+ nilfs_msg(bmap->b_inode->i_sb, KERN_CRIT,
+ "%s (ino=%lu): invalid key: %llu", __func__,
+ bmap->b_inode->i_ino, (unsigned long long)key);
return -EINVAL;
}
ptr = nilfs_direct_get_ptr(bmap, key);
if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) {
- printk(KERN_CRIT "%s: invalid pointer: %llu\n", __func__,
- (unsigned long long)ptr);
+ nilfs_msg(bmap->b_inode->i_sb, KERN_CRIT,
+ "%s (ino=%lu): invalid pointer: %llu", __func__,
+ bmap->b_inode->i_ino, (unsigned long long)ptr);
return -EINVAL;
}
diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h
index dc643de20a251..cfe85e848bba1 100644
--- a/fs/nilfs2/direct.h
+++ b/fs/nilfs2/direct.h
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#ifndef _NILFS_DIRECT_H
@@ -28,16 +24,6 @@
#include "bmap.h"
-/**
- * struct nilfs_direct_node - direct node
- * @dn_flags: flags
- * @dn_pad: padding
- */
-struct nilfs_direct_node {
- __u8 dn_flags;
- __u8 pad[7];
-};
-
#define NILFS_DIRECT_NBLOCKS (NILFS_BMAP_SIZE / sizeof(__le64) - 1)
#define NILFS_DIRECT_KEY_MIN 0
#define NILFS_DIRECT_KEY_MAX (NILFS_DIRECT_NBLOCKS - 1)
diff --git a/fs/nilfs2/export.h b/fs/nilfs2/export.h
index 19ccbf9522ab3..00107fdb93433 100644
--- a/fs/nilfs2/export.h
+++ b/fs/nilfs2/export.h
@@ -20,6 +20,6 @@ struct nilfs_fid {
u32 parent_gen;
u64 parent_ino;
-} __attribute__ ((packed));
+} __packed;
#endif
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 088ba001c6ef5..547381f3ce137 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -13,12 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Amagai Yoshiji <amagai@osrg.net>,
- * Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Amagai Yoshiji and Ryusuke Konishi.
*/
#include <linux/fs.h>
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 748ca238915a4..853a831dcde08 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -13,13 +13,8 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>,
- * and Ryusuke Konishi <ryusuke@osrg.net>.
- * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
+ * Written by Seiji Kihara, Amagai Yoshiji, and Ryusuke Konishi.
+ * Revised by Ryusuke Konishi.
*
*/
/*
@@ -106,7 +101,7 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
bh->b_blocknr = pbn;
bh->b_end_io = end_buffer_read_sync;
get_bh(bh);
- submit_bh(READ, bh);
+ submit_bh(REQ_OP_READ, 0, bh);
if (vbn)
bh->b_blocknr = vbn;
out:
@@ -115,7 +110,7 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
failed:
unlock_page(bh->b_page);
- page_cache_release(bh->b_page);
+ put_page(bh->b_page);
return err;
}
@@ -143,7 +138,8 @@ int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn,
int ret;
ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache,
- vbn ? : pbn, pbn, READ, out_bh, &pbn);
+ vbn ? : pbn, pbn, REQ_OP_READ, 0,
+ out_bh, &pbn);
if (ret == -EEXIST) /* internal code (cache hit) */
ret = 0;
return ret;
@@ -152,8 +148,15 @@ int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn,
int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
{
wait_on_buffer(bh);
- if (!buffer_uptodate(bh))
+ if (!buffer_uptodate(bh)) {
+ struct inode *inode = bh->b_page->mapping->host;
+
+ nilfs_msg(inode->i_sb, KERN_ERR,
+ "I/O error reading %s block for GC (ino=%lu, vblocknr=%llu)",
+ buffer_nilfs_node(bh) ? "node" : "data",
+ inode->i_ino, (unsigned long long)bh->b_blocknr);
return -EIO;
+ }
if (buffer_dirty(bh))
return -EEXIST;
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 6548c7851b485..b8fa45c20c63f 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -13,12 +13,8 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Amagai Yoshiji <amagai@osrg.net>.
- * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
+ * Written by Amagai Yoshiji.
+ * Revised by Ryusuke Konishi.
*
*/
@@ -68,8 +64,10 @@ int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino,
struct nilfs_palloc_req req;
int ret;
- req.pr_entry_nr = 0; /* 0 says find free inode from beginning of
- a group. dull code!! */
+ req.pr_entry_nr = 0; /*
+ * 0 says find free inode from beginning
+ * of a group. dull code!!
+ */
req.pr_entry_bh = NULL;
ret = nilfs_palloc_prepare_alloc_entry(ifile, &req);
@@ -147,15 +145,14 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
int err;
if (unlikely(!NILFS_VALID_INODE(sb, ino))) {
- nilfs_error(sb, __func__, "bad inode number: %lu",
- (unsigned long) ino);
+ nilfs_error(sb, "bad inode number: %lu", (unsigned long)ino);
return -EINVAL;
}
err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh);
if (unlikely(err))
- nilfs_warning(sb, __func__, "unable to read inode: %lu",
- (unsigned long) ino);
+ nilfs_msg(sb, KERN_WARNING, "error %d reading inode: ino=%lu",
+ err, (unsigned long)ino);
return err;
}
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index 679674d13372a..188b94fe0ec5f 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -13,12 +13,8 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Amagai Yoshiji <amagai@osrg.net>
- * Revised by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Amagai Yoshiji.
+ * Revised by Ryusuke Konishi.
*
*/
@@ -27,7 +23,6 @@
#include <linux/fs.h>
#include <linux/buffer_head.h>
-#include <linux/nilfs2_fs.h>
#include "mdt.h"
#include "alloc.h"
@@ -36,6 +31,7 @@ static inline struct nilfs_inode *
nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh)
{
void *kaddr = kmap(ibh->b_page);
+
return nilfs_palloc_block_get_entry(ifile, ino, ibh, kaddr);
}
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 21a1e2e0d92fe..af04f553d7c9d 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
*
*/
@@ -87,7 +83,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
__u64 blknum = 0;
int err = 0, ret;
- unsigned maxblocks = bh_result->b_size >> inode->i_blkbits;
+ unsigned int maxblocks = bh_result->b_size >> inode->i_blkbits;
down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks);
@@ -116,13 +112,10 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
* However, the page having this block must
* be locked in this case.
*/
- printk(KERN_WARNING
- "nilfs_get_block: a race condition "
- "while inserting a data block. "
- "(inode number=%lu, file block "
- "offset=%llu)\n",
- inode->i_ino,
- (unsigned long long)blkoff);
+ nilfs_msg(inode->i_sb, KERN_WARNING,
+ "%s (ino=%lu): a race condition while inserting a data block at offset=%llu",
+ __func__, inode->i_ino,
+ (unsigned long long)blkoff);
err = 0;
}
nilfs_transaction_abort(inode->i_sb);
@@ -133,11 +126,14 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
/* Error handling should be detailed */
set_buffer_new(bh_result);
set_buffer_delay(bh_result);
- map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed
- to proper value */
+ map_bh(bh_result, inode->i_sb, 0);
+ /* Disk block number must be changed to proper value */
+
} else if (ret == -ENOENT) {
- /* not found is not error (e.g. hole); must return without
- the mapped state flag. */
+ /*
+ * not found is not error (e.g. hole); must return without
+ * the mapped state flag.
+ */
;
} else {
err = ret;
@@ -167,7 +163,7 @@ static int nilfs_readpage(struct file *file, struct page *page)
* @nr_pages - number of pages to be read
*/
static int nilfs_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+ struct list_head *pages, unsigned int nr_pages)
{
return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block);
}
@@ -226,7 +222,7 @@ static int nilfs_set_page_dirty(struct page *page)
int ret = __set_page_dirty_nobuffers(page);
if (page_has_buffers(page)) {
- unsigned nr_dirty = 0;
+ unsigned int nr_dirty = 0;
struct buffer_head *bh, *head;
/*
@@ -249,7 +245,7 @@ static int nilfs_set_page_dirty(struct page *page)
if (nr_dirty)
nilfs_set_file_dirty(inode, nr_dirty);
} else if (ret) {
- unsigned nr_dirty = 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ unsigned int nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
nilfs_set_file_dirty(inode, nr_dirty);
}
@@ -291,8 +287,8 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
- unsigned start = pos & (PAGE_CACHE_SIZE - 1);
- unsigned nr_dirty;
+ unsigned int start = pos & (PAGE_SIZE - 1);
+ unsigned int nr_dirty;
int err;
nr_dirty = nilfs_page_count_clean_buffers(page, start,
@@ -305,7 +301,7 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping,
}
static ssize_t
-nilfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
+nilfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct inode *inode = file_inode(iocb->ki_filp);
@@ -313,7 +309,7 @@ nilfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
return 0;
/* Needs synchronization with the cleaner */
- return blockdev_direct_IO(iocb, inode, iter, offset, nilfs_get_block);
+ return blockdev_direct_IO(iocb, inode, iter, nilfs_get_block);
}
const struct address_space_operations nilfs_aops = {
@@ -360,7 +356,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
root = NILFS_I(dir)->i_root;
ii = NILFS_I(inode);
- ii->i_state = 1 << NILFS_I_NEW;
+ ii->i_state = BIT(NILFS_I_NEW);
ii->i_root = root;
err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh);
@@ -399,23 +395,26 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
err = nilfs_init_acl(inode, dir);
if (unlikely(err))
- goto failed_after_creation; /* never occur. When supporting
- nilfs_init_acl(), proper cancellation of
- above jobs should be considered */
+ /*
+ * Never occur. When supporting nilfs_init_acl(),
+ * proper cancellation of above jobs should be considered.
+ */
+ goto failed_after_creation;
return inode;
failed_after_creation:
clear_nlink(inode);
unlock_new_inode(inode);
- iput(inode); /* raw_inode will be deleted through
- nilfs_evict_inode() */
+ iput(inode); /*
+ * raw_inode will be deleted through
+ * nilfs_evict_inode().
+ */
goto failed;
failed_ifile_create_inode:
make_bad_inode(inode);
- iput(inode); /* if i_nlink == 1, generic_forget_inode() will be
- called */
+ iput(inode);
failed:
return ERR_PTR(err);
}
@@ -556,7 +555,7 @@ static int nilfs_iget_set(struct inode *inode, void *opaque)
inode->i_ino = args->ino;
if (args->for_gc) {
- NILFS_I(inode)->i_state = 1 << NILFS_I_GCINODE;
+ NILFS_I(inode)->i_state = BIT(NILFS_I_GCINODE);
NILFS_I(inode)->i_cno = args->cno;
NILFS_I(inode)->i_root = NULL;
} else {
@@ -666,8 +665,10 @@ void nilfs_write_inode_common(struct inode *inode,
else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
raw_inode->i_device_code =
cpu_to_le64(huge_encode_dev(inode->i_rdev));
- /* When extending inode, nilfs->ns_inode_size should be checked
- for substitutions of appended fields */
+ /*
+ * When extending inode, nilfs->ns_inode_size should be checked
+ * for substitutions of appended fields.
+ */
}
void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags)
@@ -685,9 +686,12 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags)
set_bit(NILFS_I_INODE_SYNC, &ii->i_state);
nilfs_write_inode_common(inode, raw_inode, 0);
- /* XXX: call with has_bmap = 0 is a workaround to avoid
- deadlock of bmap. This delays update of i_bmap to just
- before writing */
+ /*
+ * XXX: call with has_bmap = 0 is a workaround to avoid
+ * deadlock of bmap. This delays update of i_bmap to just
+ * before writing.
+ */
+
nilfs_ifile_unmap_inode(ifile, ino, ibh);
}
@@ -719,9 +723,9 @@ repeat:
goto repeat;
failed:
- nilfs_warning(ii->vfs_inode.i_sb, __func__,
- "failed to truncate bmap (ino=%lu, err=%d)",
- ii->vfs_inode.i_ino, ret);
+ nilfs_msg(ii->vfs_inode.i_sb, KERN_WARNING,
+ "error %d truncating bmap (ino=%lu)", ret,
+ ii->vfs_inode.i_ino);
}
void nilfs_truncate(struct inode *inode)
@@ -752,14 +756,15 @@ void nilfs_truncate(struct inode *inode)
nilfs_mark_inode_dirty(inode);
nilfs_set_file_dirty(inode, 0);
nilfs_transaction_commit(sb);
- /* May construct a logical segment and may fail in sync mode.
- But truncate has no return value. */
+ /*
+ * May construct a logical segment and may fail in sync mode.
+ * But truncate has no return value.
+ */
}
static void nilfs_clear_inode(struct inode *inode)
{
struct nilfs_inode_info *ii = NILFS_I(inode);
- struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
/*
* Free resources allocated in nilfs_read_inode(), here.
@@ -768,8 +773,8 @@ static void nilfs_clear_inode(struct inode *inode)
brelse(ii->i_bh);
ii->i_bh = NULL;
- if (mdi && mdi->mi_palloc_cache)
- nilfs_palloc_destroy_cache(inode);
+ if (nilfs_is_metadata_file_inode(inode))
+ nilfs_mdt_clear(inode);
if (test_bit(NILFS_I_BMAP, &ii->i_state))
nilfs_bmap_clear(ii->i_bmap);
@@ -811,8 +816,10 @@ void nilfs_evict_inode(struct inode *inode)
if (IS_SYNC(inode))
nilfs_set_transaction_flag(NILFS_TI_SYNC);
nilfs_transaction_commit(sb);
- /* May construct a logical segment and may fail in sync mode.
- But delete_inode has no return value. */
+ /*
+ * May construct a logical segment and may fail in sync mode.
+ * But delete_inode has no return value.
+ */
}
int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
@@ -856,6 +863,7 @@ out_err:
int nilfs_permission(struct inode *inode, int mask)
{
struct nilfs_root *root = NILFS_I(inode)->i_root;
+
if ((mask & MAY_WRITE) && root &&
root->cno != NILFS_CPTREE_CURRENT_CNO)
return -EROFS; /* snapshot is not writable */
@@ -906,7 +914,7 @@ int nilfs_inode_dirty(struct inode *inode)
return ret;
}
-int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
+int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty)
{
struct nilfs_inode_info *ii = NILFS_I(inode);
struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
@@ -919,17 +927,23 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
spin_lock(&nilfs->ns_inode_lock);
if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
!test_bit(NILFS_I_BUSY, &ii->i_state)) {
- /* Because this routine may race with nilfs_dispose_list(),
- we have to check NILFS_I_QUEUED here, too. */
+ /*
+ * Because this routine may race with nilfs_dispose_list(),
+ * we have to check NILFS_I_QUEUED here, too.
+ */
if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) {
- /* This will happen when somebody is freeing
- this inode. */
- nilfs_warning(inode->i_sb, __func__,
- "cannot get inode (ino=%lu)\n",
- inode->i_ino);
+ /*
+ * This will happen when somebody is freeing
+ * this inode.
+ */
+ nilfs_msg(inode->i_sb, KERN_WARNING,
+ "cannot set file dirty (ino=%lu): the file is being freed",
+ inode->i_ino);
spin_unlock(&nilfs->ns_inode_lock);
- return -EINVAL; /* NILFS_I_DIRTY may remain for
- freeing inode */
+ return -EINVAL; /*
+ * NILFS_I_DIRTY may remain for
+ * freeing inode.
+ */
}
list_move_tail(&ii->i_dirty, &nilfs->ns_dirty_files);
set_bit(NILFS_I_QUEUED, &ii->i_state);
@@ -945,8 +959,9 @@ int __nilfs_mark_inode_dirty(struct inode *inode, int flags)
err = nilfs_load_inode_block(inode, &ibh);
if (unlikely(err)) {
- nilfs_warning(inode->i_sb, __func__,
- "failed to reget inode block.\n");
+ nilfs_msg(inode->i_sb, KERN_WARNING,
+ "cannot mark inode dirty (ino=%lu): error %d loading inode block",
+ inode->i_ino, err);
return err;
}
nilfs_update_inode(inode, ibh, flags);
@@ -972,8 +987,8 @@ void nilfs_dirty_inode(struct inode *inode, int flags)
struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
if (is_bad_inode(inode)) {
- nilfs_warning(inode->i_sb, __func__,
- "tried to mark bad_inode dirty. ignored.\n");
+ nilfs_msg(inode->i_sb, KERN_WARNING,
+ "tried to mark bad_inode dirty. ignored.");
dump_stack();
return;
}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index e8fe24882b5ba..f1d7989459fdb 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#include <linux/fs.h>
@@ -29,7 +25,6 @@
#include <linux/compat.h> /* compat_ptr() */
#include <linux/mount.h> /* mnt_want_write_file(), mnt_drop_write_file() */
#include <linux/buffer_head.h>
-#include <linux/nilfs2_fs.h>
#include "nilfs.h"
#include "segment.h"
#include "bmap.h"
@@ -588,27 +583,25 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode,
if (unlikely(ret < 0)) {
if (ret == -ENOENT)
- printk(KERN_CRIT
- "%s: invalid virtual block address (%s): "
- "ino=%llu, cno=%llu, offset=%llu, "
- "blocknr=%llu, vblocknr=%llu\n",
- __func__, vdesc->vd_flags ? "node" : "data",
- (unsigned long long)vdesc->vd_ino,
- (unsigned long long)vdesc->vd_cno,
- (unsigned long long)vdesc->vd_offset,
- (unsigned long long)vdesc->vd_blocknr,
- (unsigned long long)vdesc->vd_vblocknr);
+ nilfs_msg(inode->i_sb, KERN_CRIT,
+ "%s: invalid virtual block address (%s): ino=%llu, cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu",
+ __func__, vdesc->vd_flags ? "node" : "data",
+ (unsigned long long)vdesc->vd_ino,
+ (unsigned long long)vdesc->vd_cno,
+ (unsigned long long)vdesc->vd_offset,
+ (unsigned long long)vdesc->vd_blocknr,
+ (unsigned long long)vdesc->vd_vblocknr);
return ret;
}
if (unlikely(!list_empty(&bh->b_assoc_buffers))) {
- printk(KERN_CRIT "%s: conflicting %s buffer: ino=%llu, "
- "cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu\n",
- __func__, vdesc->vd_flags ? "node" : "data",
- (unsigned long long)vdesc->vd_ino,
- (unsigned long long)vdesc->vd_cno,
- (unsigned long long)vdesc->vd_offset,
- (unsigned long long)vdesc->vd_blocknr,
- (unsigned long long)vdesc->vd_vblocknr);
+ nilfs_msg(inode->i_sb, KERN_CRIT,
+ "%s: conflicting %s buffer: ino=%llu, cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu",
+ __func__, vdesc->vd_flags ? "node" : "data",
+ (unsigned long long)vdesc->vd_ino,
+ (unsigned long long)vdesc->vd_cno,
+ (unsigned long long)vdesc->vd_offset,
+ (unsigned long long)vdesc->vd_blocknr,
+ (unsigned long long)vdesc->vd_vblocknr);
brelse(bh);
return -EEXIST;
}
@@ -783,6 +776,7 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
size_t nmembs = argv->v_nmembs;
struct nilfs_bmap *bmap = NILFS_I(nilfs->ns_dat)->i_bmap;
struct nilfs_bdesc *bdescs = buf;
+ struct buffer_head *bh;
int ret, i;
for (i = 0; i < nmembs; i++) {
@@ -800,12 +794,16 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
/* skip dead block */
continue;
if (bdescs[i].bd_level == 0) {
- ret = nilfs_mdt_mark_block_dirty(nilfs->ns_dat,
- bdescs[i].bd_offset);
- if (ret < 0) {
+ ret = nilfs_mdt_get_block(nilfs->ns_dat,
+ bdescs[i].bd_offset,
+ false, NULL, &bh);
+ if (unlikely(ret)) {
WARN_ON(ret == -ENOENT);
return ret;
}
+ mark_buffer_dirty(bh);
+ nilfs_mdt_mark_dirty(nilfs->ns_dat);
+ put_bh(bh);
} else {
ret = nilfs_bmap_mark(bmap, bdescs[i].bd_offset,
bdescs[i].bd_level);
@@ -853,8 +851,8 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
return 0;
failed:
- printk(KERN_ERR "NILFS: GC failed during preparation: %s: err=%d\n",
- msg, ret);
+ nilfs_msg(nilfs->ns_sb, KERN_ERR, "error %d preparing GC: %s", ret,
+ msg);
return ret;
}
@@ -962,10 +960,11 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
}
ret = nilfs_ioctl_move_blocks(inode->i_sb, &argv[0], kbufs[0]);
- if (ret < 0)
- printk(KERN_ERR "NILFS: GC failed during preparation: "
- "cannot read source blocks: err=%d\n", ret);
- else {
+ if (ret < 0) {
+ nilfs_msg(inode->i_sb, KERN_ERR,
+ "error %d preparing GC: cannot read source blocks",
+ ret);
+ } else {
if (nilfs_sb_need_update(nilfs))
set_nilfs_discontinued(nilfs);
ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 1125f40233ffd..d56d3a5bea88d 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
*/
#include <linux/buffer_head.h>
@@ -32,6 +28,7 @@
#include "segment.h"
#include "page.h"
#include "mdt.h"
+#include "alloc.h" /* nilfs_palloc_destroy_cache() */
#include <trace/events/nilfs2.h>
@@ -110,7 +107,7 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
failed_bh:
unlock_page(bh->b_page);
- page_cache_release(bh->b_page);
+ put_page(bh->b_page);
brelse(bh);
failed_unlock:
@@ -124,7 +121,7 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
static int
nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
- int mode, struct buffer_head **out_bh)
+ int mode, int mode_flags, struct buffer_head **out_bh)
{
struct buffer_head *bh;
__u64 blknum = 0;
@@ -138,7 +135,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
if (buffer_uptodate(bh))
goto out;
- if (mode == READA) {
+ if (mode_flags & REQ_RAHEAD) {
if (!trylock_buffer(bh)) {
ret = -EBUSY;
goto failed_bh;
@@ -160,7 +157,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
bh->b_end_io = end_buffer_read_sync;
get_bh(bh);
- submit_bh(mode, bh);
+ submit_bh(mode, mode_flags, bh);
ret = 0;
trace_nilfs2_mdt_submit_block(inode, inode->i_ino, blkoff, mode);
@@ -170,7 +167,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
failed_bh:
unlock_page(bh->b_page);
- page_cache_release(bh->b_page);
+ put_page(bh->b_page);
brelse(bh);
failed:
return ret;
@@ -184,7 +181,7 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
int i, nr_ra_blocks = NILFS_MDT_MAX_RA_BLOCKS;
int err;
- err = nilfs_mdt_submit_block(inode, block, READ, &first_bh);
+ err = nilfs_mdt_submit_block(inode, block, REQ_OP_READ, 0, &first_bh);
if (err == -EEXIST) /* internal code */
goto out;
@@ -194,7 +191,8 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
if (readahead) {
blkoff = block + 1;
for (i = 0; i < nr_ra_blocks; i++, blkoff++) {
- err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh);
+ err = nilfs_mdt_submit_block(inode, blkoff, REQ_OP_READ,
+ REQ_RAHEAD, &bh);
if (likely(!err || err == -EEXIST))
brelse(bh);
else if (err != -EBUSY)
@@ -209,8 +207,12 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
out_no_wait:
err = -EIO;
- if (!buffer_uptodate(first_bh))
+ if (!buffer_uptodate(first_bh)) {
+ nilfs_msg(inode->i_sb, KERN_ERR,
+ "I/O error reading meta-data file (ino=%lu, block-offset=%lu)",
+ inode->i_ino, block);
goto failed_bh;
+ }
out:
*out_bh = first_bh;
return 0;
@@ -363,7 +365,7 @@ int nilfs_mdt_delete_block(struct inode *inode, unsigned long block)
int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
{
pgoff_t index = (pgoff_t)block >>
- (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ (PAGE_SHIFT - inode->i_blkbits);
struct page *page;
unsigned long first_block;
int ret = 0;
@@ -376,7 +378,7 @@ int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
wait_on_page_writeback(page);
first_block = (unsigned long)index <<
- (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ (PAGE_SHIFT - inode->i_blkbits);
if (page_has_buffers(page)) {
struct buffer_head *bh;
@@ -385,7 +387,7 @@ int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
}
still_dirty = PageDirty(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if (still_dirty ||
invalidate_inode_pages2_range(inode->i_mapping, index, index) != 0)
@@ -393,34 +395,6 @@ int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
return ret;
}
-/**
- * nilfs_mdt_mark_block_dirty - mark a block on the meta data file dirty.
- * @inode: inode of the meta data file
- * @block: block offset
- *
- * Return Value: On success, it returns 0. On error, the following negative
- * error code is returned.
- *
- * %-ENOMEM - Insufficient memory available.
- *
- * %-EIO - I/O error
- *
- * %-ENOENT - the specified block does not exist (hole block)
- */
-int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block)
-{
- struct buffer_head *bh;
- int err;
-
- err = nilfs_mdt_read_block(inode, block, 0, &bh);
- if (unlikely(err))
- return err;
- mark_buffer_dirty(bh);
- nilfs_mdt_mark_dirty(inode);
- brelse(bh);
- return 0;
-}
-
int nilfs_mdt_fetch_dirty(struct inode *inode)
{
struct nilfs_inode_info *ii = NILFS_I(inode);
@@ -497,8 +471,32 @@ int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz)
return 0;
}
-void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
- unsigned header_size)
+/**
+ * nilfs_mdt_clear - do cleanup for the metadata file
+ * @inode: inode of the metadata file
+ */
+void nilfs_mdt_clear(struct inode *inode)
+{
+ struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
+
+ if (mdi->mi_palloc_cache)
+ nilfs_palloc_destroy_cache(inode);
+}
+
+/**
+ * nilfs_mdt_destroy - release resources used by the metadata file
+ * @inode: inode of the metadata file
+ */
+void nilfs_mdt_destroy(struct inode *inode)
+{
+ struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
+
+ kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
+ kfree(mdi);
+}
+
+void nilfs_mdt_set_entry_size(struct inode *inode, unsigned int entry_size,
+ unsigned int header_size)
{
struct nilfs_mdt_info *mi = NILFS_MDT(inode);
@@ -578,7 +576,7 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh)
}
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return 0;
}
@@ -597,7 +595,7 @@ nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh)
bh_frozen = nilfs_page_get_nth_block(page, n);
}
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
return bh_frozen;
}
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index 03246cac33384..3f67f3932097b 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
*/
#ifndef _NILFS_MDT_H
@@ -57,8 +53,8 @@ struct nilfs_shadow_map {
struct nilfs_mdt_info {
struct rw_semaphore mi_sem;
struct blockgroup_lock *mi_bgl;
- unsigned mi_entry_size;
- unsigned mi_first_entry_offset;
+ unsigned int mi_entry_size;
+ unsigned int mi_first_entry_offset;
unsigned long mi_entries_per_block;
struct nilfs_palloc_cache *mi_palloc_cache;
struct nilfs_shadow_map *mi_shadow;
@@ -71,6 +67,11 @@ static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
return inode->i_private;
}
+static inline int nilfs_is_metadata_file_inode(const struct inode *inode)
+{
+ return inode->i_private != NULL;
+}
+
/* Default GFP flags using highmem */
#define NILFS_MDT_GFP (__GFP_RECLAIM | __GFP_IO | __GFP_HIGHMEM)
@@ -83,11 +84,13 @@ int nilfs_mdt_find_block(struct inode *inode, unsigned long start,
struct buffer_head **out_bh);
int nilfs_mdt_delete_block(struct inode *, unsigned long);
int nilfs_mdt_forget_block(struct inode *, unsigned long);
-int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
int nilfs_mdt_fetch_dirty(struct inode *);
int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz);
-void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned);
+void nilfs_mdt_clear(struct inode *inode);
+void nilfs_mdt_destroy(struct inode *inode);
+
+void nilfs_mdt_set_entry_size(struct inode *, unsigned int, unsigned int);
int nilfs_mdt_setup_shadow_map(struct inode *inode,
struct nilfs_shadow_map *shadow);
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 7ccdb961eea90..dbcf1dc93a511 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -13,12 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>,
- * Ryusuke Konishi <ryusuke@osrg.net>
+ * Modified for NILFS by Amagai Yoshiji and Ryusuke Konishi.
*/
/*
* linux/fs/ext2/namei.c
@@ -49,6 +44,7 @@
static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode)
{
int err = nilfs_add_link(dentry, inode);
+
if (!err) {
d_instantiate(dentry, inode);
unlock_new_inode(inode);
@@ -143,7 +139,7 @@ static int nilfs_symlink(struct inode *dir, struct dentry *dentry,
{
struct nilfs_transaction_info ti;
struct super_block *sb = dir->i_sb;
- unsigned l = strlen(symname)+1;
+ unsigned int l = strlen(symname) + 1;
struct inode *inode;
int err;
@@ -287,9 +283,9 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
goto out;
if (!inode->i_nlink) {
- nilfs_warning(inode->i_sb, __func__,
- "deleting nonexistent file (%lu), %d\n",
- inode->i_ino, inode->i_nlink);
+ nilfs_msg(inode->i_sb, KERN_WARNING,
+ "deleting nonexistent file (ino=%lu), %d",
+ inode->i_ino, inode->i_nlink);
set_nlink(inode, 1);
}
err = nilfs_delete_entry(de, page);
@@ -431,11 +427,11 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
out_dir:
if (dir_de) {
kunmap(dir_page);
- page_cache_release(dir_page);
+ put_page(dir_page);
}
out_old:
kunmap(old_page);
- page_cache_release(old_page);
+ put_page(old_page);
out:
nilfs_transaction_abort(old_dir->i_sb);
return err;
@@ -457,7 +453,7 @@ static struct dentry *nilfs_get_parent(struct dentry *child)
root = NILFS_I(d_inode(child))->i_root;
- inode = nilfs_iget(d_inode(child)->i_sb, root, ino);
+ inode = nilfs_iget(child->d_sb, root, ino);
if (IS_ERR(inode))
return ERR_CAST(inode);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 3857040275755..33f8c8fc96e8e 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -13,12 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>
- * Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Koji Sato and Ryusuke Konishi.
*/
#ifndef _NILFS_H
@@ -28,7 +23,8 @@
#include <linux/buffer_head.h>
#include <linux/spinlock.h>
#include <linux/blkdev.h>
-#include <linux/nilfs2_fs.h>
+#include <linux/nilfs2_api.h>
+#include <linux/nilfs2_ondisk.h>
#include "the_nilfs.h"
#include "bmap.h"
@@ -69,8 +65,10 @@ struct nilfs_inode_info {
*/
struct rw_semaphore xattr_sem;
#endif
- struct buffer_head *i_bh; /* i_bh contains a new or dirty
- disk inode */
+ struct buffer_head *i_bh; /*
+ * i_bh contains a new or dirty
+ * disk inode.
+ */
struct nilfs_root *i_root;
struct inode vfs_inode;
};
@@ -100,8 +98,10 @@ enum {
NILFS_I_NEW = 0, /* Inode is newly created */
NILFS_I_DIRTY, /* The file is dirty */
NILFS_I_QUEUED, /* inode is in dirty_files list */
- NILFS_I_BUSY, /* inode is grabbed by a segment
- constructor */
+ NILFS_I_BUSY, /*
+ * Inode is grabbed by a segment
+ * constructor
+ */
NILFS_I_COLLECTED, /* All dirty blocks are collected */
NILFS_I_UPDATED, /* The file has been written back */
NILFS_I_INODE_SYNC, /* dsync is not allowed for inode */
@@ -120,20 +120,19 @@ enum {
/*
* Macros to check inode numbers
*/
-#define NILFS_MDT_INO_BITS \
- ((unsigned int)(1 << NILFS_DAT_INO | 1 << NILFS_CPFILE_INO | \
- 1 << NILFS_SUFILE_INO | 1 << NILFS_IFILE_INO | \
- 1 << NILFS_ATIME_INO | 1 << NILFS_SKETCH_INO))
+#define NILFS_MDT_INO_BITS \
+ (BIT(NILFS_DAT_INO) | BIT(NILFS_CPFILE_INO) | \
+ BIT(NILFS_SUFILE_INO) | BIT(NILFS_IFILE_INO) | \
+ BIT(NILFS_ATIME_INO) | BIT(NILFS_SKETCH_INO))
-#define NILFS_SYS_INO_BITS \
- ((unsigned int)(1 << NILFS_ROOT_INO) | NILFS_MDT_INO_BITS)
+#define NILFS_SYS_INO_BITS (BIT(NILFS_ROOT_INO) | NILFS_MDT_INO_BITS)
#define NILFS_FIRST_INO(sb) (((struct the_nilfs *)sb->s_fs_info)->ns_first_ino)
#define NILFS_MDT_INODE(sb, ino) \
- ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & (1 << (ino))))
+ ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & BIT(ino)))
#define NILFS_VALID_INODE(sb, ino) \
- ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & (1 << (ino))))
+ ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & BIT(ino)))
/**
* struct nilfs_transaction_info: context information for synchronization
@@ -145,8 +144,10 @@ enum {
struct nilfs_transaction_info {
u32 ti_magic;
void *ti_save;
- /* This should never used. If this happens,
- one of other filesystems has a bug. */
+ /*
+ * This should never be used. If it happens,
+ * one of other filesystems has a bug.
+ */
unsigned short ti_flags;
unsigned short ti_count;
};
@@ -156,8 +157,10 @@ struct nilfs_transaction_info {
/* ti_flags */
#define NILFS_TI_DYNAMIC_ALLOC 0x0001 /* Allocated from slab */
-#define NILFS_TI_SYNC 0x0002 /* Force to construct segment at the
- end of transaction. */
+#define NILFS_TI_SYNC 0x0002 /*
+ * Force to construct segment at the
+ * end of transaction.
+ */
#define NILFS_TI_GC 0x0004 /* GC context */
#define NILFS_TI_COMMIT 0x0008 /* Change happened or not */
#define NILFS_TI_WRITER 0x0010 /* Constructor context */
@@ -279,7 +282,7 @@ extern void nilfs_write_failed(struct address_space *mapping, loff_t to);
int nilfs_permission(struct inode *inode, int mask);
int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
extern int nilfs_inode_dirty(struct inode *);
-int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty);
+int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty);
extern int __nilfs_mark_inode_dirty(struct inode *, int);
extern void nilfs_dirty_inode(struct inode *, int flags);
int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -296,10 +299,36 @@ static inline int nilfs_mark_inode_dirty_sync(struct inode *inode)
/* super.c */
extern struct inode *nilfs_alloc_inode(struct super_block *);
extern void nilfs_destroy_inode(struct inode *);
+
extern __printf(3, 4)
-void nilfs_error(struct super_block *, const char *, const char *, ...);
+void __nilfs_msg(struct super_block *sb, const char *level,
+ const char *fmt, ...);
extern __printf(3, 4)
-void nilfs_warning(struct super_block *, const char *, const char *, ...);
+void __nilfs_error(struct super_block *sb, const char *function,
+ const char *fmt, ...);
+
+#ifdef CONFIG_PRINTK
+
+#define nilfs_msg(sb, level, fmt, ...) \
+ __nilfs_msg(sb, level, fmt, ##__VA_ARGS__)
+#define nilfs_error(sb, fmt, ...) \
+ __nilfs_error(sb, __func__, fmt, ##__VA_ARGS__)
+
+#else
+
+#define nilfs_msg(sb, level, fmt, ...) \
+ do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ (void)(sb); \
+ } while (0)
+#define nilfs_error(sb, fmt, ...) \
+ do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __nilfs_error(sb, "", " "); \
+ } while (0)
+
+#endif /* CONFIG_PRINTK */
+
extern struct nilfs_super_block *
nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
extern int nilfs_store_magic_and_option(struct super_block *,
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index c20df77eff99f..f11a3ad2df0cd 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -13,12 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>,
- * Seiji Kihara <kihara@osrg.net>.
+ * Written by Ryusuke Konishi and Seiji Kihara.
*/
#include <linux/pagemap.h>
@@ -35,9 +30,9 @@
#include "mdt.h"
-#define NILFS_BUFFER_INHERENT_BITS \
- ((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \
- (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Checked))
+#define NILFS_BUFFER_INHERENT_BITS \
+ (BIT(BH_Uptodate) | BIT(BH_Mapped) | BIT(BH_NILFS_Node) | \
+ BIT(BH_NILFS_Volatile) | BIT(BH_NILFS_Checked))
static struct buffer_head *
__nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index,
@@ -50,7 +45,7 @@ __nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index,
if (!page_has_buffers(page))
create_empty_buffers(page, 1 << blkbits, b_state);
- first_block = (unsigned long)index << (PAGE_CACHE_SHIFT - blkbits);
+ first_block = (unsigned long)index << (PAGE_SHIFT - blkbits);
bh = nilfs_page_get_nth_block(page, block - first_block);
touch_buffer(bh);
@@ -64,7 +59,7 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
unsigned long b_state)
{
int blkbits = inode->i_blkbits;
- pgoff_t index = blkoff >> (PAGE_CACHE_SHIFT - blkbits);
+ pgoff_t index = blkoff >> (PAGE_SHIFT - blkbits);
struct page *page;
struct buffer_head *bh;
@@ -75,7 +70,7 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
bh = __nilfs_get_page_block(page, blkoff, index, blkbits, b_state);
if (unlikely(!bh)) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return NULL;
}
return bh;
@@ -90,9 +85,9 @@ void nilfs_forget_buffer(struct buffer_head *bh)
{
struct page *page = bh->b_page;
const unsigned long clear_bits =
- (1 << BH_Uptodate | 1 << BH_Dirty | 1 << BH_Mapped |
- 1 << BH_Async_Write | 1 << BH_NILFS_Volatile |
- 1 << BH_NILFS_Checked | 1 << BH_NILFS_Redirected);
+ (BIT(BH_Uptodate) | BIT(BH_Dirty) | BIT(BH_Mapped) |
+ BIT(BH_Async_Write) | BIT(BH_NILFS_Volatile) |
+ BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected));
lock_buffer(bh);
set_mask_bits(&bh->b_state, clear_bits, 0);
@@ -129,17 +124,17 @@ void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh)
dbh->b_bdev = sbh->b_bdev;
bh = dbh;
- bits = sbh->b_state & ((1UL << BH_Uptodate) | (1UL << BH_Mapped));
+ bits = sbh->b_state & (BIT(BH_Uptodate) | BIT(BH_Mapped));
while ((bh = bh->b_this_page) != dbh) {
lock_buffer(bh);
bits &= bh->b_state;
unlock_buffer(bh);
}
- if (bits & (1UL << BH_Uptodate))
+ if (bits & BIT(BH_Uptodate))
SetPageUptodate(dpage);
else
ClearPageUptodate(dpage);
- if (bits & (1UL << BH_Mapped))
+ if (bits & BIT(BH_Mapped))
SetPageMappedToDisk(dpage);
else
ClearPageMappedToDisk(dpage);
@@ -220,7 +215,7 @@ static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty)
create_empty_buffers(dst, sbh->b_size, 0);
if (copy_dirty)
- mask |= (1UL << BH_Dirty);
+ mask |= BIT(BH_Dirty);
dbh = dbufs = page_buffers(dst);
do {
@@ -288,7 +283,7 @@ repeat:
__set_page_dirty_nobuffers(dpage);
unlock_page(dpage);
- page_cache_release(dpage);
+ put_page(dpage);
unlock_page(page);
}
pagevec_release(&pvec);
@@ -333,7 +328,7 @@ repeat:
WARN_ON(PageDirty(dpage));
nilfs_copy_page(dpage, page, 0);
unlock_page(dpage);
- page_cache_release(dpage);
+ put_page(dpage);
} else {
struct page *page2;
@@ -350,7 +345,7 @@ repeat:
if (unlikely(err < 0)) {
WARN_ON(err == -EEXIST);
page->mapping = NULL;
- page_cache_release(page); /* for cache */
+ put_page(page); /* for cache */
} else {
page->mapping = dmap;
dmap->nrpages++;
@@ -408,11 +403,10 @@ void nilfs_clear_dirty_page(struct page *page, bool silent)
BUG_ON(!PageLocked(page));
- if (!silent) {
- nilfs_warning(sb, __func__,
- "discard page: offset %lld, ino %lu",
- page_offset(page), inode->i_ino);
- }
+ if (!silent)
+ nilfs_msg(sb, KERN_WARNING,
+ "discard dirty page: offset=%lld, ino=%lu",
+ page_offset(page), inode->i_ino);
ClearPageUptodate(page);
ClearPageMappedToDisk(page);
@@ -420,18 +414,18 @@ void nilfs_clear_dirty_page(struct page *page, bool silent)
if (page_has_buffers(page)) {
struct buffer_head *bh, *head;
const unsigned long clear_bits =
- (1 << BH_Uptodate | 1 << BH_Dirty | 1 << BH_Mapped |
- 1 << BH_Async_Write | 1 << BH_NILFS_Volatile |
- 1 << BH_NILFS_Checked | 1 << BH_NILFS_Redirected);
+ (BIT(BH_Uptodate) | BIT(BH_Dirty) | BIT(BH_Mapped) |
+ BIT(BH_Async_Write) | BIT(BH_NILFS_Volatile) |
+ BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected));
bh = head = page_buffers(page);
do {
lock_buffer(bh);
- if (!silent) {
- nilfs_warning(sb, __func__,
- "discard block %llu, size %zu",
- (u64)bh->b_blocknr, bh->b_size);
- }
+ if (!silent)
+ nilfs_msg(sb, KERN_WARNING,
+ "discard dirty block: blocknr=%llu, size=%zu",
+ (u64)bh->b_blocknr, bh->b_size);
+
set_mask_bits(&bh->b_state, clear_bits, 0);
unlock_buffer(bh);
} while (bh = bh->b_this_page, bh != head);
@@ -440,12 +434,12 @@ void nilfs_clear_dirty_page(struct page *page, bool silent)
__nilfs_clear_page_dirty(page);
}
-unsigned nilfs_page_count_clean_buffers(struct page *page,
- unsigned from, unsigned to)
+unsigned int nilfs_page_count_clean_buffers(struct page *page,
+ unsigned int from, unsigned int to)
{
- unsigned block_start, block_end;
+ unsigned int block_start, block_end;
struct buffer_head *bh, *head;
- unsigned nc = 0;
+ unsigned int nc = 0;
for (bh = head = page_buffers(page), block_start = 0;
bh != head || !block_start;
@@ -523,8 +517,8 @@ unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
if (inode->i_mapping->nrpages == 0)
return 0;
- index = start_blk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
- nblocks_in_page = 1U << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ index = start_blk >> (PAGE_SHIFT - inode->i_blkbits);
+ nblocks_in_page = 1U << (PAGE_SHIFT - inode->i_blkbits);
pagevec_init(&pvec, 0);
@@ -537,7 +531,7 @@ repeat:
if (length > 0 && pvec.pages[0]->index > index)
goto out;
- b = pvec.pages[0]->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ b = pvec.pages[0]->index << (PAGE_SHIFT - inode->i_blkbits);
i = 0;
do {
page = pvec.pages[i];
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index a43b8287d012a..f3687c958fa84 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -13,12 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>,
- * Seiji Kihara <kihara@osrg.net>.
+ * Written by Ryusuke Konishi and Seiji Kihara.
*/
#ifndef _NILFS_PAGE_H
@@ -58,7 +53,8 @@ void nilfs_copy_back_pages(struct address_space *, struct address_space *);
void nilfs_clear_dirty_page(struct page *, bool);
void nilfs_clear_dirty_pages(struct address_space *, bool);
void nilfs_mapping_init(struct address_space *mapping, struct inode *inode);
-unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
+unsigned int nilfs_page_count_clean_buffers(struct page *, unsigned int,
+ unsigned int);
unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
sector_t start_blk,
sector_t *blkoff);
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 9b4f205d11736..5139efed18882 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
*/
#include <linux/buffer_head.h>
@@ -47,8 +43,10 @@ enum {
/* work structure for recovery */
struct nilfs_recovery_block {
- ino_t ino; /* Inode number of the file that this block
- belongs to */
+ ino_t ino; /*
+ * Inode number of the file that this block
+ * belongs to
+ */
sector_t blocknr; /* block number */
__u64 vblocknr; /* virtual block number */
unsigned long blkoff; /* File offset of the data block (per block) */
@@ -56,38 +54,37 @@ struct nilfs_recovery_block {
};
-static int nilfs_warn_segment_error(int err)
+static int nilfs_warn_segment_error(struct super_block *sb, int err)
{
+ const char *msg = NULL;
+
switch (err) {
case NILFS_SEG_FAIL_IO:
- printk(KERN_WARNING
- "NILFS warning: I/O error on loading last segment\n");
+ nilfs_msg(sb, KERN_ERR, "I/O error reading segment");
return -EIO;
case NILFS_SEG_FAIL_MAGIC:
- printk(KERN_WARNING
- "NILFS warning: Segment magic number invalid\n");
+ msg = "Magic number mismatch";
break;
case NILFS_SEG_FAIL_SEQ:
- printk(KERN_WARNING
- "NILFS warning: Sequence number mismatch\n");
+ msg = "Sequence number mismatch";
break;
case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT:
- printk(KERN_WARNING
- "NILFS warning: Checksum error in super root\n");
+ msg = "Checksum error in super root";
break;
case NILFS_SEG_FAIL_CHECKSUM_FULL:
- printk(KERN_WARNING
- "NILFS warning: Checksum error in segment payload\n");
+ msg = "Checksum error in segment payload";
break;
case NILFS_SEG_FAIL_CONSISTENCY:
- printk(KERN_WARNING
- "NILFS warning: Inconsistent segment\n");
+ msg = "Inconsistency found";
break;
case NILFS_SEG_NO_SUPER_ROOT:
- printk(KERN_WARNING
- "NILFS warning: No super root in the last segment\n");
+ msg = "No super root in the last segment";
break;
+ default:
+ nilfs_msg(sb, KERN_ERR, "unrecognized segment error %d", err);
+ return -EINVAL;
}
+ nilfs_msg(sb, KERN_WARNING, "invalid segment: %s", msg);
return -EINVAL;
}
@@ -156,7 +153,7 @@ int nilfs_read_super_root_block(struct the_nilfs *nilfs, sector_t sr_block,
sr = (struct nilfs_super_root *)bh_sr->b_data;
if (check) {
- unsigned bytes = le16_to_cpu(sr->sr_bytes);
+ unsigned int bytes = le16_to_cpu(sr->sr_bytes);
if (bytes == 0 || bytes > nilfs->ns_blocksize) {
ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT;
@@ -180,7 +177,7 @@ int nilfs_read_super_root_block(struct the_nilfs *nilfs, sector_t sr_block,
brelse(bh_sr);
failed:
- return nilfs_warn_segment_error(ret);
+ return nilfs_warn_segment_error(nilfs->ns_sb, ret);
}
/**
@@ -508,7 +505,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
{
struct inode *inode;
struct nilfs_recovery_block *rb, *n;
- unsigned blocksize = nilfs->ns_blocksize;
+ unsigned int blocksize = nilfs->ns_blocksize;
struct page *page;
loff_t pos;
int err = 0, err2 = 0;
@@ -526,6 +523,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
0, &page, nilfs_get_block);
if (unlikely(err)) {
loff_t isize = inode->i_size;
+
if (pos + blocksize > isize)
nilfs_write_failed(inode->i_mapping,
pos + blocksize);
@@ -544,21 +542,20 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
blocksize, page, NULL);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
(*nr_salvaged_blocks)++;
goto next;
failed_page:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
failed_inode:
- printk(KERN_WARNING
- "NILFS warning: error recovering data block "
- "(err=%d, ino=%lu, block-offset=%llu)\n",
- err, (unsigned long)rb->ino,
- (unsigned long long)rb->blkoff);
+ nilfs_msg(sb, KERN_WARNING,
+ "error %d recovering data block (ino=%lu, block-offset=%llu)",
+ err, (unsigned long)rb->ino,
+ (unsigned long long)rb->blkoff);
if (!err2)
err2 = err;
next:
@@ -681,8 +678,8 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
}
if (nsalvaged_blocks) {
- printk(KERN_INFO "NILFS (device %s): salvaged %lu blocks\n",
- sb->s_id, nsalvaged_blocks);
+ nilfs_msg(sb, KERN_INFO, "salvaged %lu blocks",
+ nsalvaged_blocks);
ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE;
}
out:
@@ -693,10 +690,9 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
confused:
err = -EINVAL;
failed:
- printk(KERN_ERR
- "NILFS (device %s): Error roll-forwarding "
- "(err=%d, pseg block=%llu). ",
- sb->s_id, err, (unsigned long long)pseg_start);
+ nilfs_msg(sb, KERN_ERR,
+ "error %d roll-forwarding partial segment at blocknr = %llu",
+ err, (unsigned long long)pseg_start);
goto out;
}
@@ -716,9 +712,8 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
set_buffer_dirty(bh);
err = sync_dirty_buffer(bh);
if (unlikely(err))
- printk(KERN_WARNING
- "NILFS warning: buffer sync write failed during "
- "post-cleaning of recovery.\n");
+ nilfs_msg(nilfs->ns_sb, KERN_WARNING,
+ "buffer sync write failed during post-cleaning of recovery.");
brelse(bh);
}
@@ -753,8 +748,8 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
err = nilfs_attach_checkpoint(sb, ri->ri_cno, true, &root);
if (unlikely(err)) {
- printk(KERN_ERR
- "NILFS: error loading the latest checkpoint.\n");
+ nilfs_msg(sb, KERN_ERR,
+ "error %d loading the latest checkpoint", err);
return err;
}
@@ -765,8 +760,9 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) {
err = nilfs_prepare_segment_for_recovery(nilfs, sb, ri);
if (unlikely(err)) {
- printk(KERN_ERR "NILFS: Error preparing segments for "
- "recovery.\n");
+ nilfs_msg(sb, KERN_ERR,
+ "error %d preparing segment for recovery",
+ err);
goto failed;
}
@@ -779,8 +775,9 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
nilfs_detach_log_writer(sb);
if (unlikely(err)) {
- printk(KERN_ERR "NILFS: Oops! recovery failed. "
- "(err=%d)\n", err);
+ nilfs_msg(sb, KERN_ERR,
+ "error %d writing segment for recovery",
+ err);
goto failed;
}
@@ -872,9 +869,11 @@ int nilfs_search_super_root(struct the_nilfs *nilfs,
flags = le16_to_cpu(sum->ss_flags);
if (!(flags & NILFS_SS_SR) && !scan_newer) {
- /* This will never happen because a superblock
- (last_segment) always points to a pseg
- having a super root. */
+ /*
+ * This will never happen because a superblock
+ * (last_segment) always points to a pseg with
+ * a super root.
+ */
ret = NILFS_SEG_FAIL_CONSISTENCY;
goto failed;
}
@@ -960,5 +959,5 @@ int nilfs_search_super_root(struct the_nilfs *nilfs,
failed:
brelse(bh_sum);
nilfs_dispose_segment_list(&segments);
- return (ret < 0) ? ret : nilfs_warn_segment_error(ret);
+ return ret < 0 ? ret : nilfs_warn_segment_error(nilfs->ns_sb, ret);
}
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index f63620ce38922..6f87b2ac1aeb0 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
*
*/
@@ -133,7 +129,7 @@ int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf,
return 0;
}
-int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
+int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned int flags,
time_t ctime, __u64 cno)
{
int err;
@@ -240,7 +236,7 @@ nilfs_segbuf_fill_in_super_root_crc(struct nilfs_segment_buffer *segbuf,
{
struct nilfs_super_root *raw_sr;
struct the_nilfs *nilfs = segbuf->sb_super->s_fs_info;
- unsigned srsize;
+ unsigned int srsize;
u32 crc;
raw_sr = (struct nilfs_super_root *)segbuf->sb_super_root->b_data;
@@ -350,7 +346,8 @@ static void nilfs_end_bio_write(struct bio *bio)
}
static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
- struct nilfs_write_info *wi, int mode)
+ struct nilfs_write_info *wi, int mode,
+ int mode_flags)
{
struct bio *bio = wi->bio;
int err;
@@ -368,7 +365,8 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
bio->bi_end_io = nilfs_end_bio_write;
bio->bi_private = segbuf;
- submit_bio(mode, bio);
+ bio_set_op_attrs(bio, mode, mode_flags);
+ submit_bio(bio);
segbuf->sb_nbio++;
wi->bio = NULL;
@@ -441,7 +439,7 @@ static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf,
return 0;
}
/* bio is FULL */
- err = nilfs_segbuf_submit_bio(segbuf, wi, mode);
+ err = nilfs_segbuf_submit_bio(segbuf, wi, mode, 0);
/* never submit current bh */
if (likely(!err))
goto repeat;
@@ -465,19 +463,19 @@ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
{
struct nilfs_write_info wi;
struct buffer_head *bh;
- int res = 0, rw = WRITE;
+ int res = 0;
wi.nilfs = nilfs;
nilfs_segbuf_prepare_write(segbuf, &wi);
list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) {
- res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, rw);
+ res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, REQ_OP_WRITE);
if (unlikely(res))
goto failed_bio;
}
list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
- res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, rw);
+ res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, REQ_OP_WRITE);
if (unlikely(res))
goto failed_bio;
}
@@ -487,8 +485,8 @@ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
* Last BIO is always sent through the following
* submission.
*/
- rw |= REQ_SYNC;
- res = nilfs_segbuf_submit_bio(segbuf, &wi, rw);
+ res = nilfs_segbuf_submit_bio(segbuf, &wi, REQ_OP_WRITE,
+ REQ_SYNC);
}
failed_bio:
@@ -516,7 +514,11 @@ static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf)
} while (--segbuf->sb_nbio > 0);
if (unlikely(atomic_read(&segbuf->sb_err) > 0)) {
- printk(KERN_ERR "NILFS: IO error writing segment\n");
+ nilfs_msg(segbuf->sb_super, KERN_ERR,
+ "I/O error writing log (start-blocknr=%llu, block-count=%lu) in segment %llu",
+ (unsigned long long)segbuf->sb_pseg_start,
+ segbuf->sb_sum.nblocks,
+ (unsigned long long)segbuf->sb_segnum);
err = -EIO;
}
return err;
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index b04f08cc23976..7bbccc099709a 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
*
*/
#ifndef _NILFS_SEGBUF_H
@@ -82,7 +78,7 @@ struct nilfs_segment_buffer {
__u64 sb_nextnum;
sector_t sb_fseg_start, sb_fseg_end;
sector_t sb_pseg_start;
- unsigned sb_rest_blocks;
+ unsigned int sb_rest_blocks;
/* Buffers */
struct list_head sb_segsum_buffers;
@@ -124,7 +120,8 @@ void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf,
struct nilfs_segment_buffer *prev);
void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64,
struct the_nilfs *);
-int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t, __u64);
+int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned int, time_t,
+ __u64);
int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *);
int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *,
struct buffer_head **);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 3b65adaae7e47..bedcae2c28e62 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
*
*/
@@ -49,18 +45,26 @@
*/
#define SC_N_INODEVEC 16 /* Size of locally allocated inode vector */
-#define SC_MAX_SEGDELTA 64 /* Upper limit of the number of segments
- appended in collection retry loop */
+#define SC_MAX_SEGDELTA 64 /*
+ * Upper limit of the number of segments
+ * appended in collection retry loop
+ */
/* Construction mode */
enum {
SC_LSEG_SR = 1, /* Make a logical segment having a super root */
- SC_LSEG_DSYNC, /* Flush data blocks of a given file and make
- a logical segment without a super root */
- SC_FLUSH_FILE, /* Flush data files, leads to segment writes without
- creating a checkpoint */
- SC_FLUSH_DAT, /* Flush DAT file. This also creates segments without
- a checkpoint */
+ SC_LSEG_DSYNC, /*
+ * Flush data blocks of a given file and make
+ * a logical segment without a super root.
+ */
+ SC_FLUSH_FILE, /*
+ * Flush data files, leads to segment writes without
+ * creating a checkpoint.
+ */
+ SC_FLUSH_DAT, /*
+ * Flush DAT file. This also creates segments
+ * without a checkpoint.
+ */
};
/* Stage numbers of dirty block collection */
@@ -146,7 +150,8 @@ static void nilfs_dispose_list(struct the_nilfs *, struct list_head *, int);
#define nilfs_cnt32_lt(a, b) nilfs_cnt32_gt(b, a)
#define nilfs_cnt32_le(a, b) nilfs_cnt32_ge(b, a)
-static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
+static int nilfs_prepare_segment_lock(struct super_block *sb,
+ struct nilfs_transaction_info *ti)
{
struct nilfs_transaction_info *cur_ti = current->journal_info;
void *save = NULL;
@@ -154,17 +159,14 @@ static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
if (cur_ti) {
if (cur_ti->ti_magic == NILFS_TI_MAGIC)
return ++cur_ti->ti_count;
- else {
- /*
- * If journal_info field is occupied by other FS,
- * it is saved and will be restored on
- * nilfs_transaction_commit().
- */
- printk(KERN_WARNING
- "NILFS warning: journal info from a different "
- "FS\n");
- save = current->journal_info;
- }
+
+ /*
+ * If journal_info field is occupied by other FS,
+ * it is saved and will be restored on
+ * nilfs_transaction_commit().
+ */
+ nilfs_msg(sb, KERN_WARNING, "journal info from a different FS");
+ save = current->journal_info;
}
if (!ti) {
ti = kmem_cache_alloc(nilfs_transaction_cachep, GFP_NOFS);
@@ -213,7 +215,7 @@ int nilfs_transaction_begin(struct super_block *sb,
int vacancy_check)
{
struct the_nilfs *nilfs;
- int ret = nilfs_prepare_segment_lock(ti);
+ int ret = nilfs_prepare_segment_lock(sb, ti);
struct nilfs_transaction_info *trace_ti;
if (unlikely(ret < 0))
@@ -371,7 +373,7 @@ static void nilfs_transaction_lock(struct super_block *sb,
nilfs_segctor_do_immediate_flush(sci);
up_write(&nilfs->ns_segctor_sem);
- yield();
+ cond_resched();
}
if (gcflag)
ti->ti_flags |= NILFS_TI_GC;
@@ -397,10 +399,10 @@ static void nilfs_transaction_unlock(struct super_block *sb)
static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
struct nilfs_segsum_pointer *ssp,
- unsigned bytes)
+ unsigned int bytes)
{
struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
- unsigned blocksize = sci->sc_super->s_blocksize;
+ unsigned int blocksize = sci->sc_super->s_blocksize;
void *p;
if (unlikely(ssp->offset + bytes > blocksize)) {
@@ -422,8 +424,8 @@ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
{
struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
struct buffer_head *sumbh;
- unsigned sumbytes;
- unsigned flags = 0;
+ unsigned int sumbytes;
+ unsigned int flags = 0;
int err;
if (nilfs_doing_gc())
@@ -444,8 +446,10 @@ static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci)
{
sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks;
if (NILFS_SEGBUF_IS_LAST(sci->sc_curseg, &sci->sc_segbufs))
- return -E2BIG; /* The current segment is filled up
- (internal code) */
+ return -E2BIG; /*
+ * The current segment is filled up
+ * (internal code)
+ */
sci->sc_curseg = NILFS_NEXT_SEGBUF(sci->sc_curseg);
return nilfs_segctor_reset_segment_buffer(sci);
}
@@ -472,9 +476,9 @@ static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci)
*/
static int nilfs_segctor_segsum_block_required(
struct nilfs_sc_info *sci, const struct nilfs_segsum_pointer *ssp,
- unsigned binfo_size)
+ unsigned int binfo_size)
{
- unsigned blocksize = sci->sc_super->s_blocksize;
+ unsigned int blocksize = sci->sc_super->s_blocksize;
/* Size of finfo and binfo is enough small against blocksize */
return ssp->offset + binfo_size +
@@ -533,7 +537,7 @@ static void nilfs_segctor_end_finfo(struct nilfs_sc_info *sci,
static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci,
struct buffer_head *bh,
struct inode *inode,
- unsigned binfo_size)
+ unsigned int binfo_size)
{
struct nilfs_segment_buffer *segbuf;
int required, err = 0;
@@ -617,7 +621,7 @@ static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci,
*vblocknr = binfo->bi_v.bi_vblocknr;
}
-static struct nilfs_sc_operations nilfs_sc_file_ops = {
+static const struct nilfs_sc_operations nilfs_sc_file_ops = {
.collect_data = nilfs_collect_file_data,
.collect_node = nilfs_collect_file_node,
.collect_bmap = nilfs_collect_file_bmap,
@@ -666,7 +670,7 @@ static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci,
*binfo_dat = binfo->bi_dat;
}
-static struct nilfs_sc_operations nilfs_sc_dat_ops = {
+static const struct nilfs_sc_operations nilfs_sc_dat_ops = {
.collect_data = nilfs_collect_dat_data,
.collect_node = nilfs_collect_file_node,
.collect_bmap = nilfs_collect_dat_bmap,
@@ -674,7 +678,7 @@ static struct nilfs_sc_operations nilfs_sc_dat_ops = {
.write_node_binfo = nilfs_write_dat_node_binfo,
};
-static struct nilfs_sc_operations nilfs_sc_dsync_ops = {
+static const struct nilfs_sc_operations nilfs_sc_dsync_ops = {
.collect_data = nilfs_collect_file_data,
.collect_node = NULL,
.collect_bmap = NULL,
@@ -777,7 +781,7 @@ static void nilfs_dispose_list(struct the_nilfs *nilfs,
{
struct nilfs_inode_info *ii, *n;
struct nilfs_inode_info *ivec[SC_N_INODEVEC], **pii;
- unsigned nv = 0;
+ unsigned int nv = 0;
while (!list_empty(head)) {
spin_lock(&nilfs->ns_inode_lock);
@@ -875,9 +879,11 @@ static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 1,
&raw_cp, &bh_cp);
if (likely(!err)) {
- /* The following code is duplicated with cpfile. But, it is
- needed to collect the checkpoint even if it was not newly
- created */
+ /*
+ * The following code is duplicated with cpfile. But, it is
+ * needed to collect the checkpoint even if it was not newly
+ * created.
+ */
mark_buffer_dirty(bh_cp);
nilfs_mdt_mark_dirty(nilfs->ns_cpfile);
nilfs_cpfile_put_checkpoint(
@@ -958,7 +964,7 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
{
struct buffer_head *bh_sr;
struct nilfs_super_root *raw_sr;
- unsigned isz, srsz;
+ unsigned int isz, srsz;
bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root;
raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
@@ -1043,7 +1049,7 @@ static size_t nilfs_segctor_buffer_rest(struct nilfs_sc_info *sci)
static int nilfs_segctor_scan_file(struct nilfs_sc_info *sci,
struct inode *inode,
- struct nilfs_sc_operations *sc_ops)
+ const struct nilfs_sc_operations *sc_ops)
{
LIST_HEAD(data_buffers);
LIST_HEAD(node_buffers);
@@ -1406,8 +1412,10 @@ static void nilfs_free_incomplete_logs(struct list_head *logs,
if (atomic_read(&segbuf->sb_err)) {
/* Case 1: The first segment failed */
if (segbuf->sb_pseg_start != segbuf->sb_fseg_start)
- /* Case 1a: Partial segment appended into an existing
- segment */
+ /*
+ * Case 1a: Partial segment appended into an existing
+ * segment
+ */
nilfs_terminate_segment(nilfs, segbuf->sb_fseg_start,
segbuf->sb_fseg_end);
else /* Case 1b: New full segment */
@@ -1550,7 +1558,7 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
sector_t blocknr;
unsigned long nfinfo = segbuf->sb_sum.nfinfo;
unsigned long nblocks = 0, ndatablk = 0;
- struct nilfs_sc_operations *sc_op = NULL;
+ const struct nilfs_sc_operations *sc_op = NULL;
struct nilfs_segsum_pointer ssp;
struct nilfs_finfo *finfo = NULL;
union nilfs_binfo binfo;
@@ -1631,8 +1639,10 @@ static int nilfs_segctor_assign(struct nilfs_sc_info *sci, int mode)
static void nilfs_begin_page_io(struct page *page)
{
if (!page || PageWriteback(page))
- /* For split b-tree node pages, this function may be called
- twice. We ignore the 2nd or later calls by this check. */
+ /*
+ * For split b-tree node pages, this function may be called
+ * twice. We ignore the 2nd or later calls by this check.
+ */
return;
lock_page(page);
@@ -1848,11 +1858,11 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
*/
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
- const unsigned long set_bits = (1 << BH_Uptodate);
+ const unsigned long set_bits = BIT(BH_Uptodate);
const unsigned long clear_bits =
- (1 << BH_Dirty | 1 << BH_Async_Write |
- 1 << BH_Delay | 1 << BH_NILFS_Volatile |
- 1 << BH_NILFS_Redirected);
+ (BIT(BH_Dirty) | BIT(BH_Async_Write) |
+ BIT(BH_Delay) | BIT(BH_NILFS_Volatile) |
+ BIT(BH_NILFS_Redirected));
set_mask_bits(&bh->b_state, clear_bits, set_bits);
if (bh == segbuf->sb_super_root) {
@@ -1941,8 +1951,9 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
err = nilfs_ifile_get_inode_block(
ifile, ii->vfs_inode.i_ino, &ibh);
if (unlikely(err)) {
- nilfs_warning(sci->sc_super, __func__,
- "failed to get inode block.\n");
+ nilfs_msg(sci->sc_super, KERN_WARNING,
+ "log writer: error %d getting inode block (ino=%lu)",
+ err, ii->vfs_inode.i_ino);
return err;
}
mark_buffer_dirty(ibh);
@@ -2070,7 +2081,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
goto failed_to_write;
if (nilfs_sc_cstage_get(sci) == NILFS_ST_DONE ||
- nilfs->ns_blocksize_bits != PAGE_CACHE_SHIFT) {
+ nilfs->ns_blocksize_bits != PAGE_SHIFT) {
/*
* At this point, we avoid double buffering
* for blocksize < pagesize because page dirty
@@ -2121,10 +2132,10 @@ static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci)
static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn)
{
spin_lock(&sci->sc_state_lock);
- if (!(sci->sc_flush_request & (1 << bn))) {
+ if (!(sci->sc_flush_request & BIT(bn))) {
unsigned long prev_req = sci->sc_flush_request;
- sci->sc_flush_request |= (1 << bn);
+ sci->sc_flush_request |= BIT(bn);
if (!prev_req)
wake_up(&sci->sc_wait_daemon);
}
@@ -2308,7 +2319,7 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
}
#define FLUSH_FILE_BIT (0x1) /* data file only */
-#define FLUSH_DAT_BIT (1 << NILFS_DAT_INO) /* DAT only */
+#define FLUSH_DAT_BIT BIT(NILFS_DAT_INO) /* DAT only */
/**
* nilfs_segctor_accept - record accepted sequence count of log-write requests
@@ -2395,6 +2406,7 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
static void nilfs_construction_timeout(unsigned long data)
{
struct task_struct *p = (struct task_struct *)data;
+
wake_up_process(p);
}
@@ -2447,8 +2459,7 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
if (likely(!err))
break;
- nilfs_warning(sb, __func__,
- "segment construction failed. (err=%d)", err);
+ nilfs_msg(sb, KERN_WARNING, "error %d cleaning segments", err);
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(sci->sc_interval);
}
@@ -2456,9 +2467,9 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
int ret = nilfs_discard_segments(nilfs, sci->sc_freesegs,
sci->sc_nfreesegs);
if (ret) {
- printk(KERN_WARNING
- "NILFS warning: error %d on discard request, "
- "turning discards off for the device\n", ret);
+ nilfs_msg(sb, KERN_WARNING,
+ "error %d on discard request, turning discards off for the device",
+ ret);
nilfs_clear_opt(nilfs, DISCARD);
}
}
@@ -2540,10 +2551,9 @@ static int nilfs_segctor_thread(void *arg)
/* start sync. */
sci->sc_task = current;
wake_up(&sci->sc_wait_task); /* for nilfs_segctor_start_thread() */
- printk(KERN_INFO
- "segctord starting. Construction interval = %lu seconds, "
- "CP frequency < %lu seconds\n",
- sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ);
+ nilfs_msg(sci->sc_super, KERN_INFO,
+ "segctord starting. Construction interval = %lu seconds, CP frequency < %lu seconds",
+ sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ);
spin_lock(&sci->sc_state_lock);
loop:
@@ -2555,10 +2565,10 @@ static int nilfs_segctor_thread(void *arg)
if (timeout || sci->sc_seq_request != sci->sc_seq_done)
mode = SC_LSEG_SR;
- else if (!sci->sc_flush_request)
- break;
- else
+ else if (sci->sc_flush_request)
mode = nilfs_segctor_flush_mode(sci);
+ else
+ break;
spin_unlock(&sci->sc_state_lock);
nilfs_segctor_thread_construct(sci, mode);
@@ -2617,8 +2627,8 @@ static int nilfs_segctor_start_thread(struct nilfs_sc_info *sci)
if (IS_ERR(t)) {
int err = PTR_ERR(t);
- printk(KERN_ERR "NILFS: error %d creating segctord thread\n",
- err);
+ nilfs_msg(sci->sc_super, KERN_ERR,
+ "error %d creating segctord thread", err);
return err;
}
wait_event(sci->sc_wait_task, sci->sc_task != NULL);
@@ -2684,8 +2694,10 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
{
int ret, retrycount = NILFS_SC_CLEANUP_RETRY;
- /* The segctord thread was stopped and its timer was removed.
- But some tasks remain. */
+ /*
+ * The segctord thread was stopped and its timer was removed.
+ * But some tasks remain.
+ */
do {
struct nilfs_transaction_info ti;
@@ -2726,14 +2738,14 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
nilfs_segctor_write_out(sci);
if (!list_empty(&sci->sc_dirty_files)) {
- nilfs_warning(sci->sc_super, __func__,
- "dirty file(s) after the final construction\n");
+ nilfs_msg(sci->sc_super, KERN_WARNING,
+ "disposed unprocessed dirty file(s) when stopping log writer");
nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1);
}
if (!list_empty(&sci->sc_iput_queue)) {
- nilfs_warning(sci->sc_super, __func__,
- "iput queue is not empty\n");
+ nilfs_msg(sci->sc_super, KERN_WARNING,
+ "disposed unprocessed inode(s) in iput queue when stopping log writer");
nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 1);
}
@@ -2809,8 +2821,8 @@ void nilfs_detach_log_writer(struct super_block *sb)
spin_lock(&nilfs->ns_inode_lock);
if (!list_empty(&nilfs->ns_dirty_files)) {
list_splice_init(&nilfs->ns_dirty_files, &garbage_list);
- nilfs_warning(sb, __func__,
- "Hit dirty file after stopped log writer\n");
+ nilfs_msg(sb, KERN_WARNING,
+ "disposed unprocessed dirty file(s) when detaching log writer");
}
spin_unlock(&nilfs->ns_inode_lock);
up_write(&nilfs->ns_segctor_sem);
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 0408b9b2814b2..1060949d7dd2a 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
*
*/
#ifndef _NILFS_SEGMENT_H
@@ -27,7 +23,6 @@
#include <linux/fs.h>
#include <linux/buffer_head.h>
#include <linux/workqueue.h>
-#include <linux/nilfs2_fs.h>
#include "nilfs.h"
struct nilfs_root;
@@ -75,7 +70,7 @@ struct nilfs_recovery_info {
*/
struct nilfs_cstage {
int scnt;
- unsigned flags;
+ unsigned int flags;
struct nilfs_inode_info *dirty_file_ptr;
struct nilfs_inode_info *gc_inode_ptr;
};
@@ -84,7 +79,7 @@ struct nilfs_segment_buffer;
struct nilfs_segsum_pointer {
struct buffer_head *bh;
- unsigned offset; /* offset in bytes */
+ unsigned int offset; /* offset in bytes */
};
/**
@@ -193,11 +188,15 @@ enum {
NILFS_SC_DIRTY, /* One or more dirty meta-data blocks exist */
NILFS_SC_UNCLOSED, /* Logical segment is not closed */
NILFS_SC_SUPER_ROOT, /* The latest segment has a super root */
- NILFS_SC_PRIOR_FLUSH, /* Requesting immediate flush without making a
- checkpoint */
- NILFS_SC_HAVE_DELTA, /* Next checkpoint will have update of files
- other than DAT, cpfile, sufile, or files
- moved by GC */
+ NILFS_SC_PRIOR_FLUSH, /*
+ * Requesting immediate flush without making a
+ * checkpoint
+ */
+ NILFS_SC_HAVE_DELTA, /*
+ * Next checkpoint will have update of files
+ * other than DAT, cpfile, sufile, or files
+ * moved by GC.
+ */
};
/* sc_state */
@@ -207,17 +206,23 @@ enum {
/*
* Constant parameters
*/
-#define NILFS_SC_CLEANUP_RETRY 3 /* Retry count of construction when
- destroying segctord */
+#define NILFS_SC_CLEANUP_RETRY 3 /*
+ * Retry count of construction when
+ * destroying segctord
+ */
/*
* Default values of timeout, in seconds.
*/
-#define NILFS_SC_DEFAULT_TIMEOUT 5 /* Timeout value of dirty blocks.
- It triggers construction of a
- logical segment with a super root */
-#define NILFS_SC_DEFAULT_SR_FREQ 30 /* Maximum frequency of super root
- creation */
+#define NILFS_SC_DEFAULT_TIMEOUT 5 /*
+ * Timeout value of dirty blocks.
+ * It triggers construction of a
+ * logical segment with a super root.
+ */
+#define NILFS_SC_DEFAULT_SR_FREQ 30 /*
+ * Maximum frequency of super root
+ * creation
+ */
/*
* The default threshold amount of data, in block counts.
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 52821ffc11f46..1541a1e9221a5 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -13,12 +13,8 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
- * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
+ * Written by Koji Sato.
+ * Revised by Ryusuke Konishi.
*/
#include <linux/kernel.h>
@@ -26,7 +22,6 @@
#include <linux/string.h>
#include <linux/buffer_head.h>
#include <linux/errno.h>
-#include <linux/nilfs2_fs.h>
#include "mdt.h"
#include "sufile.h"
@@ -61,6 +56,7 @@ static unsigned long
nilfs_sufile_get_blkoff(const struct inode *sufile, __u64 segnum)
{
__u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset;
+
do_div(t, nilfs_sufile_segment_usages_per_block(sufile));
return (unsigned long)t;
}
@@ -69,6 +65,7 @@ static unsigned long
nilfs_sufile_get_offset(const struct inode *sufile, __u64 segnum)
{
__u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset;
+
return do_div(t, nilfs_sufile_segment_usages_per_block(sufile));
}
@@ -183,9 +180,9 @@ int nilfs_sufile_updatev(struct inode *sufile, __u64 *segnumv, size_t nsegs,
down_write(&NILFS_MDT(sufile)->mi_sem);
for (seg = segnumv; seg < segnumv + nsegs; seg++) {
if (unlikely(*seg >= nilfs_sufile_get_nsegments(sufile))) {
- printk(KERN_WARNING
- "%s: invalid segment number: %llu\n", __func__,
- (unsigned long long)*seg);
+ nilfs_msg(sufile->i_sb, KERN_WARNING,
+ "%s: invalid segment number: %llu",
+ __func__, (unsigned long long)*seg);
nerr++;
}
}
@@ -242,8 +239,9 @@ int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create,
int ret;
if (unlikely(segnum >= nilfs_sufile_get_nsegments(sufile))) {
- printk(KERN_WARNING "%s: invalid segment number: %llu\n",
- __func__, (unsigned long long)segnum);
+ nilfs_msg(sufile->i_sb, KERN_WARNING,
+ "%s: invalid segment number: %llu",
+ __func__, (unsigned long long)segnum);
return -EINVAL;
}
down_write(&NILFS_MDT(sufile)->mi_sem);
@@ -421,8 +419,9 @@ void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum,
kaddr = kmap_atomic(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
if (unlikely(!nilfs_segment_usage_clean(su))) {
- printk(KERN_WARNING "%s: segment %llu must be clean\n",
- __func__, (unsigned long long)segnum);
+ nilfs_msg(sufile->i_sb, KERN_WARNING,
+ "%s: segment %llu must be clean", __func__,
+ (unsigned long long)segnum);
kunmap_atomic(kaddr);
return;
}
@@ -446,7 +445,7 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
kaddr = kmap_atomic(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
- if (su->su_flags == cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY) &&
+ if (su->su_flags == cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY)) &&
su->su_nblocks == cpu_to_le32(0)) {
kunmap_atomic(kaddr);
return;
@@ -457,7 +456,7 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
/* make the segment garbage */
su->su_lastmod = cpu_to_le64(0);
su->su_nblocks = cpu_to_le32(0);
- su->su_flags = cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY);
+ su->su_flags = cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY));
kunmap_atomic(kaddr);
nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1);
@@ -478,8 +477,9 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
kaddr = kmap_atomic(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
if (nilfs_segment_usage_clean(su)) {
- printk(KERN_WARNING "%s: segment %llu is already clean\n",
- __func__, (unsigned long long)segnum);
+ nilfs_msg(sufile->i_sb, KERN_WARNING,
+ "%s: segment %llu is already clean",
+ __func__, (unsigned long long)segnum);
kunmap_atomic(kaddr);
return;
}
@@ -694,7 +694,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile,
su2 = su;
for (j = 0; j < n; j++, su = (void *)su + susz) {
if ((le32_to_cpu(su->su_flags) &
- ~(1UL << NILFS_SEGMENT_USAGE_ERROR)) ||
+ ~BIT(NILFS_SEGMENT_USAGE_ERROR)) ||
nilfs_segment_is_active(nilfs, segnum + j)) {
ret = -EBUSY;
kunmap_atomic(kaddr);
@@ -819,7 +819,7 @@ out:
* %-ENOMEM - Insufficient amount of memory available.
*/
ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
- unsigned sisz, size_t nsi)
+ unsigned int sisz, size_t nsi)
{
struct buffer_head *su_bh;
struct nilfs_segment_usage *su;
@@ -861,10 +861,10 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
si->sui_lastmod = le64_to_cpu(su->su_lastmod);
si->sui_nblocks = le32_to_cpu(su->su_nblocks);
si->sui_flags = le32_to_cpu(su->su_flags) &
- ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
+ ~BIT(NILFS_SEGMENT_USAGE_ACTIVE);
if (nilfs_segment_is_active(nilfs, segnum + j))
si->sui_flags |=
- (1UL << NILFS_SEGMENT_USAGE_ACTIVE);
+ BIT(NILFS_SEGMENT_USAGE_ACTIVE);
}
kunmap_atomic(kaddr);
brelse(su_bh);
@@ -897,7 +897,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
* %-EINVAL - Invalid values in input (segment number, flags or nblocks)
*/
ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf,
- unsigned supsz, size_t nsup)
+ unsigned int supsz, size_t nsup)
{
struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
struct buffer_head *header_bh, *bh;
@@ -952,7 +952,7 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf,
* disk.
*/
sup->sup_sui.sui_flags &=
- ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
+ ~BIT(NILFS_SEGMENT_USAGE_ACTIVE);
cleansi = nilfs_suinfo_clean(&sup->sup_sui);
cleansu = nilfs_segment_usage_clean(su);
@@ -1177,14 +1177,12 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize,
int err;
if (susize > sb->s_blocksize) {
- printk(KERN_ERR
- "NILFS: too large segment usage size: %zu bytes.\n",
- susize);
+ nilfs_msg(sb, KERN_ERR,
+ "too large segment usage size: %zu bytes", susize);
return -EINVAL;
} else if (susize < NILFS_MIN_SEGMENT_USAGE_SIZE) {
- printk(KERN_ERR
- "NILFS: too small segment usage size: %zu bytes.\n",
- susize);
+ nilfs_msg(sb, KERN_ERR,
+ "too small segment usage size: %zu bytes", susize);
return -EINVAL;
}
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index b8afd72f2379e..158a9190c8ec2 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#ifndef _NILFS_SUFILE_H
@@ -25,7 +21,6 @@
#include <linux/fs.h>
#include <linux/buffer_head.h>
-#include <linux/nilfs2_fs.h>
#include "mdt.h"
@@ -42,9 +37,9 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum);
int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
unsigned long nblocks, time_t modtime);
int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *);
-ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned,
+ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned int,
size_t);
-ssize_t nilfs_sufile_set_suinfo(struct inode *, void *, unsigned , size_t);
+ssize_t nilfs_sufile_set_suinfo(struct inode *, void *, unsigned int, size_t);
int nilfs_sufile_updatev(struct inode *, __u64 *, size_t, int, size_t *,
void (*dofunc)(struct inode *, __u64,
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 7f5d3d9f1c37b..c95d369e90aa9 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
*/
/*
* linux/fs/ext2/super.c
@@ -75,6 +71,22 @@ struct kmem_cache *nilfs_btree_path_cache;
static int nilfs_setup_super(struct super_block *sb, int is_mount);
static int nilfs_remount(struct super_block *sb, int *flags, char *data);
+void __nilfs_msg(struct super_block *sb, const char *level, const char *fmt,
+ ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ if (sb)
+ printk("%sNILFS (%s): %pV\n", level, sb->s_id, &vaf);
+ else
+ printk("%sNILFS: %pV\n", level, &vaf);
+ va_end(args);
+}
+
static void nilfs_set_error(struct super_block *sb)
{
struct the_nilfs *nilfs = sb->s_fs_info;
@@ -95,19 +107,20 @@ static void nilfs_set_error(struct super_block *sb)
}
/**
- * nilfs_error() - report failure condition on a filesystem
+ * __nilfs_error() - report failure condition on a filesystem
+ *
+ * __nilfs_error() sets an ERROR_FS flag on the superblock as well as
+ * reporting an error message. This function should be called when
+ * NILFS detects incoherences or defects of meta data on disk.
*
- * nilfs_error() sets an ERROR_FS flag on the superblock as well as
- * reporting an error message. It should be called when NILFS detects
- * incoherences or defects of meta data on disk. As for sustainable
- * errors such as a single-shot I/O error, nilfs_warning() or the printk()
- * function should be used instead.
+ * This implements the body of nilfs_error() macro. Normally,
+ * nilfs_error() should be used. As for sustainable errors such as a
+ * single-shot I/O error, nilfs_msg() should be used instead.
*
- * The segment constructor must not call this function because it can
- * kill itself.
+ * Callers should not add a trailing newline since this will do it.
*/
-void nilfs_error(struct super_block *sb, const char *function,
- const char *fmt, ...)
+void __nilfs_error(struct super_block *sb, const char *function,
+ const char *fmt, ...)
{
struct the_nilfs *nilfs = sb->s_fs_info;
struct va_format vaf;
@@ -137,24 +150,6 @@ void nilfs_error(struct super_block *sb, const char *function,
sb->s_id);
}
-void nilfs_warning(struct super_block *sb, const char *function,
- const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
-
- va_start(args, fmt);
-
- vaf.fmt = fmt;
- vaf.va = &args;
-
- printk(KERN_WARNING "NILFS warning (device %s): %s: %pV\n",
- sb->s_id, function, &vaf);
-
- va_end(args);
-}
-
-
struct inode *nilfs_alloc_inode(struct super_block *sb)
{
struct nilfs_inode_info *ii;
@@ -173,12 +168,10 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
static void nilfs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
- if (mdi) {
- kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
- kfree(mdi);
- }
+ if (nilfs_is_metadata_file_inode(inode))
+ nilfs_mdt_destroy(inode);
+
kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
}
@@ -202,8 +195,8 @@ static int nilfs_sync_super(struct super_block *sb, int flag)
}
if (unlikely(err)) {
- printk(KERN_ERR
- "NILFS: unable to write superblock (err=%d)\n", err);
+ nilfs_msg(sb, KERN_ERR, "unable to write superblock: err=%d",
+ err);
if (err == -EIO && nilfs->ns_sbh[1]) {
/*
* sbp[0] points to newer log than sbp[1],
@@ -273,13 +266,12 @@ struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb,
sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC)) {
memcpy(sbp[0], sbp[1], nilfs->ns_sbsize);
} else {
- printk(KERN_CRIT "NILFS: superblock broke on dev %s\n",
- sb->s_id);
+ nilfs_msg(sb, KERN_CRIT, "superblock broke");
return NULL;
}
} else if (sbp[1] &&
sbp[1]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) {
- memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
+ memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
}
if (flip && sbp[1])
@@ -384,9 +376,9 @@ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off)
offset = sb2off & (nilfs->ns_blocksize - 1);
nsbh = sb_getblk(sb, newblocknr);
if (!nsbh) {
- printk(KERN_WARNING
- "NILFS warning: unable to move secondary superblock "
- "to block %llu\n", (unsigned long long)newblocknr);
+ nilfs_msg(sb, KERN_WARNING,
+ "unable to move secondary superblock to block %llu",
+ (unsigned long long)newblocknr);
ret = -EIO;
goto out;
}
@@ -549,10 +541,9 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
up_read(&nilfs->ns_segctor_sem);
if (unlikely(err)) {
if (err == -ENOENT || err == -EINVAL) {
- printk(KERN_ERR
- "NILFS: Invalid checkpoint "
- "(checkpoint number=%llu)\n",
- (unsigned long long)cno);
+ nilfs_msg(sb, KERN_ERR,
+ "Invalid checkpoint (checkpoint number=%llu)",
+ (unsigned long long)cno);
err = -EINVAL;
}
goto failed;
@@ -648,9 +639,8 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
err = nilfs_ifile_count_free_inodes(root->ifile,
&nmaxinodes, &nfreeinodes);
if (unlikely(err)) {
- printk(KERN_WARNING
- "NILFS warning: fail to count free inodes: err %d.\n",
- err);
+ nilfs_msg(sb, KERN_WARNING,
+ "failed to count free inodes: err=%d", err);
if (err == -ERANGE) {
/*
* If nilfs_palloc_count_max_entries() returns
@@ -749,6 +739,7 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
while ((p = strsep(&options, ",")) != NULL) {
int token;
+
if (!*p)
continue;
@@ -781,9 +772,9 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
break;
case Opt_snapshot:
if (is_remount) {
- printk(KERN_ERR
- "NILFS: \"%s\" option is invalid "
- "for remount.\n", p);
+ nilfs_msg(sb, KERN_ERR,
+ "\"%s\" option is invalid for remount",
+ p);
return 0;
}
break;
@@ -797,8 +788,8 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
nilfs_clear_opt(nilfs, DISCARD);
break;
default:
- printk(KERN_ERR
- "NILFS: Unrecognized mount option \"%s\"\n", p);
+ nilfs_msg(sb, KERN_ERR,
+ "unrecognized mount option \"%s\"", p);
return 0;
}
}
@@ -834,12 +825,10 @@ static int nilfs_setup_super(struct super_block *sb, int is_mount)
mnt_count = le16_to_cpu(sbp[0]->s_mnt_count);
if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
- printk(KERN_WARNING
- "NILFS warning: mounting fs with errors\n");
+ nilfs_msg(sb, KERN_WARNING, "mounting fs with errors");
#if 0
} else if (max_mnt_count >= 0 && mnt_count >= max_mnt_count) {
- printk(KERN_WARNING
- "NILFS warning: maximal mount count reached\n");
+ nilfs_msg(sb, KERN_WARNING, "maximal mount count reached");
#endif
}
if (!max_mnt_count)
@@ -891,7 +880,7 @@ int nilfs_store_magic_and_option(struct super_block *sb,
nilfs->ns_interval = le32_to_cpu(sbp->s_c_interval);
nilfs->ns_watermark = le32_to_cpu(sbp->s_c_block_max);
- return !parse_options(data, sb, 0) ? -EINVAL : 0 ;
+ return !parse_options(data, sb, 0) ? -EINVAL : 0;
}
int nilfs_check_feature_compatibility(struct super_block *sb,
@@ -902,17 +891,17 @@ int nilfs_check_feature_compatibility(struct super_block *sb,
features = le64_to_cpu(sbp->s_feature_incompat) &
~NILFS_FEATURE_INCOMPAT_SUPP;
if (features) {
- printk(KERN_ERR "NILFS: couldn't mount because of unsupported "
- "optional features (%llx)\n",
- (unsigned long long)features);
+ nilfs_msg(sb, KERN_ERR,
+ "couldn't mount because of unsupported optional features (%llx)",
+ (unsigned long long)features);
return -EINVAL;
}
features = le64_to_cpu(sbp->s_feature_compat_ro) &
~NILFS_FEATURE_COMPAT_RO_SUPP;
if (!(sb->s_flags & MS_RDONLY) && features) {
- printk(KERN_ERR "NILFS: couldn't mount RDWR because of "
- "unsupported optional features (%llx)\n",
- (unsigned long long)features);
+ nilfs_msg(sb, KERN_ERR,
+ "couldn't mount RDWR because of unsupported optional features (%llx)",
+ (unsigned long long)features);
return -EINVAL;
}
return 0;
@@ -928,13 +917,13 @@ static int nilfs_get_root_dentry(struct super_block *sb,
inode = nilfs_iget(sb, root, NILFS_ROOT_INO);
if (IS_ERR(inode)) {
- printk(KERN_ERR "NILFS: get root inode failed\n");
ret = PTR_ERR(inode);
+ nilfs_msg(sb, KERN_ERR, "error %d getting root inode", ret);
goto out;
}
if (!S_ISDIR(inode->i_mode) || !inode->i_blocks || !inode->i_size) {
iput(inode);
- printk(KERN_ERR "NILFS: corrupt root inode.\n");
+ nilfs_msg(sb, KERN_ERR, "corrupt root inode");
ret = -EINVAL;
goto out;
}
@@ -962,7 +951,7 @@ static int nilfs_get_root_dentry(struct super_block *sb,
return ret;
failed_dentry:
- printk(KERN_ERR "NILFS: get root dentry failed\n");
+ nilfs_msg(sb, KERN_ERR, "error %d getting root dentry", ret);
goto out;
}
@@ -982,18 +971,18 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
ret = (ret == -ENOENT) ? -EINVAL : ret;
goto out;
} else if (!ret) {
- printk(KERN_ERR "NILFS: The specified checkpoint is "
- "not a snapshot (checkpoint number=%llu).\n",
- (unsigned long long)cno);
+ nilfs_msg(s, KERN_ERR,
+ "The specified checkpoint is not a snapshot (checkpoint number=%llu)",
+ (unsigned long long)cno);
ret = -EINVAL;
goto out;
}
ret = nilfs_attach_checkpoint(s, cno, false, &root);
if (ret) {
- printk(KERN_ERR "NILFS: error loading snapshot "
- "(checkpoint number=%llu).\n",
- (unsigned long long)cno);
+ nilfs_msg(s, KERN_ERR,
+ "error %d while loading snapshot (checkpoint number=%llu)",
+ ret, (unsigned long long)cno);
goto out;
}
ret = nilfs_get_root_dentry(s, root, root_dentry);
@@ -1063,7 +1052,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
__u64 cno;
int err;
- nilfs = alloc_nilfs(sb->s_bdev);
+ nilfs = alloc_nilfs(sb);
if (!nilfs)
return -ENOMEM;
@@ -1088,8 +1077,9 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
cno = nilfs_last_cno(nilfs);
err = nilfs_attach_checkpoint(sb, cno, true, &fsroot);
if (err) {
- printk(KERN_ERR "NILFS: error loading last checkpoint "
- "(checkpoint number=%llu).\n", (unsigned long long)cno);
+ nilfs_msg(sb, KERN_ERR,
+ "error %d while loading last checkpoint (checkpoint number=%llu)",
+ err, (unsigned long long)cno);
goto failed_unload;
}
@@ -1149,9 +1139,8 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
err = -EINVAL;
if (!nilfs_valid_fs(nilfs)) {
- printk(KERN_WARNING "NILFS (device %s): couldn't "
- "remount because the filesystem is in an "
- "incomplete recovery state.\n", sb->s_id);
+ nilfs_msg(sb, KERN_WARNING,
+ "couldn't remount because the filesystem is in an incomplete recovery state");
goto restore_opts;
}
@@ -1183,10 +1172,9 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
~NILFS_FEATURE_COMPAT_RO_SUPP;
up_read(&nilfs->ns_sem);
if (features) {
- printk(KERN_WARNING "NILFS (device %s): couldn't "
- "remount RDWR because of unsupported optional "
- "features (%llx)\n",
- sb->s_id, (unsigned long long)features);
+ nilfs_msg(sb, KERN_WARNING,
+ "couldn't remount RDWR because of unsupported optional features (%llx)",
+ (unsigned long long)features);
err = -EROFS;
goto restore_opts;
}
@@ -1217,6 +1205,38 @@ struct nilfs_super_data {
int flags;
};
+static int nilfs_parse_snapshot_option(const char *option,
+ const substring_t *arg,
+ struct nilfs_super_data *sd)
+{
+ unsigned long long val;
+ const char *msg = NULL;
+ int err;
+
+ if (!(sd->flags & MS_RDONLY)) {
+ msg = "read-only option is not specified";
+ goto parse_error;
+ }
+
+ err = kstrtoull(arg->from, 0, &val);
+ if (err) {
+ if (err == -ERANGE)
+ msg = "too large checkpoint number";
+ else
+ msg = "malformed argument";
+ goto parse_error;
+ } else if (val == 0) {
+ msg = "invalid checkpoint number 0";
+ goto parse_error;
+ }
+ sd->cno = val;
+ return 0;
+
+parse_error:
+ nilfs_msg(NULL, KERN_ERR, "invalid option \"%s\": %s", option, msg);
+ return 1;
+}
+
/**
* nilfs_identify - pre-read mount options needed to identify mount instance
* @data: mount options
@@ -1233,24 +1253,9 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd)
p = strsep(&options, ",");
if (p != NULL && *p) {
token = match_token(p, tokens, args);
- if (token == Opt_snapshot) {
- if (!(sd->flags & MS_RDONLY)) {
- ret++;
- } else {
- sd->cno = simple_strtoull(args[0].from,
- NULL, 0);
- /*
- * No need to see the end pointer;
- * match_token() has done syntax
- * checking.
- */
- if (sd->cno == 0)
- ret++;
- }
- }
- if (ret)
- printk(KERN_ERR
- "NILFS: invalid mount option: %s\n", p);
+ if (token == Opt_snapshot)
+ ret = nilfs_parse_snapshot_option(p, &args[0],
+ sd);
}
if (!options)
break;
@@ -1316,7 +1321,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
}
if (!s->s_root) {
- s_new = true;
+ s_new = true;
/* New superblock instance created */
s->s_mode = mode;
@@ -1331,10 +1336,10 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
} else if (!sd.cno) {
if (nilfs_tree_is_busy(s->s_root)) {
if ((flags ^ s->s_flags) & MS_RDONLY) {
- printk(KERN_ERR "NILFS: the device already "
- "has a %s mount.\n",
- (s->s_flags & MS_RDONLY) ?
- "read-only" : "read/write");
+ nilfs_msg(s, KERN_ERR,
+ "the device already has a %s mount.",
+ (s->s_flags & MS_RDONLY) ?
+ "read-only" : "read/write");
err = -EBUSY;
goto failed_super;
}
diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c
index bbb0dcc35905c..490303e3d5179 100644
--- a/fs/nilfs2/sysfs.c
+++ b/fs/nilfs2/sysfs.c
@@ -68,7 +68,7 @@ static ssize_t nilfs_##name##_attr_store(struct kobject *kobj, \
static const struct sysfs_ops nilfs_##name##_attr_ops = { \
.show = nilfs_##name##_attr_show, \
.store = nilfs_##name##_attr_store, \
-};
+}
#define NILFS_DEV_INT_GROUP_TYPE(name, parent_name) \
static void nilfs_##name##_attr_release(struct kobject *kobj) \
@@ -84,7 +84,7 @@ static struct kobj_type nilfs_##name##_ktype = { \
.default_attrs = nilfs_##name##_attrs, \
.sysfs_ops = &nilfs_##name##_attr_ops, \
.release = nilfs_##name##_attr_release, \
-};
+}
#define NILFS_DEV_INT_GROUP_FNS(name, parent_name) \
static int nilfs_sysfs_create_##name##_group(struct the_nilfs *nilfs) \
@@ -272,8 +272,8 @@ nilfs_checkpoints_checkpoints_number_show(struct nilfs_checkpoints_attr *attr,
err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat);
up_read(&nilfs->ns_segctor_sem);
if (err < 0) {
- printk(KERN_ERR "NILFS: unable to get checkpoint stat: err=%d\n",
- err);
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "unable to get checkpoint stat: err=%d", err);
return err;
}
@@ -295,8 +295,8 @@ nilfs_checkpoints_snapshots_number_show(struct nilfs_checkpoints_attr *attr,
err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat);
up_read(&nilfs->ns_segctor_sem);
if (err < 0) {
- printk(KERN_ERR "NILFS: unable to get checkpoint stat: err=%d\n",
- err);
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "unable to get checkpoint stat: err=%d", err);
return err;
}
@@ -326,9 +326,9 @@ nilfs_checkpoints_next_checkpoint_show(struct nilfs_checkpoints_attr *attr,
{
__u64 cno;
- down_read(&nilfs->ns_sem);
+ down_read(&nilfs->ns_segctor_sem);
cno = nilfs->ns_cno;
- up_read(&nilfs->ns_sem);
+ up_read(&nilfs->ns_segctor_sem);
return snprintf(buf, PAGE_SIZE, "%llu\n", cno);
}
@@ -414,8 +414,8 @@ nilfs_segments_dirty_segments_show(struct nilfs_segments_attr *attr,
err = nilfs_sufile_get_stat(nilfs->ns_sufile, &sustat);
up_read(&nilfs->ns_segctor_sem);
if (err < 0) {
- printk(KERN_ERR "NILFS: unable to get segment stat: err=%d\n",
- err);
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "unable to get segment stat: err=%d", err);
return err;
}
@@ -511,9 +511,9 @@ nilfs_segctor_current_seg_sequence_show(struct nilfs_segctor_attr *attr,
{
u64 seg_seq;
- down_read(&nilfs->ns_sem);
+ down_read(&nilfs->ns_segctor_sem);
seg_seq = nilfs->ns_seg_seq;
- up_read(&nilfs->ns_sem);
+ up_read(&nilfs->ns_segctor_sem);
return snprintf(buf, PAGE_SIZE, "%llu\n", seg_seq);
}
@@ -525,9 +525,9 @@ nilfs_segctor_current_last_full_seg_show(struct nilfs_segctor_attr *attr,
{
__u64 segnum;
- down_read(&nilfs->ns_sem);
+ down_read(&nilfs->ns_segctor_sem);
segnum = nilfs->ns_segnum;
- up_read(&nilfs->ns_sem);
+ up_read(&nilfs->ns_segctor_sem);
return snprintf(buf, PAGE_SIZE, "%llu\n", segnum);
}
@@ -539,9 +539,9 @@ nilfs_segctor_next_full_seg_show(struct nilfs_segctor_attr *attr,
{
__u64 nextnum;
- down_read(&nilfs->ns_sem);
+ down_read(&nilfs->ns_segctor_sem);
nextnum = nilfs->ns_nextnum;
- up_read(&nilfs->ns_sem);
+ up_read(&nilfs->ns_segctor_sem);
return snprintf(buf, PAGE_SIZE, "%llu\n", nextnum);
}
@@ -553,9 +553,9 @@ nilfs_segctor_next_pseg_offset_show(struct nilfs_segctor_attr *attr,
{
unsigned long pseg_offset;
- down_read(&nilfs->ns_sem);
+ down_read(&nilfs->ns_segctor_sem);
pseg_offset = nilfs->ns_pseg_offset;
- up_read(&nilfs->ns_sem);
+ up_read(&nilfs->ns_segctor_sem);
return snprintf(buf, PAGE_SIZE, "%lu\n", pseg_offset);
}
@@ -567,9 +567,9 @@ nilfs_segctor_next_checkpoint_show(struct nilfs_segctor_attr *attr,
{
__u64 cno;
- down_read(&nilfs->ns_sem);
+ down_read(&nilfs->ns_segctor_sem);
cno = nilfs->ns_cno;
- up_read(&nilfs->ns_sem);
+ up_read(&nilfs->ns_segctor_sem);
return snprintf(buf, PAGE_SIZE, "%llu\n", cno);
}
@@ -581,9 +581,9 @@ nilfs_segctor_last_seg_write_time_show(struct nilfs_segctor_attr *attr,
{
time_t ctime;
- down_read(&nilfs->ns_sem);
+ down_read(&nilfs->ns_segctor_sem);
ctime = nilfs->ns_ctime;
- up_read(&nilfs->ns_sem);
+ up_read(&nilfs->ns_segctor_sem);
return NILFS_SHOW_TIME(ctime, buf);
}
@@ -595,9 +595,9 @@ nilfs_segctor_last_seg_write_time_secs_show(struct nilfs_segctor_attr *attr,
{
time_t ctime;
- down_read(&nilfs->ns_sem);
+ down_read(&nilfs->ns_segctor_sem);
ctime = nilfs->ns_ctime;
- up_read(&nilfs->ns_sem);
+ up_read(&nilfs->ns_segctor_sem);
return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)ctime);
}
@@ -609,9 +609,9 @@ nilfs_segctor_last_nongc_write_time_show(struct nilfs_segctor_attr *attr,
{
time_t nongc_ctime;
- down_read(&nilfs->ns_sem);
+ down_read(&nilfs->ns_segctor_sem);
nongc_ctime = nilfs->ns_nongc_ctime;
- up_read(&nilfs->ns_sem);
+ up_read(&nilfs->ns_segctor_sem);
return NILFS_SHOW_TIME(nongc_ctime, buf);
}
@@ -623,9 +623,9 @@ nilfs_segctor_last_nongc_write_time_secs_show(struct nilfs_segctor_attr *attr,
{
time_t nongc_ctime;
- down_read(&nilfs->ns_sem);
+ down_read(&nilfs->ns_segctor_sem);
nongc_ctime = nilfs->ns_nongc_ctime;
- up_read(&nilfs->ns_sem);
+ up_read(&nilfs->ns_segctor_sem);
return snprintf(buf, PAGE_SIZE, "%llu\n",
(unsigned long long)nongc_ctime);
@@ -638,9 +638,9 @@ nilfs_segctor_dirty_data_blocks_count_show(struct nilfs_segctor_attr *attr,
{
u32 ndirtyblks;
- down_read(&nilfs->ns_sem);
+ down_read(&nilfs->ns_segctor_sem);
ndirtyblks = atomic_read(&nilfs->ns_ndirtyblks);
- up_read(&nilfs->ns_sem);
+ up_read(&nilfs->ns_segctor_sem);
return snprintf(buf, PAGE_SIZE, "%u\n", ndirtyblks);
}
@@ -756,7 +756,7 @@ nilfs_superblock_sb_write_count_show(struct nilfs_superblock_attr *attr,
struct the_nilfs *nilfs,
char *buf)
{
- unsigned sbwcount;
+ unsigned int sbwcount;
down_read(&nilfs->ns_sem);
sbwcount = nilfs->ns_sbwcount;
@@ -770,7 +770,7 @@ nilfs_superblock_sb_update_frequency_show(struct nilfs_superblock_attr *attr,
struct the_nilfs *nilfs,
char *buf)
{
- unsigned sb_update_freq;
+ unsigned int sb_update_freq;
down_read(&nilfs->ns_sem);
sb_update_freq = nilfs->ns_sb_update_freq;
@@ -784,19 +784,20 @@ nilfs_superblock_sb_update_frequency_store(struct nilfs_superblock_attr *attr,
struct the_nilfs *nilfs,
const char *buf, size_t count)
{
- unsigned val;
+ unsigned int val;
int err;
err = kstrtouint(skip_spaces(buf), 0, &val);
if (err) {
- printk(KERN_ERR "NILFS: unable to convert string: err=%d\n",
- err);
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "unable to convert string: err=%d", err);
return err;
}
if (val < NILFS_SB_FREQ) {
val = NILFS_SB_FREQ;
- printk(KERN_WARNING "NILFS: superblock update frequency cannot be lesser than 10 seconds\n");
+ nilfs_msg(nilfs->ns_sb, KERN_WARNING,
+ "superblock update frequency cannot be lesser than 10 seconds");
}
down_write(&nilfs->ns_sem);
@@ -999,7 +1000,8 @@ int nilfs_sysfs_create_device_group(struct super_block *sb)
nilfs->ns_dev_subgroups = kzalloc(devgrp_size, GFP_KERNEL);
if (unlikely(!nilfs->ns_dev_subgroups)) {
err = -ENOMEM;
- printk(KERN_ERR "NILFS: unable to allocate memory for device group\n");
+ nilfs_msg(sb, KERN_ERR,
+ "unable to allocate memory for device group");
goto failed_create_device_group;
}
@@ -1109,15 +1111,15 @@ int __init nilfs_sysfs_init(void)
nilfs_kset = kset_create_and_add(NILFS_ROOT_GROUP_NAME, NULL, fs_kobj);
if (!nilfs_kset) {
err = -ENOMEM;
- printk(KERN_ERR "NILFS: unable to create sysfs entry: err %d\n",
- err);
+ nilfs_msg(NULL, KERN_ERR,
+ "unable to create sysfs entry: err=%d", err);
goto failed_sysfs_init;
}
err = sysfs_create_group(&nilfs_kset->kobj, &nilfs_feature_attr_group);
if (unlikely(err)) {
- printk(KERN_ERR "NILFS: unable to create feature group: err %d\n",
- err);
+ nilfs_msg(NULL, KERN_ERR,
+ "unable to create feature group: err=%d", err);
goto cleanup_sysfs_init;
}
diff --git a/fs/nilfs2/sysfs.h b/fs/nilfs2/sysfs.h
index 677e3a1a83708..648cedf9c06ec 100644
--- a/fs/nilfs2/sysfs.h
+++ b/fs/nilfs2/sysfs.h
@@ -66,7 +66,7 @@ struct nilfs_##name##_attr { \
char *); \
ssize_t (*store)(struct kobject *, struct attribute *, \
const char *, size_t); \
-};
+}
NILFS_COMMON_ATTR_STRUCT(feature);
@@ -77,7 +77,7 @@ struct nilfs_##name##_attr { \
char *); \
ssize_t (*store)(struct nilfs_##name##_attr *, struct the_nilfs *, \
const char *, size_t); \
-};
+}
NILFS_DEV_ATTR_STRUCT(dev);
NILFS_DEV_ATTR_STRUCT(segments);
@@ -93,7 +93,7 @@ struct nilfs_##name##_attr { \
char *); \
ssize_t (*store)(struct nilfs_##name##_attr *, struct nilfs_root *, \
const char *, size_t); \
-};
+}
NILFS_CP_ATTR_STRUCT(snapshot);
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 69bd801afb53b..2dd75bf619ad0 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
*
*/
@@ -60,12 +56,12 @@ void nilfs_set_last_segment(struct the_nilfs *nilfs,
/**
* alloc_nilfs - allocate a nilfs object
- * @bdev: block device to which the_nilfs is related
+ * @sb: super block instance
*
* Return Value: On success, pointer to the_nilfs is returned.
* On error, NULL is returned.
*/
-struct the_nilfs *alloc_nilfs(struct block_device *bdev)
+struct the_nilfs *alloc_nilfs(struct super_block *sb)
{
struct the_nilfs *nilfs;
@@ -73,7 +69,8 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
if (!nilfs)
return NULL;
- nilfs->ns_bdev = bdev;
+ nilfs->ns_sb = sb;
+ nilfs->ns_bdev = sb->s_bdev;
atomic_set(&nilfs->ns_ndirtyblks, 0);
init_rwsem(&nilfs->ns_sem);
mutex_init(&nilfs->ns_snapshot_mount_mutex);
@@ -112,8 +109,8 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs,
struct nilfs_super_root *raw_sr;
struct nilfs_super_block **sbp = nilfs->ns_sbp;
struct nilfs_inode *rawi;
- unsigned dat_entry_size, segment_usage_size, checkpoint_size;
- unsigned inode_size;
+ unsigned int dat_entry_size, segment_usage_size, checkpoint_size;
+ unsigned int inode_size;
int err;
err = nilfs_read_super_root_block(nilfs, sr_block, &bh_sr, 1);
@@ -195,7 +192,10 @@ static int nilfs_store_log_cursor(struct the_nilfs *nilfs,
nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg);
nilfs->ns_cno = nilfs->ns_last_cno + 1;
if (nilfs->ns_segnum >= nilfs->ns_nsegments) {
- printk(KERN_ERR "NILFS invalid last segment number.\n");
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "pointed segment number is out of range: segnum=%llu, nsegments=%lu",
+ (unsigned long long)nilfs->ns_segnum,
+ nilfs->ns_nsegments);
ret = -EINVAL;
}
return ret;
@@ -219,12 +219,12 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
int err;
if (!valid_fs) {
- printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n");
+ nilfs_msg(sb, KERN_WARNING, "mounting unchecked fs");
if (s_flags & MS_RDONLY) {
- printk(KERN_INFO "NILFS: INFO: recovery "
- "required for readonly filesystem.\n");
- printk(KERN_INFO "NILFS: write access will "
- "be enabled during recovery.\n");
+ nilfs_msg(sb, KERN_INFO,
+ "recovery required for readonly filesystem");
+ nilfs_msg(sb, KERN_INFO,
+ "write access will be enabled during recovery");
}
}
@@ -239,13 +239,12 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
goto scan_error;
if (!nilfs_valid_sb(sbp[1])) {
- printk(KERN_WARNING
- "NILFS warning: unable to fall back to spare"
- "super block\n");
+ nilfs_msg(sb, KERN_WARNING,
+ "unable to fall back to spare super block");
goto scan_error;
}
- printk(KERN_INFO
- "NILFS: try rollback from an earlier position\n");
+ nilfs_msg(sb, KERN_INFO,
+ "trying rollback from an earlier position");
/*
* restore super block with its spare and reconfigure
@@ -258,10 +257,9 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
/* verify consistency between two super blocks */
blocksize = BLOCK_SIZE << le32_to_cpu(sbp[0]->s_log_block_size);
if (blocksize != nilfs->ns_blocksize) {
- printk(KERN_WARNING
- "NILFS warning: blocksize differs between "
- "two super blocks (%d != %d)\n",
- blocksize, nilfs->ns_blocksize);
+ nilfs_msg(sb, KERN_WARNING,
+ "blocksize differs between two super blocks (%d != %d)",
+ blocksize, nilfs->ns_blocksize);
goto scan_error;
}
@@ -280,7 +278,8 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
err = nilfs_load_super_root(nilfs, sb, ri.ri_super_root);
if (unlikely(err)) {
- printk(KERN_ERR "NILFS: error loading super root.\n");
+ nilfs_msg(sb, KERN_ERR, "error %d while loading super root",
+ err);
goto failed;
}
@@ -291,30 +290,29 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
__u64 features;
if (nilfs_test_opt(nilfs, NORECOVERY)) {
- printk(KERN_INFO "NILFS: norecovery option specified. "
- "skipping roll-forward recovery\n");
+ nilfs_msg(sb, KERN_INFO,
+ "norecovery option specified, skipping roll-forward recovery");
goto skip_recovery;
}
features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) &
~NILFS_FEATURE_COMPAT_RO_SUPP;
if (features) {
- printk(KERN_ERR "NILFS: couldn't proceed with "
- "recovery because of unsupported optional "
- "features (%llx)\n",
- (unsigned long long)features);
+ nilfs_msg(sb, KERN_ERR,
+ "couldn't proceed with recovery because of unsupported optional features (%llx)",
+ (unsigned long long)features);
err = -EROFS;
goto failed_unload;
}
if (really_read_only) {
- printk(KERN_ERR "NILFS: write access "
- "unavailable, cannot proceed.\n");
+ nilfs_msg(sb, KERN_ERR,
+ "write access unavailable, cannot proceed");
err = -EROFS;
goto failed_unload;
}
sb->s_flags &= ~MS_RDONLY;
} else if (nilfs_test_opt(nilfs, NORECOVERY)) {
- printk(KERN_ERR "NILFS: recovery cancelled because norecovery "
- "option was specified for a read/write mount\n");
+ nilfs_msg(sb, KERN_ERR,
+ "recovery cancelled because norecovery option was specified for a read/write mount");
err = -EINVAL;
goto failed_unload;
}
@@ -329,11 +327,12 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
up_write(&nilfs->ns_sem);
if (err) {
- printk(KERN_ERR "NILFS: failed to update super block. "
- "recovery unfinished.\n");
+ nilfs_msg(sb, KERN_ERR,
+ "error %d updating super block. recovery unfinished.",
+ err);
goto failed_unload;
}
- printk(KERN_INFO "NILFS: recovery complete.\n");
+ nilfs_msg(sb, KERN_INFO, "recovery complete");
skip_recovery:
nilfs_clear_recovery_info(&ri);
@@ -341,7 +340,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
return 0;
scan_error:
- printk(KERN_ERR "NILFS: error searching super root.\n");
+ nilfs_msg(sb, KERN_ERR, "error %d while searching super root", err);
goto failed;
failed_unload:
@@ -388,12 +387,11 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
struct nilfs_super_block *sbp)
{
if (le32_to_cpu(sbp->s_rev_level) < NILFS_MIN_SUPP_REV) {
- printk(KERN_ERR "NILFS: unsupported revision "
- "(superblock rev.=%d.%d, current rev.=%d.%d). "
- "Please check the version of mkfs.nilfs.\n",
- le32_to_cpu(sbp->s_rev_level),
- le16_to_cpu(sbp->s_minor_rev_level),
- NILFS_CURRENT_REV, NILFS_MINOR_REV);
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "unsupported revision (superblock rev.=%d.%d, current rev.=%d.%d). Please check the version of mkfs.nilfs(2).",
+ le32_to_cpu(sbp->s_rev_level),
+ le16_to_cpu(sbp->s_minor_rev_level),
+ NILFS_CURRENT_REV, NILFS_MINOR_REV);
return -EINVAL;
}
nilfs->ns_sbsize = le16_to_cpu(sbp->s_bytes);
@@ -402,12 +400,14 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size);
if (nilfs->ns_inode_size > nilfs->ns_blocksize) {
- printk(KERN_ERR "NILFS: too large inode size: %d bytes.\n",
- nilfs->ns_inode_size);
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "too large inode size: %d bytes",
+ nilfs->ns_inode_size);
return -EINVAL;
} else if (nilfs->ns_inode_size < NILFS_MIN_INODE_SIZE) {
- printk(KERN_ERR "NILFS: too small inode size: %d bytes.\n",
- nilfs->ns_inode_size);
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "too small inode size: %d bytes",
+ nilfs->ns_inode_size);
return -EINVAL;
}
@@ -415,7 +415,9 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) {
- printk(KERN_ERR "NILFS: too short segment.\n");
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "too short segment: %lu blocks",
+ nilfs->ns_blocks_per_segment);
return -EINVAL;
}
@@ -424,7 +426,9 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
le32_to_cpu(sbp->s_r_segments_percentage);
if (nilfs->ns_r_segments_percentage < 1 ||
nilfs->ns_r_segments_percentage > 99) {
- printk(KERN_ERR "NILFS: invalid reserved segments percentage.\n");
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "invalid reserved segments percentage: %lu",
+ nilfs->ns_r_segments_percentage);
return -EINVAL;
}
@@ -443,7 +447,7 @@ static int nilfs_valid_sb(struct nilfs_super_block *sbp)
if (!sbp || le16_to_cpu(sbp->s_magic) != NILFS_SUPER_MAGIC)
return 0;
bytes = le16_to_cpu(sbp->s_bytes);
- if (bytes > BLOCK_SIZE)
+ if (bytes < sumoff + 4 || bytes > BLOCK_SIZE)
return 0;
crc = crc32_le(le32_to_cpu(sbp->s_crc_seed), (unsigned char *)sbp,
sumoff);
@@ -508,16 +512,16 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
if (!sbp[0]) {
if (!sbp[1]) {
- printk(KERN_ERR "NILFS: unable to read superblock\n");
+ nilfs_msg(sb, KERN_ERR, "unable to read superblock");
return -EIO;
}
- printk(KERN_WARNING
- "NILFS warning: unable to read primary superblock "
- "(blocksize = %d)\n", blocksize);
+ nilfs_msg(sb, KERN_WARNING,
+ "unable to read primary superblock (blocksize = %d)",
+ blocksize);
} else if (!sbp[1]) {
- printk(KERN_WARNING
- "NILFS warning: unable to read secondary superblock "
- "(blocksize = %d)\n", blocksize);
+ nilfs_msg(sb, KERN_WARNING,
+ "unable to read secondary superblock (blocksize = %d)",
+ blocksize);
}
/*
@@ -539,14 +543,14 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
}
if (!valid[swp]) {
nilfs_release_super_block(nilfs);
- printk(KERN_ERR "NILFS: Can't find nilfs on dev %s.\n",
- sb->s_id);
+ nilfs_msg(sb, KERN_ERR, "couldn't find nilfs on the device");
return -EINVAL;
}
if (!valid[!swp])
- printk(KERN_WARNING "NILFS warning: broken superblock. "
- "using spare superblock (blocksize = %d).\n", blocksize);
+ nilfs_msg(sb, KERN_WARNING,
+ "broken superblock, retrying with spare superblock (blocksize = %d)",
+ blocksize);
if (swp)
nilfs_swap_super_block(nilfs);
@@ -580,7 +584,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE);
if (!blocksize) {
- printk(KERN_ERR "NILFS: unable to set blocksize\n");
+ nilfs_msg(sb, KERN_ERR, "unable to set blocksize");
err = -EINVAL;
goto out;
}
@@ -599,8 +603,9 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
if (blocksize < NILFS_MIN_BLOCK_SIZE ||
blocksize > NILFS_MAX_BLOCK_SIZE) {
- printk(KERN_ERR "NILFS: couldn't mount because of unsupported "
- "filesystem blocksize %d\n", blocksize);
+ nilfs_msg(sb, KERN_ERR,
+ "couldn't mount because of unsupported filesystem blocksize %d",
+ blocksize);
err = -EINVAL;
goto failed_sbh;
}
@@ -608,10 +613,9 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
int hw_blocksize = bdev_logical_block_size(sb->s_bdev);
if (blocksize < hw_blocksize) {
- printk(KERN_ERR
- "NILFS: blocksize %d too small for device "
- "(sector-size = %d).\n",
- blocksize, hw_blocksize);
+ nilfs_msg(sb, KERN_ERR,
+ "blocksize %d too small for device (sector-size = %d)",
+ blocksize, hw_blocksize);
err = -EINVAL;
goto failed_sbh;
}
@@ -621,8 +625,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp);
if (err)
goto out;
- /* not failed_sbh; sbh is released automatically
- when reloading fails. */
+ /*
+ * Not to failed_sbh; sbh is released automatically
+ * when reloading fails.
+ */
}
nilfs->ns_blocksize_bits = sb->s_blocksize_bits;
nilfs->ns_blocksize = blocksize;
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 23778d385836f..b305c6f033e7c 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
*
*/
@@ -47,6 +43,7 @@ enum {
* struct the_nilfs - struct to supervise multiple nilfs mount points
* @ns_flags: flags
* @ns_flushed_device: flag indicating if all volatile data was flushed
+ * @ns_sb: back pointer to super block instance
* @ns_bdev: block device
* @ns_sem: semaphore for shared states
* @ns_snapshot_mount_mutex: mutex to protect snapshot mounts
@@ -106,6 +103,7 @@ struct the_nilfs {
unsigned long ns_flags;
int ns_flushed_device;
+ struct super_block *ns_sb;
struct block_device *ns_bdev;
struct rw_semaphore ns_sem;
struct mutex ns_snapshot_mount_mutex;
@@ -118,17 +116,14 @@ struct the_nilfs {
struct buffer_head *ns_sbh[2];
struct nilfs_super_block *ns_sbp[2];
time_t ns_sbwtime;
- unsigned ns_sbwcount;
- unsigned ns_sbsize;
- unsigned ns_mount_state;
- unsigned ns_sb_update_freq;
+ unsigned int ns_sbwcount;
+ unsigned int ns_sbsize;
+ unsigned int ns_mount_state;
+ unsigned int ns_sb_update_freq;
/*
- * Following fields are dedicated to a writable FS-instance.
- * Except for the period seeking checkpoint, code outside the segment
- * constructor must lock a segment semaphore while accessing these
- * fields.
- * The writable FS-instance is sole during a lifetime of the_nilfs.
+ * The following fields are updated by a writable FS-instance.
+ * These fields are protected by ns_segctor_sem outside load_nilfs().
*/
u64 ns_seg_seq;
__u64 ns_segnum;
@@ -226,15 +221,14 @@ THE_NILFS_FNS(SB_DIRTY, sb_dirty)
* Mount option operations
*/
#define nilfs_clear_opt(nilfs, opt) \
- do { (nilfs)->ns_mount_opt &= ~NILFS_MOUNT_##opt; } while (0)
+ ((nilfs)->ns_mount_opt &= ~NILFS_MOUNT_##opt)
#define nilfs_set_opt(nilfs, opt) \
- do { (nilfs)->ns_mount_opt |= NILFS_MOUNT_##opt; } while (0)
+ ((nilfs)->ns_mount_opt |= NILFS_MOUNT_##opt)
#define nilfs_test_opt(nilfs, opt) ((nilfs)->ns_mount_opt & NILFS_MOUNT_##opt)
#define nilfs_write_opt(nilfs, mask, opt) \
- do { (nilfs)->ns_mount_opt = \
+ ((nilfs)->ns_mount_opt = \
(((nilfs)->ns_mount_opt & ~NILFS_MOUNT_##mask) | \
- NILFS_MOUNT_##opt); \
- } while (0)
+ NILFS_MOUNT_##opt)) \
/**
* struct nilfs_root - nilfs root object
@@ -273,6 +267,7 @@ struct nilfs_root {
static inline int nilfs_sb_need_update(struct the_nilfs *nilfs)
{
u64 t = get_seconds();
+
return t < nilfs->ns_sbwtime ||
t > nilfs->ns_sbwtime + nilfs->ns_sb_update_freq;
}
@@ -280,11 +275,12 @@ static inline int nilfs_sb_need_update(struct the_nilfs *nilfs)
static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs)
{
int flip_bits = nilfs->ns_sbwcount & 0x0FL;
+
return (flip_bits != 0x08 && flip_bits != 0x0F);
}
void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
-struct the_nilfs *alloc_nilfs(struct block_device *bdev);
+struct the_nilfs *alloc_nilfs(struct super_block *sb);
void destroy_nilfs(struct the_nilfs *nilfs);
int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data);
int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb);
@@ -308,7 +304,7 @@ static inline void nilfs_get_root(struct nilfs_root *root)
static inline int nilfs_valid_fs(struct the_nilfs *nilfs)
{
- unsigned valid_fs;
+ unsigned int valid_fs;
down_read(&nilfs->ns_sem);
valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS);
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index b44c68a857e77..0a3bc2cf192cf 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -56,6 +56,13 @@ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks,
&mnt->mnt_root->d_lock);
}
+/* prepare for freeing all marks associated with given group */
+extern void fsnotify_detach_group_marks(struct fsnotify_group *group);
+/*
+ * wait for fsnotify_mark_srcu period to end and free all marks in destroy_list
+ */
+extern void fsnotify_mark_destroy_list(void);
+
/*
* update the dentry->d_flags of all of inode's children to indicate if inode cares
* about events that happen to its children.
diff --git a/fs/notify/group.c b/fs/notify/group.c
index d16b62cb28544..3e2dd85be5dd3 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -47,12 +47,21 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group)
*/
void fsnotify_destroy_group(struct fsnotify_group *group)
{
- /* clear all inode marks for this group */
- fsnotify_clear_marks_by_group(group);
+ /* clear all inode marks for this group, attach them to destroy_list */
+ fsnotify_detach_group_marks(group);
- synchronize_srcu(&fsnotify_mark_srcu);
+ /*
+ * Wait for fsnotify_mark_srcu period to end and free all marks in
+ * destroy_list
+ */
+ fsnotify_mark_destroy_list();
- /* clear the notification queue of all events */
+ /*
+ * Since we have waited for fsnotify_mark_srcu in
+ * fsnotify_mark_destroy_list() there can be no outstanding event
+ * notification against this group. So clearing the notification queue
+ * of all events is reliable now.
+ */
fsnotify_flush_notify(group);
/*
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 7115c5d7d373c..d3fea0bd89e2c 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -97,8 +97,8 @@ struct srcu_struct fsnotify_mark_srcu;
static DEFINE_SPINLOCK(destroy_lock);
static LIST_HEAD(destroy_list);
-static void fsnotify_mark_destroy(struct work_struct *work);
-static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy);
+static void fsnotify_mark_destroy_workfn(struct work_struct *work);
+static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn);
void fsnotify_get_mark(struct fsnotify_mark *mark)
{
@@ -173,11 +173,15 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark)
}
/*
- * Free fsnotify mark. The freeing is actually happening from a kthread which
- * first waits for srcu period end. Caller must have a reference to the mark
- * or be protected by fsnotify_mark_srcu.
+ * Prepare mark for freeing and add it to the list of marks prepared for
+ * freeing. The actual freeing must happen after SRCU period ends and the
+ * caller is responsible for this.
+ *
+ * The function returns true if the mark was added to the list of marks for
+ * freeing. The function returns false if someone else has already called
+ * __fsnotify_free_mark() for the mark.
*/
-void fsnotify_free_mark(struct fsnotify_mark *mark)
+static bool __fsnotify_free_mark(struct fsnotify_mark *mark)
{
struct fsnotify_group *group = mark->group;
@@ -185,17 +189,11 @@ void fsnotify_free_mark(struct fsnotify_mark *mark)
/* something else already called this function on this mark */
if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
spin_unlock(&mark->lock);
- return;
+ return false;
}
mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
spin_unlock(&mark->lock);
- spin_lock(&destroy_lock);
- list_add(&mark->g_list, &destroy_list);
- spin_unlock(&destroy_lock);
- queue_delayed_work(system_unbound_wq, &reaper_work,
- FSNOTIFY_REAPER_DELAY);
-
/*
* Some groups like to know that marks are being freed. This is a
* callback to the group function to let it know that this mark
@@ -203,6 +201,25 @@ void fsnotify_free_mark(struct fsnotify_mark *mark)
*/
if (group->ops->freeing_mark)
group->ops->freeing_mark(mark, group);
+
+ spin_lock(&destroy_lock);
+ list_add(&mark->g_list, &destroy_list);
+ spin_unlock(&destroy_lock);
+
+ return true;
+}
+
+/*
+ * Free fsnotify mark. The freeing is actually happening from a workqueue which
+ * first waits for srcu period end. Caller must have a reference to the mark
+ * or be protected by fsnotify_mark_srcu.
+ */
+void fsnotify_free_mark(struct fsnotify_mark *mark)
+{
+ if (__fsnotify_free_mark(mark)) {
+ queue_delayed_work(system_unbound_wq, &reaper_work,
+ FSNOTIFY_REAPER_DELAY);
+ }
}
void fsnotify_destroy_mark(struct fsnotify_mark *mark,
@@ -468,11 +485,29 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
}
/*
- * Given a group, destroy all of the marks associated with that group.
+ * Given a group, prepare for freeing all the marks associated with that group.
+ * The marks are attached to the list of marks prepared for destruction, the
+ * caller is responsible for freeing marks in that list after SRCU period has
+ * ended.
*/
-void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
+void fsnotify_detach_group_marks(struct fsnotify_group *group)
{
- fsnotify_clear_marks_by_group_flags(group, (unsigned int)-1);
+ struct fsnotify_mark *mark;
+
+ while (1) {
+ mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
+ if (list_empty(&group->marks_list)) {
+ mutex_unlock(&group->mark_mutex);
+ break;
+ }
+ mark = list_first_entry(&group->marks_list,
+ struct fsnotify_mark, g_list);
+ fsnotify_get_mark(mark);
+ fsnotify_detach_mark(mark);
+ mutex_unlock(&group->mark_mutex);
+ __fsnotify_free_mark(mark);
+ fsnotify_put_mark(mark);
+ }
}
void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old)
@@ -499,7 +534,11 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
mark->free_mark = free_mark;
}
-static void fsnotify_mark_destroy(struct work_struct *work)
+/*
+ * Destroy all marks in destroy_list, waits for SRCU period to finish before
+ * actually freeing marks.
+ */
+void fsnotify_mark_destroy_list(void)
{
struct fsnotify_mark *mark, *next;
struct list_head private_destroy_list;
@@ -516,3 +555,8 @@ static void fsnotify_mark_destroy(struct work_struct *work)
fsnotify_put_mark(mark);
}
}
+
+static void fsnotify_mark_destroy_workfn(struct work_struct *work)
+{
+ fsnotify_mark_destroy_list();
+}
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 7521e11db728f..fe251f187ff8f 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -74,7 +74,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
set_buffer_uptodate(bh);
- file_ofs = ((s64)page->index << PAGE_CACHE_SHIFT) +
+ file_ofs = ((s64)page->index << PAGE_SHIFT) +
bh_offset(bh);
read_lock_irqsave(&ni->size_lock, flags);
init_size = ni->initialized_size;
@@ -142,7 +142,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
u32 rec_size;
rec_size = ni->itype.index.block_size;
- recs = PAGE_CACHE_SIZE / rec_size;
+ recs = PAGE_SIZE / rec_size;
/* Should have been verified before we got here... */
BUG_ON(!recs);
local_irq_save(flags);
@@ -229,7 +229,7 @@ static int ntfs_read_block(struct page *page)
* fully truncated, truncate will throw it away as soon as we unlock
* it so no need to worry what we do with it.
*/
- iblock = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
+ iblock = (s64)page->index << (PAGE_SHIFT - blocksize_bits);
read_lock_irqsave(&ni->size_lock, flags);
lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits;
init_size = ni->initialized_size;
@@ -362,7 +362,7 @@ handle_zblock:
for (i = 0; i < nr; i++) {
tbh = arr[i];
if (likely(!buffer_uptodate(tbh)))
- submit_bh(READ, tbh);
+ submit_bh(REQ_OP_READ, 0, tbh);
else
ntfs_end_buffer_async_read(tbh, 1);
}
@@ -412,9 +412,9 @@ retry_readpage:
vi = page->mapping->host;
i_size = i_size_read(vi);
/* Is the page fully outside i_size? (truncate in progress) */
- if (unlikely(page->index >= (i_size + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT)) {
- zero_user(page, 0, PAGE_CACHE_SIZE);
+ if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >>
+ PAGE_SHIFT)) {
+ zero_user(page, 0, PAGE_SIZE);
ntfs_debug("Read outside i_size - truncated?");
goto done;
}
@@ -463,7 +463,7 @@ retry_readpage:
* ok to ignore the compressed flag here.
*/
if (unlikely(page->index > 0)) {
- zero_user(page, 0, PAGE_CACHE_SIZE);
+ zero_user(page, 0, PAGE_SIZE);
goto done;
}
if (!NInoAttr(ni))
@@ -509,7 +509,7 @@ retry_readpage:
le16_to_cpu(ctx->attr->data.resident.value_offset),
attr_len);
/* Zero the remainder of the page. */
- memset(addr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
+ memset(addr + attr_len, 0, PAGE_SIZE - attr_len);
flush_dcache_page(page);
kunmap_atomic(addr);
put_unm_err_out:
@@ -599,7 +599,7 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
/* NOTE: Different naming scheme to ntfs_read_block()! */
/* The first block in the page. */
- block = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
+ block = (s64)page->index << (PAGE_SHIFT - blocksize_bits);
read_lock_irqsave(&ni->size_lock, flags);
i_size = i_size_read(vi);
@@ -674,7 +674,7 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
// in the inode.
// Again, for each page do:
// __set_page_dirty_buffers();
- // page_cache_release()
+ // put_page()
// We don't need to wait on the writes.
// Update iblock.
}
@@ -877,7 +877,7 @@ lock_retry_remap:
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
- submit_bh(WRITE, bh);
+ submit_bh(REQ_OP_WRITE, 0, bh);
need_end_writeback = false;
}
bh = next;
@@ -925,7 +925,7 @@ static int ntfs_write_mst_block(struct page *page,
ntfs_volume *vol = ni->vol;
u8 *kaddr;
unsigned int rec_size = ni->itype.index.block_size;
- ntfs_inode *locked_nis[PAGE_CACHE_SIZE / rec_size];
+ ntfs_inode *locked_nis[PAGE_SIZE / rec_size];
struct buffer_head *bh, *head, *tbh, *rec_start_bh;
struct buffer_head *bhs[MAX_BUF_PER_PAGE];
runlist_element *rl;
@@ -949,7 +949,7 @@ static int ntfs_write_mst_block(struct page *page,
(NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION)));
bh_size = vol->sb->s_blocksize;
bh_size_bits = vol->sb->s_blocksize_bits;
- max_bhs = PAGE_CACHE_SIZE / bh_size;
+ max_bhs = PAGE_SIZE / bh_size;
BUG_ON(!max_bhs);
BUG_ON(max_bhs > MAX_BUF_PER_PAGE);
@@ -961,13 +961,13 @@ static int ntfs_write_mst_block(struct page *page,
BUG_ON(!bh);
rec_size_bits = ni->itype.index.block_size_bits;
- BUG_ON(!(PAGE_CACHE_SIZE >> rec_size_bits));
+ BUG_ON(!(PAGE_SIZE >> rec_size_bits));
bhs_per_rec = rec_size >> bh_size_bits;
BUG_ON(!bhs_per_rec);
/* The first block in the page. */
rec_block = block = (sector_t)page->index <<
- (PAGE_CACHE_SHIFT - bh_size_bits);
+ (PAGE_SHIFT - bh_size_bits);
/* The first out of bounds block for the data size. */
dblock = (i_size_read(vi) + bh_size - 1) >> bh_size_bits;
@@ -1133,7 +1133,7 @@ lock_retry_remap:
unsigned long mft_no;
/* Get the mft record number. */
- mft_no = (((s64)page->index << PAGE_CACHE_SHIFT) + ofs)
+ mft_no = (((s64)page->index << PAGE_SHIFT) + ofs)
>> rec_size_bits;
/* Check whether to write this mft record. */
tni = NULL;
@@ -1202,7 +1202,7 @@ lock_retry_remap:
BUG_ON(!buffer_mapped(tbh));
get_bh(tbh);
tbh->b_end_io = end_buffer_write_sync;
- submit_bh(WRITE, tbh);
+ submit_bh(REQ_OP_WRITE, 0, tbh);
}
/* Synchronize the mft mirror now if not @sync. */
if (is_mft && !sync)
@@ -1249,7 +1249,7 @@ do_mirror:
continue;
ofs = bh_offset(tbh);
/* Get the mft record number. */
- mft_no = (((s64)page->index << PAGE_CACHE_SHIFT) + ofs)
+ mft_no = (((s64)page->index << PAGE_SHIFT) + ofs)
>> rec_size_bits;
if (mft_no < vol->mftmirr_size)
ntfs_sync_mft_mirror(vol, mft_no,
@@ -1300,7 +1300,7 @@ done:
* Set page error if there is only one ntfs record in the page.
* Otherwise we would loose per-record granularity.
*/
- if (ni->itype.index.block_size == PAGE_CACHE_SIZE)
+ if (ni->itype.index.block_size == PAGE_SIZE)
SetPageError(page);
NVolSetErrors(vol);
}
@@ -1308,7 +1308,7 @@ done:
ntfs_debug("Page still contains one or more dirty ntfs "
"records. Redirtying the page starting at "
"record 0x%lx.", page->index <<
- (PAGE_CACHE_SHIFT - rec_size_bits));
+ (PAGE_SHIFT - rec_size_bits));
redirty_page_for_writepage(wbc, page);
unlock_page(page);
} else {
@@ -1365,13 +1365,13 @@ retry_writepage:
BUG_ON(!PageLocked(page));
i_size = i_size_read(vi);
/* Is the page fully outside i_size? (truncate in progress) */
- if (unlikely(page->index >= (i_size + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT)) {
+ if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >>
+ PAGE_SHIFT)) {
/*
* The page may have dirty, unmapped buffers. Make them
* freeable here, so the page does not leak.
*/
- block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+ block_invalidatepage(page, 0, PAGE_SIZE);
unlock_page(page);
ntfs_debug("Write outside i_size - truncated?");
return 0;
@@ -1414,10 +1414,10 @@ retry_writepage:
/* NInoNonResident() == NInoIndexAllocPresent() */
if (NInoNonResident(ni)) {
/* We have to zero every time due to mmap-at-end-of-file. */
- if (page->index >= (i_size >> PAGE_CACHE_SHIFT)) {
+ if (page->index >= (i_size >> PAGE_SHIFT)) {
/* The page straddles i_size. */
- unsigned int ofs = i_size & ~PAGE_CACHE_MASK;
- zero_user_segment(page, ofs, PAGE_CACHE_SIZE);
+ unsigned int ofs = i_size & ~PAGE_MASK;
+ zero_user_segment(page, ofs, PAGE_SIZE);
}
/* Handle mst protected attributes. */
if (NInoMstProtected(ni))
@@ -1500,7 +1500,7 @@ retry_writepage:
le16_to_cpu(ctx->attr->data.resident.value_offset),
addr, attr_len);
/* Zero out of bounds area in the page cache page. */
- memset(addr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
+ memset(addr + attr_len, 0, PAGE_SIZE - attr_len);
kunmap_atomic(addr);
flush_dcache_page(page);
flush_dcache_mft_record_page(ctx->ntfs_ino);
diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h
index caecc58f529c9..820d6eabf60f0 100644
--- a/fs/ntfs/aops.h
+++ b/fs/ntfs/aops.h
@@ -40,7 +40,7 @@
static inline void ntfs_unmap_page(struct page *page)
{
kunmap(page);
- page_cache_release(page);
+ put_page(page);
}
/**
@@ -49,7 +49,7 @@ static inline void ntfs_unmap_page(struct page *page)
* @index: index into the page cache for @mapping of the page to map
*
* Read a page from the page cache of the address space @mapping at position
- * @index, where @index is in units of PAGE_CACHE_SIZE, and not in bytes.
+ * @index, where @index is in units of PAGE_SIZE, and not in bytes.
*
* If the page is not in memory it is loaded from disk first using the readpage
* method defined in the address space operations of @mapping and the page is
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 250ed5b20c8fb..44a39a099b54e 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -152,7 +152,7 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx)
if (old_ctx.base_ntfs_ino && old_ctx.ntfs_ino !=
old_ctx.base_ntfs_ino) {
put_this_page = old_ctx.ntfs_ino->page;
- page_cache_get(put_this_page);
+ get_page(put_this_page);
}
/*
* Reinitialize the search context so we can lookup the
@@ -275,7 +275,7 @@ retry_map:
* the pieces anyway.
*/
if (put_this_page)
- page_cache_release(put_this_page);
+ put_page(put_this_page);
}
return err;
}
@@ -1660,7 +1660,7 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
memcpy(kaddr, (u8*)a +
le16_to_cpu(a->data.resident.value_offset),
attr_size);
- memset(kaddr + attr_size, 0, PAGE_CACHE_SIZE - attr_size);
+ memset(kaddr + attr_size, 0, PAGE_SIZE - attr_size);
kunmap_atomic(kaddr);
flush_dcache_page(page);
SetPageUptodate(page);
@@ -1748,7 +1748,7 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
if (page) {
set_page_dirty(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
ntfs_debug("Done.");
return 0;
@@ -1835,7 +1835,7 @@ rl_err_out:
ntfs_free(rl);
page_err_out:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
if (err == -EINVAL)
err = -EIO;
@@ -2513,17 +2513,17 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
BUG_ON(NInoEncrypted(ni));
mapping = VFS_I(ni)->i_mapping;
/* Work out the starting index and page offset. */
- idx = ofs >> PAGE_CACHE_SHIFT;
- start_ofs = ofs & ~PAGE_CACHE_MASK;
+ idx = ofs >> PAGE_SHIFT;
+ start_ofs = ofs & ~PAGE_MASK;
/* Work out the ending index and page offset. */
end = ofs + cnt;
- end_ofs = end & ~PAGE_CACHE_MASK;
+ end_ofs = end & ~PAGE_MASK;
/* If the end is outside the inode size return -ESPIPE. */
if (unlikely(end > i_size_read(VFS_I(ni)))) {
ntfs_error(vol->sb, "Request exceeds end of attribute.");
return -ESPIPE;
}
- end >>= PAGE_CACHE_SHIFT;
+ end >>= PAGE_SHIFT;
/* If there is a first partial page, need to do it the slow way. */
if (start_ofs) {
page = read_mapping_page(mapping, idx, NULL);
@@ -2536,7 +2536,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
* If the last page is the same as the first page, need to
* limit the write to the end offset.
*/
- size = PAGE_CACHE_SIZE;
+ size = PAGE_SIZE;
if (idx == end)
size = end_ofs;
kaddr = kmap_atomic(page);
@@ -2544,7 +2544,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
flush_dcache_page(page);
kunmap_atomic(kaddr);
set_page_dirty(page);
- page_cache_release(page);
+ put_page(page);
balance_dirty_pages_ratelimited(mapping);
cond_resched();
if (idx == end)
@@ -2561,7 +2561,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
return -ENOMEM;
}
kaddr = kmap_atomic(page);
- memset(kaddr, val, PAGE_CACHE_SIZE);
+ memset(kaddr, val, PAGE_SIZE);
flush_dcache_page(page);
kunmap_atomic(kaddr);
/*
@@ -2585,7 +2585,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
set_page_dirty(page);
/* Finally unlock and release the page. */
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
balance_dirty_pages_ratelimited(mapping);
cond_resched();
}
@@ -2602,7 +2602,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
flush_dcache_page(page);
kunmap_atomic(kaddr);
set_page_dirty(page);
- page_cache_release(page);
+ put_page(page);
balance_dirty_pages_ratelimited(mapping);
cond_resched();
}
diff --git a/fs/ntfs/bitmap.c b/fs/ntfs/bitmap.c
index 0809cf8760989..ec130c588d2b6 100644
--- a/fs/ntfs/bitmap.c
+++ b/fs/ntfs/bitmap.c
@@ -67,8 +67,8 @@ int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
* Calculate the indices for the pages containing the first and last
* bits, i.e. @start_bit and @start_bit + @cnt - 1, respectively.
*/
- index = start_bit >> (3 + PAGE_CACHE_SHIFT);
- end_index = (start_bit + cnt - 1) >> (3 + PAGE_CACHE_SHIFT);
+ index = start_bit >> (3 + PAGE_SHIFT);
+ end_index = (start_bit + cnt - 1) >> (3 + PAGE_SHIFT);
/* Get the page containing the first bit (@start_bit). */
mapping = vi->i_mapping;
@@ -82,7 +82,7 @@ int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
kaddr = page_address(page);
/* Set @pos to the position of the byte containing @start_bit. */
- pos = (start_bit >> 3) & ~PAGE_CACHE_MASK;
+ pos = (start_bit >> 3) & ~PAGE_MASK;
/* Calculate the position of @start_bit in the first byte. */
bit = start_bit & 7;
@@ -108,7 +108,7 @@ int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
* Depending on @value, modify all remaining whole bytes in the page up
* to @cnt.
*/
- len = min_t(s64, cnt >> 3, PAGE_CACHE_SIZE - pos);
+ len = min_t(s64, cnt >> 3, PAGE_SIZE - pos);
memset(kaddr + pos, value ? 0xff : 0, len);
cnt -= len << 3;
@@ -132,7 +132,7 @@ int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
* Depending on @value, modify all remaining whole bytes in the
* page up to @cnt.
*/
- len = min_t(s64, cnt >> 3, PAGE_CACHE_SIZE);
+ len = min_t(s64, cnt >> 3, PAGE_SIZE);
memset(kaddr, value ? 0xff : 0, len);
cnt -= len << 3;
}
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index f82498c35e78a..f8eb04387ca43 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -104,16 +104,12 @@ static void zero_partial_compressed_page(struct page *page,
unsigned int kp_ofs;
ntfs_debug("Zeroing page region outside initialized size.");
- if (((s64)page->index << PAGE_CACHE_SHIFT) >= initialized_size) {
- /*
- * FIXME: Using clear_page() will become wrong when we get
- * PAGE_CACHE_SIZE != PAGE_SIZE but for now there is no problem.
- */
+ if (((s64)page->index << PAGE_SHIFT) >= initialized_size) {
clear_page(kp);
return;
}
- kp_ofs = initialized_size & ~PAGE_CACHE_MASK;
- memset(kp + kp_ofs, 0, PAGE_CACHE_SIZE - kp_ofs);
+ kp_ofs = initialized_size & ~PAGE_MASK;
+ memset(kp + kp_ofs, 0, PAGE_SIZE - kp_ofs);
return;
}
@@ -123,7 +119,7 @@ static void zero_partial_compressed_page(struct page *page,
static inline void handle_bounds_compressed_page(struct page *page,
const loff_t i_size, const s64 initialized_size)
{
- if ((page->index >= (initialized_size >> PAGE_CACHE_SHIFT)) &&
+ if ((page->index >= (initialized_size >> PAGE_SHIFT)) &&
(initialized_size < i_size))
zero_partial_compressed_page(page, initialized_size);
return;
@@ -160,7 +156,7 @@ static inline void handle_bounds_compressed_page(struct page *page,
* @xpage_done indicates whether the target page (@dest_pages[@xpage]) was
* completed during the decompression of the compression block (@cb_start).
*
- * Warning: This function *REQUIRES* PAGE_CACHE_SIZE >= 4096 or it will blow up
+ * Warning: This function *REQUIRES* PAGE_SIZE >= 4096 or it will blow up
* unpredicatbly! You have been warned!
*
* Note to hackers: This function may not sleep until it has finished accessing
@@ -241,7 +237,7 @@ return_error:
if (di == xpage)
*xpage_done = 1;
else
- page_cache_release(dp);
+ put_page(dp);
dest_pages[di] = NULL;
}
}
@@ -274,7 +270,7 @@ return_error:
cb = cb_sb_end;
/* Advance destination position to next sub-block. */
- *dest_ofs = (*dest_ofs + NTFS_SB_SIZE) & ~PAGE_CACHE_MASK;
+ *dest_ofs = (*dest_ofs + NTFS_SB_SIZE) & ~PAGE_MASK;
if (!*dest_ofs && (++*dest_index > dest_max_index))
goto return_overflow;
goto do_next_sb;
@@ -301,7 +297,7 @@ return_error:
/* Advance destination position to next sub-block. */
*dest_ofs += NTFS_SB_SIZE;
- if (!(*dest_ofs &= ~PAGE_CACHE_MASK)) {
+ if (!(*dest_ofs &= ~PAGE_MASK)) {
finalize_page:
/*
* First stage: add current page index to array of
@@ -335,7 +331,7 @@ do_next_tag:
*dest_ofs += nr_bytes;
}
/* We have finished the current sub-block. */
- if (!(*dest_ofs &= ~PAGE_CACHE_MASK))
+ if (!(*dest_ofs &= ~PAGE_MASK))
goto finalize_page;
goto do_next_sb;
}
@@ -462,7 +458,7 @@ return_overflow:
* have been written to so that we would lose data if we were to just overwrite
* them with the out-of-date uncompressed data.
*
- * FIXME: For PAGE_CACHE_SIZE > cb_size we are not doing the Right Thing(TM) at
+ * FIXME: For PAGE_SIZE > cb_size we are not doing the Right Thing(TM) at
* the end of the file I think. We need to detect this case and zero the out
* of bounds remainder of the page in question and mark it as handled. At the
* moment we would just return -EIO on such a page. This bug will only become
@@ -470,7 +466,7 @@ return_overflow:
* clusters so is probably not going to be seen by anyone. Still this should
* be fixed. (AIA)
*
- * FIXME: Again for PAGE_CACHE_SIZE > cb_size we are screwing up both in
+ * FIXME: Again for PAGE_SIZE > cb_size we are screwing up both in
* handling sparse and compressed cbs. (AIA)
*
* FIXME: At the moment we don't do any zeroing out in the case that
@@ -497,14 +493,14 @@ int ntfs_read_compressed_block(struct page *page)
u64 cb_size_mask = cb_size - 1UL;
VCN vcn;
LCN lcn;
- /* The first wanted vcn (minimum alignment is PAGE_CACHE_SIZE). */
- VCN start_vcn = (((s64)index << PAGE_CACHE_SHIFT) & ~cb_size_mask) >>
+ /* The first wanted vcn (minimum alignment is PAGE_SIZE). */
+ VCN start_vcn = (((s64)index << PAGE_SHIFT) & ~cb_size_mask) >>
vol->cluster_size_bits;
/*
* The first vcn after the last wanted vcn (minimum alignment is again
- * PAGE_CACHE_SIZE.
+ * PAGE_SIZE.
*/
- VCN end_vcn = ((((s64)(index + 1UL) << PAGE_CACHE_SHIFT) + cb_size - 1)
+ VCN end_vcn = ((((s64)(index + 1UL) << PAGE_SHIFT) + cb_size - 1)
& ~cb_size_mask) >> vol->cluster_size_bits;
/* Number of compression blocks (cbs) in the wanted vcn range. */
unsigned int nr_cbs = (end_vcn - start_vcn) << vol->cluster_size_bits
@@ -515,7 +511,7 @@ int ntfs_read_compressed_block(struct page *page)
* guarantees of start_vcn and end_vcn, no need to round up here.
*/
unsigned int nr_pages = (end_vcn - start_vcn) <<
- vol->cluster_size_bits >> PAGE_CACHE_SHIFT;
+ vol->cluster_size_bits >> PAGE_SHIFT;
unsigned int xpage, max_page, cur_page, cur_ofs, i;
unsigned int cb_clusters, cb_max_ofs;
int block, max_block, cb_max_page, bhs_size, nr_bhs, err = 0;
@@ -549,7 +545,7 @@ int ntfs_read_compressed_block(struct page *page)
* We have already been given one page, this is the one we must do.
* Once again, the alignment guarantees keep it simple.
*/
- offset = start_vcn << vol->cluster_size_bits >> PAGE_CACHE_SHIFT;
+ offset = start_vcn << vol->cluster_size_bits >> PAGE_SHIFT;
xpage = index - offset;
pages[xpage] = page;
/*
@@ -560,13 +556,13 @@ int ntfs_read_compressed_block(struct page *page)
i_size = i_size_read(VFS_I(ni));
initialized_size = ni->initialized_size;
read_unlock_irqrestore(&ni->size_lock, flags);
- max_page = ((i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
+ max_page = ((i_size + PAGE_SIZE - 1) >> PAGE_SHIFT) -
offset;
/* Is the page fully outside i_size? (truncate in progress) */
if (xpage >= max_page) {
kfree(bhs);
kfree(pages);
- zero_user(page, 0, PAGE_CACHE_SIZE);
+ zero_user(page, 0, PAGE_SIZE);
ntfs_debug("Compressed read outside i_size - truncated?");
SetPageUptodate(page);
unlock_page(page);
@@ -591,7 +587,7 @@ int ntfs_read_compressed_block(struct page *page)
continue;
}
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
pages[i] = NULL;
}
}
@@ -674,7 +670,7 @@ lock_retry_remap:
}
get_bh(tbh);
tbh->b_end_io = end_buffer_read_sync;
- submit_bh(READ, tbh);
+ submit_bh(REQ_OP_READ, 0, tbh);
}
/* Wait for io completion on all buffer heads. */
@@ -735,9 +731,9 @@ lock_retry_remap:
ntfs_debug("Successfully read the compression block.");
/* The last page and maximum offset within it for the current cb. */
- cb_max_page = (cur_page << PAGE_CACHE_SHIFT) + cur_ofs + cb_size;
- cb_max_ofs = cb_max_page & ~PAGE_CACHE_MASK;
- cb_max_page >>= PAGE_CACHE_SHIFT;
+ cb_max_page = (cur_page << PAGE_SHIFT) + cur_ofs + cb_size;
+ cb_max_ofs = cb_max_page & ~PAGE_MASK;
+ cb_max_page >>= PAGE_SHIFT;
/* Catch end of file inside a compression block. */
if (cb_max_page > max_page)
@@ -753,16 +749,11 @@ lock_retry_remap:
for (; cur_page < cb_max_page; cur_page++) {
page = pages[cur_page];
if (page) {
- /*
- * FIXME: Using clear_page() will become wrong
- * when we get PAGE_CACHE_SIZE != PAGE_SIZE but
- * for now there is no problem.
- */
if (likely(!cur_ofs))
clear_page(page_address(page));
else
memset(page_address(page) + cur_ofs, 0,
- PAGE_CACHE_SIZE -
+ PAGE_SIZE -
cur_ofs);
flush_dcache_page(page);
kunmap(page);
@@ -771,10 +762,10 @@ lock_retry_remap:
if (cur_page == xpage)
xpage_done = 1;
else
- page_cache_release(page);
+ put_page(page);
pages[cur_page] = NULL;
}
- cb_pos += PAGE_CACHE_SIZE - cur_ofs;
+ cb_pos += PAGE_SIZE - cur_ofs;
cur_ofs = 0;
if (cb_pos >= cb_end)
break;
@@ -807,7 +798,7 @@ lock_retry_remap:
* synchronous io for the majority of pages.
* Or if we choose not to do the read-ahead/-behind stuff, we
* could just return block_read_full_page(pages[xpage]) as long
- * as PAGE_CACHE_SIZE <= cb_size.
+ * as PAGE_SIZE <= cb_size.
*/
if (cb_max_ofs)
cb_max_page--;
@@ -816,8 +807,8 @@ lock_retry_remap:
page = pages[cur_page];
if (page)
memcpy(page_address(page) + cur_ofs, cb_pos,
- PAGE_CACHE_SIZE - cur_ofs);
- cb_pos += PAGE_CACHE_SIZE - cur_ofs;
+ PAGE_SIZE - cur_ofs);
+ cb_pos += PAGE_SIZE - cur_ofs;
cur_ofs = 0;
if (cb_pos >= cb_end)
break;
@@ -850,10 +841,10 @@ lock_retry_remap:
if (cur2_page == xpage)
xpage_done = 1;
else
- page_cache_release(page);
+ put_page(page);
pages[cur2_page] = NULL;
}
- cb_pos2 += PAGE_CACHE_SIZE - cur_ofs2;
+ cb_pos2 += PAGE_SIZE - cur_ofs2;
cur_ofs2 = 0;
if (cb_pos2 >= cb_end)
break;
@@ -884,7 +875,7 @@ lock_retry_remap:
kunmap(page);
unlock_page(page);
if (prev_cur_page != xpage)
- page_cache_release(page);
+ put_page(page);
pages[prev_cur_page] = NULL;
}
}
@@ -914,7 +905,7 @@ lock_retry_remap:
kunmap(page);
unlock_page(page);
if (cur_page != xpage)
- page_cache_release(page);
+ put_page(page);
pages[cur_page] = NULL;
}
}
@@ -961,7 +952,7 @@ err_out:
kunmap(page);
unlock_page(page);
if (i != xpage)
- page_cache_release(page);
+ put_page(page);
}
}
kfree(pages);
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index b2eff5816adc3..a186135790012 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -315,11 +315,11 @@ found_it:
descend_into_child_node:
/*
* Convert vcn to index into the index allocation attribute in units
- * of PAGE_CACHE_SIZE and map the page cache page, reading it from
+ * of PAGE_SIZE and map the page cache page, reading it from
* disk if necessary.
*/
page = ntfs_map_page(ia_mapping, vcn <<
- dir_ni->itype.index.vcn_size_bits >> PAGE_CACHE_SHIFT);
+ dir_ni->itype.index.vcn_size_bits >> PAGE_SHIFT);
if (IS_ERR(page)) {
ntfs_error(sb, "Failed to map directory index page, error %ld.",
-PTR_ERR(page));
@@ -331,9 +331,9 @@ descend_into_child_node:
fast_descend_into_child_node:
/* Get to the index allocation block. */
ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
- dir_ni->itype.index.vcn_size_bits) & ~PAGE_CACHE_MASK));
+ dir_ni->itype.index.vcn_size_bits) & ~PAGE_MASK));
/* Bounds checks. */
- if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE) {
+ if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) {
ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
"inode 0x%lx or driver bug.", dir_ni->mft_no);
goto unm_err_out;
@@ -366,7 +366,7 @@ fast_descend_into_child_node:
goto unm_err_out;
}
index_end = (u8*)ia + dir_ni->itype.index.block_size;
- if (index_end > kaddr + PAGE_CACHE_SIZE) {
+ if (index_end > kaddr + PAGE_SIZE) {
ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
"0x%lx crosses page boundary. Impossible! "
"Cannot access! This is probably a bug in the "
@@ -559,9 +559,9 @@ found_it2:
/* If vcn is in the same page cache page as old_vcn we
* recycle the mapped page. */
if (old_vcn << vol->cluster_size_bits >>
- PAGE_CACHE_SHIFT == vcn <<
+ PAGE_SHIFT == vcn <<
vol->cluster_size_bits >>
- PAGE_CACHE_SHIFT)
+ PAGE_SHIFT)
goto fast_descend_into_child_node;
unlock_page(page);
ntfs_unmap_page(page);
@@ -793,11 +793,11 @@ found_it:
descend_into_child_node:
/*
* Convert vcn to index into the index allocation attribute in units
- * of PAGE_CACHE_SIZE and map the page cache page, reading it from
+ * of PAGE_SIZE and map the page cache page, reading it from
* disk if necessary.
*/
page = ntfs_map_page(ia_mapping, vcn <<
- dir_ni->itype.index.vcn_size_bits >> PAGE_CACHE_SHIFT);
+ dir_ni->itype.index.vcn_size_bits >> PAGE_SHIFT);
if (IS_ERR(page)) {
ntfs_error(sb, "Failed to map directory index page, error %ld.",
-PTR_ERR(page));
@@ -809,9 +809,9 @@ descend_into_child_node:
fast_descend_into_child_node:
/* Get to the index allocation block. */
ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
- dir_ni->itype.index.vcn_size_bits) & ~PAGE_CACHE_MASK));
+ dir_ni->itype.index.vcn_size_bits) & ~PAGE_MASK));
/* Bounds checks. */
- if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE) {
+ if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) {
ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
"inode 0x%lx or driver bug.", dir_ni->mft_no);
goto unm_err_out;
@@ -844,7 +844,7 @@ fast_descend_into_child_node:
goto unm_err_out;
}
index_end = (u8*)ia + dir_ni->itype.index.block_size;
- if (index_end > kaddr + PAGE_CACHE_SIZE) {
+ if (index_end > kaddr + PAGE_SIZE) {
ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
"0x%lx crosses page boundary. Impossible! "
"Cannot access! This is probably a bug in the "
@@ -968,9 +968,9 @@ found_it2:
/* If vcn is in the same page cache page as old_vcn we
* recycle the mapped page. */
if (old_vcn << vol->cluster_size_bits >>
- PAGE_CACHE_SHIFT == vcn <<
+ PAGE_SHIFT == vcn <<
vol->cluster_size_bits >>
- PAGE_CACHE_SHIFT)
+ PAGE_SHIFT)
goto fast_descend_into_child_node;
unlock_page(page);
ntfs_unmap_page(page);
@@ -1246,15 +1246,15 @@ skip_index_root:
goto iput_err_out;
}
/* Get the starting bit position in the current bitmap page. */
- cur_bmp_pos = bmp_pos & ((PAGE_CACHE_SIZE * 8) - 1);
- bmp_pos &= ~(u64)((PAGE_CACHE_SIZE * 8) - 1);
+ cur_bmp_pos = bmp_pos & ((PAGE_SIZE * 8) - 1);
+ bmp_pos &= ~(u64)((PAGE_SIZE * 8) - 1);
get_next_bmp_page:
ntfs_debug("Reading bitmap with page index 0x%llx, bit ofs 0x%llx",
- (unsigned long long)bmp_pos >> (3 + PAGE_CACHE_SHIFT),
+ (unsigned long long)bmp_pos >> (3 + PAGE_SHIFT),
(unsigned long long)bmp_pos &
- (unsigned long long)((PAGE_CACHE_SIZE * 8) - 1));
+ (unsigned long long)((PAGE_SIZE * 8) - 1));
bmp_page = ntfs_map_page(bmp_mapping,
- bmp_pos >> (3 + PAGE_CACHE_SHIFT));
+ bmp_pos >> (3 + PAGE_SHIFT));
if (IS_ERR(bmp_page)) {
ntfs_error(sb, "Reading index bitmap failed.");
err = PTR_ERR(bmp_page);
@@ -1270,9 +1270,9 @@ find_next_index_buffer:
* If we have reached the end of the bitmap page, get the next
* page, and put away the old one.
*/
- if (unlikely((cur_bmp_pos >> 3) >= PAGE_CACHE_SIZE)) {
+ if (unlikely((cur_bmp_pos >> 3) >= PAGE_SIZE)) {
ntfs_unmap_page(bmp_page);
- bmp_pos += PAGE_CACHE_SIZE * 8;
+ bmp_pos += PAGE_SIZE * 8;
cur_bmp_pos = 0;
goto get_next_bmp_page;
}
@@ -1285,8 +1285,8 @@ find_next_index_buffer:
ntfs_debug("Handling index buffer 0x%llx.",
(unsigned long long)bmp_pos + cur_bmp_pos);
/* If the current index buffer is in the same page we reuse the page. */
- if ((prev_ia_pos & (s64)PAGE_CACHE_MASK) !=
- (ia_pos & (s64)PAGE_CACHE_MASK)) {
+ if ((prev_ia_pos & (s64)PAGE_MASK) !=
+ (ia_pos & (s64)PAGE_MASK)) {
prev_ia_pos = ia_pos;
if (likely(ia_page != NULL)) {
unlock_page(ia_page);
@@ -1296,7 +1296,7 @@ find_next_index_buffer:
* Map the page cache page containing the current ia_pos,
* reading it from disk if necessary.
*/
- ia_page = ntfs_map_page(ia_mapping, ia_pos >> PAGE_CACHE_SHIFT);
+ ia_page = ntfs_map_page(ia_mapping, ia_pos >> PAGE_SHIFT);
if (IS_ERR(ia_page)) {
ntfs_error(sb, "Reading index allocation data failed.");
err = PTR_ERR(ia_page);
@@ -1307,10 +1307,10 @@ find_next_index_buffer:
kaddr = (u8*)page_address(ia_page);
}
/* Get the current index buffer. */
- ia = (INDEX_ALLOCATION*)(kaddr + (ia_pos & ~PAGE_CACHE_MASK &
- ~(s64)(ndir->itype.index.block_size - 1)));
+ ia = (INDEX_ALLOCATION*)(kaddr + (ia_pos & ~PAGE_MASK &
+ ~(s64)(ndir->itype.index.block_size - 1)));
/* Bounds checks. */
- if (unlikely((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE)) {
+ if (unlikely((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE)) {
ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
"inode 0x%lx or driver bug.", vdir->i_ino);
goto err_out;
@@ -1348,7 +1348,7 @@ find_next_index_buffer:
goto err_out;
}
index_end = (u8*)ia + ndir->itype.index.block_size;
- if (unlikely(index_end > kaddr + PAGE_CACHE_SIZE)) {
+ if (unlikely(index_end > kaddr + PAGE_SIZE)) {
ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
"0x%lx crosses page boundary. Impossible! "
"Cannot access! This is probably a bug in the "
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index bed4d427dfaee..f548629dfaacb 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -220,8 +220,8 @@ do_non_resident_extend:
m = NULL;
}
mapping = vi->i_mapping;
- index = old_init_size >> PAGE_CACHE_SHIFT;
- end_index = (new_init_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ index = old_init_size >> PAGE_SHIFT;
+ end_index = (new_init_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
do {
/*
* Read the page. If the page is not present, this will zero
@@ -233,7 +233,7 @@ do_non_resident_extend:
goto init_err_out;
}
if (unlikely(PageError(page))) {
- page_cache_release(page);
+ put_page(page);
err = -EIO;
goto init_err_out;
}
@@ -242,13 +242,13 @@ do_non_resident_extend:
* enough to make ntfs_writepage() work.
*/
write_lock_irqsave(&ni->size_lock, flags);
- ni->initialized_size = (s64)(index + 1) << PAGE_CACHE_SHIFT;
+ ni->initialized_size = (s64)(index + 1) << PAGE_SHIFT;
if (ni->initialized_size > new_init_size)
ni->initialized_size = new_init_size;
write_unlock_irqrestore(&ni->size_lock, flags);
/* Set the page dirty so it gets written out. */
set_page_dirty(page);
- page_cache_release(page);
+ put_page(page);
/*
* Play nice with the vm and the rest of the system. This is
* very much needed as we can potentially be modifying the
@@ -543,7 +543,7 @@ out:
err_out:
while (nr > 0) {
unlock_page(pages[--nr]);
- page_cache_release(pages[nr]);
+ put_page(pages[nr]);
}
goto out;
}
@@ -553,7 +553,7 @@ static inline int ntfs_submit_bh_for_read(struct buffer_head *bh)
lock_buffer(bh);
get_bh(bh);
bh->b_end_io = end_buffer_read_sync;
- return submit_bh(READ, bh);
+ return submit_bh(REQ_OP_READ, 0, bh);
}
/**
@@ -573,7 +573,7 @@ static inline int ntfs_submit_bh_for_read(struct buffer_head *bh)
* only partially being written to.
*
* If @nr_pages is greater than one, we are guaranteed that the cluster size is
- * greater than PAGE_CACHE_SIZE, that all pages in @pages are entirely inside
+ * greater than PAGE_SIZE, that all pages in @pages are entirely inside
* the same cluster and that they are the entirety of that cluster, and that
* the cluster is sparse, i.e. we need to allocate a cluster to fill the hole.
*
@@ -653,7 +653,7 @@ static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
u = 0;
do_next_page:
page = pages[u];
- bh_pos = (s64)page->index << PAGE_CACHE_SHIFT;
+ bh_pos = (s64)page->index << PAGE_SHIFT;
bh = head = page_buffers(page);
do {
VCN cdelta;
@@ -810,11 +810,11 @@ map_buffer_cached:
kaddr = kmap_atomic(page);
if (bh_pos < pos) {
- pofs = bh_pos & ~PAGE_CACHE_MASK;
+ pofs = bh_pos & ~PAGE_MASK;
memset(kaddr + pofs, 0, pos - bh_pos);
}
if (bh_end > end) {
- pofs = end & ~PAGE_CACHE_MASK;
+ pofs = end & ~PAGE_MASK;
memset(kaddr + pofs, 0, bh_end - end);
}
kunmap_atomic(kaddr);
@@ -942,7 +942,7 @@ rl_not_mapped_enoent:
* unmapped. This can only happen when the cluster size is
* less than the page cache size.
*/
- if (unlikely(vol->cluster_size < PAGE_CACHE_SIZE)) {
+ if (unlikely(vol->cluster_size < PAGE_SIZE)) {
bh_cend = (bh_end + vol->cluster_size - 1) >>
vol->cluster_size_bits;
if ((bh_cend <= cpos || bh_cpos >= cend)) {
@@ -1208,7 +1208,7 @@ rl_not_mapped_enoent:
wait_on_buffer(bh);
if (likely(buffer_uptodate(bh))) {
page = bh->b_page;
- bh_pos = ((s64)page->index << PAGE_CACHE_SHIFT) +
+ bh_pos = ((s64)page->index << PAGE_SHIFT) +
bh_offset(bh);
/*
* If the buffer overflows the initialized size, need
@@ -1350,7 +1350,7 @@ rl_not_mapped_enoent:
bh = head = page_buffers(page);
do {
if (u == nr_pages &&
- ((s64)page->index << PAGE_CACHE_SHIFT) +
+ ((s64)page->index << PAGE_SHIFT) +
bh_offset(bh) >= end)
break;
if (!buffer_new(bh))
@@ -1422,7 +1422,7 @@ static inline int ntfs_commit_pages_after_non_resident_write(
bool partial;
page = pages[u];
- bh_pos = (s64)page->index << PAGE_CACHE_SHIFT;
+ bh_pos = (s64)page->index << PAGE_SHIFT;
bh = head = page_buffers(page);
partial = false;
do {
@@ -1639,7 +1639,7 @@ static int ntfs_commit_pages_after_write(struct page **pages,
if (end < attr_len)
memcpy(kaddr + end, kattr + end, attr_len - end);
/* Zero the region outside the end of the attribute value. */
- memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
+ memset(kaddr + attr_len, 0, PAGE_SIZE - attr_len);
flush_dcache_page(page);
SetPageUptodate(page);
}
@@ -1706,7 +1706,7 @@ static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages,
unsigned len, copied;
do {
- len = PAGE_CACHE_SIZE - ofs;
+ len = PAGE_SIZE - ofs;
if (len > bytes)
len = bytes;
copied = iov_iter_copy_from_user_atomic(*pages, &data, ofs,
@@ -1724,14 +1724,14 @@ out:
return total;
err:
/* Zero the rest of the target like __copy_from_user(). */
- len = PAGE_CACHE_SIZE - copied;
+ len = PAGE_SIZE - copied;
do {
if (len > bytes)
len = bytes;
zero_user(*pages, copied, len);
bytes -= len;
copied = 0;
- len = PAGE_CACHE_SIZE;
+ len = PAGE_SIZE;
} while (++pages < last_page);
goto out;
}
@@ -1787,8 +1787,8 @@ static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i,
* attributes.
*/
nr_pages = 1;
- if (vol->cluster_size > PAGE_CACHE_SIZE && NInoNonResident(ni))
- nr_pages = vol->cluster_size >> PAGE_CACHE_SHIFT;
+ if (vol->cluster_size > PAGE_SIZE && NInoNonResident(ni))
+ nr_pages = vol->cluster_size >> PAGE_SHIFT;
last_vcn = -1;
do {
VCN vcn;
@@ -1796,9 +1796,9 @@ static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i,
unsigned ofs, do_pages, u;
size_t copied;
- start_idx = idx = pos >> PAGE_CACHE_SHIFT;
- ofs = pos & ~PAGE_CACHE_MASK;
- bytes = PAGE_CACHE_SIZE - ofs;
+ start_idx = idx = pos >> PAGE_SHIFT;
+ ofs = pos & ~PAGE_MASK;
+ bytes = PAGE_SIZE - ofs;
do_pages = 1;
if (nr_pages > 1) {
vcn = pos >> vol->cluster_size_bits;
@@ -1832,7 +1832,7 @@ static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i,
if (lcn == LCN_HOLE) {
start_idx = (pos & ~(s64)
vol->cluster_size_mask)
- >> PAGE_CACHE_SHIFT;
+ >> PAGE_SHIFT;
bytes = vol->cluster_size - (pos &
vol->cluster_size_mask);
do_pages = nr_pages;
@@ -1871,12 +1871,12 @@ again:
if (unlikely(status)) {
do {
unlock_page(pages[--do_pages]);
- page_cache_release(pages[do_pages]);
+ put_page(pages[do_pages]);
} while (do_pages);
break;
}
}
- u = (pos >> PAGE_CACHE_SHIFT) - pages[0]->index;
+ u = (pos >> PAGE_SHIFT) - pages[0]->index;
copied = ntfs_copy_from_user_iter(pages + u, do_pages - u, ofs,
i, bytes);
ntfs_flush_dcache_pages(pages + u, do_pages - u);
@@ -1889,7 +1889,7 @@ again:
}
do {
unlock_page(pages[--do_pages]);
- page_cache_release(pages[do_pages]);
+ put_page(pages[do_pages]);
} while (do_pages);
if (unlikely(status < 0))
break;
@@ -1921,7 +1921,7 @@ again:
}
} while (iov_iter_count(i));
if (cached_page)
- page_cache_release(cached_page);
+ put_page(cached_page);
ntfs_debug("Done. Returning %s (written 0x%lx, status %li).",
written ? "written" : "status", (unsigned long)written,
(long)status);
@@ -1952,12 +1952,9 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
written = ntfs_perform_write(file, from, iocb->ki_pos);
current->backing_dev_info = NULL;
inode_unlock(vi);
- if (likely(written > 0)) {
- err = generic_write_sync(file, iocb->ki_pos, written);
- if (err < 0)
- written = 0;
- }
iocb->ki_pos += written;
+ if (likely(written > 0))
+ written = generic_write_sync(iocb, written);
return written ? written : err;
}
diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
index 096c135691aed..0d645f3579300 100644
--- a/fs/ntfs/index.c
+++ b/fs/ntfs/index.c
@@ -272,11 +272,11 @@ done:
descend_into_child_node:
/*
* Convert vcn to index into the index allocation attribute in units
- * of PAGE_CACHE_SIZE and map the page cache page, reading it from
+ * of PAGE_SIZE and map the page cache page, reading it from
* disk if necessary.
*/
page = ntfs_map_page(ia_mapping, vcn <<
- idx_ni->itype.index.vcn_size_bits >> PAGE_CACHE_SHIFT);
+ idx_ni->itype.index.vcn_size_bits >> PAGE_SHIFT);
if (IS_ERR(page)) {
ntfs_error(sb, "Failed to map index page, error %ld.",
-PTR_ERR(page));
@@ -288,9 +288,9 @@ descend_into_child_node:
fast_descend_into_child_node:
/* Get to the index allocation block. */
ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
- idx_ni->itype.index.vcn_size_bits) & ~PAGE_CACHE_MASK));
+ idx_ni->itype.index.vcn_size_bits) & ~PAGE_MASK));
/* Bounds checks. */
- if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE) {
+ if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) {
ntfs_error(sb, "Out of bounds check failed. Corrupt inode "
"0x%lx or driver bug.", idx_ni->mft_no);
goto unm_err_out;
@@ -323,7 +323,7 @@ fast_descend_into_child_node:
goto unm_err_out;
}
index_end = (u8*)ia + idx_ni->itype.index.block_size;
- if (index_end > kaddr + PAGE_CACHE_SIZE) {
+ if (index_end > kaddr + PAGE_SIZE) {
ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx "
"crosses page boundary. Impossible! Cannot "
"access! This is probably a bug in the "
@@ -427,9 +427,9 @@ ia_done:
* the mapped page.
*/
if (old_vcn << vol->cluster_size_bits >>
- PAGE_CACHE_SHIFT == vcn <<
+ PAGE_SHIFT == vcn <<
vol->cluster_size_bits >>
- PAGE_CACHE_SHIFT)
+ PAGE_SHIFT)
goto fast_descend_into_child_node;
unlock_page(page);
ntfs_unmap_page(page);
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index d284f07eda775..e01287c964a88 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -868,12 +868,12 @@ skip_attr_list_load:
ni->itype.index.block_size);
goto unm_err_out;
}
- if (ni->itype.index.block_size > PAGE_CACHE_SIZE) {
+ if (ni->itype.index.block_size > PAGE_SIZE) {
ntfs_error(vi->i_sb, "Index block size (%u) > "
- "PAGE_CACHE_SIZE (%ld) is not "
+ "PAGE_SIZE (%ld) is not "
"supported. Sorry.",
ni->itype.index.block_size,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
err = -EOPNOTSUPP;
goto unm_err_out;
}
@@ -1585,10 +1585,10 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
"two.", ni->itype.index.block_size);
goto unm_err_out;
}
- if (ni->itype.index.block_size > PAGE_CACHE_SIZE) {
- ntfs_error(vi->i_sb, "Index block size (%u) > PAGE_CACHE_SIZE "
+ if (ni->itype.index.block_size > PAGE_SIZE) {
+ ntfs_error(vi->i_sb, "Index block size (%u) > PAGE_SIZE "
"(%ld) is not supported. Sorry.",
- ni->itype.index.block_size, PAGE_CACHE_SIZE);
+ ni->itype.index.block_size, PAGE_SIZE);
err = -EOPNOTSUPP;
goto unm_err_out;
}
@@ -1854,7 +1854,7 @@ int ntfs_read_inode_mount(struct inode *vi)
/* Need this to sanity check attribute list references to $MFT. */
vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
- /* Provides readpage() and sync_page() for map_mft_record(). */
+ /* Provides readpage() for map_mft_record(). */
vi->i_mapping->a_ops = &ntfs_mst_aops;
ctx = ntfs_attr_get_search_ctx(ni, m);
diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c
index 1711b710b641f..27a24a42f7120 100644
--- a/fs/ntfs/lcnalloc.c
+++ b/fs/ntfs/lcnalloc.c
@@ -283,15 +283,15 @@ runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn,
ntfs_unmap_page(page);
}
page = ntfs_map_page(mapping, last_read_pos >>
- PAGE_CACHE_SHIFT);
+ PAGE_SHIFT);
if (IS_ERR(page)) {
err = PTR_ERR(page);
ntfs_error(vol->sb, "Failed to map page.");
goto out;
}
- buf_size = last_read_pos & ~PAGE_CACHE_MASK;
+ buf_size = last_read_pos & ~PAGE_MASK;
buf = page_address(page) + buf_size;
- buf_size = PAGE_CACHE_SIZE - buf_size;
+ buf_size = PAGE_SIZE - buf_size;
if (unlikely(last_read_pos + buf_size > i_size))
buf_size = i_size - last_read_pos;
buf_size <<= 3;
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index c71de292c5ade..761f12f7f3efc 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -381,7 +381,7 @@ static int ntfs_check_and_load_restart_page(struct inode *vi,
* completely inside @rp, just copy it from there. Otherwise map all
* the required pages and copy the data from them.
*/
- size = PAGE_CACHE_SIZE - (pos & ~PAGE_CACHE_MASK);
+ size = PAGE_SIZE - (pos & ~PAGE_MASK);
if (size >= le32_to_cpu(rp->system_page_size)) {
memcpy(trp, rp, le32_to_cpu(rp->system_page_size));
} else {
@@ -394,8 +394,8 @@ static int ntfs_check_and_load_restart_page(struct inode *vi,
/* Copy the remaining data one page at a time. */
have_read = size;
to_read = le32_to_cpu(rp->system_page_size) - size;
- idx = (pos + size) >> PAGE_CACHE_SHIFT;
- BUG_ON((pos + size) & ~PAGE_CACHE_MASK);
+ idx = (pos + size) >> PAGE_SHIFT;
+ BUG_ON((pos + size) & ~PAGE_MASK);
do {
page = ntfs_map_page(vi->i_mapping, idx);
if (IS_ERR(page)) {
@@ -406,7 +406,7 @@ static int ntfs_check_and_load_restart_page(struct inode *vi,
err = -EIO;
goto err_out;
}
- size = min_t(int, to_read, PAGE_CACHE_SIZE);
+ size = min_t(int, to_read, PAGE_SIZE);
memcpy((u8*)trp + have_read, page_address(page), size);
ntfs_unmap_page(page);
have_read += size;
@@ -509,11 +509,11 @@ bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp)
* log page size if the page cache size is between the default log page
* size and twice that.
*/
- if (PAGE_CACHE_SIZE >= DefaultLogPageSize && PAGE_CACHE_SIZE <=
+ if (PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <=
DefaultLogPageSize * 2)
log_page_size = DefaultLogPageSize;
else
- log_page_size = PAGE_CACHE_SIZE;
+ log_page_size = PAGE_SIZE;
log_page_mask = log_page_size - 1;
/*
* Use ntfs_ffs() instead of ffs() to enable the compiler to
@@ -539,7 +539,7 @@ bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp)
* to be empty.
*/
for (pos = 0; pos < size; pos <<= 1) {
- pgoff_t idx = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t idx = pos >> PAGE_SHIFT;
if (!page || page->index != idx) {
if (page)
ntfs_unmap_page(page);
@@ -550,7 +550,7 @@ bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp)
goto err_out;
}
}
- kaddr = (u8*)page_address(page) + (pos & ~PAGE_CACHE_MASK);
+ kaddr = (u8*)page_address(page) + (pos & ~PAGE_MASK);
/*
* A non-empty block means the logfile is not empty while an
* empty block after a non-empty block has been encountered
@@ -821,7 +821,7 @@ map_vcn:
* completed ignore errors afterwards as we can assume
* that if one buffer worked all of them will work.
*/
- submit_bh(WRITE, bh);
+ submit_bh(REQ_OP_WRITE, 0, bh);
if (should_wait) {
should_wait = false;
wait_on_buffer(bh);
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 3014a36a255b9..d15d492ce47b1 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -61,16 +61,16 @@ static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni)
* here if the volume was that big...
*/
index = (u64)ni->mft_no << vol->mft_record_size_bits >>
- PAGE_CACHE_SHIFT;
- ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
+ PAGE_SHIFT;
+ ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_MASK;
i_size = i_size_read(mft_vi);
/* The maximum valid index into the page cache for $MFT's data. */
- end_index = i_size >> PAGE_CACHE_SHIFT;
+ end_index = i_size >> PAGE_SHIFT;
/* If the wanted index is out of bounds the mft record doesn't exist. */
if (unlikely(index >= end_index)) {
- if (index > end_index || (i_size & ~PAGE_CACHE_MASK) < ofs +
+ if (index > end_index || (i_size & ~PAGE_MASK) < ofs +
vol->mft_record_size) {
page = ERR_PTR(-ENOENT);
ntfs_error(vol->sb, "Attempt to read mft record 0x%lx, "
@@ -487,7 +487,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
}
/* Get the page containing the mirror copy of the mft record @m. */
page = ntfs_map_page(vol->mftmirr_ino->i_mapping, mft_no >>
- (PAGE_CACHE_SHIFT - vol->mft_record_size_bits));
+ (PAGE_SHIFT - vol->mft_record_size_bits));
if (IS_ERR(page)) {
ntfs_error(vol->sb, "Failed to map mft mirror page.");
err = PTR_ERR(page);
@@ -497,7 +497,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
BUG_ON(!PageUptodate(page));
ClearPageUptodate(page);
/* Offset of the mft mirror record inside the page. */
- page_ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
+ page_ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK;
/* The address in the page of the mirror copy of the mft record @m. */
kmirr = page_address(page) + page_ofs;
/* Copy the mst protected mft record to the mirror. */
@@ -592,7 +592,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
clear_buffer_dirty(tbh);
get_bh(tbh);
tbh->b_end_io = end_buffer_write_sync;
- submit_bh(WRITE, tbh);
+ submit_bh(REQ_OP_WRITE, 0, tbh);
}
/* Wait on i/o completion of buffers. */
for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
@@ -785,7 +785,7 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
clear_buffer_dirty(tbh);
get_bh(tbh);
tbh->b_end_io = end_buffer_write_sync;
- submit_bh(WRITE, tbh);
+ submit_bh(REQ_OP_WRITE, 0, tbh);
}
/* Synchronize the mft mirror now if not @sync. */
if (!sync && ni->mft_no < vol->mftmirr_size)
@@ -1178,8 +1178,8 @@ static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol,
for (; pass <= 2;) {
/* Cap size to pass_end. */
ofs = data_pos >> 3;
- page_ofs = ofs & ~PAGE_CACHE_MASK;
- size = PAGE_CACHE_SIZE - page_ofs;
+ page_ofs = ofs & ~PAGE_MASK;
+ size = PAGE_SIZE - page_ofs;
ll = ((pass_end + 7) >> 3) - ofs;
if (size > ll)
size = ll;
@@ -1190,7 +1190,7 @@ static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol,
*/
if (size) {
page = ntfs_map_page(mftbmp_mapping,
- ofs >> PAGE_CACHE_SHIFT);
+ ofs >> PAGE_SHIFT);
if (IS_ERR(page)) {
ntfs_error(vol->sb, "Failed to read mft "
"bitmap, aborting.");
@@ -1328,13 +1328,13 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
*/
ll = lcn >> 3;
page = ntfs_map_page(vol->lcnbmp_ino->i_mapping,
- ll >> PAGE_CACHE_SHIFT);
+ ll >> PAGE_SHIFT);
if (IS_ERR(page)) {
up_write(&mftbmp_ni->runlist.lock);
ntfs_error(vol->sb, "Failed to read from lcn bitmap.");
return PTR_ERR(page);
}
- b = (u8*)page_address(page) + (ll & ~PAGE_CACHE_MASK);
+ b = (u8*)page_address(page) + (ll & ~PAGE_MASK);
tb = 1 << (lcn & 7ull);
down_write(&vol->lcnbmp_lock);
if (*b != 0xff && !(*b & tb)) {
@@ -2103,14 +2103,14 @@ static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no)
* The index into the page cache and the offset within the page cache
* page of the wanted mft record.
*/
- index = mft_no << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT;
- ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
+ index = mft_no << vol->mft_record_size_bits >> PAGE_SHIFT;
+ ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK;
/* The maximum valid index into the page cache for $MFT's data. */
i_size = i_size_read(mft_vi);
- end_index = i_size >> PAGE_CACHE_SHIFT;
+ end_index = i_size >> PAGE_SHIFT;
if (unlikely(index >= end_index)) {
if (unlikely(index > end_index || ofs + vol->mft_record_size >=
- (i_size & ~PAGE_CACHE_MASK))) {
+ (i_size & ~PAGE_MASK))) {
ntfs_error(vol->sb, "Tried to format non-existing mft "
"record 0x%llx.", (long long)mft_no);
return -ENOENT;
@@ -2515,8 +2515,8 @@ mft_rec_already_initialized:
* We now have allocated and initialized the mft record. Calculate the
* index of and the offset within the page cache page the record is in.
*/
- index = bit << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT;
- ofs = (bit << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
+ index = bit << vol->mft_record_size_bits >> PAGE_SHIFT;
+ ofs = (bit << vol->mft_record_size_bits) & ~PAGE_MASK;
/* Read, map, and pin the page containing the mft record. */
page = ntfs_map_page(vol->mft_ino->i_mapping, index);
if (IS_ERR(page)) {
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index 443abecf01b7d..358258364616c 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -253,7 +253,7 @@ handle_name:
err = (signed)nls_name.len;
goto err_out;
}
- nls_name.hash = full_name_hash(nls_name.name, nls_name.len);
+ nls_name.hash = full_name_hash(dent, nls_name.name, nls_name.len);
dent = d_add_ci(dent, dent_inode, &nls_name);
kfree(nls_name.name);
diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h
index c581e26a350d5..12de47b96ca95 100644
--- a/fs/ntfs/ntfs.h
+++ b/fs/ntfs/ntfs.h
@@ -43,7 +43,7 @@ typedef enum {
NTFS_MAX_NAME_LEN = 255,
NTFS_MAX_ATTR_NAME_LEN = 255,
NTFS_MAX_CLUSTER_SIZE = 64 * 1024, /* 64kiB */
- NTFS_MAX_PAGES_PER_CLUSTER = NTFS_MAX_CLUSTER_SIZE / PAGE_CACHE_SIZE,
+ NTFS_MAX_PAGES_PER_CLUSTER = NTFS_MAX_CLUSTER_SIZE / PAGE_SIZE,
} NTFS_CONSTANTS;
/* Global variables. */
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 1b38abdaa3ed3..ecb49870a680c 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -823,14 +823,14 @@ static bool parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
ntfs_debug("vol->mft_record_size_bits = %i (0x%x)",
vol->mft_record_size_bits, vol->mft_record_size_bits);
/*
- * We cannot support mft record sizes above the PAGE_CACHE_SIZE since
+ * We cannot support mft record sizes above the PAGE_SIZE since
* we store $MFT/$DATA, the table of mft records in the page cache.
*/
- if (vol->mft_record_size > PAGE_CACHE_SIZE) {
+ if (vol->mft_record_size > PAGE_SIZE) {
ntfs_error(vol->sb, "Mft record size (%i) exceeds the "
- "PAGE_CACHE_SIZE on your system (%lu). "
+ "PAGE_SIZE on your system (%lu). "
"This is not supported. Sorry.",
- vol->mft_record_size, PAGE_CACHE_SIZE);
+ vol->mft_record_size, PAGE_SIZE);
return false;
}
/* We cannot support mft record sizes below the sector size. */
@@ -1096,7 +1096,7 @@ static bool check_mft_mirror(ntfs_volume *vol)
ntfs_debug("Entering.");
/* Compare contents of $MFT and $MFTMirr. */
- mrecs_per_page = PAGE_CACHE_SIZE / vol->mft_record_size;
+ mrecs_per_page = PAGE_SIZE / vol->mft_record_size;
BUG_ON(!mrecs_per_page);
BUG_ON(!vol->mftmirr_size);
mft_page = mirr_page = NULL;
@@ -1615,20 +1615,20 @@ static bool load_and_init_attrdef(ntfs_volume *vol)
if (!vol->attrdef)
goto iput_failed;
index = 0;
- max_index = i_size >> PAGE_CACHE_SHIFT;
- size = PAGE_CACHE_SIZE;
+ max_index = i_size >> PAGE_SHIFT;
+ size = PAGE_SIZE;
while (index < max_index) {
/* Read the attrdef table and copy it into the linear buffer. */
read_partial_attrdef_page:
page = ntfs_map_page(ino->i_mapping, index);
if (IS_ERR(page))
goto free_iput_failed;
- memcpy((u8*)vol->attrdef + (index++ << PAGE_CACHE_SHIFT),
+ memcpy((u8*)vol->attrdef + (index++ << PAGE_SHIFT),
page_address(page), size);
ntfs_unmap_page(page);
};
- if (size == PAGE_CACHE_SIZE) {
- size = i_size & ~PAGE_CACHE_MASK;
+ if (size == PAGE_SIZE) {
+ size = i_size & ~PAGE_MASK;
if (size)
goto read_partial_attrdef_page;
}
@@ -1684,20 +1684,20 @@ static bool load_and_init_upcase(ntfs_volume *vol)
if (!vol->upcase)
goto iput_upcase_failed;
index = 0;
- max_index = i_size >> PAGE_CACHE_SHIFT;
- size = PAGE_CACHE_SIZE;
+ max_index = i_size >> PAGE_SHIFT;
+ size = PAGE_SIZE;
while (index < max_index) {
/* Read the upcase table and copy it into the linear buffer. */
read_partial_upcase_page:
page = ntfs_map_page(ino->i_mapping, index);
if (IS_ERR(page))
goto iput_upcase_failed;
- memcpy((char*)vol->upcase + (index++ << PAGE_CACHE_SHIFT),
+ memcpy((char*)vol->upcase + (index++ << PAGE_SHIFT),
page_address(page), size);
ntfs_unmap_page(page);
};
- if (size == PAGE_CACHE_SIZE) {
- size = i_size & ~PAGE_CACHE_MASK;
+ if (size == PAGE_SIZE) {
+ size = i_size & ~PAGE_MASK;
if (size)
goto read_partial_upcase_page;
}
@@ -2471,14 +2471,14 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
down_read(&vol->lcnbmp_lock);
/*
* Convert the number of bits into bytes rounded up, then convert into
- * multiples of PAGE_CACHE_SIZE, rounding up so that if we have one
+ * multiples of PAGE_SIZE, rounding up so that if we have one
* full and one partial page max_index = 2.
*/
- max_index = (((vol->nr_clusters + 7) >> 3) + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
- /* Use multiples of 4 bytes, thus max_size is PAGE_CACHE_SIZE / 4. */
+ max_index = (((vol->nr_clusters + 7) >> 3) + PAGE_SIZE - 1) >>
+ PAGE_SHIFT;
+ /* Use multiples of 4 bytes, thus max_size is PAGE_SIZE / 4. */
ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.",
- max_index, PAGE_CACHE_SIZE / 4);
+ max_index, PAGE_SIZE / 4);
for (index = 0; index < max_index; index++) {
unsigned long *kaddr;
@@ -2491,7 +2491,7 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
if (IS_ERR(page)) {
ntfs_debug("read_mapping_page() error. Skipping "
"page (index 0x%lx).", index);
- nr_free -= PAGE_CACHE_SIZE * 8;
+ nr_free -= PAGE_SIZE * 8;
continue;
}
kaddr = kmap_atomic(page);
@@ -2503,9 +2503,9 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
* ntfs_readpage().
*/
nr_free -= bitmap_weight(kaddr,
- PAGE_CACHE_SIZE * BITS_PER_BYTE);
+ PAGE_SIZE * BITS_PER_BYTE);
kunmap_atomic(kaddr);
- page_cache_release(page);
+ put_page(page);
}
ntfs_debug("Finished reading $Bitmap, last index = 0x%lx.", index - 1);
/*
@@ -2547,9 +2547,9 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
pgoff_t index;
ntfs_debug("Entering.");
- /* Use multiples of 4 bytes, thus max_size is PAGE_CACHE_SIZE / 4. */
+ /* Use multiples of 4 bytes, thus max_size is PAGE_SIZE / 4. */
ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = "
- "0x%lx.", max_index, PAGE_CACHE_SIZE / 4);
+ "0x%lx.", max_index, PAGE_SIZE / 4);
for (index = 0; index < max_index; index++) {
unsigned long *kaddr;
@@ -2562,7 +2562,7 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
if (IS_ERR(page)) {
ntfs_debug("read_mapping_page() error. Skipping "
"page (index 0x%lx).", index);
- nr_free -= PAGE_CACHE_SIZE * 8;
+ nr_free -= PAGE_SIZE * 8;
continue;
}
kaddr = kmap_atomic(page);
@@ -2574,9 +2574,9 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
* ntfs_readpage().
*/
nr_free -= bitmap_weight(kaddr,
- PAGE_CACHE_SIZE * BITS_PER_BYTE);
+ PAGE_SIZE * BITS_PER_BYTE);
kunmap_atomic(kaddr);
- page_cache_release(page);
+ put_page(page);
}
ntfs_debug("Finished reading $MFT/$BITMAP, last index = 0x%lx.",
index - 1);
@@ -2618,17 +2618,17 @@ static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs)
/* Type of filesystem. */
sfs->f_type = NTFS_SB_MAGIC;
/* Optimal transfer block size. */
- sfs->f_bsize = PAGE_CACHE_SIZE;
+ sfs->f_bsize = PAGE_SIZE;
/*
* Total data blocks in filesystem in units of f_bsize and since
* inodes are also stored in data blocs ($MFT is a file) this is just
* the total clusters.
*/
sfs->f_blocks = vol->nr_clusters << vol->cluster_size_bits >>
- PAGE_CACHE_SHIFT;
+ PAGE_SHIFT;
/* Free data blocks in filesystem in units of f_bsize. */
size = get_nr_free_clusters(vol) << vol->cluster_size_bits >>
- PAGE_CACHE_SHIFT;
+ PAGE_SHIFT;
if (size < 0LL)
size = 0LL;
/* Free blocks avail to non-superuser, same as above on NTFS. */
@@ -2639,11 +2639,11 @@ static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs)
size = i_size_read(vol->mft_ino) >> vol->mft_record_size_bits;
/*
* Convert the maximum number of set bits into bytes rounded up, then
- * convert into multiples of PAGE_CACHE_SIZE, rounding up so that if we
+ * convert into multiples of PAGE_SIZE, rounding up so that if we
* have one full and one partial page max_index = 2.
*/
max_index = ((((mft_ni->initialized_size >> vol->mft_record_size_bits)
- + 7) >> 3) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ + 7) >> 3) + PAGE_SIZE - 1) >> PAGE_SHIFT;
read_unlock_irqrestore(&mft_ni->size_lock, flags);
/* Number of inodes in filesystem (at this point in time). */
sfs->f_files = size;
@@ -2765,15 +2765,15 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
if (!parse_options(vol, (char*)opt))
goto err_out_now;
- /* We support sector sizes up to the PAGE_CACHE_SIZE. */
- if (bdev_logical_block_size(sb->s_bdev) > PAGE_CACHE_SIZE) {
+ /* We support sector sizes up to the PAGE_SIZE. */
+ if (bdev_logical_block_size(sb->s_bdev) > PAGE_SIZE) {
if (!silent)
ntfs_error(sb, "Device has unsupported sector size "
"(%i). The maximum supported sector "
"size on this architecture is %lu "
"bytes.",
bdev_logical_block_size(sb->s_bdev),
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
goto err_out_now;
}
/*
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index e27e6527912bb..4342c7ee7d202 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -1,7 +1,5 @@
ccflags-y := -Ifs/ocfs2
-ccflags-y += -DCATCH_BH_JBD_RACES
-
obj-$(CONFIG_OCFS2_FS) += \
ocfs2.o \
ocfs2_stackglue.o
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 0cdf497c91efb..2162434728c02 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -322,3 +322,90 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
brelse(di_bh);
return acl;
}
+
+int ocfs2_acl_chmod(struct inode *inode, struct buffer_head *bh)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct posix_acl *acl;
+ int ret;
+
+ if (S_ISLNK(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+ return 0;
+
+ acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, bh);
+ if (IS_ERR(acl) || !acl)
+ return PTR_ERR(acl);
+ ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+ if (ret)
+ return ret;
+ ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS,
+ acl, NULL, NULL);
+ posix_acl_release(acl);
+ return ret;
+}
+
+/*
+ * Initialize the ACLs of a new inode. If parent directory has default ACL,
+ * then clone to new inode. Called from ocfs2_mknod.
+ */
+int ocfs2_init_acl(handle_t *handle,
+ struct inode *inode,
+ struct inode *dir,
+ struct buffer_head *di_bh,
+ struct buffer_head *dir_bh,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_alloc_context *data_ac)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct posix_acl *acl = NULL;
+ int ret = 0, ret2;
+ umode_t mode;
+
+ if (!S_ISLNK(inode->i_mode)) {
+ if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
+ acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT,
+ dir_bh);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ }
+ if (!acl) {
+ mode = inode->i_mode & ~current_umask();
+ ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
+ if (ret) {
+ mlog_errno(ret);
+ goto cleanup;
+ }
+ }
+ }
+ if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
+ if (S_ISDIR(inode->i_mode)) {
+ ret = ocfs2_set_acl(handle, inode, di_bh,
+ ACL_TYPE_DEFAULT, acl,
+ meta_ac, data_ac);
+ if (ret)
+ goto cleanup;
+ }
+ mode = inode->i_mode;
+ ret = __posix_acl_create(&acl, GFP_NOFS, &mode);
+ if (ret < 0)
+ return ret;
+
+ ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
+ if (ret2) {
+ mlog_errno(ret2);
+ ret = ret2;
+ goto cleanup;
+ }
+ if (ret > 0) {
+ ret = ocfs2_set_acl(handle, inode,
+ di_bh, ACL_TYPE_ACCESS,
+ acl, meta_ac, data_ac);
+ }
+ }
+cleanup:
+ posix_acl_release(acl);
+ return ret;
+}
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 3fce68d086251..2783a75b3999e 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -35,5 +35,10 @@ int ocfs2_set_acl(handle_t *handle,
struct posix_acl *acl,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_alloc_context *data_ac);
+extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *);
+extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
+ struct buffer_head *, struct buffer_head *,
+ struct ocfs2_alloc_context *,
+ struct ocfs2_alloc_context *);
#endif /* OCFS2_ACL_H */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 70907d638b607..7dabbc31060e4 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5351,7 +5351,7 @@ static int ocfs2_truncate_rec(handle_t *handle,
{
int ret;
u32 left_cpos, rec_range, trunc_range;
- int wants_rotate = 0, is_rightmost_tree_rec = 0;
+ int is_rightmost_tree_rec = 0;
struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
struct ocfs2_path *left_path = NULL;
struct ocfs2_extent_list *el = path_leaf_el(path);
@@ -5457,7 +5457,6 @@ static int ocfs2_truncate_rec(handle_t *handle,
memset(rec, 0, sizeof(*rec));
ocfs2_cleanup_merge(el, index);
- wants_rotate = 1;
next_free = le16_to_cpu(el->l_next_free_rec);
if (is_rightmost_tree_rec && next_free > 1) {
@@ -6107,6 +6106,43 @@ void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
}
}
+/*
+ * Try to flush truncate logs if we can free enough clusters from it.
+ * As for return value, "< 0" means error, "0" no space and "1" means
+ * we have freed enough spaces and let the caller try to allocate again.
+ */
+int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
+ unsigned int needed)
+{
+ tid_t target;
+ int ret = 0;
+ unsigned int truncated_clusters;
+
+ inode_lock(osb->osb_tl_inode);
+ truncated_clusters = osb->truncated_clusters;
+ inode_unlock(osb->osb_tl_inode);
+
+ /*
+ * Check whether we can succeed in allocating if we free
+ * the truncate log.
+ */
+ if (truncated_clusters < needed)
+ goto out;
+
+ ret = ocfs2_flush_truncate_log(osb);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) {
+ jbd2_log_wait_commit(osb->journal->j_journal, target);
+ ret = 1;
+ }
+out:
+ return ret;
+}
+
static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
int slot_num,
struct inode **tl_inode,
@@ -6671,7 +6707,7 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
{
int i;
struct page *page;
- unsigned int from, to = PAGE_CACHE_SIZE;
+ unsigned int from, to = PAGE_SIZE;
struct super_block *sb = inode->i_sb;
BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
@@ -6679,21 +6715,21 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
if (numpages == 0)
goto out;
- to = PAGE_CACHE_SIZE;
+ to = PAGE_SIZE;
for(i = 0; i < numpages; i++) {
page = pages[i];
- from = start & (PAGE_CACHE_SIZE - 1);
- if ((end >> PAGE_CACHE_SHIFT) == page->index)
- to = end & (PAGE_CACHE_SIZE - 1);
+ from = start & (PAGE_SIZE - 1);
+ if ((end >> PAGE_SHIFT) == page->index)
+ to = end & (PAGE_SIZE - 1);
- BUG_ON(from > PAGE_CACHE_SIZE);
- BUG_ON(to > PAGE_CACHE_SIZE);
+ BUG_ON(from > PAGE_SIZE);
+ BUG_ON(to > PAGE_SIZE);
ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
&phys);
- start = (page->index + 1) << PAGE_CACHE_SHIFT;
+ start = (page->index + 1) << PAGE_SHIFT;
}
out:
if (pages)
@@ -6712,7 +6748,7 @@ int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
numpages = 0;
last_page_bytes = PAGE_ALIGN(end);
- index = start >> PAGE_CACHE_SHIFT;
+ index = start >> PAGE_SHIFT;
do {
pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS);
if (!pages[numpages]) {
@@ -6723,7 +6759,7 @@ int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
numpages++;
index++;
- } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
+ } while (index < (last_page_bytes >> PAGE_SHIFT));
out:
if (ret != 0) {
@@ -6950,8 +6986,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
* to do that now.
*/
if (!ocfs2_sparse_alloc(osb) &&
- PAGE_CACHE_SIZE < osb->s_clustersize)
- end = PAGE_CACHE_SIZE;
+ PAGE_SIZE < osb->s_clustersize)
+ end = PAGE_SIZE;
ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
if (ret) {
@@ -6971,8 +7007,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
goto out_unlock;
}
- page_end = PAGE_CACHE_SIZE;
- if (PAGE_CACHE_SIZE > osb->s_clustersize)
+ page_end = PAGE_SIZE;
+ if (PAGE_SIZE > osb->s_clustersize)
page_end = osb->s_clustersize;
for (i = 0; i < num_pages; i++)
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index f3dc1b0dfffc8..4a5152ec88a33 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -188,6 +188,8 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
u64 start_blk,
unsigned int num_clusters);
int __ocfs2_flush_truncate_log(struct ocfs2_super *osb);
+int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
+ unsigned int needed);
/*
* Process local structure which describes the block unlinks done
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 1581240a7ca04..98d36548153dc 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -234,7 +234,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
size = i_size_read(inode);
- if (size > PAGE_CACHE_SIZE ||
+ if (size > PAGE_SIZE ||
size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) {
ocfs2_error(inode->i_sb,
"Inode %llu has with inline data has bad size: %Lu\n",
@@ -247,7 +247,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
if (size)
memcpy(kaddr, di->id2.i_data.id_data, size);
/* Clear the remaining part of the page */
- memset(kaddr + size, 0, PAGE_CACHE_SIZE - size);
+ memset(kaddr + size, 0, PAGE_SIZE - size);
flush_dcache_page(page);
kunmap_atomic(kaddr);
@@ -282,7 +282,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
{
struct inode *inode = page->mapping->host;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
+ loff_t start = (loff_t)page->index << PAGE_SHIFT;
int ret, unlock = 1;
trace_ocfs2_readpage((unsigned long long)oi->ip_blkno,
@@ -385,7 +385,7 @@ static int ocfs2_readpages(struct file *filp, struct address_space *mapping,
* drop out in that case as it's not worth handling here.
*/
last = list_entry(pages->prev, struct page, lru);
- start = (loff_t)last->index << PAGE_CACHE_SHIFT;
+ start = (loff_t)last->index << PAGE_SHIFT;
if (start >= i_size_read(inode))
goto out_unlock;
@@ -511,12 +511,12 @@ static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
unsigned int *start,
unsigned int *end)
{
- unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE;
+ unsigned int cluster_start = 0, cluster_end = PAGE_SIZE;
- if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) {
+ if (unlikely(PAGE_SHIFT > osb->s_clustersize_bits)) {
unsigned int cpp;
- cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits);
+ cpp = 1 << (PAGE_SHIFT - osb->s_clustersize_bits);
cluster_start = cpos % cpp;
cluster_start = cluster_start << osb->s_clustersize_bits;
@@ -640,7 +640,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
!buffer_new(bh) &&
ocfs2_should_read_blk(inode, page, block_start) &&
(block_start < from || block_end > to)) {
- ll_rw_block(READ, 1, &bh);
+ ll_rw_block(REQ_OP_READ, 0, 1, &bh);
*wait_bh++=bh;
}
@@ -684,13 +684,13 @@ next_bh:
return ret;
}
-#if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE)
+#if (PAGE_SIZE >= OCFS2_MAX_CLUSTERSIZE)
#define OCFS2_MAX_CTXT_PAGES 1
#else
-#define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE)
+#define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_SIZE)
#endif
-#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
+#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_SIZE / OCFS2_MIN_CLUSTERSIZE)
struct ocfs2_unwritten_extent {
struct list_head ue_node;
@@ -785,7 +785,7 @@ void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
if (pages[i]) {
unlock_page(pages[i]);
mark_page_accessed(pages[i]);
- page_cache_release(pages[i]);
+ put_page(pages[i]);
}
}
}
@@ -808,7 +808,7 @@ static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc)
}
}
mark_page_accessed(wc->w_target_page);
- page_cache_release(wc->w_target_page);
+ put_page(wc->w_target_page);
}
ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
}
@@ -857,7 +857,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
wc->w_di_bh = di_bh;
wc->w_type = type;
- if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
+ if (unlikely(PAGE_SHIFT > osb->s_clustersize_bits))
wc->w_large_pages = 1;
else
wc->w_large_pages = 0;
@@ -920,7 +920,7 @@ static void ocfs2_write_failure(struct inode *inode,
loff_t user_pos, unsigned user_len)
{
int i;
- unsigned from = user_pos & (PAGE_CACHE_SIZE - 1),
+ unsigned from = user_pos & (PAGE_SIZE - 1),
to = user_pos + user_len;
struct page *tmppage;
@@ -960,7 +960,7 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
(page_offset(page) <= user_pos));
if (page == wc->w_target_page) {
- map_from = user_pos & (PAGE_CACHE_SIZE - 1);
+ map_from = user_pos & (PAGE_SIZE - 1);
map_to = map_from + user_len;
if (new)
@@ -1034,7 +1034,7 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
struct inode *inode = mapping->host;
loff_t last_byte;
- target_index = user_pos >> PAGE_CACHE_SHIFT;
+ target_index = user_pos >> PAGE_SHIFT;
/*
* Figure out how many pages we'll be manipulating here. For
@@ -1053,14 +1053,14 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
*/
last_byte = max(user_pos + user_len, i_size_read(inode));
BUG_ON(last_byte < 1);
- end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1;
+ end_index = ((last_byte - 1) >> PAGE_SHIFT) + 1;
if ((start + wc->w_num_pages) > end_index)
wc->w_num_pages = end_index - start;
} else {
wc->w_num_pages = 1;
start = target_index;
}
- end_index = (user_pos + user_len - 1) >> PAGE_CACHE_SHIFT;
+ end_index = (user_pos + user_len - 1) >> PAGE_SHIFT;
for(i = 0; i < wc->w_num_pages; i++) {
index = start + i;
@@ -1082,7 +1082,7 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
goto out;
}
- page_cache_get(mmap_page);
+ get_page(mmap_page);
wc->w_pages[i] = mmap_page;
wc->w_target_locked = true;
} else if (index >= target_index && index <= end_index &&
@@ -1272,7 +1272,7 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
{
struct ocfs2_write_cluster_desc *desc;
- wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1);
+ wc->w_target_from = pos & (PAGE_SIZE - 1);
wc->w_target_to = wc->w_target_from + len;
if (alloc == 0)
@@ -1309,7 +1309,7 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
&wc->w_target_to);
} else {
wc->w_target_from = 0;
- wc->w_target_to = PAGE_CACHE_SIZE;
+ wc->w_target_to = PAGE_SIZE;
}
}
@@ -1645,43 +1645,6 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
return ret;
}
-/*
- * Try to flush truncate logs if we can free enough clusters from it.
- * As for return value, "< 0" means error, "0" no space and "1" means
- * we have freed enough spaces and let the caller try to allocate again.
- */
-static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
- unsigned int needed)
-{
- tid_t target;
- int ret = 0;
- unsigned int truncated_clusters;
-
- inode_lock(osb->osb_tl_inode);
- truncated_clusters = osb->truncated_clusters;
- inode_unlock(osb->osb_tl_inode);
-
- /*
- * Check whether we can succeed in allocating if we free
- * the truncate log.
- */
- if (truncated_clusters < needed)
- goto out;
-
- ret = ocfs2_flush_truncate_log(osb);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
- if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) {
- jbd2_log_wait_commit(osb->journal->j_journal, target);
- ret = 1;
- }
-out:
- return ret;
-}
-
int ocfs2_write_begin_nolock(struct address_space *mapping,
loff_t pos, unsigned len, ocfs2_write_type_t type,
struct page **pagep, void **fsdata,
@@ -1981,7 +1944,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
struct page *page, void *fsdata)
{
int i, ret;
- unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned from, to, start = pos & (PAGE_SIZE - 1);
struct inode *inode = mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_write_ctxt *wc = fsdata;
@@ -2027,8 +1990,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
from = wc->w_target_from;
to = wc->w_target_to;
- BUG_ON(from > PAGE_CACHE_SIZE ||
- to > PAGE_CACHE_SIZE ||
+ BUG_ON(from > PAGE_SIZE ||
+ to > PAGE_SIZE ||
to < from);
} else {
/*
@@ -2037,7 +2000,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
* to flush their entire range.
*/
from = 0;
- to = PAGE_CACHE_SIZE;
+ to = PAGE_SIZE;
}
if (page_has_buffers(tmppage)) {
@@ -2311,7 +2274,7 @@ static void ocfs2_dio_end_io_write(struct inode *inode,
/* ocfs2_file_write_iter will get i_mutex, so we need not lock if we
* are in that context. */
if (dwc->dw_writer_pid != task_pid_nr(current)) {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
locked = 1;
}
@@ -2390,7 +2353,7 @@ out:
ocfs2_free_alloc_context(meta_ac);
ocfs2_run_deallocs(osb, &dealloc);
if (locked)
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
ocfs2_dio_free_write_ctx(inode, dwc);
}
@@ -2423,13 +2386,11 @@ static int ocfs2_dio_end_io(struct kiocb *iocb,
return 0;
}
-static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- loff_t offset)
+static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
- struct inode *inode = file_inode(file)->i_mapping->host;
+ struct inode *inode = file->f_mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- loff_t end = offset + iter->count;
get_block_t *get_block;
/*
@@ -2440,7 +2401,8 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
return 0;
/* Fallback to buffered I/O if we do not support append dio. */
- if (end > i_size_read(inode) && !ocfs2_supports_append_dio(osb))
+ if (iocb->ki_pos + iter->count > i_size_read(inode) &&
+ !ocfs2_supports_append_dio(osb))
return 0;
if (iov_iter_rw(iter) == READ)
@@ -2449,7 +2411,7 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
get_block = ocfs2_dio_get_block;
return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
- iter, offset, get_block,
+ iter, get_block,
ocfs2_dio_end_io, NULL, 0);
}
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index fe50ded1b4ce7..8f040f88ade44 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -79,7 +79,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
get_bh(bh); /* for end_buffer_write_sync() */
bh->b_end_io = end_buffer_write_sync;
- submit_bh(WRITE, bh);
+ submit_bh(REQ_OP_WRITE, 0, bh);
wait_on_buffer(bh);
@@ -139,17 +139,22 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
lock_buffer(bh);
if (buffer_jbd(bh)) {
+#ifdef CATCH_BH_JBD_RACES
mlog(ML_ERROR,
"block %llu had the JBD bit set "
"while I was in lock_buffer!",
(unsigned long long)bh->b_blocknr);
BUG();
+#else
+ unlock_buffer(bh);
+ continue;
+#endif
}
clear_buffer_uptodate(bh);
get_bh(bh); /* for end_buffer_read_sync() */
bh->b_end_io = end_buffer_read_sync;
- submit_bh(READ, bh);
+ submit_bh(REQ_OP_READ, 0, bh);
}
for (i = nr; i > 0; i--) {
@@ -305,7 +310,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
if (validate)
set_buffer_needs_validate(bh);
bh->b_end_io = end_buffer_read_sync;
- submit_bh(READ, bh);
+ submit_bh(REQ_OP_READ, 0, bh);
continue;
}
}
@@ -419,7 +424,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
get_bh(bh); /* for end_buffer_write_sync() */
bh->b_end_io = end_buffer_write_sync;
ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &di->i_check);
- submit_bh(WRITE, bh);
+ submit_bh(REQ_OP_WRITE, 0, bh);
wait_on_buffer(bh);
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index bd15929b5f925..636abcbd46501 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -272,10 +272,21 @@ struct o2hb_region {
struct delayed_work hr_write_timeout_work;
unsigned long hr_last_timeout_start;
+ /* negotiate timer, used to negotiate extending hb timeout. */
+ struct delayed_work hr_nego_timeout_work;
+ unsigned long hr_nego_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+
/* Used during o2hb_check_slot to hold a copy of the block
* being checked because we temporarily have to zero out the
* crc field. */
struct o2hb_disk_heartbeat_block *hr_tmp_block;
+
+ /* Message key for negotiate timeout message. */
+ unsigned int hr_key;
+ struct list_head hr_handler_list;
+
+ /* last hb status, 0 for success, other value for error. */
+ int hr_last_hb_status;
};
struct o2hb_bio_wait_ctxt {
@@ -284,6 +295,17 @@ struct o2hb_bio_wait_ctxt {
int wc_error;
};
+#define O2HB_NEGO_TIMEOUT_MS (O2HB_MAX_WRITE_TIMEOUT_MS/2)
+
+enum {
+ O2HB_NEGO_TIMEOUT_MSG = 1,
+ O2HB_NEGO_APPROVE_MSG = 2,
+};
+
+struct o2hb_nego_msg {
+ u8 node_num;
+};
+
static void o2hb_write_timeout(struct work_struct *work)
{
int failed, quorum;
@@ -319,7 +341,7 @@ static void o2hb_write_timeout(struct work_struct *work)
o2quo_disk_timeout();
}
-static void o2hb_arm_write_timeout(struct o2hb_region *reg)
+static void o2hb_arm_timeout(struct o2hb_region *reg)
{
/* Arm writeout only after thread reaches steady state */
if (atomic_read(&reg->hr_steady_iterations) != 0)
@@ -334,14 +356,132 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg)
spin_unlock(&o2hb_live_lock);
}
cancel_delayed_work(&reg->hr_write_timeout_work);
- reg->hr_last_timeout_start = jiffies;
schedule_delayed_work(&reg->hr_write_timeout_work,
msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS));
+
+ cancel_delayed_work(&reg->hr_nego_timeout_work);
+ /* negotiate timeout must be less than write timeout. */
+ schedule_delayed_work(&reg->hr_nego_timeout_work,
+ msecs_to_jiffies(O2HB_NEGO_TIMEOUT_MS));
+ memset(reg->hr_nego_node_bitmap, 0, sizeof(reg->hr_nego_node_bitmap));
}
-static void o2hb_disarm_write_timeout(struct o2hb_region *reg)
+static void o2hb_disarm_timeout(struct o2hb_region *reg)
{
cancel_delayed_work_sync(&reg->hr_write_timeout_work);
+ cancel_delayed_work_sync(&reg->hr_nego_timeout_work);
+}
+
+static int o2hb_send_nego_msg(int key, int type, u8 target)
+{
+ struct o2hb_nego_msg msg;
+ int status, ret;
+
+ msg.node_num = o2nm_this_node();
+again:
+ ret = o2net_send_message(type, key, &msg, sizeof(msg),
+ target, &status);
+
+ if (ret == -EAGAIN || ret == -ENOMEM) {
+ msleep(100);
+ goto again;
+ }
+
+ return ret;
+}
+
+static void o2hb_nego_timeout(struct work_struct *work)
+{
+ unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ int master_node, i, ret;
+ struct o2hb_region *reg;
+
+ reg = container_of(work, struct o2hb_region, hr_nego_timeout_work.work);
+ /* don't negotiate timeout if last hb failed since it is very
+ * possible io failed. Should let write timeout fence self.
+ */
+ if (reg->hr_last_hb_status)
+ return;
+
+ o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
+ /* lowest node as master node to make negotiate decision. */
+ master_node = find_next_bit(live_node_bitmap, O2NM_MAX_NODES, 0);
+
+ if (master_node == o2nm_this_node()) {
+ if (!test_bit(master_node, reg->hr_nego_node_bitmap)) {
+ printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%s).\n",
+ o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000,
+ config_item_name(&reg->hr_item), reg->hr_dev_name);
+ set_bit(master_node, reg->hr_nego_node_bitmap);
+ }
+ if (memcmp(reg->hr_nego_node_bitmap, live_node_bitmap,
+ sizeof(reg->hr_nego_node_bitmap))) {
+ /* check negotiate bitmap every second to do timeout
+ * approve decision.
+ */
+ schedule_delayed_work(&reg->hr_nego_timeout_work,
+ msecs_to_jiffies(1000));
+
+ return;
+ }
+
+ printk(KERN_NOTICE "o2hb: all nodes hb write hung, maybe region %s (%s) is down.\n",
+ config_item_name(&reg->hr_item), reg->hr_dev_name);
+ /* approve negotiate timeout request. */
+ o2hb_arm_timeout(reg);
+
+ i = -1;
+ while ((i = find_next_bit(live_node_bitmap,
+ O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
+ if (i == master_node)
+ continue;
+
+ mlog(ML_HEARTBEAT, "send NEGO_APPROVE msg to node %d\n", i);
+ ret = o2hb_send_nego_msg(reg->hr_key,
+ O2HB_NEGO_APPROVE_MSG, i);
+ if (ret)
+ mlog(ML_ERROR, "send NEGO_APPROVE msg to node %d fail %d\n",
+ i, ret);
+ }
+ } else {
+ /* negotiate timeout with master node. */
+ printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%s), negotiate timeout with node %d.\n",
+ o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, config_item_name(&reg->hr_item),
+ reg->hr_dev_name, master_node);
+ ret = o2hb_send_nego_msg(reg->hr_key, O2HB_NEGO_TIMEOUT_MSG,
+ master_node);
+ if (ret)
+ mlog(ML_ERROR, "send NEGO_TIMEOUT msg to node %d fail %d\n",
+ master_node, ret);
+ }
+}
+
+static int o2hb_nego_timeout_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
+{
+ struct o2hb_region *reg = data;
+ struct o2hb_nego_msg *nego_msg;
+
+ nego_msg = (struct o2hb_nego_msg *)msg->buf;
+ printk(KERN_NOTICE "o2hb: receive negotiate timeout message from node %d on region %s (%s).\n",
+ nego_msg->node_num, config_item_name(&reg->hr_item), reg->hr_dev_name);
+ if (nego_msg->node_num < O2NM_MAX_NODES)
+ set_bit(nego_msg->node_num, reg->hr_nego_node_bitmap);
+ else
+ mlog(ML_ERROR, "got nego timeout message from bad node.\n");
+
+ return 0;
+}
+
+static int o2hb_nego_approve_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
+{
+ struct o2hb_region *reg = data;
+
+ printk(KERN_NOTICE "o2hb: negotiate timeout approved by master node on region %s (%s).\n",
+ config_item_name(&reg->hr_item), reg->hr_dev_name);
+ o2hb_arm_timeout(reg);
+ return 0;
}
static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc)
@@ -390,7 +530,8 @@ static void o2hb_bio_end_io(struct bio *bio)
static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
struct o2hb_bio_wait_ctxt *wc,
unsigned int *current_slot,
- unsigned int max_slots)
+ unsigned int max_slots, int op,
+ int op_flags)
{
int len, current_page;
unsigned int vec_len, vec_start;
@@ -416,14 +557,15 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
bio->bi_bdev = reg->hr_bdev;
bio->bi_private = wc;
bio->bi_end_io = o2hb_bio_end_io;
+ bio_set_op_attrs(bio, op, op_flags);
- vec_start = (cs << bits) % PAGE_CACHE_SIZE;
+ vec_start = (cs << bits) % PAGE_SIZE;
while(cs < max_slots) {
current_page = cs / spp;
page = reg->hr_slot_data[current_page];
- vec_len = min(PAGE_CACHE_SIZE - vec_start,
- (max_slots-cs) * (PAGE_CACHE_SIZE/spp) );
+ vec_len = min(PAGE_SIZE - vec_start,
+ (max_slots-cs) * (PAGE_SIZE/spp) );
mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n",
current_page, vec_len, vec_start);
@@ -431,7 +573,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
len = bio_add_page(bio, page, vec_len, vec_start);
if (len != vec_len) break;
- cs += vec_len / (PAGE_CACHE_SIZE/spp);
+ cs += vec_len / (PAGE_SIZE/spp);
vec_start = 0;
}
@@ -451,7 +593,8 @@ static int o2hb_read_slots(struct o2hb_region *reg,
o2hb_bio_wait_init(&wc);
while(current_slot < max_slots) {
- bio = o2hb_setup_one_bio(reg, &wc, &current_slot, max_slots);
+ bio = o2hb_setup_one_bio(reg, &wc, &current_slot, max_slots,
+ REQ_OP_READ, 0);
if (IS_ERR(bio)) {
status = PTR_ERR(bio);
mlog_errno(status);
@@ -459,7 +602,7 @@ static int o2hb_read_slots(struct o2hb_region *reg,
}
atomic_inc(&wc.wc_num_reqs);
- submit_bio(READ, bio);
+ submit_bio(bio);
}
status = 0;
@@ -483,7 +626,8 @@ static int o2hb_issue_node_write(struct o2hb_region *reg,
slot = o2nm_this_node();
- bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1);
+ bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1, REQ_OP_WRITE,
+ WRITE_SYNC);
if (IS_ERR(bio)) {
status = PTR_ERR(bio);
mlog_errno(status);
@@ -491,7 +635,7 @@ static int o2hb_issue_node_write(struct o2hb_region *reg,
}
atomic_inc(&write_wc->wc_num_reqs);
- submit_bio(WRITE_SYNC, bio);
+ submit_bio(bio);
status = 0;
bail:
@@ -1032,7 +1176,8 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
/* Skip disarming the timeout if own slot has stale/bad data */
if (own_slot_ok) {
o2hb_set_quorum_device(reg);
- o2hb_arm_write_timeout(reg);
+ o2hb_arm_timeout(reg);
+ reg->hr_last_timeout_start = jiffies;
}
bail:
@@ -1096,6 +1241,7 @@ static int o2hb_thread(void *data)
before_hb = ktime_get_real();
ret = o2hb_do_disk_heartbeat(reg);
+ reg->hr_last_hb_status = ret;
after_hb = ktime_get_real();
@@ -1114,7 +1260,7 @@ static int o2hb_thread(void *data)
}
}
- o2hb_disarm_write_timeout(reg);
+ o2hb_disarm_timeout(reg);
/* unclean stop is only used in very bad situation */
for(i = 0; !reg->hr_unclean_stop && i < reg->hr_blocks; i++)
@@ -1451,12 +1597,12 @@ static void o2hb_region_release(struct config_item *item)
list_del(&reg->hr_all_item);
spin_unlock(&o2hb_live_lock);
+ o2net_unregister_handler_list(&reg->hr_handler_list);
kfree(reg);
}
static int o2hb_read_block_input(struct o2hb_region *reg,
const char *page,
- size_t count,
unsigned long *ret_bytes,
unsigned int *ret_bits)
{
@@ -1499,8 +1645,8 @@ static ssize_t o2hb_region_block_bytes_store(struct config_item *item,
if (reg->hr_bdev)
return -EINVAL;
- status = o2hb_read_block_input(reg, page, count,
- &block_bytes, &block_bits);
+ status = o2hb_read_block_input(reg, page, &block_bytes,
+ &block_bits);
if (status)
return status;
@@ -1576,7 +1722,7 @@ static ssize_t o2hb_region_dev_show(struct config_item *item, char *page)
static void o2hb_init_region_params(struct o2hb_region *reg)
{
- reg->hr_slots_per_page = PAGE_CACHE_SIZE >> reg->hr_block_bits;
+ reg->hr_slots_per_page = PAGE_SIZE >> reg->hr_block_bits;
reg->hr_timeout_ms = O2HB_REGION_TIMEOUT_MS;
mlog(ML_HEARTBEAT, "hr_start_block = %llu, hr_blocks = %u\n",
@@ -1763,6 +1909,7 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
}
INIT_DELAYED_WORK(&reg->hr_write_timeout_work, o2hb_write_timeout);
+ INIT_DELAYED_WORK(&reg->hr_nego_timeout_work, o2hb_nego_timeout);
/*
* A node is considered live after it has beat LIVE_THRESHOLD
@@ -1996,13 +2143,37 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
+ /* this is the same way to generate msg key as dlm, for local heartbeat,
+ * name is also the same, so make initial crc value different to avoid
+ * message key conflict.
+ */
+ reg->hr_key = crc32_le(reg->hr_region_num + O2NM_MAX_REGIONS,
+ name, strlen(name));
+ INIT_LIST_HEAD(&reg->hr_handler_list);
+ ret = o2net_register_handler(O2HB_NEGO_TIMEOUT_MSG, reg->hr_key,
+ sizeof(struct o2hb_nego_msg),
+ o2hb_nego_timeout_handler,
+ reg, NULL, &reg->hr_handler_list);
+ if (ret)
+ goto free;
+
+ ret = o2net_register_handler(O2HB_NEGO_APPROVE_MSG, reg->hr_key,
+ sizeof(struct o2hb_nego_msg),
+ o2hb_nego_approve_handler,
+ reg, NULL, &reg->hr_handler_list);
+ if (ret)
+ goto unregister_handler;
+
ret = o2hb_debug_region_init(reg, o2hb_debug_dir);
if (ret) {
config_item_put(&reg->hr_item);
- goto free;
+ goto unregister_handler;
}
return &reg->hr_item;
+
+unregister_handler:
+ o2net_unregister_handler_list(&reg->hr_handler_list);
free:
kfree(reg);
return ERR_PTR(ret);
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 2d0acd6678fe4..1d67fcbf71606 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -600,10 +600,11 @@ static void o2net_set_nn_state(struct o2net_node *nn,
static void o2net_data_ready(struct sock *sk)
{
void (*ready)(struct sock *sk);
+ struct o2net_sock_container *sc;
- read_lock(&sk->sk_callback_lock);
- if (sk->sk_user_data) {
- struct o2net_sock_container *sc = sk->sk_user_data;
+ read_lock_bh(&sk->sk_callback_lock);
+ sc = sk->sk_user_data;
+ if (sc) {
sclog(sc, "data_ready hit\n");
o2net_set_data_ready_time(sc);
o2net_sc_queue_work(sc, &sc->sc_rx_work);
@@ -611,7 +612,7 @@ static void o2net_data_ready(struct sock *sk)
} else {
ready = sk->sk_data_ready;
}
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
ready(sk);
}
@@ -622,7 +623,7 @@ static void o2net_state_change(struct sock *sk)
void (*state_change)(struct sock *sk);
struct o2net_sock_container *sc;
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
sc = sk->sk_user_data;
if (sc == NULL) {
state_change = sk->sk_state_change;
@@ -649,7 +650,7 @@ static void o2net_state_change(struct sock *sk)
break;
}
out:
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
state_change(sk);
}
@@ -1617,16 +1618,12 @@ static void o2net_start_connect(struct work_struct *work)
/* watch for racing with tearing a node down */
node = o2nm_get_node_by_num(o2net_num_from_nn(nn));
- if (node == NULL) {
- ret = 0;
+ if (node == NULL)
goto out;
- }
mynode = o2nm_get_node_by_num(o2nm_this_node());
- if (mynode == NULL) {
- ret = 0;
+ if (mynode == NULL)
goto out;
- }
spin_lock(&nn->nn_lock);
/*
@@ -2012,7 +2009,7 @@ static void o2net_listen_data_ready(struct sock *sk)
{
void (*ready)(struct sock *sk);
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
ready = sk->sk_user_data;
if (ready == NULL) { /* check for teardown race */
ready = sk->sk_data_ready;
@@ -2039,7 +2036,7 @@ static void o2net_listen_data_ready(struct sock *sk)
}
out:
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
if (ready != NULL)
ready(sk);
}
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index b95e7df5b76ac..94b18369b1cc5 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -44,6 +44,9 @@
* version here in tcp_internal.h should not need to be bumped for
* filesystem locking changes.
*
+ * New in version 12
+ * - Negotiate hb timeout when storage is down.
+ *
* New in version 11
* - Negotiation of filesystem locking in the dlm join.
*
@@ -75,7 +78,7 @@
* - full 64 bit i_size in the metadata lock lvbs
* - introduction of "rw" lock and pushing meta/data locking down
*/
-#define O2NET_PROTOCOL_VERSION 11ULL
+#define O2NET_PROTOCOL_VERSION 12ULL
struct o2net_handshake {
__be64 protocol_version;
__be64 connector_id;
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 004f2cbe8f71e..e9f3705c4c9fe 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -47,7 +47,7 @@
#define DLM_HASH_BUCKETS (DLM_HASH_PAGES * DLM_BUCKETS_PER_PAGE)
/* Intended to make it easier for us to switch out hash functions */
-#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
+#define dlm_lockid_hash(_n, _l) full_name_hash(NULL, _n, _l)
enum dlm_mle_type {
DLM_MLE_BLOCK = 0,
@@ -1004,6 +1004,8 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
u8 nodenum, u8 *real_master);
+void __dlm_do_purge_lockres(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res);
int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 825136070d2c1..e7b760deefaee 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -347,26 +347,6 @@ static struct dentry *dlm_debugfs_root;
#define DLM_DEBUGFS_PURGE_LIST "purge_list"
/* begin - utils funcs */
-static void dlm_debug_free(struct kref *kref)
-{
- struct dlm_debug_ctxt *dc;
-
- dc = container_of(kref, struct dlm_debug_ctxt, debug_refcnt);
-
- kfree(dc);
-}
-
-static void dlm_debug_put(struct dlm_debug_ctxt *dc)
-{
- if (dc)
- kref_put(&dc->debug_refcnt, dlm_debug_free);
-}
-
-static void dlm_debug_get(struct dlm_debug_ctxt *dc)
-{
- kref_get(&dc->debug_refcnt);
-}
-
static int debug_release(struct inode *inode, struct file *file)
{
free_page((unsigned long)file->private_data);
@@ -932,11 +912,9 @@ int dlm_debug_init(struct dlm_ctxt *dlm)
goto bail;
}
- dlm_debug_get(dc);
return 0;
bail:
- dlm_debug_shutdown(dlm);
return -ENOMEM;
}
@@ -949,7 +927,8 @@ void dlm_debug_shutdown(struct dlm_ctxt *dlm)
debugfs_remove(dc->debug_mle_dentry);
debugfs_remove(dc->debug_lockres_dentry);
debugfs_remove(dc->debug_state_dentry);
- dlm_debug_put(dc);
+ kfree(dc);
+ dc = NULL;
}
}
@@ -969,7 +948,6 @@ int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
mlog_errno(-ENOMEM);
goto bail;
}
- kref_init(&dlm->dlm_debug_ctxt->debug_refcnt);
return 0;
bail:
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
index 1f27c4812d1ac..5ced5482e7d35 100644
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -30,7 +30,6 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle);
#ifdef CONFIG_DEBUG_FS
struct dlm_debug_ctxt {
- struct kref debug_refcnt;
struct dentry *debug_state_dentry;
struct dentry *debug_lockres_dentry;
struct dentry *debug_mle_dentry;
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 12e064b8be9af..533bd524e41eb 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -172,12 +172,10 @@ void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
struct hlist_head *bucket;
- struct qstr *q;
assert_spin_locked(&dlm->spinlock);
- q = &res->lockname;
- bucket = dlm_lockres_hash(dlm, q->hash);
+ bucket = dlm_lockres_hash(dlm, res->lockname.hash);
/* get a reference for our hashtable */
dlm_lockres_get(res);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 9aed6e2022014..6ea06f8a7d295 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2276,9 +2276,12 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
dlm->name, namelen, lockname, res->owner, r);
dlm_print_one_lock_resource(res);
- BUG();
- }
- return ret ? ret : r;
+ if (r == -ENOMEM)
+ BUG();
+ } else
+ ret = r;
+
+ return ret;
}
int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
@@ -2416,46 +2419,26 @@ int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data,
}
spin_lock(&res->spinlock);
- BUG_ON(!(res->state & DLM_LOCK_RES_DROPPING_REF));
- if (!list_empty(&res->purge)) {
- mlog(0, "%s: Removing res %.*s from purgelist\n",
- dlm->name, res->lockname.len, res->lockname.name);
- list_del_init(&res->purge);
- dlm_lockres_put(res);
- dlm->purge_count--;
- }
-
- if (!__dlm_lockres_unused(res)) {
- mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
- dlm->name, res->lockname.len, res->lockname.name);
- __dlm_print_one_lock_resource(res);
- BUG();
- }
-
- __dlm_unhash_lockres(dlm, res);
-
- spin_lock(&dlm->track_lock);
- if (!list_empty(&res->tracking))
- list_del_init(&res->tracking);
- else {
- mlog(ML_ERROR, "%s: Resource %.*s not on the Tracking list\n",
- dlm->name, res->lockname.len, res->lockname.name);
- __dlm_print_one_lock_resource(res);
+ if (!(res->state & DLM_LOCK_RES_DROPPING_REF)) {
+ spin_unlock(&res->spinlock);
+ spin_unlock(&dlm->spinlock);
+ mlog(ML_NOTICE, "%s:%.*s: node %u sends deref done "
+ "but it is already derefed!\n", dlm->name,
+ res->lockname.len, res->lockname.name, node);
+ ret = 0;
+ goto done;
}
- spin_unlock(&dlm->track_lock);
- /* lockres is not in the hash now. drop the flag and wake up
- * any processes waiting in dlm_get_lock_resource.
- */
- res->state &= ~DLM_LOCK_RES_DROPPING_REF;
+ __dlm_do_purge_lockres(dlm, res);
spin_unlock(&res->spinlock);
wake_up(&res->wq);
- dlm_lockres_put(res);
-
spin_unlock(&dlm->spinlock);
+ ret = 0;
done:
+ if (res)
+ dlm_lockres_put(res);
dlm_put(dlm);
return ret;
}
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index f6b313898763a..dd5cb8bcefd13 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2343,6 +2343,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
struct dlm_lock_resource *res;
int i;
struct hlist_head *bucket;
+ struct hlist_node *tmp;
struct dlm_lock *lock;
@@ -2365,7 +2366,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
*/
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
bucket = dlm_lockres_hash(dlm, i);
- hlist_for_each_entry(res, bucket, hash_node) {
+ hlist_for_each_entry_safe(res, tmp, bucket, hash_node) {
/* always prune any $RECOVERY entries for dead nodes,
* otherwise hangs can occur during later recovery */
if (dlm_is_recovery_lock(res->lockname.name,
@@ -2386,8 +2387,17 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
break;
}
}
- dlm_lockres_clear_refmap_bit(dlm, res,
- dead_node);
+
+ if ((res->owner == dead_node) &&
+ (res->state & DLM_LOCK_RES_DROPPING_REF)) {
+ dlm_lockres_get(res);
+ __dlm_do_purge_lockres(dlm, res);
+ spin_unlock(&res->spinlock);
+ wake_up(&res->wq);
+ dlm_lockres_put(res);
+ continue;
+ } else if (res->owner == dlm->node_num)
+ dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
spin_unlock(&res->spinlock);
continue;
}
@@ -2398,14 +2408,17 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
if (res->state & DLM_LOCK_RES_DROPPING_REF) {
mlog(0, "%s:%.*s: owned by "
"dead node %u, this node was "
- "dropping its ref when it died. "
- "continue, dropping the flag.\n",
+ "dropping its ref when master died. "
+ "continue, purging the lockres.\n",
dlm->name, res->lockname.len,
res->lockname.name, dead_node);
+ dlm_lockres_get(res);
+ __dlm_do_purge_lockres(dlm, res);
+ spin_unlock(&res->spinlock);
+ wake_up(&res->wq);
+ dlm_lockres_put(res);
+ continue;
}
- res->state &= ~DLM_LOCK_RES_DROPPING_REF;
- dlm_move_lockres_to_recovery_list(dlm,
- res);
} else if (res->owner == dlm->node_num) {
dlm_free_dead_locks(dlm, res, dead_node);
__dlm_lockres_calc_usage(dlm, res);
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 68d239ba0c63c..838a06d4066a6 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -160,6 +160,52 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
spin_unlock(&dlm->spinlock);
}
+/*
+ * Do the real purge work:
+ * unhash the lockres, and
+ * clear flag DLM_LOCK_RES_DROPPING_REF.
+ * It requires dlm and lockres spinlock to be taken.
+ */
+void __dlm_do_purge_lockres(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ assert_spin_locked(&dlm->spinlock);
+ assert_spin_locked(&res->spinlock);
+
+ if (!list_empty(&res->purge)) {
+ mlog(0, "%s: Removing res %.*s from purgelist\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ list_del_init(&res->purge);
+ dlm_lockres_put(res);
+ dlm->purge_count--;
+ }
+
+ if (!__dlm_lockres_unused(res)) {
+ mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ __dlm_print_one_lock_resource(res);
+ BUG();
+ }
+
+ __dlm_unhash_lockres(dlm, res);
+
+ spin_lock(&dlm->track_lock);
+ if (!list_empty(&res->tracking))
+ list_del_init(&res->tracking);
+ else {
+ mlog(ML_ERROR, "%s: Resource %.*s not on the Tracking list\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ __dlm_print_one_lock_resource(res);
+ }
+ spin_unlock(&dlm->track_lock);
+
+ /*
+ * lockres is not in the hash now. drop the flag and wake up
+ * any processes waiting in dlm_get_lock_resource.
+ */
+ res->state &= ~DLM_LOCK_RES_DROPPING_REF;
+}
+
static void dlm_purge_lockres(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
@@ -175,6 +221,13 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
res->lockname.len, res->lockname.name, master);
if (!master) {
+ if (res->state & DLM_LOCK_RES_DROPPING_REF) {
+ mlog(ML_NOTICE, "%s: res %.*s already in DLM_LOCK_RES_DROPPING_REF state\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ spin_unlock(&res->spinlock);
+ return;
+ }
+
res->state |= DLM_LOCK_RES_DROPPING_REF;
/* drop spinlock... retake below */
spin_unlock(&res->spinlock);
@@ -203,8 +256,8 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
dlm->purge_count--;
}
- if (!master && ret != 0) {
- mlog(0, "%s: deref %.*s in progress or master goes down\n",
+ if (!master && ret == DLM_DEREF_RESPONSE_INPROG) {
+ mlog(0, "%s: deref %.*s in progress\n",
dlm->name, res->lockname.len, res->lockname.name);
spin_unlock(&res->spinlock);
return;
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 03768bb3aab15..ef474cdd64047 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -469,7 +469,7 @@ static int dlmfs_mkdir(struct inode * dir,
{
int status;
struct inode *inode = NULL;
- struct qstr *domain = &dentry->d_name;
+ const struct qstr *domain = &dentry->d_name;
struct dlmfs_inode_private *ip;
struct ocfs2_cluster_connection *conn;
@@ -518,7 +518,7 @@ static int dlmfs_create(struct inode *dir,
{
int status = 0;
struct inode *inode;
- struct qstr *name = &dentry->d_name;
+ const struct qstr *name = &dentry->d_name;
mlog(0, "create %.*s\n", name->len, name->name);
@@ -571,8 +571,8 @@ static int dlmfs_fill_super(struct super_block * sb,
int silent)
{
sb->s_maxbytes = MAX_LFS_FILESIZE;
- sb->s_blocksize = PAGE_CACHE_SIZE;
- sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ sb->s_blocksize = PAGE_SIZE;
+ sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = DLMFS_MAGIC;
sb->s_op = &dlmfs_ops;
sb->s_root = d_make_root(dlmfs_get_root_inode(sb));
diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c
index 0499e3fb7bdbd..f70cda2f090d5 100644
--- a/fs/ocfs2/dlmfs/userdlm.c
+++ b/fs/ocfs2/dlmfs/userdlm.c
@@ -667,7 +667,7 @@ void user_dlm_set_locking_protocol(void)
ocfs2_stack_glue_set_max_proto_version(&user_dlm_lproto.lp_max_version);
}
-struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name)
+struct ocfs2_cluster_connection *user_dlm_register(const struct qstr *name)
{
int rc;
struct ocfs2_cluster_connection *conn;
diff --git a/fs/ocfs2/dlmfs/userdlm.h b/fs/ocfs2/dlmfs/userdlm.h
index 3b42d79531d72..ede94a6e7fd33 100644
--- a/fs/ocfs2/dlmfs/userdlm.h
+++ b/fs/ocfs2/dlmfs/userdlm.h
@@ -83,7 +83,7 @@ void user_dlm_write_lvb(struct inode *inode,
ssize_t user_dlm_read_lvb(struct inode *inode,
char *val,
unsigned int len);
-struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name);
+struct ocfs2_cluster_connection *user_dlm_register(const struct qstr *name);
void user_dlm_unregister(struct ocfs2_cluster_connection *conn);
void user_dlm_set_locking_protocol(void);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 474e57f834e6c..83d576f6a287b 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -54,6 +54,7 @@
#include "uptodate.h"
#include "quota.h"
#include "refcounttree.h"
+#include "acl.h"
#include "buffer_head_io.h"
@@ -1634,7 +1635,6 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
int ret;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- BUG_ON(!inode);
BUG_ON(!ocfs2_inode_is_new(inode));
mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -1664,10 +1664,8 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
}
ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0);
- if (ret) {
+ if (ret)
mlog_errno(ret);
- goto bail;
- }
bail:
return ret;
@@ -1679,8 +1677,6 @@ int ocfs2_rw_lock(struct inode *inode, int write)
struct ocfs2_lock_res *lockres;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- BUG_ON(!inode);
-
mlog(0, "inode %llu take %s RW lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
write ? "EXMODE" : "PRMODE");
@@ -1723,8 +1719,6 @@ int ocfs2_open_lock(struct inode *inode)
struct ocfs2_lock_res *lockres;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- BUG_ON(!inode);
-
mlog(0, "inode %llu take PRMODE open lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -1748,8 +1742,6 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
struct ocfs2_lock_res *lockres;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- BUG_ON(!inode);
-
mlog(0, "inode %llu try to take %s open lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
write ? "EXMODE" : "PRMODE");
@@ -2327,8 +2319,6 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct buffer_head *local_bh = NULL;
- BUG_ON(!inode);
-
mlog(0, "inode %llu, take %s META lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
ex ? "EXMODE" : "PRMODE");
@@ -3623,6 +3613,8 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
filemap_fdatawait(mapping);
}
+ forget_all_cached_acls(inode);
+
out:
return UNBLOCK_CONTINUE;
}
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index c18ab45f8d210..4e7b0dc224505 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -770,14 +770,14 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
{
struct address_space *mapping = inode->i_mapping;
struct page *page;
- unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
+ unsigned long index = abs_from >> PAGE_SHIFT;
handle_t *handle;
int ret = 0;
unsigned zero_from, zero_to, block_start, block_end;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
BUG_ON(abs_from >= abs_to);
- BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
+ BUG_ON(abs_to > (((u64)index + 1) << PAGE_SHIFT));
BUG_ON(abs_from & (inode->i_blkbits - 1));
handle = ocfs2_zero_start_ordered_transaction(inode, di_bh);
@@ -794,10 +794,10 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
}
/* Get the offsets within the page that we want to zero */
- zero_from = abs_from & (PAGE_CACHE_SIZE - 1);
- zero_to = abs_to & (PAGE_CACHE_SIZE - 1);
+ zero_from = abs_from & (PAGE_SIZE - 1);
+ zero_to = abs_to & (PAGE_SIZE - 1);
if (!zero_to)
- zero_to = PAGE_CACHE_SIZE;
+ zero_to = PAGE_SIZE;
trace_ocfs2_write_zero_page(
(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -851,7 +851,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
out_unlock:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
out_commit_trans:
if (handle)
ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
@@ -959,7 +959,7 @@ static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
BUG_ON(range_start >= range_end);
while (zero_pos < range_end) {
- next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
+ next_pos = (zero_pos & PAGE_MASK) + PAGE_SIZE;
if (next_pos > range_end)
next_pos = range_end;
rc = ocfs2_write_zero_page(inode, zero_pos, next_pos, di_bh);
@@ -1268,20 +1268,20 @@ bail_unlock_rw:
if (size_change)
ocfs2_rw_unlock(inode, 1);
bail:
- brelse(bh);
/* Release quota pointers in case we acquired them */
for (qtype = 0; qtype < OCFS2_MAXQUOTAS; qtype++)
dqput(transfer_to[qtype]);
if (!status && attr->ia_valid & ATTR_MODE) {
- status = posix_acl_chmod(inode, inode->i_mode);
+ status = ocfs2_acl_chmod(inode, bh);
if (status < 0)
mlog_errno(status);
}
if (inode_locked)
ocfs2_inode_unlock(inode, 1);
+ brelse(bh);
return status;
}
@@ -1290,7 +1290,7 @@ int ocfs2_getattr(struct vfsmount *mnt,
struct kstat *stat)
{
struct inode *inode = d_inode(dentry);
- struct super_block *sb = d_inode(dentry)->i_sb;
+ struct super_block *sb = dentry->d_sb;
struct ocfs2_super *osb = sb->s_fs_info;
int err;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 12f4a9e9800f9..c56a7679df93a 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -176,12 +176,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
}
if (is_bad_inode(inode)) {
iput(inode);
- if ((flags & OCFS2_FI_FLAG_FILECHECK_CHK) ||
- (flags & OCFS2_FI_FLAG_FILECHECK_FIX))
- /* Return OCFS2_FILECHECK_ERR_XXX related errno */
- inode = ERR_PTR(rc);
- else
- inode = ERR_PTR(-ESTALE);
+ inode = ERR_PTR(rc);
goto bail;
}
@@ -262,7 +257,7 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
inode->i_ino = args->fi_ino;
OCFS2_I(inode)->ip_blkno = args->fi_blkno;
if (args->fi_sysfile_type != 0)
- lockdep_set_class(&inode->i_mutex,
+ lockdep_set_class(&inode->i_rwsem,
&ocfs2_sysfile_lock_key[args->fi_sysfile_type]);
if (args->fi_sysfile_type == USER_QUOTA_SYSTEM_INODE ||
args->fi_sysfile_type == GROUP_QUOTA_SYSTEM_INODE ||
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index d8f3fc8d25515..50cc550474433 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -145,22 +145,15 @@ int ocfs2_drop_inode(struct inode *inode);
struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff);
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
int sysfile_type);
-int ocfs2_inode_init_private(struct inode *inode);
int ocfs2_inode_revalidate(struct dentry *dentry);
void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
int create_ino);
-void ocfs2_read_inode(struct inode *inode);
-void ocfs2_read_inode2(struct inode *inode, void *opaque);
-ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf,
- size_t size, loff_t *offp);
void ocfs2_sync_blockdev(struct super_block *sb);
void ocfs2_refresh_inode(struct inode *inode,
struct ocfs2_dinode *fe);
int ocfs2_mark_inode_dirty(handle_t *handle,
struct inode *inode,
struct buffer_head *bh);
-struct buffer_head *ocfs2_bread(struct inode *inode,
- int block, int *err, int reada);
void ocfs2_set_inode_flags(struct inode *inode);
void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index e607419cdfa46..a244f14c6b877 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1159,10 +1159,8 @@ static int ocfs2_force_read_journal(struct inode *inode)
int status = 0;
int i;
u64 v_blkno, p_blkno, p_blocks, num_blocks;
-#define CONCURRENT_JOURNAL_FILL 32ULL
- struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
-
- memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
+ struct buffer_head *bh = NULL;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
v_blkno = 0;
@@ -1174,29 +1172,32 @@ static int ocfs2_force_read_journal(struct inode *inode)
goto bail;
}
- if (p_blocks > CONCURRENT_JOURNAL_FILL)
- p_blocks = CONCURRENT_JOURNAL_FILL;
-
- /* We are reading journal data which should not
- * be put in the uptodate cache */
- status = ocfs2_read_blocks_sync(OCFS2_SB(inode->i_sb),
- p_blkno, p_blocks, bhs);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ for (i = 0; i < p_blocks; i++, p_blkno++) {
+ bh = __find_get_block(osb->sb->s_bdev, p_blkno,
+ osb->sb->s_blocksize);
+ /* block not cached. */
+ if (!bh)
+ continue;
+
+ brelse(bh);
+ bh = NULL;
+ /* We are reading journal data which should not
+ * be put in the uptodate cache.
+ */
+ status = ocfs2_read_blocks_sync(osb, p_blkno, 1, &bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
- for(i = 0; i < p_blocks; i++) {
- brelse(bhs[i]);
- bhs[i] = NULL;
+ brelse(bh);
+ bh = NULL;
}
v_blkno += p_blocks;
}
bail:
- for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
- brelse(bhs[i]);
return status;
}
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index f4cd3c3e9fb70..497a4171ef61f 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -619,7 +619,7 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
{
- return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode);
+ return jbd2_journal_inode_add_write(handle, &OCFS2_I(inode)->ip_jinode);
}
static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 9ea081f4e6e46..71545ad4628ce 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -65,13 +65,13 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
struct inode *inode = file_inode(file);
struct address_space *mapping = inode->i_mapping;
loff_t pos = page_offset(page);
- unsigned int len = PAGE_CACHE_SIZE;
+ unsigned int len = PAGE_SIZE;
pgoff_t last_index;
struct page *locked_page = NULL;
void *fsdata;
loff_t size = i_size_read(inode);
- last_index = (size - 1) >> PAGE_CACHE_SHIFT;
+ last_index = (size - 1) >> PAGE_SHIFT;
/*
* There are cases that lead to the page no longer bebongs to the
@@ -102,7 +102,7 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
* because the "write" would invalidate their data.
*/
if (page->index == last_index)
- len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
+ len = ((size - 1) & ~PAGE_MASK) + 1;
ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP,
&locked_page, &fsdata, di_bh, page);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 6b3e87189a646..a8f1225e6d9b7 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -259,7 +259,6 @@ static int ocfs2_mknod(struct inode *dir,
struct ocfs2_dir_lookup_result lookup = { NULL, };
sigset_t oldset;
int did_block_signals = 0;
- struct posix_acl *default_acl = NULL, *acl = NULL;
struct ocfs2_dentry_lock *dl = NULL;
trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name,
@@ -367,12 +366,6 @@ static int ocfs2_mknod(struct inode *dir,
goto leave;
}
- status = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
- if (status) {
- mlog_errno(status);
- goto leave;
- }
-
handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb,
S_ISDIR(mode),
xattr_credits));
@@ -421,16 +414,8 @@ static int ocfs2_mknod(struct inode *dir,
inc_nlink(dir);
}
- if (default_acl) {
- status = ocfs2_set_acl(handle, inode, new_fe_bh,
- ACL_TYPE_DEFAULT, default_acl,
- meta_ac, data_ac);
- }
- if (!status && acl) {
- status = ocfs2_set_acl(handle, inode, new_fe_bh,
- ACL_TYPE_ACCESS, acl,
- meta_ac, data_ac);
- }
+ status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
+ meta_ac, data_ac);
if (status < 0) {
mlog_errno(status);
@@ -472,10 +457,6 @@ static int ocfs2_mknod(struct inode *dir,
d_instantiate(dentry, inode);
status = 0;
leave:
- if (default_acl)
- posix_acl_release(default_acl);
- if (acl)
- posix_acl_release(acl);
if (status < 0 && did_quota_inode)
dquot_free_inode(inode);
if (handle)
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 6cf6538a06516..e63af7ddfe688 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -822,10 +822,10 @@ static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb,
u32 clusters = pg_index;
unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
- if (unlikely(PAGE_CACHE_SHIFT > cbits))
- clusters = pg_index << (PAGE_CACHE_SHIFT - cbits);
- else if (PAGE_CACHE_SHIFT < cbits)
- clusters = pg_index >> (cbits - PAGE_CACHE_SHIFT);
+ if (unlikely(PAGE_SHIFT > cbits))
+ clusters = pg_index << (PAGE_SHIFT - cbits);
+ else if (PAGE_SHIFT < cbits)
+ clusters = pg_index >> (cbits - PAGE_SHIFT);
return clusters;
}
@@ -839,10 +839,10 @@ static inline pgoff_t ocfs2_align_clusters_to_page_index(struct super_block *sb,
unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
pgoff_t index = clusters;
- if (PAGE_CACHE_SHIFT > cbits) {
- index = (pgoff_t)clusters >> (PAGE_CACHE_SHIFT - cbits);
- } else if (PAGE_CACHE_SHIFT < cbits) {
- index = (pgoff_t)clusters << (cbits - PAGE_CACHE_SHIFT);
+ if (PAGE_SHIFT > cbits) {
+ index = (pgoff_t)clusters >> (PAGE_SHIFT - cbits);
+ } else if (PAGE_SHIFT < cbits) {
+ index = (pgoff_t)clusters << (cbits - PAGE_SHIFT);
}
return index;
@@ -853,8 +853,8 @@ static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
unsigned int pages_per_cluster = 1;
- if (PAGE_CACHE_SHIFT < cbits)
- pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT);
+ if (PAGE_SHIFT < cbits)
+ pages_per_cluster = 1 << (cbits - PAGE_SHIFT);
return pages_per_cluster;
}
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 540ab5b75dbb0..44d178b8d1aa9 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -580,7 +580,7 @@ struct ocfs2_extended_slot {
/*00*/ __u8 es_valid;
__u8 es_reserved1[3];
__le32 es_node_num;
-/*10*/
+/*08*/
};
/*
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 3892f3c079ca8..87e577a49b0d5 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -483,7 +483,7 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
struct ocfs2_global_disk_dqblk dqblk;
s64 spacechange, inodechange;
- time_t olditime, oldbtime;
+ time64_t olditime, oldbtime;
err = sb->s_op->quota_read(sb, type, (char *)&dqblk,
sizeof(struct ocfs2_global_disk_dqblk),
@@ -867,6 +867,10 @@ static int ocfs2_get_next_id(struct super_block *sb, struct kqid *qid)
int status = 0;
trace_ocfs2_get_next_id(from_kqid(&init_user_ns, *qid), type);
+ if (!sb_has_quota_loaded(sb, type)) {
+ status = -ESRCH;
+ goto out;
+ }
status = ocfs2_lock_global_qf(info, 0);
if (status < 0)
goto out;
@@ -878,8 +882,11 @@ static int ocfs2_get_next_id(struct super_block *sb, struct kqid *qid)
out_global:
ocfs2_unlock_global_qf(info, 0);
out:
- /* Avoid logging ENOENT since it just means there isn't next ID */
- if (status && status != -ENOENT)
+ /*
+ * Avoid logging ENOENT since it just means there isn't next ID and
+ * ESRCH which means quota isn't enabled for the filesystem.
+ */
+ if (status && status != -ENOENT && status != -ESRCH)
mlog_errno(status);
return status;
}
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 3eff031aaf264..92bbe93bfe107 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2937,16 +2937,16 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
end = i_size_read(inode);
while (offset < end) {
- page_index = offset >> PAGE_CACHE_SHIFT;
- map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
+ page_index = offset >> PAGE_SHIFT;
+ map_end = ((loff_t)page_index + 1) << PAGE_SHIFT;
if (map_end > end)
map_end = end;
/* from, to is the offset within the page. */
- from = offset & (PAGE_CACHE_SIZE - 1);
- to = PAGE_CACHE_SIZE;
- if (map_end & (PAGE_CACHE_SIZE - 1))
- to = map_end & (PAGE_CACHE_SIZE - 1);
+ from = offset & (PAGE_SIZE - 1);
+ to = PAGE_SIZE;
+ if (map_end & (PAGE_SIZE - 1))
+ to = map_end & (PAGE_SIZE - 1);
page = find_or_create_page(mapping, page_index, GFP_NOFS);
if (!page) {
@@ -2956,10 +2956,10 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
}
/*
- * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
+ * In case PAGE_SIZE <= CLUSTER_SIZE, This page
* can't be dirtied before we CoW it out.
*/
- if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
+ if (PAGE_SIZE <= OCFS2_SB(sb)->s_clustersize)
BUG_ON(PageDirty(page));
if (!PageUptodate(page)) {
@@ -2987,7 +2987,7 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
mark_page_accessed(page);
unlock:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
page = NULL;
offset = map_end;
if (ret)
@@ -3165,8 +3165,8 @@ int ocfs2_cow_sync_writeback(struct super_block *sb,
}
while (offset < end) {
- page_index = offset >> PAGE_CACHE_SHIFT;
- map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
+ page_index = offset >> PAGE_SHIFT;
+ map_end = ((loff_t)page_index + 1) << PAGE_SHIFT;
if (map_end > end)
map_end = end;
@@ -3182,7 +3182,7 @@ int ocfs2_cow_sync_writeback(struct super_block *sb,
mark_page_accessed(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
page = NULL;
offset = map_end;
if (ret)
@@ -4248,20 +4248,12 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
struct inode *inode = d_inode(old_dentry);
struct buffer_head *old_bh = NULL;
struct inode *new_orphan_inode = NULL;
- struct posix_acl *default_acl, *acl;
- umode_t mode;
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
return -EOPNOTSUPP;
- mode = inode->i_mode;
- error = posix_acl_create(dir, &mode, &default_acl, &acl);
- if (error) {
- mlog_errno(error);
- return error;
- }
- error = ocfs2_create_inode_in_orphan(dir, mode,
+ error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
&new_orphan_inode);
if (error) {
mlog_errno(error);
@@ -4300,16 +4292,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
/* If the security isn't preserved, we need to re-initialize them. */
if (!preserve) {
error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
- &new_dentry->d_name,
- default_acl, acl);
+ &new_dentry->d_name);
if (error)
mlog_errno(error);
}
out:
- if (default_acl)
- posix_acl_release(default_acl);
- if (acl)
- posix_acl_release(acl);
if (!error) {
error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
new_dentry);
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 1e09592148ad2..d7407994f3080 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -535,12 +535,8 @@ void ocfs2_put_slot(struct ocfs2_super *osb)
spin_unlock(&osb->osb_lock);
status = ocfs2_update_disk_slot(osb, si, slot_num);
- if (status < 0) {
+ if (status < 0)
mlog_errno(status);
- goto bail;
- }
-bail:
ocfs2_free_slot_info(osb);
}
-
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index ced70c8139f73..c9e828ec3c8ee 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -1007,10 +1007,17 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
lc->oc_type = NO_CONTROLD;
rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name,
- DLM_LSFL_FS, DLM_LVB_LEN,
+ DLM_LSFL_FS | DLM_LSFL_NEWEXCL, DLM_LVB_LEN,
&ocfs2_ls_ops, conn, &ops_rv, &fsdlm);
- if (rc)
+ if (rc) {
+ if (rc == -EEXIST || rc == -EPROTO)
+ printk(KERN_ERR "ocfs2: Unable to create the "
+ "lockspace %s (%d), because a ocfs2-tools "
+ "program is running on this file system "
+ "with the same name lockspace\n",
+ conn->cc_name, rc);
goto out;
+ }
if (ops_rv == -EOPNOTSUPP) {
lc->oc_type = WITH_CONTROLD;
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 13219ed73e1d6..52c07346bea3f 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -735,8 +735,6 @@ static void __exit ocfs2_stack_glue_exit(void)
{
memset(&locking_max_version, 0,
sizeof(struct ocfs2_protocol_version));
- locking_max_version.pv_major = 0;
- locking_max_version.pv_minor = 0;
ocfs2_sysfs_exit();
if (ocfs2_table_header)
unregister_sysctl_table(ocfs2_table_header);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 2f19aeec54821..ea47120a85ff2 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1164,7 +1164,8 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
int flags,
struct ocfs2_alloc_context **ac)
{
- int status;
+ int status, ret = 0;
+ int retried = 0;
*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
if (!(*ac)) {
@@ -1189,7 +1190,24 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
}
if (status == -ENOSPC) {
+retry:
status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
+ /* Retry if there is sufficient space cached in truncate log */
+ if (status == -ENOSPC && !retried) {
+ retried = 1;
+ ocfs2_inode_unlock((*ac)->ac_inode, 1);
+ inode_unlock((*ac)->ac_inode);
+
+ ret = ocfs2_try_to_free_truncate_log(osb, bits_wanted);
+ if (ret == 1)
+ goto retry;
+
+ if (ret < 0)
+ mlog_errno(ret);
+
+ inode_lock((*ac)->ac_inode);
+ ocfs2_inode_lock((*ac)->ac_inode, NULL, 1);
+ }
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 7db631e1c8b0a..603b28d6f0082 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -605,8 +605,8 @@ static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
/*
* We might be limited by page cache size.
*/
- if (bytes > PAGE_CACHE_SIZE) {
- bytes = PAGE_CACHE_SIZE;
+ if (bytes > PAGE_SIZE) {
+ bytes = PAGE_SIZE;
trim = 1;
/*
* Shift by 31 here so that we don't get larger than
@@ -1819,7 +1819,7 @@ static int ocfs2_get_sector(struct super_block *sb,
if (!buffer_dirty(*bh))
clear_buffer_uptodate(*bh);
unlock_buffer(*bh);
- ll_rw_block(READ, 1, bh);
+ ll_rw_block(REQ_OP_READ, 0, 1, bh);
wait_on_buffer(*bh);
if (!buffer_uptodate(*bh)) {
mlog_errno(-EIO);
@@ -2072,7 +2072,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
osb->osb_dx_seed[3] = le32_to_cpu(di->id2.i_super.s_uuid_hash);
osb->sb = sb;
- /* Save off for ocfs2_rw_direct */
osb->s_sectsize_bits = blksize_bits(sector_size);
BUG_ON(!osb->s_sectsize_bits);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 7d3d979f57d91..5bb44f7a78ee8 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7216,12 +7216,10 @@ out:
*/
int ocfs2_init_security_and_acl(struct inode *dir,
struct inode *inode,
- const struct qstr *qstr,
- struct posix_acl *default_acl,
- struct posix_acl *acl)
+ const struct qstr *qstr)
{
- struct buffer_head *dir_bh = NULL;
int ret = 0;
+ struct buffer_head *dir_bh = NULL;
ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
if (ret) {
@@ -7234,11 +7232,9 @@ int ocfs2_init_security_and_acl(struct inode *dir,
mlog_errno(ret);
goto leave;
}
-
- if (!ret && default_acl)
- ret = ocfs2_iop_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
- if (!ret && acl)
- ret = ocfs2_iop_set_acl(inode, acl, ACL_TYPE_ACCESS);
+ ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL);
+ if (ret)
+ mlog_errno(ret);
ocfs2_inode_unlock(dir, 0);
brelse(dir_bh);
@@ -7250,18 +7246,19 @@ leave:
* 'security' attributes support
*/
static int ocfs2_xattr_security_get(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- void *buffer, size_t size)
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *buffer, size_t size)
{
- return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY,
+ return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_SECURITY,
name, buffer, size);
}
static int ocfs2_xattr_security_set(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY,
+ return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
name, value, size, flags);
}
@@ -7321,18 +7318,19 @@ const struct xattr_handler ocfs2_xattr_security_handler = {
* 'trusted' attributes support
*/
static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- void *buffer, size_t size)
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *buffer, size_t size)
{
- return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED,
+ return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_TRUSTED,
name, buffer, size);
}
static int ocfs2_xattr_trusted_set(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED,
+ return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED,
name, value, size, flags);
}
@@ -7346,27 +7344,28 @@ const struct xattr_handler ocfs2_xattr_trusted_handler = {
* 'user' attributes support
*/
static int ocfs2_xattr_user_get(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- void *buffer, size_t size)
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *buffer, size_t size)
{
- struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
return -EOPNOTSUPP;
- return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_USER, name,
+ return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_USER, name,
buffer, size);
}
static int ocfs2_xattr_user_set(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
return -EOPNOTSUPP;
- return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_USER,
+ return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER,
name, value, size, flags);
}
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index f10d5b93c366c..1633cc15ea1fd 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -94,7 +94,5 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
bool preserve_security);
int ocfs2_init_security_and_acl(struct inode *dir,
struct inode *inode,
- const struct qstr *qstr,
- struct posix_acl *default_acl,
- struct posix_acl *acl);
+ const struct qstr *qstr);
#endif /* OCFS2_XATTR_H */
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index f833bf8d57929..c8cbf3b606456 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -452,6 +452,6 @@ const struct inode_operations omfs_dir_inops = {
const struct file_operations omfs_dir_operations = {
.read = generic_read_dir,
- .iterate = omfs_readdir,
+ .iterate_shared = omfs_readdir,
.llseek = generic_file_llseek,
};
diff --git a/fs/open.c b/fs/open.c
index bfe6f2b8345f5..4fd6e256f4f45 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -65,7 +65,7 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
return ret;
}
-long vfs_truncate(struct path *path, loff_t length)
+long vfs_truncate(const struct path *path, loff_t length)
{
struct inode *inode;
long error;
@@ -499,7 +499,7 @@ out:
return error;
}
-static int chmod_common(struct path *path, umode_t mode)
+static int chmod_common(const struct path *path, umode_t mode)
{
struct inode *inode = path->dentry->d_inode;
struct inode *delegated_inode = NULL;
@@ -564,7 +564,7 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode)
return sys_fchmodat(AT_FDCWD, filename, mode);
}
-static int chown_common(struct path *path, uid_t user, gid_t group)
+static int chown_common(const struct path *path, uid_t user, gid_t group)
{
struct inode *inode = path->dentry->d_inode;
struct inode *delegated_inode = NULL;
@@ -713,7 +713,7 @@ static int do_dentry_open(struct file *f,
}
/* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
- if (S_ISREG(inode->i_mode))
+ if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))
f->f_mode |= FMODE_ATOMIC_POS;
f->f_op = fops_get(inode->i_fop);
@@ -840,17 +840,13 @@ EXPORT_SYMBOL(file_path);
int vfs_open(const struct path *path, struct file *file,
const struct cred *cred)
{
- struct dentry *dentry = path->dentry;
- struct inode *inode = dentry->d_inode;
+ struct dentry *dentry = d_real(path->dentry, NULL, file->f_flags);
- file->f_path = *path;
- if (dentry->d_flags & DCACHE_OP_SELECT_INODE) {
- inode = dentry->d_op->d_select_inode(dentry, file->f_flags);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
- }
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
- return do_dentry_open(file, inode, NULL, cred);
+ file->f_path = *path;
+ return do_dentry_open(file, d_backing_inode(dentry), NULL, cred);
}
struct file *dentry_open(const struct path *path, int flags,
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index b61b883c8ff8e..c7a86993d97e3 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -166,7 +166,7 @@ static int openpromfs_readdir(struct file *, struct dir_context *);
static const struct file_operations openprom_operations = {
.read = generic_read_dir,
- .iterate = openpromfs_readdir,
+ .iterate_shared = openpromfs_readdir,
.llseek = generic_file_llseek,
};
diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c
index 03f89dbb2512a..28f2195cd7986 100644
--- a/fs/orangefs/acl.c
+++ b/fs/orangefs/acl.c
@@ -18,10 +18,10 @@ struct posix_acl *orangefs_get_acl(struct inode *inode, int type)
switch (type) {
case ACL_TYPE_ACCESS:
- key = ORANGEFS_XATTR_NAME_ACL_ACCESS;
+ key = XATTR_NAME_POSIX_ACL_ACCESS;
break;
case ACL_TYPE_DEFAULT:
- key = ORANGEFS_XATTR_NAME_ACL_DEFAULT;
+ key = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
gossip_err("orangefs_get_acl: bogus value of type %d\n", type);
@@ -43,11 +43,8 @@ struct posix_acl *orangefs_get_acl(struct inode *inode, int type)
get_khandle_from_ino(inode),
key,
type);
- ret = orangefs_inode_getxattr(inode,
- "",
- key,
- value,
- ORANGEFS_MAX_XATTR_VALUELEN);
+ ret = orangefs_inode_getxattr(inode, key, value,
+ ORANGEFS_MAX_XATTR_VALUELEN);
/* if the key exists, convert it to an in-memory rep */
if (ret > 0) {
acl = posix_acl_from_xattr(&init_user_ns, value, ret);
@@ -74,7 +71,7 @@ int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
switch (type) {
case ACL_TYPE_ACCESS:
- name = ORANGEFS_XATTR_NAME_ACL_ACCESS;
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
if (acl) {
umode_t mode = inode->i_mode;
/*
@@ -98,7 +95,7 @@ int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
}
break;
case ACL_TYPE_DEFAULT:
- name = ORANGEFS_XATTR_NAME_ACL_DEFAULT;
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
gossip_err("%s: invalid type %d!\n", __func__, type);
@@ -131,7 +128,7 @@ int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
* will xlate to a removexattr. However, we don't want removexattr
* complain if attributes does not exist.
*/
- error = orangefs_inode_setxattr(inode, "", name, value, size, 0);
+ error = orangefs_inode_setxattr(inode, name, value, size, 0);
out:
kfree(value);
diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c
index 5dfc4f3cfe68a..00235bf644dcf 100644
--- a/fs/orangefs/dcache.c
+++ b/fs/orangefs/dcache.c
@@ -73,6 +73,7 @@ static int orangefs_revalidate_lookup(struct dentry *dentry)
}
}
+ dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
ret = 1;
out_release_op:
op_release(new_op);
@@ -94,6 +95,9 @@ static int orangefs_d_revalidate(struct dentry *dentry, unsigned int flags)
{
int ret;
+ if (time_before(jiffies, dentry->d_time))
+ return 1;
+
if (flags & LOOKUP_RCU)
return -ECHILD;
diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c
index db170beba7974..a287a66d94e35 100644
--- a/fs/orangefs/devorangefs-req.c
+++ b/fs/orangefs/devorangefs-req.c
@@ -116,6 +116,13 @@ static int orangefs_devreq_open(struct inode *inode, struct file *file)
{
int ret = -EINVAL;
+ /* in order to ensure that the filesystem driver sees correct UIDs */
+ if (file->f_cred->user_ns != &init_user_ns) {
+ gossip_err("%s: device cannot be opened outside init_user_ns\n",
+ __func__);
+ goto out;
+ }
+
if (!(file->f_flags & O_NONBLOCK)) {
gossip_err("%s: device cannot be opened in blocking mode\n",
__func__);
diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c
index f30b6ecacdd19..324f0af40d7bd 100644
--- a/fs/orangefs/dir.c
+++ b/fs/orangefs/dir.c
@@ -153,7 +153,6 @@ static int orangefs_readdir(struct file *file, struct dir_context *ctx)
struct dentry *dentry = file->f_path.dentry;
struct orangefs_kernel_op_s *new_op = NULL;
struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(dentry->d_inode);
- int buffer_full = 0;
struct orangefs_readdir_response_s readdir_response;
void *dents_buf;
int i = 0;
@@ -235,7 +234,7 @@ get_new_buffer_index:
if (ret == -EIO && op_state_purged(new_op)) {
gossip_err("%s: Client is down. Aborting readdir call.\n",
__func__);
- goto out_slot;
+ goto out_free_op;
}
if (ret < 0 || new_op->downcall.status != 0) {
@@ -244,14 +243,14 @@ get_new_buffer_index:
new_op->downcall.status);
if (ret >= 0)
ret = new_op->downcall.status;
- goto out_slot;
+ goto out_free_op;
}
dents_buf = new_op->downcall.trailer_buf;
if (dents_buf == NULL) {
gossip_err("Invalid NULL buffer in readdir response\n");
ret = -ENOMEM;
- goto out_slot;
+ goto out_free_op;
}
bytes_decoded = decode_dirents(dents_buf, new_op->downcall.trailer_size,
@@ -350,8 +349,7 @@ get_new_buffer_index:
/*
* Did we hit the end of the directory?
*/
- if (readdir_response.token == ORANGEFS_READDIR_END &&
- !buffer_full) {
+ if (readdir_response.token == ORANGEFS_READDIR_END) {
gossip_debug(GOSSIP_DIR_DEBUG,
"End of dir detected; setting ctx->pos to ORANGEFS_READDIR_END.\n");
ctx->pos = ORANGEFS_READDIR_END;
@@ -363,8 +361,6 @@ out_destroy_handle:
out_vfree:
gossip_debug(GOSSIP_DIR_DEBUG, "vfree %p\n", dents_buf);
vfree(dents_buf);
-out_slot:
- orangefs_readdir_index_put(buffer_index);
out_free_op:
op_release(new_op);
gossip_debug(GOSSIP_DIR_DEBUG, "orangefs_readdir returning %d\n", ret);
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index ae92795ed9658..526040e09f787 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -445,7 +445,7 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite
gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_write_iter\n");
- mutex_lock(&file->f_mapping->host->i_mutex);
+ inode_lock(file->f_mapping->host);
/* Make sure generic_write_checks sees an up to date inode size. */
if (file->f_flags & O_APPEND) {
@@ -492,7 +492,7 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite
out:
- mutex_unlock(&file->f_mapping->host->i_mutex);
+ inode_unlock(file->f_mapping->host);
return rc;
}
@@ -516,7 +516,6 @@ static long orangefs_ioctl(struct file *file, unsigned int cmd, unsigned long ar
if (cmd == FS_IOC_GETFLAGS) {
val = 0;
ret = orangefs_inode_getxattr(file_inode(file),
- ORANGEFS_XATTR_NAME_DEFAULT_PREFIX,
"user.pvfs2.meta_hint",
&val, sizeof(val));
if (ret < 0 && ret != -ENODATA)
@@ -549,7 +548,6 @@ static long orangefs_ioctl(struct file *file, unsigned int cmd, unsigned long ar
"orangefs_ioctl: FS_IOC_SETFLAGS: %llu\n",
(unsigned long long)val);
ret = orangefs_inode_setxattr(file_inode(file),
- ORANGEFS_XATTR_NAME_DEFAULT_PREFIX,
"user.pvfs2.meta_hint",
&val, sizeof(val), 0);
}
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 2382e267b49e3..28a0557a69be8 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -18,8 +18,8 @@ static int read_one_page(struct page *page)
int max_block;
ssize_t bytes_read = 0;
struct inode *inode = page->mapping->host;
- const __u32 blocksize = PAGE_CACHE_SIZE; /* inode->i_blksize */
- const __u32 blockbits = PAGE_CACHE_SHIFT; /* inode->i_blkbits */
+ const __u32 blocksize = PAGE_SIZE; /* inode->i_blksize */
+ const __u32 blockbits = PAGE_SHIFT; /* inode->i_blkbits */
struct iov_iter to;
struct bio_vec bv = {.bv_page = page, .bv_len = PAGE_SIZE};
@@ -80,13 +80,13 @@ static int orangefs_readpages(struct file *file,
if (!add_to_page_cache(page,
mapping,
page->index,
- GFP_KERNEL)) {
+ readahead_gfp_mask(mapping))) {
ret = read_one_page(page);
gossip_debug(GOSSIP_INODE_DEBUG,
"failure adding page to cache, read_one_page returned: %d\n",
ret);
} else {
- page_cache_release(page);
+ put_page(page);
}
}
BUG_ON(!list_empty(pages));
@@ -124,19 +124,16 @@ static int orangefs_releasepage(struct page *page, gfp_t foo)
* will need to be able to use O_DIRECT on open in order to support
* AIO. Modeled after NFS, they do this too.
*/
-/*
- * static ssize_t orangefs_direct_IO(int rw,
- * struct kiocb *iocb,
- * struct iov_iter *iter,
- * loff_t offset)
- *{
- * gossip_debug(GOSSIP_INODE_DEBUG,
- * "orangefs_direct_IO: %s\n",
- * iocb->ki_filp->f_path.dentry->d_name.name);
- *
- * return -EINVAL;
- *}
- */
+
+static ssize_t orangefs_direct_IO(struct kiocb *iocb,
+ struct iov_iter *iter)
+{
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "orangefs_direct_IO: %s\n",
+ iocb->ki_filp->f_path.dentry->d_name.name);
+
+ return -EINVAL;
+}
struct backing_dev_info orangefs_backing_dev_info = {
.name = "orangefs",
@@ -150,7 +147,7 @@ const struct address_space_operations orangefs_address_operations = {
.readpages = orangefs_readpages,
.invalidatepage = orangefs_invalidatepage,
.releasepage = orangefs_releasepage,
-/* .direct_IO = orangefs_direct_IO */
+ .direct_IO = orangefs_direct_IO,
};
static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr)
@@ -204,22 +201,8 @@ static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr)
if (ret != 0)
return ret;
- /*
- * Only change the c/mtime if we are changing the size or we are
- * explicitly asked to change it. This handles the semantic difference
- * between truncate() and ftruncate() as implemented in the VFS.
- *
- * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
- * special case where we need to update the times despite not having
- * these flags set. For all other operations the VFS set these flags
- * explicitly if it wants a timestamp update.
- */
- if (orig_size != i_size_read(inode) &&
- !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) {
- iattr->ia_ctime = iattr->ia_mtime =
- current_fs_time(inode->i_sb);
+ if (orig_size != i_size_read(inode))
iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME;
- }
return ret;
}
@@ -279,7 +262,7 @@ int orangefs_getattr(struct vfsmount *mnt,
"orangefs_getattr: called on %s\n",
dentry->d_name.name);
- ret = orangefs_inode_getattr(inode, 0, 1);
+ ret = orangefs_inode_getattr(inode, 0, 0);
if (ret == 0) {
generic_fillattr(inode, kstat);
@@ -308,7 +291,7 @@ int orangefs_permission(struct inode *inode, int mask)
}
/* ORANGEDS2 implementation of VFS inode operations for files */
-struct inode_operations orangefs_file_inode_operations = {
+const struct inode_operations orangefs_file_inode_operations = {
.get_acl = orangefs_get_acl,
.set_acl = orangefs_set_acl,
.setattr = orangefs_setattr,
@@ -328,7 +311,7 @@ static int orangefs_init_iops(struct inode *inode)
case S_IFREG:
inode->i_op = &orangefs_file_inode_operations;
inode->i_fop = &orangefs_file_operations;
- inode->i_blkbits = PAGE_CACHE_SHIFT;
+ inode->i_blkbits = PAGE_SHIFT;
break;
case S_IFLNK:
inode->i_op = &orangefs_symlink_inode_operations;
@@ -401,7 +384,7 @@ struct inode *orangefs_iget(struct super_block *sb, struct orangefs_object_kref
if (!inode || !(inode->i_state & I_NEW))
return inode;
- error = orangefs_inode_getattr(inode, 1, 0);
+ error = orangefs_inode_getattr(inode, 1, 1);
if (error) {
iget_failed(inode);
return ERR_PTR(error);
@@ -446,7 +429,7 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
orangefs_set_inode(inode, ref);
inode->i_ino = hash; /* needed for stat etc */
- error = orangefs_inode_getattr(inode, 1, 0);
+ error = orangefs_inode_getattr(inode, 1, 1);
if (error)
goto out_iput;
@@ -456,7 +439,7 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- inode->i_size = PAGE_CACHE_SIZE;
+ inode->i_size = PAGE_SIZE;
inode->i_rdev = dev;
error = insert_inode_locked4(inode, hash, orangefs_test_inode, ref);
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index 5a60c508af4ed..62c525936ee88 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -72,6 +72,8 @@ static int orangefs_create(struct inode *dir,
d_instantiate(dentry, inode);
unlock_new_inode(inode);
+ dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
+ ORANGEFS_I(inode)->getattr_time = jiffies - 1;
gossip_debug(GOSSIP_NAME_DEBUG,
"%s: dentry instantiated for %s\n",
@@ -181,6 +183,8 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry,
goto out;
}
+ dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
+
inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn);
if (IS_ERR(inode)) {
gossip_debug(GOSSIP_NAME_DEBUG,
@@ -189,6 +193,8 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry,
goto out;
}
+ ORANGEFS_I(inode)->getattr_time = jiffies - 1;
+
gossip_debug(GOSSIP_NAME_DEBUG,
"%s:%s:%d "
"Found good inode [%lu] with count [%d]\n",
@@ -316,6 +322,8 @@ static int orangefs_symlink(struct inode *dir,
d_instantiate(dentry, inode);
unlock_new_inode(inode);
+ dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
+ ORANGEFS_I(inode)->getattr_time = jiffies - 1;
gossip_debug(GOSSIP_NAME_DEBUG,
"Inode (Symlink) %pU -> %s\n",
@@ -378,6 +386,8 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
d_instantiate(dentry, inode);
unlock_new_inode(inode);
+ dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
+ ORANGEFS_I(inode)->getattr_time = jiffies - 1;
gossip_debug(GOSSIP_NAME_DEBUG,
"Inode (Directory) %pU -> %s\n",
@@ -405,12 +415,10 @@ static int orangefs_rename(struct inode *old_dir,
int ret;
gossip_debug(GOSSIP_NAME_DEBUG,
- "orangefs_rename: called (%s/%s => %s/%s) ct=%d\n",
- old_dentry->d_parent->d_name.name,
- old_dentry->d_name.name,
- new_dentry->d_parent->d_name.name,
- new_dentry->d_name.name,
- d_count(new_dentry));
+ "orangefs_rename: called (%pd2 => %pd2) ct=%d\n",
+ old_dentry, new_dentry, d_count(new_dentry));
+
+ ORANGEFS_I(new_dentry->d_parent->d_inode)->getattr_time = jiffies - 1;
new_op = op_alloc(ORANGEFS_VFS_OP_RENAME);
if (!new_op)
@@ -442,7 +450,7 @@ static int orangefs_rename(struct inode *old_dir,
}
/* ORANGEFS implementation of VFS inode operations for directories */
-struct inode_operations orangefs_dir_inode_operations = {
+const struct inode_operations orangefs_dir_inode_operations = {
.lookup = orangefs_lookup,
.get_acl = orangefs_get_acl,
.set_acl = orangefs_set_acl,
diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c
index 1f8acc9f9a888..75375e90a63f3 100644
--- a/fs/orangefs/orangefs-bufmap.c
+++ b/fs/orangefs/orangefs-bufmap.c
@@ -170,7 +170,7 @@ orangefs_bufmap_unmap(struct orangefs_bufmap *bufmap)
int i;
for (i = 0; i < bufmap->page_count; i++)
- page_cache_release(bufmap->page_array[i]);
+ put_page(bufmap->page_array[i]);
}
static void
@@ -299,7 +299,7 @@ orangefs_bufmap_map(struct orangefs_bufmap *bufmap,
for (i = 0; i < ret; i++) {
SetPageError(bufmap->page_array[i]);
- page_cache_release(bufmap->page_array[i]);
+ put_page(bufmap->page_array[i]);
}
return -ENOMEM;
}
diff --git a/fs/orangefs/orangefs-cache.c b/fs/orangefs/orangefs-cache.c
index 900a2e38e11bc..b6edbe9fb309e 100644
--- a/fs/orangefs/orangefs-cache.c
+++ b/fs/orangefs/orangefs-cache.c
@@ -136,10 +136,10 @@ struct orangefs_kernel_op_s *op_alloc(__s32 type)
llu(new_op->tag),
get_opname_string(new_op));
- new_op->upcall.uid = from_kuid(current_user_ns(),
+ new_op->upcall.uid = from_kuid(&init_user_ns,
current_fsuid());
- new_op->upcall.gid = from_kgid(current_user_ns(),
+ new_op->upcall.gid = from_kgid(&init_user_ns,
current_fsgid());
} else {
gossip_err("op_alloc: kmem_cache_zalloc failed!\n");
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index 19670b8b4053b..1714a737d5563 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -126,8 +126,7 @@ out:
void orangefs_debugfs_cleanup(void)
{
- if (debug_dir)
- debugfs_remove_recursive(debug_dir);
+ debugfs_remove_recursive(debug_dir);
}
/* open ORANGEFS_KMOD_DEBUG_HELP_FILE */
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index a9925e296ceb4..633c07a6e3d80 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -119,17 +119,6 @@ struct client_debug_mask {
#define ORANGEFS_CACHE_CREATE_FLAGS 0
#endif /* ((defined ORANGEFS_KERNEL_DEBUG) && (defined CONFIG_DEBUG_SLAB)) */
-/* orangefs xattr and acl related defines */
-#define ORANGEFS_XATTR_INDEX_POSIX_ACL_ACCESS 1
-#define ORANGEFS_XATTR_INDEX_POSIX_ACL_DEFAULT 2
-#define ORANGEFS_XATTR_INDEX_TRUSTED 3
-#define ORANGEFS_XATTR_INDEX_DEFAULT 4
-
-#define ORANGEFS_XATTR_NAME_ACL_ACCESS XATTR_NAME_POSIX_ACL_ACCESS
-#define ORANGEFS_XATTR_NAME_ACL_DEFAULT XATTR_NAME_POSIX_ACL_DEFAULT
-#define ORANGEFS_XATTR_NAME_TRUSTED_PREFIX "trusted."
-#define ORANGEFS_XATTR_NAME_DEFAULT_PREFIX ""
-
/* these functions are defined in orangefs-utils.c */
int orangefs_prepare_cdm_array(char *debug_array_string);
int orangefs_prepare_debugfs_help_string(int);
@@ -257,6 +246,8 @@ struct orangefs_inode_s {
* with this object
*/
unsigned long pinode_flags;
+
+ unsigned long getattr_time;
};
#define P_ATIME_FLAG 0
@@ -528,19 +519,17 @@ __s32 fsid_of_op(struct orangefs_kernel_op_s *op);
int orangefs_flush_inode(struct inode *inode);
ssize_t orangefs_inode_getxattr(struct inode *inode,
- const char *prefix,
const char *name,
void *buffer,
size_t size);
int orangefs_inode_setxattr(struct inode *inode,
- const char *prefix,
const char *name,
const void *value,
size_t size,
int flags);
-int orangefs_inode_getattr(struct inode *inode, int new, int size);
+int orangefs_inode_getattr(struct inode *inode, int new, int bypass);
int orangefs_inode_check_changed(struct inode *inode);
@@ -559,6 +548,8 @@ extern struct mutex request_mutex;
extern int debug;
extern int op_timeout_secs;
extern int slot_timeout_secs;
+extern int dcache_timeout_msecs;
+extern int getattr_timeout_msecs;
extern struct list_head orangefs_superblocks;
extern spinlock_t orangefs_superblocks_lock;
extern struct list_head orangefs_request_list;
@@ -570,10 +561,10 @@ extern int hash_table_size;
extern const struct address_space_operations orangefs_address_operations;
extern struct backing_dev_info orangefs_backing_dev_info;
-extern struct inode_operations orangefs_file_inode_operations;
+extern const struct inode_operations orangefs_file_inode_operations;
extern const struct file_operations orangefs_file_operations;
-extern struct inode_operations orangefs_symlink_inode_operations;
-extern struct inode_operations orangefs_dir_inode_operations;
+extern const struct inode_operations orangefs_symlink_inode_operations;
+extern const struct inode_operations orangefs_dir_inode_operations;
extern const struct file_operations orangefs_dir_operations;
extern const struct dentry_operations orangefs_dentry_operations;
extern const struct file_operations orangefs_devreq_file_operations;
@@ -600,8 +591,8 @@ int service_operation(struct orangefs_kernel_op_s *op,
#define fill_default_sys_attrs(sys_attr, type, mode) \
do { \
- sys_attr.owner = from_kuid(current_user_ns(), current_fsuid()); \
- sys_attr.group = from_kgid(current_user_ns(), current_fsgid()); \
+ sys_attr.owner = from_kuid(&init_user_ns, current_fsuid()); \
+ sys_attr.group = from_kgid(&init_user_ns, current_fsgid()); \
sys_attr.perms = ORANGEFS_util_translate_mode(mode); \
sys_attr.mtime = 0; \
sys_attr.atime = 0; \
@@ -612,11 +603,11 @@ do { \
static inline void orangefs_i_size_write(struct inode *inode, loff_t i_size)
{
#if BITS_PER_LONG == 32 && defined(CONFIG_SMP)
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
#endif
i_size_write(inode, i_size);
#if BITS_PER_LONG == 32 && defined(CONFIG_SMP)
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
#endif
}
diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c
index 6f072a8c0de17..e9fd5755c05fa 100644
--- a/fs/orangefs/orangefs-mod.c
+++ b/fs/orangefs/orangefs-mod.c
@@ -47,6 +47,8 @@ struct client_debug_mask client_debug_mask = { NULL, 0, 0 };
unsigned int kernel_mask_set_mod_init; /* implicitly false */
int op_timeout_secs = ORANGEFS_DEFAULT_OP_TIMEOUT_SECS;
int slot_timeout_secs = ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS;
+int dcache_timeout_msecs = 50;
+int getattr_timeout_msecs = 50;
MODULE_LICENSE("GPL");
MODULE_AUTHOR("ORANGEFS Development Team");
diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c
index 5c03113e3ad2a..375708c2db87c 100644
--- a/fs/orangefs/orangefs-sysfs.c
+++ b/fs/orangefs/orangefs-sysfs.c
@@ -61,10 +61,21 @@
* Slots are requested and waited for,
* the wait times out after slot_timeout_secs.
*
+ * What: /sys/fs/orangefs/dcache_timeout_msecs
+ * Date: Jul 2016
+ * Contact: Martin Brandenburg <martin@omnibond.com>
+ * Description:
+ * Time lookup is valid in milliseconds.
+ *
+ * What: /sys/fs/orangefs/getattr_timeout_msecs
+ * Date: Jul 2016
+ * Contact: Martin Brandenburg <martin@omnibond.com>
+ * Description:
+ * Time getattr is valid in milliseconds.
*
* What: /sys/fs/orangefs/acache/...
* Date: Jun 2015
- * Contact: Mike Marshall <hubcap@omnibond.com>
+ * Contact: Martin Brandenburg <martin@omnibond.com>
* Description:
* Attribute cache configurable settings.
*
@@ -117,6 +128,8 @@ struct orangefs_obj {
int perf_history_size;
int perf_time_interval_secs;
int slot_timeout_secs;
+ int dcache_timeout_msecs;
+ int getattr_timeout_msecs;
};
struct acache_orangefs_obj {
@@ -658,6 +671,20 @@ static ssize_t sysfs_int_show(char *kobj_id, char *buf, void *attr)
"%d\n",
slot_timeout_secs);
goto out;
+ } else if (!strcmp(orangefs_attr->attr.name,
+ "dcache_timeout_msecs")) {
+ rc = scnprintf(buf,
+ PAGE_SIZE,
+ "%d\n",
+ dcache_timeout_msecs);
+ goto out;
+ } else if (!strcmp(orangefs_attr->attr.name,
+ "getattr_timeout_msecs")) {
+ rc = scnprintf(buf,
+ PAGE_SIZE,
+ "%d\n",
+ getattr_timeout_msecs);
+ goto out;
} else {
goto out;
}
@@ -734,6 +761,12 @@ static ssize_t int_store(struct orangefs_obj *orangefs_obj,
} else if (!strcmp(attr->attr.name, "slot_timeout_secs")) {
rc = kstrtoint(buf, 0, &slot_timeout_secs);
goto out;
+ } else if (!strcmp(attr->attr.name, "dcache_timeout_msecs")) {
+ rc = kstrtoint(buf, 0, &dcache_timeout_msecs);
+ goto out;
+ } else if (!strcmp(attr->attr.name, "getattr_timeout_msecs")) {
+ rc = kstrtoint(buf, 0, &getattr_timeout_msecs);
+ goto out;
} else {
goto out;
}
@@ -1361,6 +1394,12 @@ static struct orangefs_attribute op_timeout_secs_attribute =
static struct orangefs_attribute slot_timeout_secs_attribute =
__ATTR(slot_timeout_secs, 0664, int_orangefs_show, int_store);
+static struct orangefs_attribute dcache_timeout_msecs_attribute =
+ __ATTR(dcache_timeout_msecs, 0664, int_orangefs_show, int_store);
+
+static struct orangefs_attribute getattr_timeout_msecs_attribute =
+ __ATTR(getattr_timeout_msecs, 0664, int_orangefs_show, int_store);
+
static struct orangefs_attribute perf_counter_reset_attribute =
__ATTR(perf_counter_reset,
0664,
@@ -1382,6 +1421,8 @@ static struct orangefs_attribute perf_time_interval_secs_attribute =
static struct attribute *orangefs_default_attrs[] = {
&op_timeout_secs_attribute.attr,
&slot_timeout_secs_attribute.attr,
+ &dcache_timeout_msecs_attribute.attr,
+ &getattr_timeout_msecs_attribute.attr,
&perf_counter_reset_attribute.attr,
&perf_history_size_attribute.attr,
&perf_time_interval_secs_attribute.attr,
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index 40f5163b56aa0..d13c7291fd054 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -153,12 +153,12 @@ static inline int copy_attributes_from_inode(struct inode *inode,
*/
attrs->mask = 0;
if (iattr->ia_valid & ATTR_UID) {
- attrs->owner = from_kuid(current_user_ns(), iattr->ia_uid);
+ attrs->owner = from_kuid(&init_user_ns, iattr->ia_uid);
attrs->mask |= ORANGEFS_ATTR_SYS_UID;
gossip_debug(GOSSIP_UTILS_DEBUG, "(UID) %d\n", attrs->owner);
}
if (iattr->ia_valid & ATTR_GID) {
- attrs->group = from_kgid(current_user_ns(), iattr->ia_gid);
+ attrs->group = from_kgid(&init_user_ns, iattr->ia_gid);
attrs->mask |= ORANGEFS_ATTR_SYS_GID;
gossip_debug(GOSSIP_UTILS_DEBUG, "(GID) %d\n", attrs->group);
}
@@ -251,7 +251,7 @@ static int orangefs_inode_is_stale(struct inode *inode, int new,
return 0;
}
-int orangefs_inode_getattr(struct inode *inode, int new, int size)
+int orangefs_inode_getattr(struct inode *inode, int new, int bypass)
{
struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
struct orangefs_kernel_op_s *new_op;
@@ -261,12 +261,16 @@ int orangefs_inode_getattr(struct inode *inode, int new, int size)
gossip_debug(GOSSIP_UTILS_DEBUG, "%s: called on inode %pU\n", __func__,
get_khandle_from_ino(inode));
+ if (!new && !bypass) {
+ if (time_before(jiffies, orangefs_inode->getattr_time))
+ return 0;
+ }
+
new_op = op_alloc(ORANGEFS_VFS_OP_GETATTR);
if (!new_op)
return -ENOMEM;
new_op->upcall.req.getattr.refn = orangefs_inode->refn;
- new_op->upcall.req.getattr.mask = size ?
- ORANGEFS_ATTR_SYS_ALL_NOHINT : ORANGEFS_ATTR_SYS_ALL_NOHINT_NOSIZE;
+ new_op->upcall.req.getattr.mask = ORANGEFS_ATTR_SYS_ALL_NOHINT;
ret = service_operation(new_op, __func__,
get_interruptible_flag(inode));
@@ -287,23 +291,21 @@ int orangefs_inode_getattr(struct inode *inode, int new, int size)
case S_IFREG:
inode->i_flags = orangefs_inode_flags(&new_op->
downcall.resp.getattr.attributes);
- if (size) {
- inode_size = (loff_t)new_op->
- downcall.resp.getattr.attributes.size;
- rounded_up_size =
- (inode_size + (4096 - (inode_size % 4096)));
- inode->i_size = inode_size;
- orangefs_inode->blksize =
- new_op->downcall.resp.getattr.attributes.blksize;
- spin_lock(&inode->i_lock);
- inode->i_bytes = inode_size;
- inode->i_blocks =
- (unsigned long)(rounded_up_size / 512);
- spin_unlock(&inode->i_lock);
- }
+ inode_size = (loff_t)new_op->
+ downcall.resp.getattr.attributes.size;
+ rounded_up_size =
+ (inode_size + (4096 - (inode_size % 4096)));
+ inode->i_size = inode_size;
+ orangefs_inode->blksize =
+ new_op->downcall.resp.getattr.attributes.blksize;
+ spin_lock(&inode->i_lock);
+ inode->i_bytes = inode_size;
+ inode->i_blocks =
+ (unsigned long)(rounded_up_size / 512);
+ spin_unlock(&inode->i_lock);
break;
case S_IFDIR:
- inode->i_size = PAGE_CACHE_SIZE;
+ inode->i_size = PAGE_SIZE;
orangefs_inode->blksize = (1 << inode->i_blkbits);
spin_lock(&inode->i_lock);
inode_set_bytes(inode, inode->i_size);
@@ -315,9 +317,13 @@ int orangefs_inode_getattr(struct inode *inode, int new, int size)
inode->i_size = (loff_t)strlen(new_op->
downcall.resp.getattr.link_target);
orangefs_inode->blksize = (1 << inode->i_blkbits);
- strlcpy(orangefs_inode->link_target,
+ ret = strscpy(orangefs_inode->link_target,
new_op->downcall.resp.getattr.link_target,
ORANGEFS_NAME_MAX);
+ if (ret == -E2BIG) {
+ ret = -EIO;
+ goto out;
+ }
inode->i_link = orangefs_inode->link_target;
}
break;
@@ -341,6 +347,7 @@ int orangefs_inode_getattr(struct inode *inode, int new, int size)
inode->i_mode = type | (is_root_handle(inode) ? S_ISVTX : 0) |
orangefs_inode_perms(&new_op->downcall.resp.getattr.attributes);
+ orangefs_inode->getattr_time = jiffies + getattr_timeout_msecs*HZ/1000;
ret = 0;
out:
op_release(new_op);
@@ -414,6 +421,7 @@ int orangefs_inode_setattr(struct inode *inode, struct iattr *iattr)
ClearMtimeFlag(orangefs_inode);
ClearCtimeFlag(orangefs_inode);
ClearModeFlag(orangefs_inode);
+ orangefs_inode->getattr_time = jiffies - 1;
}
return ret;
diff --git a/fs/orangefs/protocol.h b/fs/orangefs/protocol.h
index 45ce4ff4cbc79..3d7418c728f57 100644
--- a/fs/orangefs/protocol.h
+++ b/fs/orangefs/protocol.h
@@ -1,3 +1,4 @@
+#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/spinlock_types.h>
#include <linux/slab.h>
@@ -74,8 +75,8 @@ static inline void ORANGEFS_khandle_to(const struct orangefs_khandle *kh,
void *p, int size)
{
- memset(p, 0, size);
memcpy(p, kh->u, 16);
+ memset(p + 16, 0, size - 16);
}
@@ -206,14 +207,6 @@ typedef __s64 ORANGEFS_offset;
ORANGEFS_ATTR_SYS_DIRENT_COUNT | \
ORANGEFS_ATTR_SYS_BLKSIZE)
-#define ORANGEFS_ATTR_SYS_ALL_NOHINT_NOSIZE \
- (ORANGEFS_ATTR_SYS_COMMON_ALL | \
- ORANGEFS_ATTR_SYS_LNK_TARGET | \
- ORANGEFS_ATTR_SYS_DFILE_COUNT | \
- ORANGEFS_ATTR_SYS_MIRROR_COPIES_COUNT | \
- ORANGEFS_ATTR_SYS_DIRENT_COUNT | \
- ORANGEFS_ATTR_SYS_BLKSIZE)
-
#define ORANGEFS_XATTR_REPLACE 0x2
#define ORANGEFS_XATTR_CREATE 0x1
#define ORANGEFS_MAX_SERVER_ADDR_LEN 256
@@ -407,7 +400,7 @@ enum {
* space. Zero signifies the upstream version of the kernel module.
*/
#define ORANGEFS_KERNEL_PROTO_VERSION 0
-#define ORANGEFS_MINIMUM_USERSPACE_VERSION 20904
+#define ORANGEFS_MINIMUM_USERSPACE_VERSION 20903
/*
* describes memory regions to map in the ORANGEFS_DEV_MAP ioctl.
@@ -427,26 +420,28 @@ struct ORANGEFS_dev_map_desc {
/* gossip.h *****************************************************************/
#ifdef GOSSIP_DISABLE_DEBUG
-#define gossip_debug(mask, format, f...) do {} while (0)
+#define gossip_debug(mask, fmt, ...) \
+do { \
+ if (0) \
+ printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
+} while (0)
#else
extern __u64 gossip_debug_mask;
extern struct client_debug_mask client_debug_mask;
/* try to avoid function call overhead by checking masks in macro */
-#define gossip_debug(mask, format, f...) \
-do { \
- if (gossip_debug_mask & mask) \
- printk(format, ##f); \
+#define gossip_debug(mask, fmt, ...) \
+do { \
+ if (gossip_debug_mask & (mask)) \
+ printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
} while (0)
#endif /* GOSSIP_DISABLE_DEBUG */
/* do file and line number printouts w/ the GNU preprocessor */
-#define gossip_ldebug(mask, format, f...) \
- gossip_debug(mask, "%s: " format, __func__, ##f)
-
-#define gossip_err printk
-#define gossip_lerr(format, f...) \
- gossip_err("%s line %d: " format, \
- __FILE__, \
- __LINE__, \
- ##f)
+#define gossip_ldebug(mask, fmt, ...) \
+ gossip_debug(mask, "%s: " fmt, __func__, ##__VA_ARGS__)
+
+#define gossip_err pr_err
+#define gossip_lerr(fmt, ...) \
+ gossip_err("%s line %d: " fmt, \
+ __FILE__, __LINE__, ##__VA_ARGS__)
diff --git a/fs/orangefs/symlink.c b/fs/orangefs/symlink.c
index 6418dd6386801..8fecf823f5ba2 100644
--- a/fs/orangefs/symlink.c
+++ b/fs/orangefs/symlink.c
@@ -8,7 +8,7 @@
#include "orangefs-kernel.h"
#include "orangefs-bufmap.h"
-struct inode_operations orangefs_symlink_inode_operations = {
+const struct inode_operations orangefs_symlink_inode_operations = {
.readlink = generic_readlink,
.get_link = simple_get_link,
.setattr = orangefs_setattr,
diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c
index ef5da7538cd51..2a9f07f06d100 100644
--- a/fs/orangefs/xattr.c
+++ b/fs/orangefs/xattr.c
@@ -59,8 +59,8 @@ static inline int convert_to_internal_xattr_flags(int setxattr_flags)
* unless the key does not exist for the file and/or if
* there were errors in fetching the attribute value.
*/
-ssize_t orangefs_inode_getxattr(struct inode *inode, const char *prefix,
- const char *name, void *buffer, size_t size)
+ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name,
+ void *buffer, size_t size)
{
struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
struct orangefs_kernel_op_s *new_op = NULL;
@@ -70,21 +70,17 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *prefix,
int fsgid;
gossip_debug(GOSSIP_XATTR_DEBUG,
- "%s: prefix %s name %s, buffer_size %zd\n",
- __func__, prefix, name, size);
+ "%s: name %s, buffer_size %zd\n",
+ __func__, name, size);
- if (name == NULL || (size > 0 && buffer == NULL)) {
- gossip_err("orangefs_inode_getxattr: bogus NULL pointers\n");
- return -EINVAL;
- }
- if ((strlen(name) + strlen(prefix)) >= ORANGEFS_MAX_XATTR_NAMELEN) {
+ if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) {
gossip_err("Invalid key length (%d)\n",
- (int)(strlen(name) + strlen(prefix)));
+ (int)strlen(name));
return -EINVAL;
}
- fsuid = from_kuid(current_user_ns(), current_fsuid());
- fsgid = from_kgid(current_user_ns(), current_fsgid());
+ fsuid = from_kuid(&init_user_ns, current_fsuid());
+ fsgid = from_kgid(&init_user_ns, current_fsgid());
gossip_debug(GOSSIP_XATTR_DEBUG,
"getxattr on inode %pU, name %s "
@@ -101,15 +97,14 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *prefix,
goto out_unlock;
new_op->upcall.req.getxattr.refn = orangefs_inode->refn;
- ret = snprintf((char *)new_op->upcall.req.getxattr.key,
- ORANGEFS_MAX_XATTR_NAMELEN, "%s%s", prefix, name);
+ strcpy(new_op->upcall.req.getxattr.key, name);
/*
* NOTE: Although keys are meant to be NULL terminated textual
* strings, I am going to explicitly pass the length just in case
* we change this later on...
*/
- new_op->upcall.req.getxattr.key_sz = ret + 1;
+ new_op->upcall.req.getxattr.key_sz = strlen(name) + 1;
ret = service_operation(new_op, "orangefs_inode_getxattr",
get_interruptible_flag(inode));
@@ -146,8 +141,8 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *prefix,
goto out_release_op;
}
- memset(buffer, 0, size);
memcpy(buffer, new_op->downcall.resp.getxattr.val, length);
+ memset(buffer + length, 0, size - length);
gossip_debug(GOSSIP_XATTR_DEBUG,
"orangefs_inode_getxattr: inode %pU "
"key %s key_sz %d, val_len %d\n",
@@ -167,10 +162,8 @@ out_unlock:
return ret;
}
-static int orangefs_inode_removexattr(struct inode *inode,
- const char *prefix,
- const char *name,
- int flags)
+static int orangefs_inode_removexattr(struct inode *inode, const char *name,
+ int flags)
{
struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
struct orangefs_kernel_op_s *new_op = NULL;
@@ -187,12 +180,8 @@ static int orangefs_inode_removexattr(struct inode *inode,
* textual strings, I am going to explicitly pass the
* length just in case we change this later on...
*/
- ret = snprintf((char *)new_op->upcall.req.removexattr.key,
- ORANGEFS_MAX_XATTR_NAMELEN,
- "%s%s",
- (prefix ? prefix : ""),
- name);
- new_op->upcall.req.removexattr.key_sz = ret + 1;
+ strcpy(new_op->upcall.req.removexattr.key, name);
+ new_op->upcall.req.removexattr.key_sz = strlen(name) + 1;
gossip_debug(GOSSIP_XATTR_DEBUG,
"orangefs_inode_removexattr: key %s, key_sz %d\n",
@@ -227,8 +216,8 @@ out_unlock:
* Returns a -ve number on error and 0 on success. Key is text, but value
* can be binary!
*/
-int orangefs_inode_setxattr(struct inode *inode, const char *prefix,
- const char *name, const void *value, size_t size, int flags)
+int orangefs_inode_setxattr(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
{
struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
struct orangefs_kernel_op_s *new_op;
@@ -236,11 +225,10 @@ int orangefs_inode_setxattr(struct inode *inode, const char *prefix,
int ret = -ENOMEM;
gossip_debug(GOSSIP_XATTR_DEBUG,
- "%s: prefix %s, name %s, buffer_size %zd\n",
- __func__, prefix, name, size);
+ "%s: name %s, buffer_size %zd\n",
+ __func__, name, size);
- if (size < 0 ||
- size >= ORANGEFS_MAX_XATTR_VALUELEN ||
+ if (size >= ORANGEFS_MAX_XATTR_VALUELEN ||
flags < 0) {
gossip_err("orangefs_inode_setxattr: bogus values of size(%d), flags(%d)\n",
(int)size,
@@ -248,37 +236,21 @@ int orangefs_inode_setxattr(struct inode *inode, const char *prefix,
return -EINVAL;
}
- if (name == NULL ||
- (size > 0 && value == NULL)) {
- gossip_err("orangefs_inode_setxattr: bogus NULL pointers!\n");
- return -EINVAL;
- }
-
internal_flag = convert_to_internal_xattr_flags(flags);
- if (prefix) {
- if (strlen(name) + strlen(prefix) >= ORANGEFS_MAX_XATTR_NAMELEN) {
- gossip_err
- ("orangefs_inode_setxattr: bogus key size (%d)\n",
- (int)(strlen(name) + strlen(prefix)));
- return -EINVAL;
- }
- } else {
- if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) {
- gossip_err
- ("orangefs_inode_setxattr: bogus key size (%d)\n",
- (int)(strlen(name)));
- return -EINVAL;
- }
+ if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) {
+ gossip_err
+ ("orangefs_inode_setxattr: bogus key size (%d)\n",
+ (int)(strlen(name)));
+ return -EINVAL;
}
/* This is equivalent to a removexattr */
if (size == 0 && value == NULL) {
gossip_debug(GOSSIP_XATTR_DEBUG,
- "removing xattr (%s%s)\n",
- prefix,
+ "removing xattr (%s)\n",
name);
- return orangefs_inode_removexattr(inode, prefix, name, flags);
+ return orangefs_inode_removexattr(inode, name, flags);
}
gossip_debug(GOSSIP_XATTR_DEBUG,
@@ -299,11 +271,8 @@ int orangefs_inode_setxattr(struct inode *inode, const char *prefix,
* strings, I am going to explicitly pass the length just in
* case we change this later on...
*/
- ret = snprintf((char *)new_op->upcall.req.setxattr.keyval.key,
- ORANGEFS_MAX_XATTR_NAMELEN,
- "%s%s",
- prefix, name);
- new_op->upcall.req.setxattr.keyval.key_sz = ret + 1;
+ strcpy(new_op->upcall.req.setxattr.keyval.key, name);
+ new_op->upcall.req.setxattr.keyval.key_sz = strlen(name) + 1;
memcpy(new_op->upcall.req.setxattr.keyval.val, value, size);
new_op->upcall.req.setxattr.keyval.val_sz = size;
@@ -353,10 +322,6 @@ ssize_t orangefs_listxattr(struct dentry *dentry, char *buffer, size_t size)
gossip_err("%s: bogus NULL pointers\n", __func__);
return -EINVAL;
}
- if (size < 0) {
- gossip_err("Invalid size (%d)\n", (int)size);
- return -EINVAL;
- }
down_read(&orangefs_inode->xattr_sem);
new_op = op_alloc(ORANGEFS_VFS_OP_LISTXATTR);
@@ -463,75 +428,29 @@ out_unlock:
}
static int orangefs_xattr_set_default(const struct xattr_handler *handler,
- struct dentry *dentry,
+ struct dentry *unused,
+ struct inode *inode,
const char *name,
const void *buffer,
size_t size,
int flags)
{
- return orangefs_inode_setxattr(dentry->d_inode,
- ORANGEFS_XATTR_NAME_DEFAULT_PREFIX,
- name,
- buffer,
- size,
- flags);
+ return orangefs_inode_setxattr(inode, name, buffer, size, flags);
}
static int orangefs_xattr_get_default(const struct xattr_handler *handler,
- struct dentry *dentry,
+ struct dentry *unused,
+ struct inode *inode,
const char *name,
void *buffer,
size_t size)
{
- return orangefs_inode_getxattr(dentry->d_inode,
- ORANGEFS_XATTR_NAME_DEFAULT_PREFIX,
- name,
- buffer,
- size);
-
-}
-
-static int orangefs_xattr_set_trusted(const struct xattr_handler *handler,
- struct dentry *dentry,
- const char *name,
- const void *buffer,
- size_t size,
- int flags)
-{
- return orangefs_inode_setxattr(dentry->d_inode,
- ORANGEFS_XATTR_NAME_TRUSTED_PREFIX,
- name,
- buffer,
- size,
- flags);
-}
+ return orangefs_inode_getxattr(inode, name, buffer, size);
-static int orangefs_xattr_get_trusted(const struct xattr_handler *handler,
- struct dentry *dentry,
- const char *name,
- void *buffer,
- size_t size)
-{
- return orangefs_inode_getxattr(dentry->d_inode,
- ORANGEFS_XATTR_NAME_TRUSTED_PREFIX,
- name,
- buffer,
- size);
}
-static struct xattr_handler orangefs_xattr_trusted_handler = {
- .prefix = ORANGEFS_XATTR_NAME_TRUSTED_PREFIX,
- .get = orangefs_xattr_get_trusted,
- .set = orangefs_xattr_set_trusted,
-};
-
static struct xattr_handler orangefs_xattr_default_handler = {
- /*
- * NOTE: this is set to be the empty string.
- * so that all un-prefixed xattrs keys get caught
- * here!
- */
- .prefix = ORANGEFS_XATTR_NAME_DEFAULT_PREFIX,
+ .prefix = "", /* match any name => handlers called with full name */
.get = orangefs_xattr_get_default,
.set = orangefs_xattr_set_default,
};
@@ -539,7 +458,6 @@ static struct xattr_handler orangefs_xattr_default_handler = {
const struct xattr_handler *orangefs_xattr_handlers[] = {
&posix_acl_access_xattr_handler,
&posix_acl_default_xattr_handler,
- &orangefs_xattr_trusted_handler,
&orangefs_xattr_default_handler,
NULL
};
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index cc514da6f3e7b..54e5d66817867 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -292,6 +292,7 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
goto out_cleanup;
ovl_dentry_update(dentry, newdentry);
+ ovl_inode_update(d_inode(dentry), d_inode(newdentry));
newdentry = NULL;
/*
@@ -336,7 +337,6 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct dentry *upperdir;
struct dentry *upperdentry;
const struct cred *old_cred;
- struct cred *override_cred;
char *link = NULL;
if (WARN_ON(!workdir))
@@ -357,28 +357,7 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
return PTR_ERR(link);
}
- err = -ENOMEM;
- override_cred = prepare_creds();
- if (!override_cred)
- goto out_free_link;
-
- override_cred->fsuid = stat->uid;
- override_cred->fsgid = stat->gid;
- /*
- * CAP_SYS_ADMIN for copying up extended attributes
- * CAP_DAC_OVERRIDE for create
- * CAP_FOWNER for chmod, timestamp update
- * CAP_FSETID for chmod
- * CAP_CHOWN for chown
- * CAP_MKNOD for mknod
- */
- cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
- cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
- cap_raise(override_cred->cap_effective, CAP_FOWNER);
- cap_raise(override_cred->cap_effective, CAP_FSETID);
- cap_raise(override_cred->cap_effective, CAP_CHOWN);
- cap_raise(override_cred->cap_effective, CAP_MKNOD);
- old_cred = override_creds(override_cred);
+ old_cred = ovl_override_creds(dentry->d_sb);
err = -EIO;
if (lock_rename(workdir, upperdir) != NULL) {
@@ -401,9 +380,7 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
out_unlock:
unlock_rename(workdir, upperdir);
revert_creds(old_cred);
- put_cred(override_cred);
-out_free_link:
if (link)
free_page((unsigned long) link);
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index b3fc0a35bf624..12bcd07b9e32c 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -138,9 +138,12 @@ static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
int err;
enum ovl_path_type type;
struct path realpath;
+ const struct cred *old_cred;
type = ovl_path_real(dentry, &realpath);
+ old_cred = ovl_override_creds(dentry->d_sb);
err = vfs_getattr(&realpath, stat);
+ revert_creds(old_cred);
if (err)
return err;
@@ -158,6 +161,22 @@ static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
return 0;
}
+/* Common operations required to be done after creation of file on upper */
+static void ovl_instantiate(struct dentry *dentry, struct inode *inode,
+ struct dentry *newdentry, bool hardlink)
+{
+ ovl_dentry_version_inc(dentry->d_parent);
+ ovl_dentry_update(dentry, newdentry);
+ if (!hardlink) {
+ ovl_inode_update(inode, d_inode(newdentry));
+ ovl_copyattr(newdentry->d_inode, inode);
+ } else {
+ WARN_ON(ovl_inode_real(inode, NULL) != d_inode(newdentry));
+ inc_nlink(inode);
+ }
+ d_instantiate(dentry, inode);
+}
+
static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
@@ -177,10 +196,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
if (err)
goto out_dput;
- ovl_dentry_version_inc(dentry->d_parent);
- ovl_dentry_update(dentry, newdentry);
- ovl_copyattr(newdentry->d_inode, inode);
- d_instantiate(dentry, inode);
+ ovl_instantiate(dentry, inode, newdentry, !!hardlink);
newdentry = NULL;
out_dput:
dput(newdentry);
@@ -291,23 +307,29 @@ static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
{
int err;
struct dentry *ret = NULL;
+ enum ovl_path_type type = ovl_path_type(dentry);
LIST_HEAD(list);
err = ovl_check_empty_dir(dentry, &list);
- if (err)
+ if (err) {
ret = ERR_PTR(err);
- else {
- /*
- * If no upperdentry then skip clearing whiteouts.
- *
- * Can race with copy-up, since we don't hold the upperdir
- * mutex. Doesn't matter, since copy-up can't create a
- * non-empty directory from an empty one.
- */
- if (ovl_dentry_upper(dentry))
- ret = ovl_clear_empty(dentry, &list);
+ goto out_free;
}
+ /*
+ * When removing an empty opaque directory, then it makes no sense to
+ * replace it with an exact replica of itself.
+ *
+ * If no upperdentry then skip clearing whiteouts.
+ *
+ * Can race with copy-up, since we don't hold the upperdir mutex.
+ * Doesn't matter, since copy-up can't create a non-empty directory
+ * from an empty one.
+ */
+ if (OVL_TYPE_UPPER(type) && OVL_TYPE_MERGE(type))
+ ret = ovl_clear_empty(dentry, &list);
+
+out_free:
ovl_cache_free(&list);
return ret;
@@ -347,7 +369,23 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
if (err)
goto out_dput2;
- if (S_ISDIR(stat->mode)) {
+ /*
+ * mode could have been mutilated due to umask (e.g. sgid directory)
+ */
+ if (!hardlink &&
+ !S_ISLNK(stat->mode) && newdentry->d_inode->i_mode != stat->mode) {
+ struct iattr attr = {
+ .ia_valid = ATTR_MODE,
+ .ia_mode = stat->mode,
+ };
+ inode_lock(newdentry->d_inode);
+ err = notify_change(newdentry, &attr, NULL);
+ inode_unlock(newdentry->d_inode);
+ if (err)
+ goto out_cleanup;
+ }
+
+ if (!hardlink && S_ISDIR(stat->mode)) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_cleanup;
@@ -363,10 +401,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
if (err)
goto out_cleanup;
}
- ovl_dentry_version_inc(dentry->d_parent);
- ovl_dentry_update(dentry, newdentry);
- ovl_copyattr(newdentry->d_inode, inode);
- d_instantiate(dentry, inode);
+ ovl_instantiate(dentry, inode, newdentry, !!hardlink);
newdentry = NULL;
out_dput2:
dput(upper);
@@ -382,58 +417,42 @@ out_cleanup:
goto out_dput2;
}
-static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
- const char *link, struct dentry *hardlink)
+static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
+ struct kstat *stat, const char *link,
+ struct dentry *hardlink)
{
int err;
- struct inode *inode;
- struct kstat stat = {
- .mode = mode,
- .rdev = rdev,
- };
-
- err = -ENOMEM;
- inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
- if (!inode)
- goto out;
+ const struct cred *old_cred;
+ struct cred *override_cred;
err = ovl_copy_up(dentry->d_parent);
if (err)
- goto out_iput;
-
- if (!ovl_dentry_is_opaque(dentry)) {
- err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
- } else {
- const struct cred *old_cred;
- struct cred *override_cred;
-
- err = -ENOMEM;
- override_cred = prepare_creds();
- if (!override_cred)
- goto out_iput;
-
- /*
- * CAP_SYS_ADMIN for setting opaque xattr
- * CAP_DAC_OVERRIDE for create in workdir, rename
- * CAP_FOWNER for removing whiteout from sticky dir
- */
- cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
- cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
- cap_raise(override_cred->cap_effective, CAP_FOWNER);
- old_cred = override_creds(override_cred);
-
- err = ovl_create_over_whiteout(dentry, inode, &stat, link,
- hardlink);
+ return err;
- revert_creds(old_cred);
+ old_cred = ovl_override_creds(dentry->d_sb);
+ err = -ENOMEM;
+ override_cred = prepare_creds();
+ if (override_cred) {
+ override_cred->fsuid = inode->i_uid;
+ override_cred->fsgid = inode->i_gid;
+ put_cred(override_creds(override_cred));
put_cred(override_cred);
+
+ if (!ovl_dentry_is_opaque(dentry))
+ err = ovl_create_upper(dentry, inode, stat, link,
+ hardlink);
+ else
+ err = ovl_create_over_whiteout(dentry, inode, stat,
+ link, hardlink);
}
+ revert_creds(old_cred);
+ if (!err) {
+ struct inode *realinode = d_inode(ovl_dentry_upper(dentry));
- if (!err)
- inode = NULL;
-out_iput:
- iput(inode);
-out:
+ WARN_ON(inode->i_mode != realinode->i_mode);
+ WARN_ON(!uid_eq(inode->i_uid, realinode->i_uid));
+ WARN_ON(!gid_eq(inode->i_gid, realinode->i_gid));
+ }
return err;
}
@@ -441,13 +460,30 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
const char *link)
{
int err;
+ struct inode *inode;
+ struct kstat stat = {
+ .rdev = rdev,
+ };
err = ovl_want_write(dentry);
- if (!err) {
- err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
- ovl_drop_write(dentry);
- }
+ if (err)
+ goto out;
+
+ err = -ENOMEM;
+ inode = ovl_new_inode(dentry->d_sb, mode);
+ if (!inode)
+ goto out_drop_write;
+
+ inode_init_owner(inode, dentry->d_parent->d_inode, mode);
+ stat.mode = inode->i_mode;
+ err = ovl_create_or_link(dentry, inode, &stat, link, NULL);
+ if (err)
+ iput(inode);
+
+out_drop_write:
+ ovl_drop_write(dentry);
+out:
return err;
}
@@ -482,7 +518,7 @@ static int ovl_link(struct dentry *old, struct inode *newdir,
struct dentry *new)
{
int err;
- struct dentry *upper;
+ struct inode *inode;
err = ovl_want_write(old);
if (err)
@@ -492,8 +528,12 @@ static int ovl_link(struct dentry *old, struct inode *newdir,
if (err)
goto out_drop_write;
- upper = ovl_dentry_upper(old);
- err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
+ inode = d_inode(old);
+ ihold(inode);
+
+ err = ovl_create_or_link(new, inode, NULL, NULL, ovl_dentry_upper(old));
+ if (err)
+ iput(inode);
out_drop_write:
ovl_drop_write(old);
@@ -511,75 +551,55 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
struct dentry *upper;
struct dentry *opaquedir = NULL;
int err;
+ int flags = 0;
if (WARN_ON(!workdir))
return -EROFS;
if (is_dir) {
- if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) {
- opaquedir = ovl_check_empty_and_clear(dentry);
- err = PTR_ERR(opaquedir);
- if (IS_ERR(opaquedir))
- goto out;
- } else {
- LIST_HEAD(list);
-
- /*
- * When removing an empty opaque directory, then it
- * makes no sense to replace it with an exact replica of
- * itself. But emptiness still needs to be checked.
- */
- err = ovl_check_empty_dir(dentry, &list);
- ovl_cache_free(&list);
- if (err)
- goto out;
- }
+ opaquedir = ovl_check_empty_and_clear(dentry);
+ err = PTR_ERR(opaquedir);
+ if (IS_ERR(opaquedir))
+ goto out;
}
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out_dput;
- whiteout = ovl_whiteout(workdir, dentry);
- err = PTR_ERR(whiteout);
- if (IS_ERR(whiteout))
+ upper = lookup_one_len(dentry->d_name.name, upperdir,
+ dentry->d_name.len);
+ err = PTR_ERR(upper);
+ if (IS_ERR(upper))
goto out_unlock;
- upper = ovl_dentry_upper(dentry);
- if (!upper) {
- upper = lookup_one_len(dentry->d_name.name, upperdir,
- dentry->d_name.len);
- err = PTR_ERR(upper);
- if (IS_ERR(upper))
- goto kill_whiteout;
-
- err = ovl_do_rename(wdir, whiteout, udir, upper, 0);
- dput(upper);
- if (err)
- goto kill_whiteout;
- } else {
- int flags = 0;
+ err = -ESTALE;
+ if ((opaquedir && upper != opaquedir) ||
+ (!opaquedir && ovl_dentry_upper(dentry) &&
+ upper != ovl_dentry_upper(dentry))) {
+ goto out_dput_upper;
+ }
- if (opaquedir)
- upper = opaquedir;
- err = -ESTALE;
- if (upper->d_parent != upperdir)
- goto kill_whiteout;
+ whiteout = ovl_whiteout(workdir, dentry);
+ err = PTR_ERR(whiteout);
+ if (IS_ERR(whiteout))
+ goto out_dput_upper;
- if (is_dir)
- flags |= RENAME_EXCHANGE;
+ if (d_is_dir(upper))
+ flags = RENAME_EXCHANGE;
- err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
- if (err)
- goto kill_whiteout;
+ err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
+ if (err)
+ goto kill_whiteout;
+ if (flags)
+ ovl_cleanup(wdir, upper);
- if (is_dir)
- ovl_cleanup(wdir, upper);
- }
ovl_dentry_version_inc(dentry->d_parent);
out_d_drop:
d_drop(dentry);
dput(whiteout);
+out_dput_upper:
+ dput(upper);
out_unlock:
unlock_rename(workdir, upperdir);
out_dput:
@@ -645,6 +665,8 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
{
enum ovl_path_type type;
int err;
+ const struct cred *old_cred;
+
err = ovl_check_sticky(dentry);
if (err)
@@ -659,35 +681,18 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
goto out_drop_write;
type = ovl_path_type(dentry);
- if (OVL_TYPE_PURE_UPPER(type)) {
- err = ovl_remove_upper(dentry, is_dir);
- } else {
- const struct cred *old_cred;
- struct cred *override_cred;
-
- err = -ENOMEM;
- override_cred = prepare_creds();
- if (!override_cred)
- goto out_drop_write;
-
- /*
- * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
- * CAP_DAC_OVERRIDE for create in workdir, rename
- * CAP_FOWNER for removing whiteout from sticky dir
- * CAP_FSETID for chmod of opaque dir
- * CAP_CHOWN for chown of opaque dir
- */
- cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
- cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
- cap_raise(override_cred->cap_effective, CAP_FOWNER);
- cap_raise(override_cred->cap_effective, CAP_FSETID);
- cap_raise(override_cred->cap_effective, CAP_CHOWN);
- old_cred = override_creds(override_cred);
+ old_cred = ovl_override_creds(dentry->d_sb);
+ if (OVL_TYPE_PURE_UPPER(type))
+ err = ovl_remove_upper(dentry, is_dir);
+ else
err = ovl_remove_and_whiteout(dentry, is_dir);
-
- revert_creds(old_cred);
- put_cred(override_cred);
+ revert_creds(old_cred);
+ if (!err) {
+ if (is_dir)
+ clear_nlink(dentry->d_inode);
+ else
+ drop_nlink(dentry->d_inode);
}
out_drop_write:
ovl_drop_write(dentry);
@@ -725,7 +730,6 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
bool new_is_dir = false;
struct dentry *opaquedir = NULL;
const struct cred *old_cred = NULL;
- struct cred *override_cred = NULL;
err = -EINVAL;
if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
@@ -794,26 +798,7 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
- if (old_opaque || new_opaque) {
- err = -ENOMEM;
- override_cred = prepare_creds();
- if (!override_cred)
- goto out_drop_write;
-
- /*
- * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
- * CAP_DAC_OVERRIDE for create in workdir
- * CAP_FOWNER for removing whiteout from sticky dir
- * CAP_FSETID for chmod of opaque dir
- * CAP_CHOWN for chown of opaque dir
- */
- cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
- cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
- cap_raise(override_cred->cap_effective, CAP_FOWNER);
- cap_raise(override_cred->cap_effective, CAP_FSETID);
- cap_raise(override_cred->cap_effective, CAP_CHOWN);
- old_cred = override_creds(override_cred);
- }
+ old_cred = ovl_override_creds(old->d_sb);
if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
opaquedir = ovl_check_empty_and_clear(new);
@@ -943,10 +928,7 @@ out_dput_old:
out_unlock:
unlock_rename(new_upperdir, old_upperdir);
out_revert_creds:
- if (old_opaque || new_opaque) {
- revert_creds(old_cred);
- put_cred(override_cred);
- }
+ revert_creds(old_cred);
out_drop_write:
ovl_drop_write(old);
out:
@@ -967,8 +949,10 @@ const struct inode_operations ovl_dir_inode_operations = {
.mknod = ovl_mknod,
.permission = ovl_permission,
.getattr = ovl_dir_getattr,
- .setxattr = ovl_setxattr,
+ .setxattr = generic_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
+ .get_acl = ovl_get_acl,
+ .update_time = ovl_update_time,
};
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index a4ff5d0d7db91..1b885c156028d 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -41,6 +41,7 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr)
{
int err;
struct dentry *upperdentry;
+ const struct cred *old_cred;
/*
* Check for permissions before trying to copy-up. This is redundant
@@ -59,16 +60,42 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr)
if (err)
goto out;
+ if (attr->ia_valid & ATTR_SIZE) {
+ struct inode *realinode = d_inode(ovl_dentry_real(dentry));
+
+ err = -ETXTBSY;
+ if (atomic_read(&realinode->i_writecount) < 0)
+ goto out_drop_write;
+ }
+
err = ovl_copy_up(dentry);
if (!err) {
+ struct inode *winode = NULL;
+
upperdentry = ovl_dentry_upper(dentry);
+ if (attr->ia_valid & ATTR_SIZE) {
+ winode = d_inode(upperdentry);
+ err = get_write_access(winode);
+ if (err)
+ goto out_drop_write;
+ }
+
+ if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
+ attr->ia_valid &= ~ATTR_MODE;
+
inode_lock(upperdentry->d_inode);
+ old_cred = ovl_override_creds(dentry->d_sb);
err = notify_change(upperdentry, attr, NULL);
+ revert_creds(old_cred);
if (!err)
ovl_copyattr(upperdentry->d_inode, dentry->d_inode);
inode_unlock(upperdentry->d_inode);
+
+ if (winode)
+ put_write_access(winode);
}
+out_drop_write:
ovl_drop_write(dentry);
out:
return err;
@@ -78,94 +105,46 @@ static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
struct path realpath;
+ const struct cred *old_cred;
+ int err;
ovl_path_real(dentry, &realpath);
- return vfs_getattr(&realpath, stat);
+ old_cred = ovl_override_creds(dentry->d_sb);
+ err = vfs_getattr(&realpath, stat);
+ revert_creds(old_cred);
+ return err;
}
int ovl_permission(struct inode *inode, int mask)
{
- struct ovl_entry *oe;
- struct dentry *alias = NULL;
- struct inode *realinode;
- struct dentry *realdentry;
bool is_upper;
+ struct inode *realinode = ovl_inode_real(inode, &is_upper);
+ const struct cred *old_cred;
int err;
- if (S_ISDIR(inode->i_mode)) {
- oe = inode->i_private;
- } else if (mask & MAY_NOT_BLOCK) {
- return -ECHILD;
- } else {
- /*
- * For non-directories find an alias and get the info
- * from there.
- */
- alias = d_find_any_alias(inode);
- if (WARN_ON(!alias))
- return -ENOENT;
-
- oe = alias->d_fsdata;
- }
-
- realdentry = ovl_entry_real(oe, &is_upper);
-
- if (ovl_is_default_permissions(inode)) {
- struct kstat stat;
- struct path realpath = { .dentry = realdentry };
-
- if (mask & MAY_NOT_BLOCK)
- return -ECHILD;
-
- realpath.mnt = ovl_entry_mnt_real(oe, inode, is_upper);
-
- err = vfs_getattr(&realpath, &stat);
- if (err)
- return err;
-
- if ((stat.mode ^ inode->i_mode) & S_IFMT)
- return -ESTALE;
-
- inode->i_mode = stat.mode;
- inode->i_uid = stat.uid;
- inode->i_gid = stat.gid;
-
- return generic_permission(inode, mask);
- }
-
/* Careful in RCU walk mode */
- realinode = ACCESS_ONCE(realdentry->d_inode);
if (!realinode) {
WARN_ON(!(mask & MAY_NOT_BLOCK));
- err = -ENOENT;
- goto out_dput;
+ return -ECHILD;
}
- if (mask & MAY_WRITE) {
- umode_t mode = realinode->i_mode;
-
- /*
- * Writes will always be redirected to upper layer, so
- * ignore lower layer being read-only.
- *
- * If the overlay itself is read-only then proceed
- * with the permission check, don't return EROFS.
- * This will only happen if this is the lower layer of
- * another overlayfs.
- *
- * If upper fs becomes read-only after the overlay was
- * constructed return EROFS to prevent modification of
- * upper layer.
- */
- err = -EROFS;
- if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
- (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
- goto out_dput;
+ /*
+ * Check overlay inode with the creds of task and underlying inode
+ * with creds of mounter
+ */
+ err = generic_permission(inode, mask);
+ if (err)
+ return err;
+
+ old_cred = ovl_override_creds(inode->i_sb);
+ if (!is_upper && !special_file(realinode->i_mode) && mask & MAY_WRITE) {
+ mask &= ~(MAY_WRITE | MAY_APPEND);
+ /* Make sure mounter can read file for copy up later */
+ mask |= MAY_READ;
}
+ err = inode_permission(realinode, mask);
+ revert_creds(old_cred);
- err = __inode_permission(realinode, mask);
-out_dput:
- dput(alias);
return err;
}
@@ -175,6 +154,8 @@ static const char *ovl_get_link(struct dentry *dentry,
{
struct dentry *realdentry;
struct inode *realinode;
+ const struct cred *old_cred;
+ const char *p;
if (!dentry)
return ERR_PTR(-ECHILD);
@@ -185,13 +166,18 @@ static const char *ovl_get_link(struct dentry *dentry,
if (WARN_ON(!realinode->i_op->get_link))
return ERR_PTR(-EPERM);
- return realinode->i_op->get_link(realdentry, realinode, done);
+ old_cred = ovl_override_creds(dentry->d_sb);
+ p = realinode->i_op->get_link(realdentry, realinode, done);
+ revert_creds(old_cred);
+ return p;
}
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
{
struct path realpath;
struct inode *realinode;
+ const struct cred *old_cred;
+ int err;
ovl_path_real(dentry, &realpath);
realinode = realpath.dentry->d_inode;
@@ -199,37 +185,39 @@ static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
if (!realinode->i_op->readlink)
return -EINVAL;
- touch_atime(&realpath);
-
- return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
+ old_cred = ovl_override_creds(dentry->d_sb);
+ err = realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
+ revert_creds(old_cred);
+ return err;
}
-
static bool ovl_is_private_xattr(const char *name)
{
- return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
+#define OVL_XATTR_PRE_NAME OVL_XATTR_PREFIX "."
+ return strncmp(name, OVL_XATTR_PRE_NAME,
+ sizeof(OVL_XATTR_PRE_NAME) - 1) == 0;
}
-int ovl_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+int ovl_setxattr(struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
int err;
struct dentry *upperdentry;
+ const struct cred *old_cred;
err = ovl_want_write(dentry);
if (err)
goto out;
- err = -EPERM;
- if (ovl_is_private_xattr(name))
- goto out_drop_write;
-
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
upperdentry = ovl_dentry_upper(dentry);
+ old_cred = ovl_override_creds(dentry->d_sb);
err = vfs_setxattr(upperdentry, name, value, size, flags);
+ revert_creds(old_cred);
out_drop_write:
ovl_drop_write(dentry);
@@ -237,41 +225,35 @@ out:
return err;
}
-static bool ovl_need_xattr_filter(struct dentry *dentry,
- enum ovl_path_type type)
-{
- if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER)
- return S_ISDIR(dentry->d_inode->i_mode);
- else
- return false;
-}
-
-ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
- void *value, size_t size)
+ssize_t ovl_getxattr(struct dentry *dentry, struct inode *inode,
+ const char *name, void *value, size_t size)
{
- struct path realpath;
- enum ovl_path_type type = ovl_path_real(dentry, &realpath);
+ struct dentry *realdentry = ovl_dentry_real(dentry);
+ ssize_t res;
+ const struct cred *old_cred;
- if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
+ if (ovl_is_private_xattr(name))
return -ENODATA;
- return vfs_getxattr(realpath.dentry, name, value, size);
+ old_cred = ovl_override_creds(dentry->d_sb);
+ res = vfs_getxattr(realdentry, name, value, size);
+ revert_creds(old_cred);
+ return res;
}
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
{
- struct path realpath;
- enum ovl_path_type type = ovl_path_real(dentry, &realpath);
+ struct dentry *realdentry = ovl_dentry_real(dentry);
ssize_t res;
int off;
+ const struct cred *old_cred;
- res = vfs_listxattr(realpath.dentry, list, size);
+ old_cred = ovl_override_creds(dentry->d_sb);
+ res = vfs_listxattr(realdentry, list, size);
+ revert_creds(old_cred);
if (res <= 0 || size == 0)
return res;
- if (!ovl_need_xattr_filter(dentry, type))
- return res;
-
/* filter out private xattrs */
for (off = 0; off < res;) {
char *s = list + off;
@@ -295,13 +277,14 @@ int ovl_removexattr(struct dentry *dentry, const char *name)
int err;
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
+ const struct cred *old_cred;
err = ovl_want_write(dentry);
if (err)
goto out;
err = -ENODATA;
- if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
+ if (ovl_is_private_xattr(name))
goto out_drop_write;
if (!OVL_TYPE_UPPER(type)) {
@@ -316,13 +299,34 @@ int ovl_removexattr(struct dentry *dentry, const char *name)
ovl_path_upper(dentry, &realpath);
}
+ old_cred = ovl_override_creds(dentry->d_sb);
err = vfs_removexattr(realpath.dentry, name);
+ revert_creds(old_cred);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
+struct posix_acl *ovl_get_acl(struct inode *inode, int type)
+{
+ struct inode *realinode = ovl_inode_real(inode, NULL);
+ const struct cred *old_cred;
+ struct posix_acl *acl;
+
+ if (!IS_POSIXACL(realinode))
+ return NULL;
+
+ if (!realinode->i_op->get_acl)
+ return NULL;
+
+ old_cred = ovl_override_creds(inode->i_sb);
+ acl = realinode->i_op->get_acl(realinode, type);
+ revert_creds(old_cred);
+
+ return acl;
+}
+
static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
struct dentry *realdentry)
{
@@ -338,46 +342,60 @@ static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
return true;
}
-struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags)
+int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags)
{
- int err;
+ int err = 0;
struct path realpath;
enum ovl_path_type type;
- if (d_is_dir(dentry))
- return d_backing_inode(dentry);
-
type = ovl_path_real(dentry, &realpath);
if (ovl_open_need_copy_up(file_flags, type, realpath.dentry)) {
err = ovl_want_write(dentry);
- if (err)
- return ERR_PTR(err);
+ if (!err) {
+ if (file_flags & O_TRUNC)
+ err = ovl_copy_up_truncate(dentry);
+ else
+ err = ovl_copy_up(dentry);
+ ovl_drop_write(dentry);
+ }
+ }
- if (file_flags & O_TRUNC)
- err = ovl_copy_up_truncate(dentry);
- else
- err = ovl_copy_up(dentry);
- ovl_drop_write(dentry);
- if (err)
- return ERR_PTR(err);
+ return err;
+}
- ovl_path_upper(dentry, &realpath);
+int ovl_update_time(struct inode *inode, struct timespec *ts, int flags)
+{
+ struct dentry *alias;
+ struct path upperpath;
+
+ if (!(flags & S_ATIME))
+ return 0;
+
+ alias = d_find_any_alias(inode);
+ if (!alias)
+ return 0;
+
+ ovl_path_upper(alias, &upperpath);
+ if (upperpath.dentry) {
+ touch_atime(&upperpath);
+ inode->i_atime = d_inode(upperpath.dentry)->i_atime;
}
- if (realpath.dentry->d_flags & DCACHE_OP_SELECT_INODE)
- return realpath.dentry->d_op->d_select_inode(realpath.dentry, file_flags);
+ dput(alias);
- return d_backing_inode(realpath.dentry);
+ return 0;
}
static const struct inode_operations ovl_file_inode_operations = {
.setattr = ovl_setattr,
.permission = ovl_permission,
.getattr = ovl_getattr,
- .setxattr = ovl_setxattr,
+ .setxattr = generic_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
+ .get_acl = ovl_get_acl,
+ .update_time = ovl_update_time,
};
static const struct inode_operations ovl_symlink_inode_operations = {
@@ -385,30 +403,22 @@ static const struct inode_operations ovl_symlink_inode_operations = {
.get_link = ovl_get_link,
.readlink = ovl_readlink,
.getattr = ovl_getattr,
- .setxattr = ovl_setxattr,
+ .setxattr = generic_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
+ .update_time = ovl_update_time,
};
-struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
- struct ovl_entry *oe)
+static void ovl_fill_inode(struct inode *inode, umode_t mode)
{
- struct inode *inode;
-
- inode = new_inode(sb);
- if (!inode)
- return NULL;
-
- mode &= S_IFMT;
-
inode->i_ino = get_next_ino();
inode->i_mode = mode;
- inode->i_flags |= S_NOATIME | S_NOCMTIME;
+ inode->i_flags |= S_NOCMTIME;
+ mode &= S_IFMT;
switch (mode) {
case S_IFDIR:
- inode->i_private = oe;
inode->i_op = &ovl_dir_inode_operations;
inode->i_fop = &ovl_dir_operations;
break;
@@ -417,6 +427,10 @@ struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
inode->i_op = &ovl_symlink_inode_operations;
break;
+ default:
+ WARN(1, "illegal file type: %i\n", mode);
+ /* Fall through */
+
case S_IFREG:
case S_IFSOCK:
case S_IFBLK:
@@ -424,11 +438,42 @@ struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
case S_IFIFO:
inode->i_op = &ovl_file_inode_operations;
break;
+ }
+}
- default:
- WARN(1, "illegal file type: %i\n", mode);
- iput(inode);
- inode = NULL;
+struct inode *ovl_new_inode(struct super_block *sb, umode_t mode)
+{
+ struct inode *inode;
+
+ inode = new_inode(sb);
+ if (inode)
+ ovl_fill_inode(inode, mode);
+
+ return inode;
+}
+
+static int ovl_inode_test(struct inode *inode, void *data)
+{
+ return ovl_inode_real(inode, NULL) == data;
+}
+
+static int ovl_inode_set(struct inode *inode, void *data)
+{
+ inode->i_private = (void *) (((unsigned long) data) | OVL_ISUPPER_MASK);
+ return 0;
+}
+
+struct inode *ovl_get_inode(struct super_block *sb, struct inode *realinode)
+
+{
+ struct inode *inode;
+
+ inode = iget5_locked(sb, (unsigned long) realinode,
+ ovl_inode_test, ovl_inode_set, realinode);
+ if (inode && inode->i_state & I_NEW) {
+ ovl_fill_inode(inode, realinode->i_mode);
+ set_nlink(inode, realinode->i_nlink);
+ unlock_new_inode(inode);
}
return inode;
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 6a7090f4a4413..e4f5c9536bfea 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -23,9 +23,11 @@ enum ovl_path_type {
#define OVL_TYPE_MERGE_OR_LOWER(type) \
(OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
-#define OVL_XATTR_PRE_NAME "trusted.overlay."
-#define OVL_XATTR_PRE_LEN 16
-#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque"
+
+#define OVL_XATTR_PREFIX XATTR_TRUSTED_PREFIX "overlay"
+#define OVL_XATTR_OPAQUE OVL_XATTR_PREFIX ".opaque"
+
+#define OVL_ISUPPER_MASK 1UL
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
{
@@ -131,6 +133,16 @@ static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
return err;
}
+static inline struct inode *ovl_inode_real(struct inode *inode, bool *is_upper)
+{
+ unsigned long x = (unsigned long) READ_ONCE(inode->i_private);
+
+ if (is_upper)
+ *is_upper = x & OVL_ISUPPER_MASK;
+
+ return (struct inode *) (x & ~OVL_ISUPPER_MASK);
+}
+
enum ovl_path_type ovl_path_type(struct dentry *dentry);
u64 ovl_dentry_version_get(struct dentry *dentry);
void ovl_dentry_version_inc(struct dentry *dentry);
@@ -141,11 +153,9 @@ int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
struct dentry *ovl_dentry_upper(struct dentry *dentry);
struct dentry *ovl_dentry_lower(struct dentry *dentry);
struct dentry *ovl_dentry_real(struct dentry *dentry);
-struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
struct vfsmount *ovl_entry_mnt_real(struct ovl_entry *oe, struct inode *inode,
bool is_upper);
struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
-bool ovl_is_default_permissions(struct inode *inode);
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
struct dentry *ovl_workdir(struct dentry *dentry);
int ovl_want_write(struct dentry *dentry);
@@ -153,7 +163,9 @@ void ovl_drop_write(struct dentry *dentry);
bool ovl_dentry_is_opaque(struct dentry *dentry);
void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
bool ovl_is_whiteout(struct dentry *dentry);
+const struct cred *ovl_override_creds(struct super_block *sb);
void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
+void ovl_inode_update(struct inode *inode, struct inode *upperinode);
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags);
struct file *ovl_path_open(struct path *path, int flags);
@@ -171,20 +183,27 @@ int ovl_check_d_type_supported(struct path *realpath);
/* inode.c */
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
int ovl_permission(struct inode *inode, int mask);
-int ovl_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags);
-ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
- void *value, size_t size);
+int ovl_setxattr(struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags);
+ssize_t ovl_getxattr(struct dentry *dentry, struct inode *inode,
+ const char *name, void *value, size_t size);
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
int ovl_removexattr(struct dentry *dentry, const char *name);
-struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags);
+struct posix_acl *ovl_get_acl(struct inode *inode, int type);
+int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags);
+int ovl_update_time(struct inode *inode, struct timespec *ts, int flags);
-struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
- struct ovl_entry *oe);
+struct inode *ovl_new_inode(struct super_block *sb, umode_t mode);
+struct inode *ovl_get_inode(struct super_block *sb, struct inode *realinode);
static inline void ovl_copyattr(struct inode *from, struct inode *to)
{
to->i_uid = from->i_uid;
to->i_gid = from->i_gid;
+ to->i_mode = from->i_mode;
+ to->i_atime = from->i_atime;
+ to->i_mtime = from->i_mtime;
+ to->i_ctime = from->i_ctime;
}
/* dir.c */
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 6ec1e43a9a54a..cf37fc76fc9fc 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -36,6 +36,7 @@ struct ovl_dir_cache {
struct ovl_readdir_data {
struct dir_context ctx;
+ struct dentry *dentry;
bool is_lowest;
struct rb_root root;
struct list_head *list;
@@ -206,19 +207,10 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
struct ovl_cache_entry *p;
struct dentry *dentry;
const struct cred *old_cred;
- struct cred *override_cred;
-
- override_cred = prepare_creds();
- if (!override_cred)
- return -ENOMEM;
- /*
- * CAP_DAC_OVERRIDE for lookup
- */
- cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
- old_cred = override_creds(override_cred);
+ old_cred = ovl_override_creds(rdd->dentry->d_sb);
- err = mutex_lock_killable(&dir->d_inode->i_mutex);
+ err = down_write_killable(&dir->d_inode->i_rwsem);
if (!err) {
while (rdd->first_maybe_whiteout) {
p = rdd->first_maybe_whiteout;
@@ -232,7 +224,6 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
inode_unlock(dir->d_inode);
}
revert_creds(old_cred);
- put_cred(override_cred);
return err;
}
@@ -288,6 +279,7 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
struct path realpath;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_merge,
+ .dentry = dentry,
.list = list,
.root = RB_ROOT,
.is_lowest = false,
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index ef64984c9bbce..4036132842b53 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -16,10 +16,10 @@
#include <linux/slab.h>
#include <linux/parser.h>
#include <linux/module.h>
-#include <linux/pagemap.h>
#include <linux/sched.h>
#include <linux/statfs.h>
#include <linux/seq_file.h>
+#include <linux/posix_acl_xattr.h>
#include "overlayfs.h"
MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
@@ -42,6 +42,8 @@ struct ovl_fs {
long lower_namelen;
/* pathnames of lower and upper dirs, for show_options */
struct ovl_config config;
+ /* creds of process who forced instantiation of super block */
+ const struct cred *creator_cred;
};
struct ovl_dir_cache;
@@ -143,18 +145,11 @@ struct dentry *ovl_dentry_real(struct dentry *dentry)
return realdentry;
}
-struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper)
+static void ovl_inode_init(struct inode *inode, struct inode *realinode,
+ bool is_upper)
{
- struct dentry *realdentry;
-
- realdentry = ovl_upperdentry_dereference(oe);
- if (realdentry) {
- *is_upper = true;
- } else {
- realdentry = __ovl_dentry_lower(oe);
- *is_upper = false;
- }
- return realdentry;
+ WRITE_ONCE(inode->i_private, (unsigned long) realinode |
+ (is_upper ? OVL_ISUPPER_MASK : 0));
}
struct vfsmount *ovl_entry_mnt_real(struct ovl_entry *oe, struct inode *inode,
@@ -176,13 +171,6 @@ struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry)
return oe->cache;
}
-bool ovl_is_default_permissions(struct inode *inode)
-{
- struct ovl_fs *ofs = inode->i_sb->s_fs_info;
-
- return ofs->config.default_permissions;
-}
-
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache)
{
struct ovl_entry *oe = dentry->d_fsdata;
@@ -233,7 +221,6 @@ void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry)
WARN_ON(!inode_is_locked(upperdentry->d_parent->d_inode));
WARN_ON(oe->__upperdentry);
- BUG_ON(!upperdentry->d_inode);
/*
* Make sure upperdentry is consistent before making it visible to
* ovl_upperdentry_dereference().
@@ -242,6 +229,16 @@ void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry)
oe->__upperdentry = upperdentry;
}
+void ovl_inode_update(struct inode *inode, struct inode *upperinode)
+{
+ WARN_ON(!upperinode);
+ WARN_ON(!inode_unhashed(inode));
+ WRITE_ONCE(inode->i_private,
+ (unsigned long) upperinode | OVL_ISUPPER_MASK);
+ if (!S_ISDIR(upperinode->i_mode))
+ __insert_inode_hash(inode, (unsigned long) upperinode);
+}
+
void ovl_dentry_version_inc(struct dentry *dentry)
{
struct ovl_entry *oe = dentry->d_fsdata;
@@ -265,6 +262,13 @@ bool ovl_is_whiteout(struct dentry *dentry)
return inode && IS_WHITEOUT(inode);
}
+const struct cred *ovl_override_creds(struct super_block *sb)
+{
+ struct ovl_fs *ofs = sb->s_fs_info;
+
+ return override_creds(ofs->creator_cred);
+}
+
static bool ovl_is_opaquedir(struct dentry *dentry)
{
int res;
@@ -274,7 +278,7 @@ static bool ovl_is_opaquedir(struct dentry *dentry)
if (!S_ISDIR(inode->i_mode) || !inode->i_op->getxattr)
return false;
- res = inode->i_op->getxattr(dentry, OVL_XATTR_OPAQUE, &val, 1);
+ res = inode->i_op->getxattr(dentry, inode, OVL_XATTR_OPAQUE, &val, 1);
if (res == 1 && val == 'y')
return true;
@@ -295,6 +299,47 @@ static void ovl_dentry_release(struct dentry *dentry)
}
}
+static struct dentry *ovl_d_real(struct dentry *dentry,
+ const struct inode *inode,
+ unsigned int open_flags)
+{
+ struct dentry *real;
+
+ if (d_is_dir(dentry)) {
+ if (!inode || inode == d_inode(dentry))
+ return dentry;
+ goto bug;
+ }
+
+ if (d_is_negative(dentry))
+ return dentry;
+
+ if (open_flags) {
+ int err = ovl_open_maybe_copy_up(dentry, open_flags);
+
+ if (err)
+ return ERR_PTR(err);
+ }
+
+ real = ovl_dentry_upper(dentry);
+ if (real && (!inode || inode == d_inode(real)))
+ return real;
+
+ real = ovl_dentry_lower(dentry);
+ if (!real)
+ goto bug;
+
+ if (!inode || inode == d_inode(real))
+ return real;
+
+ /* Handle recursion */
+ return d_real(real, inode, open_flags);
+bug:
+ WARN(1, "ovl_d_real(%pd4, %s:%lu): real dentry not found\n", dentry,
+ inode ? inode->i_sb->s_id : "NULL", inode ? inode->i_ino : 0);
+ return dentry;
+}
+
static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags)
{
struct ovl_entry *oe = dentry->d_fsdata;
@@ -338,12 +383,12 @@ static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags)
static const struct dentry_operations ovl_dentry_operations = {
.d_release = ovl_dentry_release,
- .d_select_inode = ovl_d_select_inode,
+ .d_real = ovl_d_real,
};
static const struct dentry_operations ovl_reval_dentry_operations = {
.d_release = ovl_dentry_release,
- .d_select_inode = ovl_d_select_inode,
+ .d_real = ovl_d_real,
.d_revalidate = ovl_dentry_revalidate,
.d_weak_revalidate = ovl_dentry_weak_revalidate,
};
@@ -362,7 +407,8 @@ static struct ovl_entry *ovl_alloc_entry(unsigned int numlower)
static bool ovl_dentry_remote(struct dentry *dentry)
{
return dentry->d_flags &
- (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE);
+ (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE |
+ DCACHE_OP_REAL);
}
static bool ovl_dentry_weird(struct dentry *dentry)
@@ -373,14 +419,16 @@ static bool ovl_dentry_weird(struct dentry *dentry)
DCACHE_OP_COMPARE);
}
-static inline struct dentry *ovl_lookup_real(struct dentry *dir,
- struct qstr *name)
+static inline struct dentry *ovl_lookup_real(struct super_block *ovl_sb,
+ struct dentry *dir,
+ const struct qstr *name)
{
+ const struct cred *old_cred;
struct dentry *dentry;
- inode_lock(dir->d_inode);
- dentry = lookup_one_len(name->name, dir, name->len);
- inode_unlock(dir->d_inode);
+ old_cred = ovl_override_creds(ovl_sb);
+ dentry = lookup_one_len_unlocked(name->name, dir, name->len);
+ revert_creds(old_cred);
if (IS_ERR(dentry)) {
if (PTR_ERR(dentry) == -ENOENT)
@@ -433,7 +481,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
upperdir = ovl_upperdentry_dereference(poe);
if (upperdir) {
- this = ovl_lookup_real(upperdir, &dentry->d_name);
+ this = ovl_lookup_real(dentry->d_sb, upperdir, &dentry->d_name);
err = PTR_ERR(this);
if (IS_ERR(this))
goto out;
@@ -466,7 +514,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
bool opaque = false;
struct path lowerpath = poe->lowerstack[i];
- this = ovl_lookup_real(lowerpath.dentry, &dentry->d_name);
+ this = ovl_lookup_real(dentry->d_sb,
+ lowerpath.dentry, &dentry->d_name);
err = PTR_ERR(this);
if (IS_ERR(this)) {
/*
@@ -521,12 +570,19 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
if (upperdentry || ctr) {
struct dentry *realdentry;
+ struct inode *realinode;
realdentry = upperdentry ? upperdentry : stack[0].dentry;
+ realinode = d_inode(realdentry);
err = -ENOMEM;
- inode = ovl_new_inode(dentry->d_sb, realdentry->d_inode->i_mode,
- oe);
+ if (upperdentry && !d_is_dir(upperdentry)) {
+ inode = ovl_get_inode(dentry->d_sb, realinode);
+ } else {
+ inode = ovl_new_inode(dentry->d_sb, realinode->i_mode);
+ if (inode)
+ ovl_inode_init(inode, realinode, !!upperdentry);
+ }
if (!inode)
goto out_free_oe;
ovl_copyattr(realdentry->d_inode, inode);
@@ -555,7 +611,7 @@ out:
struct file *ovl_path_open(struct path *path, int flags)
{
- return dentry_open(path, flags, current_cred());
+ return dentry_open(path, flags | O_NOATIME, current_cred());
}
static void ovl_put_super(struct super_block *sb)
@@ -572,6 +628,7 @@ static void ovl_put_super(struct super_block *sb)
kfree(ufs->config.lowerdir);
kfree(ufs->config.upperdir);
kfree(ufs->config.workdir);
+ put_cred(ufs->creator_cred);
kfree(ufs);
}
@@ -637,6 +694,7 @@ static const struct super_operations ovl_super_operations = {
.statfs = ovl_statfs,
.show_options = ovl_show_options,
.remount_fs = ovl_remount,
+ .drop_inode = generic_delete_inode,
};
enum {
@@ -909,11 +967,102 @@ static unsigned int ovl_split_lowerdirs(char *str)
return ctr;
}
+static int ovl_posix_acl_xattr_set(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ struct dentry *workdir = ovl_workdir(dentry);
+ struct inode *realinode = ovl_inode_real(inode, NULL);
+ struct posix_acl *acl = NULL;
+ int err;
+
+ /* Check that everything is OK before copy-up */
+ if (value) {
+ acl = posix_acl_from_xattr(&init_user_ns, value, size);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ }
+ err = -EOPNOTSUPP;
+ if (!IS_POSIXACL(d_inode(workdir)))
+ goto out_acl_release;
+ if (!realinode->i_op->set_acl)
+ goto out_acl_release;
+ if (handler->flags == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) {
+ err = acl ? -EACCES : 0;
+ goto out_acl_release;
+ }
+ err = -EPERM;
+ if (!inode_owner_or_capable(inode))
+ goto out_acl_release;
+
+ posix_acl_release(acl);
+
+ return ovl_setxattr(dentry, inode, handler->name, value, size, flags);
+
+out_acl_release:
+ posix_acl_release(acl);
+ return err;
+}
+
+static int ovl_other_xattr_set(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ return ovl_setxattr(dentry, inode, name, value, size, flags);
+}
+
+static int ovl_own_xattr_set(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ return -EPERM;
+}
+
+static const struct xattr_handler ovl_posix_acl_access_xattr_handler = {
+ .name = XATTR_NAME_POSIX_ACL_ACCESS,
+ .flags = ACL_TYPE_ACCESS,
+ .set = ovl_posix_acl_xattr_set,
+};
+
+static const struct xattr_handler ovl_posix_acl_default_xattr_handler = {
+ .name = XATTR_NAME_POSIX_ACL_DEFAULT,
+ .flags = ACL_TYPE_DEFAULT,
+ .set = ovl_posix_acl_xattr_set,
+};
+
+static const struct xattr_handler ovl_own_xattr_handler = {
+ .prefix = OVL_XATTR_PREFIX,
+ .set = ovl_own_xattr_set,
+};
+
+static const struct xattr_handler ovl_other_xattr_handler = {
+ .prefix = "", /* catch all */
+ .set = ovl_other_xattr_set,
+};
+
+static const struct xattr_handler *ovl_xattr_handlers[] = {
+ &ovl_posix_acl_access_xattr_handler,
+ &ovl_posix_acl_default_xattr_handler,
+ &ovl_own_xattr_handler,
+ &ovl_other_xattr_handler,
+ NULL
+};
+
+static const struct xattr_handler *ovl_xattr_noacl_handlers[] = {
+ &ovl_own_xattr_handler,
+ &ovl_other_xattr_handler,
+ NULL,
+};
+
static int ovl_fill_super(struct super_block *sb, void *data, int silent)
{
struct path upperpath = { NULL, NULL };
struct path workpath = { NULL, NULL };
struct dentry *root_dentry;
+ struct inode *realinode;
struct ovl_entry *oe;
struct ovl_fs *ufs;
struct path *stack = NULL;
@@ -1020,6 +1169,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
pr_err("overlayfs: failed to clone upperpath\n");
goto out_put_lowerpath;
}
+ /* Don't inherit atime flags */
+ ufs->upper_mnt->mnt_flags &= ~(MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME);
+
+ sb->s_time_gran = ufs->upper_mnt->mnt_sb->s_time_gran;
ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry);
err = PTR_ERR(ufs->workdir);
@@ -1033,16 +1186,21 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
/*
* Upper should support d_type, else whiteouts are visible.
* Given workdir and upper are on same fs, we can do
- * iterate_dir() on workdir.
+ * iterate_dir() on workdir. This check requires successful
+ * creation of workdir in previous step.
*/
- err = ovl_check_d_type_supported(&workpath);
- if (err < 0)
- goto out_put_workdir;
+ if (ufs->workdir) {
+ err = ovl_check_d_type_supported(&workpath);
+ if (err < 0)
+ goto out_put_workdir;
- if (!err) {
- pr_err("overlayfs: upper fs needs to support d_type.\n");
- err = -EINVAL;
- goto out_put_workdir;
+ /*
+ * We allowed this configuration and don't want to
+ * break users over kernel upgrade. So warn instead
+ * of erroring out.
+ */
+ if (!err)
+ pr_warn("overlayfs: upper fs needs to support d_type.\n");
}
}
@@ -1062,7 +1220,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
* Make lower_mnt R/O. That way fchmod/fchown on lower file
* will fail instead of modifying lower fs.
*/
- mnt->mnt_flags |= MNT_READONLY;
+ mnt->mnt_flags |= MNT_READONLY | MNT_NOATIME;
ufs->lower_mnt[ufs->numlower] = mnt;
ufs->numlower++;
@@ -1077,12 +1235,16 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
else
sb->s_d_op = &ovl_dentry_operations;
+ ufs->creator_cred = prepare_creds();
+ if (!ufs->creator_cred)
+ goto out_put_lower_mnt;
+
err = -ENOMEM;
oe = ovl_alloc_entry(numlower);
if (!oe)
- goto out_put_lower_mnt;
+ goto out_put_cred;
- root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, oe));
+ root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR));
if (!root_dentry)
goto out_free_oe;
@@ -1101,18 +1263,26 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
root_dentry->d_fsdata = oe;
- ovl_copyattr(ovl_dentry_real(root_dentry)->d_inode,
- root_dentry->d_inode);
+ realinode = d_inode(ovl_dentry_real(root_dentry));
+ ovl_inode_init(d_inode(root_dentry), realinode, !!upperpath.dentry);
+ ovl_copyattr(realinode, d_inode(root_dentry));
sb->s_magic = OVERLAYFS_SUPER_MAGIC;
sb->s_op = &ovl_super_operations;
+ if (IS_ENABLED(CONFIG_FS_POSIX_ACL))
+ sb->s_xattr = ovl_xattr_handlers;
+ else
+ sb->s_xattr = ovl_xattr_noacl_handlers;
sb->s_root = root_dentry;
sb->s_fs_info = ufs;
+ sb->s_flags |= MS_POSIXACL;
return 0;
out_free_oe:
kfree(oe);
+out_put_cred:
+ put_cred(ufs->creator_cred);
out_put_lower_mnt:
for (i = 0; i < ufs->numlower; i++)
mntput(ufs->lower_mnt[i]);
diff --git a/fs/pipe.c b/fs/pipe.c
index ab8dad3ccb6a8..4b32928f54266 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -21,6 +21,7 @@
#include <linux/audit.h>
#include <linux/syscalls.h>
#include <linux/fcntl.h>
+#include <linux/memcontrol.h>
#include <asm/uaccess.h>
#include <asm/ioctls.h>
@@ -134,7 +135,23 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
if (page_count(page) == 1 && !pipe->tmp_page)
pipe->tmp_page = page;
else
- page_cache_release(page);
+ put_page(page);
+}
+
+static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+ struct page *page = buf->page;
+
+ if (page_count(page) == 1) {
+ if (memcg_kmem_enabled()) {
+ memcg_kmem_uncharge(page, 0);
+ __ClearPageKmemcg(page);
+ }
+ __SetPageLocked(page);
+ return 0;
+ }
+ return 1;
}
/**
@@ -180,7 +197,7 @@ EXPORT_SYMBOL(generic_pipe_buf_steal);
*/
void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
{
- page_cache_get(buf->page);
+ get_page(buf->page);
}
EXPORT_SYMBOL(generic_pipe_buf_get);
@@ -211,7 +228,7 @@ EXPORT_SYMBOL(generic_pipe_buf_confirm);
void generic_pipe_buf_release(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
- page_cache_release(buf->page);
+ put_page(buf->page);
}
EXPORT_SYMBOL(generic_pipe_buf_release);
@@ -219,7 +236,7 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = {
.can_merge = 1,
.confirm = generic_pipe_buf_confirm,
.release = anon_pipe_buf_release,
- .steal = generic_pipe_buf_steal,
+ .steal = anon_pipe_buf_steal,
.get = generic_pipe_buf_get,
};
@@ -227,7 +244,7 @@ static const struct pipe_buf_operations packet_pipe_buf_ops = {
.can_merge = 0,
.confirm = generic_pipe_buf_confirm,
.release = anon_pipe_buf_release,
- .steal = generic_pipe_buf_steal,
+ .steal = anon_pipe_buf_steal,
.get = generic_pipe_buf_get,
};
@@ -405,7 +422,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
int copied;
if (!page) {
- page = alloc_page(GFP_HIGHUSER);
+ page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
if (unlikely(!page)) {
ret = ret ? : -ENOMEM;
break;
@@ -611,7 +628,7 @@ struct pipe_inode_info *alloc_pipe_info(void)
{
struct pipe_inode_info *pipe;
- pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
+ pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
if (pipe) {
unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
struct user_struct *user = get_current_user();
@@ -619,7 +636,9 @@ struct pipe_inode_info *alloc_pipe_info(void)
if (!too_many_pipe_buffers_hard(user)) {
if (too_many_pipe_buffers_soft(user))
pipe_bufs = 1;
- pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * pipe_bufs, GFP_KERNEL);
+ pipe->bufs = kcalloc(pipe_bufs,
+ sizeof(struct pipe_buffer),
+ GFP_KERNEL_ACCOUNT);
}
if (pipe->bufs) {
@@ -1010,7 +1029,8 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
if (nr_pages < pipe->nrbufs)
return -EBUSY;
- bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN);
+ bufs = kcalloc(nr_pages, sizeof(*bufs),
+ GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
if (unlikely(!bufs))
return -ENOMEM;
diff --git a/fs/pnode.c b/fs/pnode.c
index c524fdddc7fb1..99899705b1055 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -198,7 +198,7 @@ static struct mount *next_group(struct mount *m, struct mount *origin)
/* all accesses are serialized by namespace_sem */
static struct user_namespace *user_ns;
-static struct mount *last_dest, *last_source, *dest_master;
+static struct mount *last_dest, *first_source, *last_source, *dest_master;
static struct mountpoint *mp;
static struct hlist_head *list;
@@ -221,20 +221,22 @@ static int propagate_one(struct mount *m)
type = CL_MAKE_SHARED;
} else {
struct mount *n, *p;
+ bool done;
for (n = m; ; n = p) {
p = n->mnt_master;
- if (p == dest_master || IS_MNT_MARKED(p)) {
- while (last_dest->mnt_master != p) {
- last_source = last_source->mnt_master;
- last_dest = last_source->mnt_parent;
- }
- if (!peers(n, last_dest)) {
- last_source = last_source->mnt_master;
- last_dest = last_source->mnt_parent;
- }
+ if (p == dest_master || IS_MNT_MARKED(p))
break;
- }
}
+ do {
+ struct mount *parent = last_source->mnt_parent;
+ if (last_source == first_source)
+ break;
+ done = parent->mnt_master == p;
+ if (done && peers(n, parent))
+ break;
+ last_source = last_source->mnt_master;
+ } while (!done);
+
type = CL_SLAVE;
/* beginning of peer group among the slaves? */
if (IS_MNT_SHARED(m))
@@ -286,6 +288,7 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
*/
user_ns = current->nsproxy->mnt_ns->user_ns;
last_dest = dest_mnt;
+ first_source = source_mnt;
last_source = source_mnt;
mp = dest_mp;
list = tree_list;
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 711dd51703768..59d47ab0791af 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -21,7 +21,7 @@
#include <linux/export.h>
#include <linux/user_namespace.h>
-struct posix_acl **acl_by_type(struct inode *inode, int type)
+static struct posix_acl **acl_by_type(struct inode *inode, int type)
{
switch (type) {
case ACL_TYPE_ACCESS:
@@ -32,19 +32,22 @@ struct posix_acl **acl_by_type(struct inode *inode, int type)
BUG();
}
}
-EXPORT_SYMBOL(acl_by_type);
struct posix_acl *get_cached_acl(struct inode *inode, int type)
{
struct posix_acl **p = acl_by_type(inode, type);
- struct posix_acl *acl = ACCESS_ONCE(*p);
- if (acl) {
- spin_lock(&inode->i_lock);
- acl = *p;
- if (acl != ACL_NOT_CACHED)
- acl = posix_acl_dup(acl);
- spin_unlock(&inode->i_lock);
+ struct posix_acl *acl;
+
+ for (;;) {
+ rcu_read_lock();
+ acl = rcu_dereference(*p);
+ if (!acl || is_uncached_acl(acl) ||
+ atomic_inc_not_zero(&acl->a_refcount))
+ break;
+ rcu_read_unlock();
+ cpu_relax();
}
+ rcu_read_unlock();
return acl;
}
EXPORT_SYMBOL(get_cached_acl);
@@ -59,58 +62,72 @@ void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl)
{
struct posix_acl **p = acl_by_type(inode, type);
struct posix_acl *old;
- spin_lock(&inode->i_lock);
- old = *p;
- rcu_assign_pointer(*p, posix_acl_dup(acl));
- spin_unlock(&inode->i_lock);
- if (old != ACL_NOT_CACHED)
+
+ old = xchg(p, posix_acl_dup(acl));
+ if (!is_uncached_acl(old))
posix_acl_release(old);
}
EXPORT_SYMBOL(set_cached_acl);
-void forget_cached_acl(struct inode *inode, int type)
+static void __forget_cached_acl(struct posix_acl **p)
{
- struct posix_acl **p = acl_by_type(inode, type);
struct posix_acl *old;
- spin_lock(&inode->i_lock);
- old = *p;
- *p = ACL_NOT_CACHED;
- spin_unlock(&inode->i_lock);
- if (old != ACL_NOT_CACHED)
+
+ old = xchg(p, ACL_NOT_CACHED);
+ if (!is_uncached_acl(old))
posix_acl_release(old);
}
+
+void forget_cached_acl(struct inode *inode, int type)
+{
+ __forget_cached_acl(acl_by_type(inode, type));
+}
EXPORT_SYMBOL(forget_cached_acl);
void forget_all_cached_acls(struct inode *inode)
{
- struct posix_acl *old_access, *old_default;
- spin_lock(&inode->i_lock);
- old_access = inode->i_acl;
- old_default = inode->i_default_acl;
- inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
- spin_unlock(&inode->i_lock);
- if (old_access != ACL_NOT_CACHED)
- posix_acl_release(old_access);
- if (old_default != ACL_NOT_CACHED)
- posix_acl_release(old_default);
+ __forget_cached_acl(&inode->i_acl);
+ __forget_cached_acl(&inode->i_default_acl);
}
EXPORT_SYMBOL(forget_all_cached_acls);
struct posix_acl *get_acl(struct inode *inode, int type)
{
+ void *sentinel;
+ struct posix_acl **p;
struct posix_acl *acl;
+ /*
+ * The sentinel is used to detect when another operation like
+ * set_cached_acl() or forget_cached_acl() races with get_acl().
+ * It is guaranteed that is_uncached_acl(sentinel) is true.
+ */
+
acl = get_cached_acl(inode, type);
- if (acl != ACL_NOT_CACHED)
+ if (!is_uncached_acl(acl))
return acl;
if (!IS_POSIXACL(inode))
return NULL;
+ sentinel = uncached_acl_sentinel(current);
+ p = acl_by_type(inode, type);
+
/*
- * A filesystem can force a ACL callback by just never filling the
- * ACL cache. But normally you'd fill the cache either at inode
- * instantiation time, or on the first ->get_acl call.
+ * If the ACL isn't being read yet, set our sentinel. Otherwise, the
+ * current value of the ACL will not be ACL_NOT_CACHED and so our own
+ * sentinel will not be set; another task will update the cache. We
+ * could wait for that other task to complete its job, but it's easier
+ * to just call ->get_acl to fetch the ACL ourself. (This is going to
+ * be an unlikely race.)
+ */
+ if (cmpxchg(p, ACL_NOT_CACHED, sentinel) != ACL_NOT_CACHED)
+ /* fall through */ ;
+
+ /*
+ * Normally, the ACL returned by ->get_acl will be cached.
+ * A filesystem can prevent that by calling
+ * forget_cached_acl(inode, type) in ->get_acl.
*
* If the filesystem doesn't have a get_acl() function at all, we'll
* just create the negative cache entry.
@@ -119,7 +136,24 @@ struct posix_acl *get_acl(struct inode *inode, int type)
set_cached_acl(inode, type, NULL);
return NULL;
}
- return inode->i_op->get_acl(inode, type);
+ acl = inode->i_op->get_acl(inode, type);
+
+ if (IS_ERR(acl)) {
+ /*
+ * Remove our sentinel so that we don't block future attempts
+ * to cache the ACL.
+ */
+ cmpxchg(p, sentinel, ACL_NOT_CACHED);
+ return acl;
+ }
+
+ /*
+ * Cache the result, but only if our sentinel is still in place.
+ */
+ posix_acl_dup(acl);
+ if (unlikely(cmpxchg(p, sentinel, acl) != sentinel))
+ posix_acl_release(acl);
+ return acl;
}
EXPORT_SYMBOL(get_acl);
@@ -171,7 +205,7 @@ posix_acl_clone(const struct posix_acl *acl, gfp_t flags)
* Check if an acl is valid. Returns 0 if it is, or -E... otherwise.
*/
int
-posix_acl_valid(const struct posix_acl *acl)
+posix_acl_valid(struct user_namespace *user_ns, const struct posix_acl *acl)
{
const struct posix_acl_entry *pa, *pe;
int state = ACL_USER_OBJ;
@@ -191,7 +225,7 @@ posix_acl_valid(const struct posix_acl *acl)
case ACL_USER:
if (state != ACL_USER)
return -EINVAL;
- if (!uid_valid(pa->e_uid))
+ if (!kuid_has_mapping(user_ns, pa->e_uid))
return -EINVAL;
needs_mask = 1;
break;
@@ -206,7 +240,7 @@ posix_acl_valid(const struct posix_acl *acl)
case ACL_GROUP:
if (state != ACL_GROUP)
return -EINVAL;
- if (!gid_valid(pa->e_gid))
+ if (!kgid_has_mapping(user_ns, pa->e_gid))
return -EINVAL;
needs_mask = 1;
break;
@@ -763,18 +797,18 @@ EXPORT_SYMBOL (posix_acl_to_xattr);
static int
posix_acl_xattr_get(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- void *value, size_t size)
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *value, size_t size)
{
struct posix_acl *acl;
int error;
- if (!IS_POSIXACL(d_backing_inode(dentry)))
+ if (!IS_POSIXACL(inode))
return -EOPNOTSUPP;
- if (d_is_symlink(dentry))
+ if (S_ISLNK(inode->i_mode))
return -EOPNOTSUPP;
- acl = get_acl(d_backing_inode(dentry), handler->flags);
+ acl = get_acl(inode, handler->flags);
if (IS_ERR(acl))
return PTR_ERR(acl);
if (acl == NULL)
@@ -786,39 +820,43 @@ posix_acl_xattr_get(const struct xattr_handler *handler,
return error;
}
-static int
-posix_acl_xattr_set(const struct xattr_handler *handler,
- struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+int
+set_posix_acl(struct inode *inode, int type, struct posix_acl *acl)
{
- struct inode *inode = d_backing_inode(dentry);
- struct posix_acl *acl = NULL;
- int ret;
-
if (!IS_POSIXACL(inode))
return -EOPNOTSUPP;
if (!inode->i_op->set_acl)
return -EOPNOTSUPP;
- if (handler->flags == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
- return value ? -EACCES : 0;
+ if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
+ return acl ? -EACCES : 0;
if (!inode_owner_or_capable(inode))
return -EPERM;
+ if (acl) {
+ int ret = posix_acl_valid(inode->i_sb->s_user_ns, acl);
+ if (ret)
+ return ret;
+ }
+ return inode->i_op->set_acl(inode, acl, type);
+}
+EXPORT_SYMBOL(set_posix_acl);
+
+static int
+posix_acl_xattr_set(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ struct posix_acl *acl = NULL;
+ int ret;
+
if (value) {
acl = posix_acl_from_xattr(&init_user_ns, value, size);
if (IS_ERR(acl))
return PTR_ERR(acl);
-
- if (acl) {
- ret = posix_acl_valid(acl);
- if (ret)
- goto out;
- }
}
-
- ret = inode->i_op->set_acl(inode, acl, handler->flags);
-out:
+ ret = set_posix_acl(inode, handler->flags, acl);
posix_acl_release(acl);
return ret;
}
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 7151ea428041d..12c6922c913c4 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -4,6 +4,7 @@
obj-y += proc.o
+CFLAGS_task_mmu.o += $(call cc-option,-Wno-override-init,)
proc-y := nommu.o task_nommu.o
proc-$(CONFIG_MMU) := task_mmu.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index b6c00ce0e29e3..88c7de12197bd 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -83,6 +83,7 @@
#include <linux/tracehook.h>
#include <linux/string_helpers.h>
#include <linux/user_namespace.h>
+#include <linux/fs_struct.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
@@ -139,12 +140,25 @@ static inline const char *get_task_state(struct task_struct *tsk)
return task_state_array[fls(state)];
}
+static inline int get_task_umask(struct task_struct *tsk)
+{
+ struct fs_struct *fs;
+ int umask = -ENOENT;
+
+ task_lock(tsk);
+ fs = tsk->fs;
+ if (fs)
+ umask = fs->umask;
+ task_unlock(tsk);
+ return umask;
+}
+
static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *p)
{
struct user_namespace *user_ns = seq_user_ns(m);
struct group_info *group_info;
- int g;
+ int g, umask;
struct task_struct *tracer;
const struct cred *cred;
pid_t ppid, tpid = 0, tgid, ngid;
@@ -162,6 +176,10 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
ngid = task_numa_group_id(p);
cred = get_task_cred(p);
+ umask = get_task_umask(p);
+ if (umask >= 0)
+ seq_printf(m, "Umask:\t%#04o\n", umask);
+
task_lock(p);
if (p->files)
max_fds = files_fdtable(p->files)->max_fds;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b1755b23893e5..54e270262979b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -434,7 +434,7 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
&& !lookup_symbol_name(wchan, symname))
seq_printf(m, "%s", symname);
else
- seq_puts(m, "0\n");
+ seq_putc(m, '0');
return 0;
}
@@ -579,11 +579,8 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
unsigned long totalpages = totalram_pages + total_swap_pages;
unsigned long points = 0;
- read_lock(&tasklist_lock);
- if (pid_alive(task))
- points = oom_badness(task, NULL, NULL, totalpages) *
- 1000 / totalpages;
- read_unlock(&tasklist_lock);
+ points = oom_badness(task, NULL, NULL, totalpages) *
+ 1000 / totalpages;
seq_printf(m, "%lu\n", points);
return 0;
@@ -955,7 +952,8 @@ static ssize_t environ_read(struct file *file, char __user *buf,
struct mm_struct *mm = file->private_data;
unsigned long env_start, env_end;
- if (!mm)
+ /* Ensure the process spawned far enough to have an environment. */
+ if (!mm || !mm->env_end)
return 0;
page = (char *)__get_free_page(GFP_TEMPORARY);
@@ -1023,23 +1021,107 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
char buffer[PROC_NUMBUF];
int oom_adj = OOM_ADJUST_MIN;
size_t len;
- unsigned long flags;
if (!task)
return -ESRCH;
- if (lock_task_sighand(task, &flags)) {
- if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
- oom_adj = OOM_ADJUST_MAX;
- else
- oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
- OOM_SCORE_ADJ_MAX;
- unlock_task_sighand(task, &flags);
- }
+ if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
+ oom_adj = OOM_ADJUST_MAX;
+ else
+ oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
+ OOM_SCORE_ADJ_MAX;
put_task_struct(task);
len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
return simple_read_from_buffer(buf, count, ppos, buffer, len);
}
+static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
+{
+ static DEFINE_MUTEX(oom_adj_mutex);
+ struct mm_struct *mm = NULL;
+ struct task_struct *task;
+ int err = 0;
+
+ task = get_proc_task(file_inode(file));
+ if (!task)
+ return -ESRCH;
+
+ mutex_lock(&oom_adj_mutex);
+ if (legacy) {
+ if (oom_adj < task->signal->oom_score_adj &&
+ !capable(CAP_SYS_RESOURCE)) {
+ err = -EACCES;
+ goto err_unlock;
+ }
+ /*
+ * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
+ * /proc/pid/oom_score_adj instead.
+ */
+ pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
+ current->comm, task_pid_nr(current), task_pid_nr(task),
+ task_pid_nr(task));
+ } else {
+ if ((short)oom_adj < task->signal->oom_score_adj_min &&
+ !capable(CAP_SYS_RESOURCE)) {
+ err = -EACCES;
+ goto err_unlock;
+ }
+ }
+
+ /*
+ * Make sure we will check other processes sharing the mm if this is
+ * not vfrok which wants its own oom_score_adj.
+ * pin the mm so it doesn't go away and get reused after task_unlock
+ */
+ if (!task->vfork_done) {
+ struct task_struct *p = find_lock_task_mm(task);
+
+ if (p) {
+ if (atomic_read(&p->mm->mm_users) > 1) {
+ mm = p->mm;
+ atomic_inc(&mm->mm_count);
+ }
+ task_unlock(p);
+ }
+ }
+
+ task->signal->oom_score_adj = oom_adj;
+ if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
+ task->signal->oom_score_adj_min = (short)oom_adj;
+ trace_oom_score_adj_update(task);
+
+ if (mm) {
+ struct task_struct *p;
+
+ rcu_read_lock();
+ for_each_process(p) {
+ if (same_thread_group(task, p))
+ continue;
+
+ /* do not touch kernel threads or the global init */
+ if (p->flags & PF_KTHREAD || is_global_init(p))
+ continue;
+
+ task_lock(p);
+ if (!p->vfork_done && process_shares_mm(p, mm)) {
+ pr_info("updating oom_score_adj for %d (%s) from %d to %d because it shares mm with %d (%s). Report if this is unexpected.\n",
+ task_pid_nr(p), p->comm,
+ p->signal->oom_score_adj, oom_adj,
+ task_pid_nr(task), task->comm);
+ p->signal->oom_score_adj = oom_adj;
+ if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
+ p->signal->oom_score_adj_min = (short)oom_adj;
+ }
+ task_unlock(p);
+ }
+ rcu_read_unlock();
+ mmdrop(mm);
+ }
+err_unlock:
+ mutex_unlock(&oom_adj_mutex);
+ put_task_struct(task);
+ return err;
+}
+
/*
* /proc/pid/oom_adj exists solely for backwards compatibility with previous
* kernels. The effective policy is defined by oom_score_adj, which has a
@@ -1053,10 +1135,8 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
static ssize_t oom_adj_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
- struct task_struct *task;
char buffer[PROC_NUMBUF];
int oom_adj;
- unsigned long flags;
int err;
memset(buffer, 0, sizeof(buffer));
@@ -1076,23 +1156,6 @@ static ssize_t oom_adj_write(struct file *file, const char __user *buf,
goto out;
}
- task = get_proc_task(file_inode(file));
- if (!task) {
- err = -ESRCH;
- goto out;
- }
-
- task_lock(task);
- if (!task->mm) {
- err = -EINVAL;
- goto err_task_lock;
- }
-
- if (!lock_task_sighand(task, &flags)) {
- err = -ESRCH;
- goto err_task_lock;
- }
-
/*
* Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
* value is always attainable.
@@ -1102,27 +1165,7 @@ static ssize_t oom_adj_write(struct file *file, const char __user *buf,
else
oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
- if (oom_adj < task->signal->oom_score_adj &&
- !capable(CAP_SYS_RESOURCE)) {
- err = -EACCES;
- goto err_sighand;
- }
-
- /*
- * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
- * /proc/pid/oom_score_adj instead.
- */
- pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
- current->comm, task_pid_nr(current), task_pid_nr(task),
- task_pid_nr(task));
-
- task->signal->oom_score_adj = oom_adj;
- trace_oom_score_adj_update(task);
-err_sighand:
- unlock_task_sighand(task, &flags);
-err_task_lock:
- task_unlock(task);
- put_task_struct(task);
+ err = __set_oom_adj(file, oom_adj, true);
out:
return err < 0 ? err : count;
}
@@ -1139,15 +1182,11 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
struct task_struct *task = get_proc_task(file_inode(file));
char buffer[PROC_NUMBUF];
short oom_score_adj = OOM_SCORE_ADJ_MIN;
- unsigned long flags;
size_t len;
if (!task)
return -ESRCH;
- if (lock_task_sighand(task, &flags)) {
- oom_score_adj = task->signal->oom_score_adj;
- unlock_task_sighand(task, &flags);
- }
+ oom_score_adj = task->signal->oom_score_adj;
put_task_struct(task);
len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj);
return simple_read_from_buffer(buf, count, ppos, buffer, len);
@@ -1156,9 +1195,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
- struct task_struct *task;
char buffer[PROC_NUMBUF];
- unsigned long flags;
int oom_score_adj;
int err;
@@ -1179,39 +1216,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
goto out;
}
- task = get_proc_task(file_inode(file));
- if (!task) {
- err = -ESRCH;
- goto out;
- }
-
- task_lock(task);
- if (!task->mm) {
- err = -EINVAL;
- goto err_task_lock;
- }
-
- if (!lock_task_sighand(task, &flags)) {
- err = -ESRCH;
- goto err_task_lock;
- }
-
- if ((short)oom_score_adj < task->signal->oom_score_adj_min &&
- !capable(CAP_SYS_RESOURCE)) {
- err = -EACCES;
- goto err_sighand;
- }
-
- task->signal->oom_score_adj = (short)oom_score_adj;
- if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
- task->signal->oom_score_adj_min = (short)oom_score_adj;
- trace_oom_score_adj_update(task);
-
-err_sighand:
- unlock_task_sighand(task, &flags);
-err_task_lock:
- task_unlock(task);
- put_task_struct(task);
+ err = __set_oom_adj(file, oom_score_adj, false);
out:
return err < 0 ? err : count;
}
@@ -1819,12 +1824,17 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx,
child = d_hash_and_lookup(dir, &qname);
if (!child) {
- child = d_alloc(dir, &qname);
- if (!child)
- goto end_instantiate;
- if (instantiate(d_inode(dir), child, task, ptr) < 0) {
- dput(child);
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+ child = d_alloc_parallel(dir, &qname, &wq);
+ if (IS_ERR(child))
goto end_instantiate;
+ if (d_in_lookup(child)) {
+ int err = instantiate(d_inode(dir), child, task, ptr);
+ d_lookup_done(child);
+ if (err < 0) {
+ dput(child);
+ goto end_instantiate;
+ }
}
}
inode = d_inode(child);
@@ -2154,8 +2164,8 @@ out:
static const struct file_operations proc_map_files_operations = {
.read = generic_read_dir,
- .iterate = proc_map_files_readdir,
- .llseek = default_llseek,
+ .iterate_shared = proc_map_files_readdir,
+ .llseek = generic_file_llseek,
};
#ifdef CONFIG_CHECKPOINT_RESTORE
@@ -2502,8 +2512,8 @@ static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
static const struct file_operations proc_attr_dir_operations = {
.read = generic_read_dir,
- .iterate = proc_attr_dir_readdir,
- .llseek = default_llseek,
+ .iterate_shared = proc_attr_dir_readdir,
+ .llseek = generic_file_llseek,
};
static struct dentry *proc_attr_dir_lookup(struct inode *dir,
@@ -2910,8 +2920,8 @@ static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
static const struct file_operations proc_tgid_base_operations = {
.read = generic_read_dir,
- .iterate = proc_tgid_base_readdir,
- .llseek = default_llseek,
+ .iterate_shared = proc_tgid_base_readdir,
+ .llseek = generic_file_llseek,
};
static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
@@ -3157,6 +3167,44 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx)
}
/*
+ * proc_tid_comm_permission is a special permission function exclusively
+ * used for the node /proc/<pid>/task/<tid>/comm.
+ * It bypasses generic permission checks in the case where a task of the same
+ * task group attempts to access the node.
+ * The rationale behind this is that glibc and bionic access this node for
+ * cross thread naming (pthread_set/getname_np(!self)). However, if
+ * PR_SET_DUMPABLE gets set to 0 this node among others becomes uid=0 gid=0,
+ * which locks out the cross thread naming implementation.
+ * This function makes sure that the node is always accessible for members of
+ * same thread group.
+ */
+static int proc_tid_comm_permission(struct inode *inode, int mask)
+{
+ bool is_same_tgroup;
+ struct task_struct *task;
+
+ task = get_proc_task(inode);
+ if (!task)
+ return -ESRCH;
+ is_same_tgroup = same_thread_group(current, task);
+ put_task_struct(task);
+
+ if (likely(is_same_tgroup && !(mask & MAY_EXEC))) {
+ /* This file (/proc/<pid>/task/<tid>/comm) can always be
+ * read or written by the members of the corresponding
+ * thread group.
+ */
+ return 0;
+ }
+
+ return generic_permission(inode, mask);
+}
+
+static const struct inode_operations proc_tid_comm_inode_operations = {
+ .permission = proc_tid_comm_permission,
+};
+
+/*
* Tasks
*/
static const struct pid_entry tid_base_stuff[] = {
@@ -3174,7 +3222,9 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
#endif
- REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
+ NOD("comm", S_IFREG|S_IRUGO|S_IWUSR,
+ &proc_tid_comm_inode_operations,
+ &proc_pid_set_comm_operations, {}),
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
ONE("syscall", S_IRUSR, proc_pid_syscall),
#endif
@@ -3258,8 +3308,8 @@ static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *den
static const struct file_operations proc_tid_base_operations = {
.read = generic_read_dir,
- .iterate = proc_tid_base_readdir,
- .llseek = default_llseek,
+ .iterate_shared = proc_tid_base_readdir,
+ .llseek = generic_file_llseek,
};
static const struct inode_operations proc_tid_base_inode_operations = {
@@ -3469,6 +3519,6 @@ static const struct inode_operations proc_task_inode_operations = {
static const struct file_operations proc_task_operations = {
.read = generic_read_dir,
- .iterate = proc_task_readdir,
- .llseek = default_llseek,
+ .iterate_shared = proc_task_readdir,
+ .llseek = generic_file_llseek,
};
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 56afa5ef08f2d..01df23cc81f62 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -276,8 +276,8 @@ static int proc_readfd(struct file *file, struct dir_context *ctx)
const struct file_operations proc_fd_operations = {
.read = generic_read_dir,
- .iterate = proc_readfd,
- .llseek = default_llseek,
+ .iterate_shared = proc_readfd,
+ .llseek = generic_file_llseek,
};
static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
@@ -361,6 +361,6 @@ const struct inode_operations proc_fdinfo_inode_operations = {
const struct file_operations proc_fdinfo_operations = {
.read = generic_read_dir,
- .iterate = proc_readfdinfo,
- .llseek = default_llseek,
+ .iterate_shared = proc_readfdinfo,
+ .llseek = generic_file_llseek,
};
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index ff3ffc76a9379..c633476616e0f 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -318,7 +318,7 @@ int proc_readdir(struct file *file, struct dir_context *ctx)
static const struct file_operations proc_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .iterate = proc_readdir,
+ .iterate_shared = proc_readdir,
};
/*
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 42305ddcbaa00..c1b72388e5711 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -457,17 +457,30 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
return inode;
}
-int proc_fill_super(struct super_block *s)
+int proc_fill_super(struct super_block *s, void *data, int silent)
{
+ struct pid_namespace *ns = get_pid_ns(s->s_fs_info);
struct inode *root_inode;
int ret;
+ if (!proc_parse_options(data, ns))
+ return -EINVAL;
+
+ /* User space would break if executables or devices appear on proc */
+ s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
s->s_blocksize = 1024;
s->s_blocksize_bits = 10;
s->s_magic = PROC_SUPER_MAGIC;
s->s_op = &proc_sops;
s->s_time_gran = 1;
+
+ /*
+ * procfs isn't actually a stacking filesystem; however, there is
+ * too much magic going on inside it to permit stacking things on
+ * top of it
+ */
+ s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
pde_get(&proc_root);
root_inode = proc_get_inode(s, &proc_root);
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index aa2781095bd15..7931c558c1925 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -212,7 +212,7 @@ extern const struct inode_operations proc_pid_link_inode_operations;
extern void proc_init_inodecache(void);
extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
-extern int proc_fill_super(struct super_block *);
+extern int proc_fill_super(struct super_block *, void *data, int flags);
extern void proc_entry_rundown(struct proc_dir_entry *);
/*
@@ -268,6 +268,7 @@ static inline void proc_tty_init(void) {}
* root.c
*/
extern struct proc_dir_entry proc_root;
+extern int proc_parse_options(char *options, struct pid_namespace *pid);
extern void proc_self_init(void);
extern int proc_remount(struct super_block *, int *, char *);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 83720460c5bc7..09e18fdf61e5b 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -40,7 +40,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
si_swapinfo(&i);
committed = percpu_counter_read_positive(&vm_committed_as);
- cached = global_page_state(NR_FILE_PAGES) -
+ cached = global_node_page_state(NR_FILE_PAGES) -
total_swapcache_pages() - i.bufferram;
if (cached < 0)
cached = 0;
@@ -105,6 +105,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
"AnonHugePages: %8lu kB\n"
+ "ShmemHugePages: %8lu kB\n"
+ "ShmemPmdMapped: %8lu kB\n"
#endif
#ifdef CONFIG_CMA
"CmaTotal: %8lu kB\n"
@@ -136,23 +138,23 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
#endif
K(i.totalswap),
K(i.freeswap),
- K(global_page_state(NR_FILE_DIRTY)),
- K(global_page_state(NR_WRITEBACK)),
- K(global_page_state(NR_ANON_PAGES)),
- K(global_page_state(NR_FILE_MAPPED)),
+ K(global_node_page_state(NR_FILE_DIRTY)),
+ K(global_node_page_state(NR_WRITEBACK)),
+ K(global_node_page_state(NR_ANON_MAPPED)),
+ K(global_node_page_state(NR_FILE_MAPPED)),
K(i.sharedram),
K(global_page_state(NR_SLAB_RECLAIMABLE) +
global_page_state(NR_SLAB_UNRECLAIMABLE)),
K(global_page_state(NR_SLAB_RECLAIMABLE)),
K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
- global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024,
+ global_page_state(NR_KERNEL_STACK_KB),
K(global_page_state(NR_PAGETABLE)),
#ifdef CONFIG_QUICKLIST
K(quicklist_total_size()),
#endif
- K(global_page_state(NR_UNSTABLE_NFS)),
+ K(global_node_page_state(NR_UNSTABLE_NFS)),
K(global_page_state(NR_BOUNCE)),
- K(global_page_state(NR_WRITEBACK_TEMP)),
+ K(global_node_page_state(NR_WRITEBACK_TEMP)),
K(vm_commit_limit()),
K(committed),
(unsigned long)VMALLOC_TOTAL >> 10,
@@ -162,8 +164,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
, atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- , K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
- HPAGE_PMD_NR)
+ , K(global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR)
+ , K(global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR)
+ , K(global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR)
#endif
#ifdef CONFIG_CMA
, K(totalcma_pages)
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 72cb26f85d58d..51b8b0a8ad91b 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -139,7 +139,8 @@ out:
const struct file_operations proc_ns_dir_operations = {
.read = generic_read_dir,
- .iterate = proc_ns_dir_readdir,
+ .iterate_shared = proc_ns_dir_readdir,
+ .llseek = generic_file_llseek,
};
static struct dentry *proc_ns_dir_lookup(struct inode *dir,
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 712f1b9992ccb..3ecd445e830dc 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -142,7 +142,7 @@ u64 stable_page_flags(struct page *page)
/*
- * Caveats on high order pages: page->_count will only be set
+ * Caveats on high order pages: page->_refcount will only be set
* -1 on the head page; SLUB/SLQB do the same for PG_slab;
* SLOB won't set PG_slab at all on compound pages.
*/
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 350984a19c834..c8bbc68cdb059 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -179,7 +179,7 @@ static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx)
const struct file_operations proc_net_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .iterate = proc_tgid_net_readdir,
+ .iterate_shared = proc_tgid_net_readdir,
};
static __net_init int proc_net_ns_init(struct net *net)
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index fe5b6e6c46719..1b93650dda2fc 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -474,7 +474,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
{
struct ctl_table_header *head = grab_header(dir);
struct ctl_table_header *h = NULL;
- struct qstr *name = &dentry->d_name;
+ const struct qstr *name = &dentry->d_name;
struct ctl_table *p;
struct inode *inode;
struct dentry *err = ERR_PTR(-ENOENT);
@@ -623,22 +623,23 @@ static bool proc_sys_fill_cache(struct file *file,
qname.name = table->procname;
qname.len = strlen(table->procname);
- qname.hash = full_name_hash(qname.name, qname.len);
+ qname.hash = full_name_hash(dir, qname.name, qname.len);
child = d_lookup(dir, &qname);
if (!child) {
- child = d_alloc(dir, &qname);
- if (child) {
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+ child = d_alloc_parallel(dir, &qname, &wq);
+ if (IS_ERR(child))
+ return false;
+ if (d_in_lookup(child)) {
inode = proc_sys_make_inode(dir->d_sb, head, table);
if (!inode) {
+ d_lookup_done(child);
dput(child);
return false;
- } else {
- d_set_d_op(child, &proc_sys_dentry_operations);
- d_add(child, inode);
}
- } else {
- return false;
+ d_set_d_op(child, &proc_sys_dentry_operations);
+ d_add(child, inode);
}
}
inode = d_inode(child);
@@ -789,7 +790,7 @@ static const struct file_operations proc_sys_file_operations = {
static const struct file_operations proc_sys_dir_file_operations = {
.read = generic_read_dir,
- .iterate = proc_sys_readdir,
+ .iterate_shared = proc_sys_readdir,
.llseek = generic_file_llseek,
};
@@ -833,7 +834,7 @@ static int sysctl_is_seen(struct ctl_table_header *p)
return res;
}
-static int proc_sys_compare(const struct dentry *parent, const struct dentry *dentry,
+static int proc_sys_compare(const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
struct ctl_table_header *head;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 361ab4ee42fc3..8d3e484055a6b 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -23,21 +23,6 @@
#include "internal.h"
-static int proc_test_super(struct super_block *sb, void *data)
-{
- return sb->s_fs_info == data;
-}
-
-static int proc_set_super(struct super_block *sb, void *data)
-{
- int err = set_anon_super(sb, NULL);
- if (!err) {
- struct pid_namespace *ns = (struct pid_namespace *)data;
- sb->s_fs_info = get_pid_ns(ns);
- }
- return err;
-}
-
enum {
Opt_gid, Opt_hidepid, Opt_err,
};
@@ -48,7 +33,7 @@ static const match_table_t tokens = {
{Opt_err, NULL},
};
-static int proc_parse_options(char *options, struct pid_namespace *pid)
+int proc_parse_options(char *options, struct pid_namespace *pid)
{
char *p;
substring_t args[MAX_OPT_ARGS];
@@ -100,45 +85,16 @@ int proc_remount(struct super_block *sb, int *flags, char *data)
static struct dentry *proc_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
- int err;
- struct super_block *sb;
struct pid_namespace *ns;
- char *options;
if (flags & MS_KERNMOUNT) {
- ns = (struct pid_namespace *)data;
- options = NULL;
+ ns = data;
+ data = NULL;
} else {
ns = task_active_pid_ns(current);
- options = data;
-
- /* Does the mounter have privilege over the pid namespace? */
- if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
- return ERR_PTR(-EPERM);
- }
-
- sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns);
- if (IS_ERR(sb))
- return ERR_CAST(sb);
-
- if (!proc_parse_options(options, ns)) {
- deactivate_locked_super(sb);
- return ERR_PTR(-EINVAL);
- }
-
- if (!sb->s_root) {
- err = proc_fill_super(sb);
- if (err) {
- deactivate_locked_super(sb);
- return ERR_PTR(err);
- }
-
- sb->s_flags |= MS_ACTIVE;
- /* User space would break if executables appear on proc */
- sb->s_iflags |= SB_I_NOEXEC;
}
- return dget(sb->s_root);
+ return mount_ns(fs_type, flags, data, ns, ns->user_ns, proc_fill_super);
}
static void proc_kill_sb(struct super_block *sb)
@@ -158,7 +114,7 @@ static struct file_system_type proc_fs_type = {
.name = "proc",
.mount = proc_mount,
.kill_sb = proc_kill_sb,
- .fs_flags = FS_USERNS_VISIBLE | FS_USERNS_MOUNT,
+ .fs_flags = FS_USERNS_MOUNT,
};
void __init proc_root_init(void)
@@ -226,8 +182,8 @@ static int proc_root_readdir(struct file *file, struct dir_context *ctx)
*/
static const struct file_operations proc_root_operations = {
.read = generic_read_dir,
- .iterate = proc_root_readdir,
- .llseek = default_llseek,
+ .iterate_shared = proc_root_readdir,
+ .llseek = generic_file_llseek,
};
/*
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 510413eb25b8b..7907e456ac4f8 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -80,19 +80,17 @@ static u64 get_iowait_time(int cpu)
static int show_stat(struct seq_file *p, void *v)
{
int i, j;
- unsigned long jif;
u64 user, nice, system, idle, iowait, irq, softirq, steal;
u64 guest, guest_nice;
u64 sum = 0;
u64 sum_softirq = 0;
unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
- struct timespec boottime;
+ struct timespec64 boottime;
user = nice = system = idle = iowait =
irq = softirq = steal = 0;
guest = guest_nice = 0;
- getboottime(&boottime);
- jif = boottime.tv_sec;
+ getboottime64(&boottime);
for_each_possible_cpu(i) {
user += kcpustat_cpu(i).cpustat[CPUTIME_USER];
@@ -163,12 +161,12 @@ static int show_stat(struct seq_file *p, void *v)
seq_printf(p,
"\nctxt %llu\n"
- "btime %lu\n"
+ "btime %llu\n"
"processes %lu\n"
"procs_running %lu\n"
"procs_blocked %lu\n",
nr_context_switches(),
- (unsigned long)jif,
+ (unsigned long long)boottime.tv_sec,
total_forks,
nr_running(),
nr_iowait());
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 9df4316420422..187d84ef9de9d 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -448,6 +448,7 @@ struct mem_size_stats {
unsigned long referenced;
unsigned long anonymous;
unsigned long anonymous_thp;
+ unsigned long shmem_thp;
unsigned long swap;
unsigned long shared_hugetlb;
unsigned long private_hugetlb;
@@ -553,7 +554,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
if (radix_tree_exceptional_entry(page))
mss->swap += PAGE_SIZE;
else
- page_cache_release(page);
+ put_page(page);
return;
}
@@ -576,7 +577,12 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP);
if (IS_ERR_OR_NULL(page))
return;
- mss->anonymous_thp += HPAGE_PMD_SIZE;
+ if (PageAnon(page))
+ mss->anonymous_thp += HPAGE_PMD_SIZE;
+ else if (PageSwapBacked(page))
+ mss->shmem_thp += HPAGE_PMD_SIZE;
+ else
+ VM_BUG_ON_PAGE(1, page);
smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd));
}
#else
@@ -770,6 +776,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
"Referenced: %8lu kB\n"
"Anonymous: %8lu kB\n"
"AnonHugePages: %8lu kB\n"
+ "ShmemPmdMapped: %8lu kB\n"
"Shared_Hugetlb: %8lu kB\n"
"Private_Hugetlb: %7lu kB\n"
"Swap: %8lu kB\n"
@@ -787,6 +794,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
mss.referenced >> 10,
mss.anonymous >> 10,
mss.anonymous_thp >> 10,
+ mss.shmem_thp >> 10,
mss.shared_hugetlb >> 10,
mss.private_hugetlb >> 10,
mss.swap >> 10,
@@ -1027,11 +1035,15 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
};
if (type == CLEAR_REFS_MM_HIWATER_RSS) {
+ if (down_write_killable(&mm->mmap_sem)) {
+ count = -EINTR;
+ goto out_mm;
+ }
+
/*
* Writing 5 to /proc/pid/clear_refs resets the peak
* resident set size to this mm's current rss value.
*/
- down_write(&mm->mmap_sem);
reset_mm_hiwater_rss(mm);
up_write(&mm->mmap_sem);
goto out_mm;
@@ -1043,7 +1055,10 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
if (!(vma->vm_flags & VM_SOFTDIRTY))
continue;
up_read(&mm->mmap_sem);
- down_write(&mm->mmap_sem);
+ if (down_write_killable(&mm->mmap_sem)) {
+ count = -EINTR;
+ goto out_mm;
+ }
for (vma = mm->mmap; vma; vma = vma->vm_next) {
vma->vm_flags &= ~VM_SOFTDIRTY;
vma_set_page_prot(vma);
@@ -1518,6 +1533,32 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
return page;
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static struct page *can_gather_numa_stats_pmd(pmd_t pmd,
+ struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ struct page *page;
+ int nid;
+
+ if (!pmd_present(pmd))
+ return NULL;
+
+ page = vm_normal_page_pmd(vma, addr, pmd);
+ if (!page)
+ return NULL;
+
+ if (PageReserved(page))
+ return NULL;
+
+ nid = page_to_nid(page);
+ if (!node_isset(nid, node_states[N_MEMORY]))
+ return NULL;
+
+ return page;
+}
+#endif
+
static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
@@ -1527,14 +1568,14 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
pte_t *orig_pte;
pte_t *pte;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
- pte_t huge_pte = *(pte_t *)pmd;
struct page *page;
- page = can_gather_numa_stats(huge_pte, vma, addr);
+ page = can_gather_numa_stats_pmd(*pmd, vma, addr);
if (page)
- gather_stats(page, md, pte_dirty(huge_pte),
+ gather_stats(page, md, pmd_dirty(*pmd),
HPAGE_PMD_SIZE/PAGE_SIZE);
spin_unlock(ptl);
return 0;
@@ -1542,6 +1583,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
if (pmd_trans_unstable(pmd))
return 0;
+#endif
orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
do {
struct page *page = can_gather_numa_stats(*pte, vma, addr);
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 55bb57e6a30d3..8ab782d8b33dd 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -279,12 +279,12 @@ static int mmap_vmcore_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
if (!page)
return VM_FAULT_OOM;
if (!PageUptodate(page)) {
- offset = (loff_t) index << PAGE_CACHE_SHIFT;
+ offset = (loff_t) index << PAGE_SHIFT;
buf = __va((page_to_pfn(page) << PAGE_SHIFT));
rc = __read_vmcore(buf, PAGE_SIZE, &offset, 0);
if (rc < 0) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return (rc == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
}
SetPageUptodate(page);
@@ -1071,7 +1071,7 @@ static int __init parse_crash_elf32_headers(void)
/* Do some basic Verification. */
if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 ||
(ehdr.e_type != ET_CORE) ||
- !elf_check_arch(&ehdr) ||
+ !vmcore_elf32_check_arch(&ehdr) ||
ehdr.e_ident[EI_CLASS] != ELFCLASS32||
ehdr.e_ident[EI_VERSION] != EV_CURRENT ||
ehdr.e_version != EV_CURRENT ||
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index 360ae43f590cc..be40813eff52c 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -1,8 +1,6 @@
config PSTORE
tristate "Persistent store support"
default n
- select ZLIB_DEFLATE
- select ZLIB_INFLATE
help
This option enables generic access to platform level
persistent storage via "pstore" filesystem that can
@@ -14,6 +12,35 @@ config PSTORE
If you don't have a platform persistent store driver,
say N.
+choice
+ prompt "Choose compression algorithm"
+ depends on PSTORE
+ default PSTORE_ZLIB_COMPRESS
+ help
+ This option chooses compression algorithm.
+
+config PSTORE_ZLIB_COMPRESS
+ bool "ZLIB"
+ select ZLIB_DEFLATE
+ select ZLIB_INFLATE
+ help
+ This option enables ZLIB compression algorithm support.
+
+config PSTORE_LZO_COMPRESS
+ bool "LZO"
+ select LZO_COMPRESS
+ select LZO_DECOMPRESS
+ help
+ This option enables LZO compression algorithm support.
+
+config PSTORE_LZ4_COMPRESS
+ bool "LZ4"
+ select LZ4_COMPRESS
+ select LZ4_DECOMPRESS
+ help
+ This option enables LZ4 compression algorithm support.
+endchoice
+
config PSTORE_CONSOLE
bool "Log kernel console messages"
depends on PSTORE
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index dc645b66cd79a..ec9ddef5ae755 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -178,7 +178,6 @@ static loff_t pstore_file_llseek(struct file *file, loff_t off, int whence)
}
static const struct file_operations pstore_file_operations = {
- .owner = THIS_MODULE,
.open = pstore_file_open,
.read = pstore_file_read,
.llseek = pstore_file_llseek,
@@ -420,8 +419,8 @@ static int pstore_fill_super(struct super_block *sb, void *data, int silent)
pstore_sb = sb;
sb->s_maxbytes = MAX_LFS_FILESIZE;
- sb->s_blocksize = PAGE_CACHE_SIZE;
- sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ sb->s_blocksize = PAGE_SIZE;
+ sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = PSTOREFS_MAGIC;
sb->s_op = &pstore_ops;
sb->s_time_gran = 1;
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 588461bb2dd48..16ecca5b72d81 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -28,7 +28,15 @@
#include <linux/console.h>
#include <linux/module.h>
#include <linux/pstore.h>
+#ifdef CONFIG_PSTORE_ZLIB_COMPRESS
#include <linux/zlib.h>
+#endif
+#ifdef CONFIG_PSTORE_LZO_COMPRESS
+#include <linux/lzo.h>
+#endif
+#ifdef CONFIG_PSTORE_LZ4_COMPRESS
+#include <linux/lz4.h>
+#endif
#include <linux/string.h>
#include <linux/timer.h>
#include <linux/slab.h>
@@ -69,10 +77,23 @@ struct pstore_info *psinfo;
static char *backend;
/* Compression parameters */
+#ifdef CONFIG_PSTORE_ZLIB_COMPRESS
#define COMPR_LEVEL 6
#define WINDOW_BITS 12
#define MEM_LEVEL 4
static struct z_stream_s stream;
+#else
+static unsigned char *workspace;
+#endif
+
+struct pstore_zbackend {
+ int (*compress)(const void *in, void *out, size_t inlen, size_t outlen);
+ int (*decompress)(void *in, void *out, size_t inlen, size_t outlen);
+ void (*allocate)(void);
+ void (*free)(void);
+
+ const char *name;
+};
static char *big_oops_buf;
static size_t big_oops_buf_sz;
@@ -129,9 +150,9 @@ bool pstore_cannot_block_path(enum kmsg_dump_reason reason)
}
EXPORT_SYMBOL_GPL(pstore_cannot_block_path);
+#ifdef CONFIG_PSTORE_ZLIB_COMPRESS
/* Derived from logfs_compress() */
-static int pstore_compress(const void *in, void *out, size_t inlen,
- size_t outlen)
+static int compress_zlib(const void *in, void *out, size_t inlen, size_t outlen)
{
int err, ret;
@@ -165,7 +186,7 @@ error:
}
/* Derived from logfs_uncompress */
-static int pstore_decompress(void *in, void *out, size_t inlen, size_t outlen)
+static int decompress_zlib(void *in, void *out, size_t inlen, size_t outlen)
{
int err, ret;
@@ -194,7 +215,7 @@ error:
return ret;
}
-static void allocate_buf_for_compression(void)
+static void allocate_zlib(void)
{
size_t size;
size_t cmpr;
@@ -237,12 +258,190 @@ static void allocate_buf_for_compression(void)
}
-static void free_buf_for_compression(void)
+static void free_zlib(void)
{
kfree(stream.workspace);
stream.workspace = NULL;
kfree(big_oops_buf);
big_oops_buf = NULL;
+ big_oops_buf_sz = 0;
+}
+
+static struct pstore_zbackend backend_zlib = {
+ .compress = compress_zlib,
+ .decompress = decompress_zlib,
+ .allocate = allocate_zlib,
+ .free = free_zlib,
+ .name = "zlib",
+};
+#endif
+
+#ifdef CONFIG_PSTORE_LZO_COMPRESS
+static int compress_lzo(const void *in, void *out, size_t inlen, size_t outlen)
+{
+ int ret;
+
+ ret = lzo1x_1_compress(in, inlen, out, &outlen, workspace);
+ if (ret != LZO_E_OK) {
+ pr_err("lzo_compress error, ret = %d!\n", ret);
+ return -EIO;
+ }
+
+ return outlen;
+}
+
+static int decompress_lzo(void *in, void *out, size_t inlen, size_t outlen)
+{
+ int ret;
+
+ ret = lzo1x_decompress_safe(in, inlen, out, &outlen);
+ if (ret != LZO_E_OK) {
+ pr_err("lzo_decompress error, ret = %d!\n", ret);
+ return -EIO;
+ }
+
+ return outlen;
+}
+
+static void allocate_lzo(void)
+{
+ big_oops_buf_sz = lzo1x_worst_compress(psinfo->bufsize);
+ big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL);
+ if (big_oops_buf) {
+ workspace = kmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
+ if (!workspace) {
+ pr_err("No memory for compression workspace; skipping compression\n");
+ kfree(big_oops_buf);
+ big_oops_buf = NULL;
+ }
+ } else {
+ pr_err("No memory for uncompressed data; skipping compression\n");
+ workspace = NULL;
+ }
+}
+
+static void free_lzo(void)
+{
+ kfree(workspace);
+ kfree(big_oops_buf);
+ big_oops_buf = NULL;
+ big_oops_buf_sz = 0;
+}
+
+static struct pstore_zbackend backend_lzo = {
+ .compress = compress_lzo,
+ .decompress = decompress_lzo,
+ .allocate = allocate_lzo,
+ .free = free_lzo,
+ .name = "lzo",
+};
+#endif
+
+#ifdef CONFIG_PSTORE_LZ4_COMPRESS
+static int compress_lz4(const void *in, void *out, size_t inlen, size_t outlen)
+{
+ int ret;
+
+ ret = lz4_compress(in, inlen, out, &outlen, workspace);
+ if (ret) {
+ pr_err("lz4_compress error, ret = %d!\n", ret);
+ return -EIO;
+ }
+
+ return outlen;
+}
+
+static int decompress_lz4(void *in, void *out, size_t inlen, size_t outlen)
+{
+ int ret;
+
+ ret = lz4_decompress_unknownoutputsize(in, inlen, out, &outlen);
+ if (ret) {
+ pr_err("lz4_decompress error, ret = %d!\n", ret);
+ return -EIO;
+ }
+
+ return outlen;
+}
+
+static void allocate_lz4(void)
+{
+ big_oops_buf_sz = lz4_compressbound(psinfo->bufsize);
+ big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL);
+ if (big_oops_buf) {
+ workspace = kmalloc(LZ4_MEM_COMPRESS, GFP_KERNEL);
+ if (!workspace) {
+ pr_err("No memory for compression workspace; skipping compression\n");
+ kfree(big_oops_buf);
+ big_oops_buf = NULL;
+ }
+ } else {
+ pr_err("No memory for uncompressed data; skipping compression\n");
+ workspace = NULL;
+ }
+}
+
+static void free_lz4(void)
+{
+ kfree(workspace);
+ kfree(big_oops_buf);
+ big_oops_buf = NULL;
+ big_oops_buf_sz = 0;
+}
+
+static struct pstore_zbackend backend_lz4 = {
+ .compress = compress_lz4,
+ .decompress = decompress_lz4,
+ .allocate = allocate_lz4,
+ .free = free_lz4,
+ .name = "lz4",
+};
+#endif
+
+static struct pstore_zbackend *zbackend =
+#if defined(CONFIG_PSTORE_ZLIB_COMPRESS)
+ &backend_zlib;
+#elif defined(CONFIG_PSTORE_LZO_COMPRESS)
+ &backend_lzo;
+#elif defined(CONFIG_PSTORE_LZ4_COMPRESS)
+ &backend_lz4;
+#else
+ NULL;
+#endif
+
+static int pstore_compress(const void *in, void *out,
+ size_t inlen, size_t outlen)
+{
+ if (zbackend)
+ return zbackend->compress(in, out, inlen, outlen);
+ else
+ return -EIO;
+}
+
+static int pstore_decompress(void *in, void *out, size_t inlen, size_t outlen)
+{
+ if (zbackend)
+ return zbackend->decompress(in, out, inlen, outlen);
+ else
+ return -EIO;
+}
+
+static void allocate_buf_for_compression(void)
+{
+ if (zbackend) {
+ pr_info("using %s compression\n", zbackend->name);
+ zbackend->allocate();
+ } else {
+ pr_err("allocate compression buffer error!\n");
+ }
+}
+
+static void free_buf_for_compression(void)
+{
+ if (zbackend)
+ zbackend->free();
+ else
+ pr_err("free compression buffer error!\n");
}
/*
@@ -284,7 +483,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
u64 id;
unsigned int part = 1;
unsigned long flags = 0;
- int is_locked = 0;
+ int is_locked;
int ret;
why = get_reason_str(reason);
@@ -295,8 +494,10 @@ static void pstore_dump(struct kmsg_dumper *dumper,
pr_err("pstore dump routine blocked in %s path, may corrupt error record\n"
, in_nmi() ? "NMI" : why);
}
- } else
+ } else {
spin_lock_irqsave(&psinfo->buf_lock, flags);
+ is_locked = 1;
+ }
oopscount++;
while (total < kmsg_bytes) {
char *dst;
@@ -304,19 +505,25 @@ static void pstore_dump(struct kmsg_dumper *dumper,
int hsize;
int zipped_len = -1;
size_t len;
- bool compressed;
+ bool compressed = false;
size_t total_len;
if (big_oops_buf && is_locked) {
dst = big_oops_buf;
- hsize = sprintf(dst, "%s#%d Part%u\n", why,
- oopscount, part);
- size = big_oops_buf_sz - hsize;
+ size = big_oops_buf_sz;
+ } else {
+ dst = psinfo->buf;
+ size = psinfo->bufsize;
+ }
- if (!kmsg_dump_get_buffer(dumper, true, dst + hsize,
- size, &len))
- break;
+ hsize = sprintf(dst, "%s#%d Part%u\n", why, oopscount, part);
+ size -= hsize;
+
+ if (!kmsg_dump_get_buffer(dumper, true, dst + hsize,
+ size, &len))
+ break;
+ if (big_oops_buf && is_locked) {
zipped_len = pstore_compress(dst, psinfo->buf,
hsize + len, psinfo->bufsize);
@@ -324,21 +531,9 @@ static void pstore_dump(struct kmsg_dumper *dumper,
compressed = true;
total_len = zipped_len;
} else {
- compressed = false;
total_len = copy_kmsg_to_buffer(hsize, len);
}
} else {
- dst = psinfo->buf;
- hsize = sprintf(dst, "%s#%d Part%u\n", why, oopscount,
- part);
- size = psinfo->bufsize - hsize;
- dst += hsize;
-
- if (!kmsg_dump_get_buffer(dumper, true, dst,
- size, &len))
- break;
-
- compressed = false;
total_len = hsize + len;
}
@@ -350,10 +545,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
total += total_len;
part++;
}
- if (pstore_cannot_block_path(reason)) {
- if (is_locked)
- spin_unlock_irqrestore(&psinfo->buf_lock, flags);
- } else
+ if (is_locked)
spin_unlock_irqrestore(&psinfo->buf_lock, flags);
}
@@ -497,9 +689,11 @@ EXPORT_SYMBOL_GPL(pstore_register);
void pstore_unregister(struct pstore_info *psi)
{
- pstore_unregister_pmsg();
- pstore_unregister_ftrace();
- pstore_unregister_console();
+ if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) {
+ pstore_unregister_pmsg();
+ pstore_unregister_ftrace();
+ pstore_unregister_console();
+ }
pstore_unregister_kmsg();
free_buf_for_compression();
@@ -527,6 +721,7 @@ void pstore_get_records(int quiet)
int failed = 0, rc;
bool compressed;
int unzipped_len = -1;
+ ssize_t ecc_notice_size = 0;
if (!psi)
return;
@@ -536,7 +731,7 @@ void pstore_get_records(int quiet)
goto out;
while ((size = psi->read(&id, &type, &count, &time, &buf, &compressed,
- psi)) > 0) {
+ &ecc_notice_size, psi)) > 0) {
if (compressed && (type == PSTORE_TYPE_DMESG)) {
if (big_oops_buf)
unzipped_len = pstore_decompress(buf,
@@ -544,6 +739,9 @@ void pstore_get_records(int quiet)
big_oops_buf_sz);
if (unzipped_len > 0) {
+ if (ecc_notice_size)
+ memcpy(big_oops_buf + unzipped_len,
+ buf + size, ecc_notice_size);
kfree(buf);
buf = big_oops_buf;
size = unzipped_len;
@@ -555,7 +753,8 @@ void pstore_get_records(int quiet)
}
}
rc = pstore_mkfile(type, psi->name, id, count, buf,
- compressed, (size_t)size, time, psi);
+ compressed, size + ecc_notice_size,
+ time, psi);
if (unzipped_len < 0) {
/* Free buffer other than big oops */
kfree(buf);
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index bd9812e834612..7a034d62cf8c8 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -34,6 +34,8 @@
#include <linux/slab.h>
#include <linux/compiler.h>
#include <linux/pstore_ram.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
#define RAMOOPS_KERNMSG_HDR "===="
#define MIN_MEM_SIZE 4096UL
@@ -181,10 +183,10 @@ static bool prz_ok(struct persistent_ram_zone *prz)
static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
int *count, struct timespec *time,
char **buf, bool *compressed,
+ ssize_t *ecc_notice_size,
struct pstore_info *psi)
{
ssize_t size;
- ssize_t ecc_notice_size;
struct ramoops_context *cxt = psi->data;
struct persistent_ram_zone *prz = NULL;
int header_length = 0;
@@ -229,16 +231,16 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
size = persistent_ram_old_size(prz) - header_length;
/* ECC correction notice */
- ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0);
+ *ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0);
- *buf = kmalloc(size + ecc_notice_size + 1, GFP_KERNEL);
+ *buf = kmalloc(size + *ecc_notice_size + 1, GFP_KERNEL);
if (*buf == NULL)
return -ENOMEM;
memcpy(*buf, (char *)persistent_ram_old(prz) + header_length, size);
- persistent_ram_ecc_string(prz, *buf + size, ecc_notice_size + 1);
+ persistent_ram_ecc_string(prz, *buf + size, *ecc_notice_size + 1);
- return size + ecc_notice_size;
+ return size;
}
static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz,
@@ -458,15 +460,89 @@ static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt,
return 0;
}
+static int ramoops_parse_dt_size(struct platform_device *pdev,
+ const char *propname, u32 *value)
+{
+ u32 val32 = 0;
+ int ret;
+
+ ret = of_property_read_u32(pdev->dev.of_node, propname, &val32);
+ if (ret < 0 && ret != -EINVAL) {
+ dev_err(&pdev->dev, "failed to parse property %s: %d\n",
+ propname, ret);
+ return ret;
+ }
+
+ if (val32 > INT_MAX) {
+ dev_err(&pdev->dev, "%s %u > INT_MAX\n", propname, val32);
+ return -EOVERFLOW;
+ }
+
+ *value = val32;
+ return 0;
+}
+
+static int ramoops_parse_dt(struct platform_device *pdev,
+ struct ramoops_platform_data *pdata)
+{
+ struct device_node *of_node = pdev->dev.of_node;
+ struct resource *res;
+ u32 value;
+ int ret;
+
+ dev_dbg(&pdev->dev, "using Device Tree\n");
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ if (!res) {
+ dev_err(&pdev->dev,
+ "failed to locate DT /reserved-memory resource\n");
+ return -EINVAL;
+ }
+
+ pdata->mem_size = resource_size(res);
+ pdata->mem_address = res->start;
+ pdata->mem_type = of_property_read_bool(of_node, "unbuffered");
+ pdata->dump_oops = !of_property_read_bool(of_node, "no-dump-oops");
+
+#define parse_size(name, field) { \
+ ret = ramoops_parse_dt_size(pdev, name, &value); \
+ if (ret < 0) \
+ return ret; \
+ field = value; \
+ }
+
+ parse_size("record-size", pdata->record_size);
+ parse_size("console-size", pdata->console_size);
+ parse_size("ftrace-size", pdata->ftrace_size);
+ parse_size("pmsg-size", pdata->pmsg_size);
+ parse_size("ecc-size", pdata->ecc_info.ecc_size);
+
+#undef parse_size
+
+ return 0;
+}
+
static int ramoops_probe(struct platform_device *pdev)
{
struct device *dev = &pdev->dev;
- struct ramoops_platform_data *pdata = pdev->dev.platform_data;
+ struct ramoops_platform_data *pdata = dev->platform_data;
struct ramoops_context *cxt = &oops_cxt;
size_t dump_mem_sz;
phys_addr_t paddr;
int err = -EINVAL;
+ if (dev_of_node(dev) && !pdata) {
+ pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
+ if (!pdata) {
+ err = -ENOMEM;
+ goto fail_out;
+ }
+
+ err = ramoops_parse_dt(pdev, pdata);
+ if (err < 0)
+ goto fail_out;
+ }
+
/* Only a single ramoops area allowed at a time, so fail extra
* probes.
*/
@@ -567,11 +643,11 @@ fail_buf:
kfree(cxt->pstore.buf);
fail_clear:
cxt->pstore.bufsize = 0;
- kfree(cxt->mprz);
+ persistent_ram_free(cxt->mprz);
fail_init_mprz:
- kfree(cxt->fprz);
+ persistent_ram_free(cxt->fprz);
fail_init_fprz:
- kfree(cxt->cprz);
+ persistent_ram_free(cxt->cprz);
fail_init_cprz:
ramoops_free_przs(cxt);
fail_out:
@@ -596,11 +672,17 @@ static int ramoops_remove(struct platform_device *pdev)
return 0;
}
+static const struct of_device_id dt_match[] = {
+ { .compatible = "ramoops" },
+ {}
+};
+
static struct platform_driver ramoops_driver = {
.probe = ramoops_probe,
.remove = ramoops_remove,
.driver = {
- .name = "ramoops",
+ .name = "ramoops",
+ .of_match_table = dt_match,
},
};
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index b218f965817bf..781056a0480f4 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -71,7 +71,7 @@ const struct file_operations qnx4_dir_operations =
{
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .iterate = qnx4_readdir,
+ .iterate_shared = qnx4_readdir,
.fsync = generic_file_fsync,
};
diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c
index e1f37278cf97b..27637e0bdc9f1 100644
--- a/fs/qnx6/dir.c
+++ b/fs/qnx6/dir.c
@@ -35,9 +35,9 @@ static struct page *qnx6_get_page(struct inode *dir, unsigned long n)
static unsigned last_entry(struct inode *inode, unsigned long page_nr)
{
unsigned long last_byte = inode->i_size;
- last_byte -= page_nr << PAGE_CACHE_SHIFT;
- if (last_byte > PAGE_CACHE_SIZE)
- last_byte = PAGE_CACHE_SIZE;
+ last_byte -= page_nr << PAGE_SHIFT;
+ if (last_byte > PAGE_SIZE)
+ last_byte = PAGE_SIZE;
return last_byte / QNX6_DIR_ENTRY_SIZE;
}
@@ -47,9 +47,9 @@ static struct qnx6_long_filename *qnx6_longname(struct super_block *sb,
{
struct qnx6_sb_info *sbi = QNX6_SB(sb);
u32 s = fs32_to_cpu(sbi, de->de_long_inode); /* in block units */
- u32 n = s >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits); /* in pages */
+ u32 n = s >> (PAGE_SHIFT - sb->s_blocksize_bits); /* in pages */
/* within page */
- u32 offs = (s << sb->s_blocksize_bits) & ~PAGE_CACHE_MASK;
+ u32 offs = (s << sb->s_blocksize_bits) & ~PAGE_MASK;
struct address_space *mapping = sbi->longfile->i_mapping;
struct page *page = read_mapping_page(mapping, n, NULL);
if (IS_ERR(page))
@@ -115,8 +115,8 @@ static int qnx6_readdir(struct file *file, struct dir_context *ctx)
struct qnx6_sb_info *sbi = QNX6_SB(s);
loff_t pos = ctx->pos & ~(QNX6_DIR_ENTRY_SIZE - 1);
unsigned long npages = dir_pages(inode);
- unsigned long n = pos >> PAGE_CACHE_SHIFT;
- unsigned start = (pos & ~PAGE_CACHE_MASK) / QNX6_DIR_ENTRY_SIZE;
+ unsigned long n = pos >> PAGE_SHIFT;
+ unsigned start = (pos & ~PAGE_MASK) / QNX6_DIR_ENTRY_SIZE;
bool done = false;
ctx->pos = pos;
@@ -131,7 +131,7 @@ static int qnx6_readdir(struct file *file, struct dir_context *ctx)
if (IS_ERR(page)) {
pr_err("%s(): read failed\n", __func__);
- ctx->pos = (n + 1) << PAGE_CACHE_SHIFT;
+ ctx->pos = (n + 1) << PAGE_SHIFT;
return PTR_ERR(page);
}
de = ((struct qnx6_dir_entry *)page_address(page)) + start;
@@ -272,7 +272,7 @@ found:
const struct file_operations qnx6_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .iterate = qnx6_readdir,
+ .iterate_shared = qnx6_readdir,
.fsync = generic_file_fsync,
};
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 47bb1de07155e..1192422a1c562 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -542,8 +542,8 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino)
iget_failed(inode);
return ERR_PTR(-EIO);
}
- n = (ino - 1) >> (PAGE_CACHE_SHIFT - QNX6_INODE_SIZE_BITS);
- offs = (ino - 1) & (~PAGE_CACHE_MASK >> QNX6_INODE_SIZE_BITS);
+ n = (ino - 1) >> (PAGE_SHIFT - QNX6_INODE_SIZE_BITS);
+ offs = (ino - 1) & (~PAGE_MASK >> QNX6_INODE_SIZE_BITS);
mapping = sbi->inodes->i_mapping;
page = read_mapping_page(mapping, n, NULL);
if (IS_ERR(page)) {
diff --git a/fs/qnx6/qnx6.h b/fs/qnx6/qnx6.h
index d3fb2b6988002..f23b5c4a66ad7 100644
--- a/fs/qnx6/qnx6.h
+++ b/fs/qnx6/qnx6.h
@@ -128,7 +128,7 @@ extern struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s,
static inline void qnx6_put_page(struct page *page)
{
kunmap(page);
- page_cache_release(page);
+ put_page(page);
}
extern unsigned qnx6_find_entry(int len, struct inode *dir, const char *name,
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index ba827daea5a0b..1bfac28b7e7df 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -841,6 +841,9 @@ struct dquot *dqget(struct super_block *sb, struct kqid qid)
unsigned int hashent = hashfn(sb, qid);
struct dquot *dquot, *empty = NULL;
+ if (!qid_has_mapping(sb->s_user_ns, qid))
+ return ERR_PTR(-EINVAL);
+
if (!sb_has_quota_active(sb, qid.type))
return ERR_PTR(-ESRCH);
we_slept:
@@ -1133,7 +1136,7 @@ static void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
else
dquot->dq_dqb.dqb_curinodes = 0;
if (dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit)
- dquot->dq_dqb.dqb_itime = (time_t) 0;
+ dquot->dq_dqb.dqb_itime = (time64_t) 0;
clear_bit(DQ_INODES_B, &dquot->dq_flags);
}
@@ -1145,7 +1148,7 @@ static void dquot_decr_space(struct dquot *dquot, qsize_t number)
else
dquot->dq_dqb.dqb_curspace = 0;
if (dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit)
- dquot->dq_dqb.dqb_btime = (time_t) 0;
+ dquot->dq_dqb.dqb_btime = (time64_t) 0;
clear_bit(DQ_BLKS_B, &dquot->dq_flags);
}
@@ -1292,7 +1295,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes,
if (dquot->dq_dqb.dqb_isoftlimit &&
newinodes > dquot->dq_dqb.dqb_isoftlimit &&
dquot->dq_dqb.dqb_itime &&
- get_seconds() >= dquot->dq_dqb.dqb_itime &&
+ ktime_get_real_seconds() >= dquot->dq_dqb.dqb_itime &&
!ignore_hardlimit(dquot)) {
prepare_warning(warn, dquot, QUOTA_NL_ISOFTLONGWARN);
return -EDQUOT;
@@ -1302,7 +1305,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes,
newinodes > dquot->dq_dqb.dqb_isoftlimit &&
dquot->dq_dqb.dqb_itime == 0) {
prepare_warning(warn, dquot, QUOTA_NL_ISOFTWARN);
- dquot->dq_dqb.dqb_itime = get_seconds() +
+ dquot->dq_dqb.dqb_itime = ktime_get_real_seconds() +
sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type].dqi_igrace;
}
@@ -1334,7 +1337,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc,
if (dquot->dq_dqb.dqb_bsoftlimit &&
tspace > dquot->dq_dqb.dqb_bsoftlimit &&
dquot->dq_dqb.dqb_btime &&
- get_seconds() >= dquot->dq_dqb.dqb_btime &&
+ ktime_get_real_seconds() >= dquot->dq_dqb.dqb_btime &&
!ignore_hardlimit(dquot)) {
if (!prealloc)
prepare_warning(warn, dquot, QUOTA_NL_BSOFTLONGWARN);
@@ -1346,7 +1349,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc,
dquot->dq_dqb.dqb_btime == 0) {
if (!prealloc) {
prepare_warning(warn, dquot, QUOTA_NL_BSOFTWARN);
- dquot->dq_dqb.dqb_btime = get_seconds() +
+ dquot->dq_dqb.dqb_btime = ktime_get_real_seconds() +
sb_dqopt(sb)->info[dquot->dq_id.type].dqi_bgrace;
}
else
@@ -2047,11 +2050,20 @@ int dquot_get_next_id(struct super_block *sb, struct kqid *qid)
struct quota_info *dqopt = sb_dqopt(sb);
int err;
- if (!dqopt->ops[qid->type]->get_next_id)
- return -ENOSYS;
+ mutex_lock(&dqopt->dqonoff_mutex);
+ if (!sb_has_quota_active(sb, qid->type)) {
+ err = -ESRCH;
+ goto out;
+ }
+ if (!dqopt->ops[qid->type]->get_next_id) {
+ err = -ENOSYS;
+ goto out;
+ }
mutex_lock(&dqopt->dqio_mutex);
err = dqopt->ops[qid->type]->get_next_id(sb, qid);
mutex_unlock(&dqopt->dqio_mutex);
+out:
+ mutex_unlock(&dqopt->dqonoff_mutex);
return err;
}
@@ -2259,6 +2271,11 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
error = -EINVAL;
goto out_fmt;
}
+ /* Filesystems outside of init_user_ns not yet supported */
+ if (sb->s_user_ns != &init_user_ns) {
+ error = -EINVAL;
+ goto out_fmt;
+ }
/* Usage always has to be set... */
if (!(flags & DQUOT_USAGE_ENABLED)) {
error = -EINVAL;
@@ -2686,7 +2703,7 @@ static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di)
clear_bit(DQ_BLKS_B, &dquot->dq_flags);
} else if (!(di->d_fieldmask & QC_SPC_TIMER))
/* Set grace only if user hasn't provided his own... */
- dm->dqb_btime = get_seconds() + dqi->dqi_bgrace;
+ dm->dqb_btime = ktime_get_real_seconds() + dqi->dqi_bgrace;
}
if (check_ilim) {
if (!dm->dqb_isoftlimit ||
@@ -2695,7 +2712,7 @@ static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di)
clear_bit(DQ_INODES_B, &dquot->dq_flags);
} else if (!(di->d_fieldmask & QC_INO_TIMER))
/* Set grace only if user hasn't provided his own... */
- dm->dqb_itime = get_seconds() + dqi->dqi_igrace;
+ dm->dqb_itime = ktime_get_real_seconds() + dqi->dqi_igrace;
}
if (dm->dqb_bhardlimit || dm->dqb_bsoftlimit || dm->dqb_ihardlimit ||
dm->dqb_isoftlimit)
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
index d07a2f91d8580..8b252673d4540 100644
--- a/fs/quota/netlink.c
+++ b/fs/quota/netlink.c
@@ -47,7 +47,7 @@ void quota_send_warning(struct kqid qid, dev_t dev,
void *msg_head;
int ret;
int msg_size = 4 * nla_total_size(sizeof(u32)) +
- 2 * nla_total_size(sizeof(u64));
+ 2 * nla_total_size_64bit(sizeof(u64));
/* We have to allocate using GFP_NOFS as we are called from a
* filesystem performing write and thus further recursion into
@@ -68,8 +68,9 @@ void quota_send_warning(struct kqid qid, dev_t dev,
ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, qid.type);
if (ret)
goto attr_err_out;
- ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID,
- from_kqid_munged(&init_user_ns, qid));
+ ret = nla_put_u64_64bit(skb, QUOTA_NL_A_EXCESS_ID,
+ from_kqid_munged(&init_user_ns, qid),
+ QUOTA_NL_A_PAD);
if (ret)
goto attr_err_out;
ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype);
@@ -81,8 +82,9 @@ void quota_send_warning(struct kqid qid, dev_t dev,
ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev));
if (ret)
goto attr_err_out;
- ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID,
- from_kuid_munged(&init_user_ns, current_uid()));
+ ret = nla_put_u64_64bit(skb, QUOTA_NL_A_CAUSED_ID,
+ from_kuid_munged(&init_user_ns, current_uid()),
+ QUOTA_NL_A_PAD);
if (ret)
goto attr_err_out;
genlmsg_end(skb, msg_head);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 0f10ee9892ce3..35df08ee9c97d 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -211,7 +211,7 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id,
if (!sb->s_qcop->get_dqblk)
return -ENOSYS;
qid = make_kqid(current_user_ns(), type, id);
- if (!qid_valid(qid))
+ if (!qid_has_mapping(sb->s_user_ns, qid))
return -EINVAL;
ret = sb->s_qcop->get_dqblk(sb, qid, &fdq);
if (ret)
@@ -237,7 +237,7 @@ static int quota_getnextquota(struct super_block *sb, int type, qid_t id,
if (!sb->s_qcop->get_nextdqblk)
return -ENOSYS;
qid = make_kqid(current_user_ns(), type, id);
- if (!qid_valid(qid))
+ if (!qid_has_mapping(sb->s_user_ns, qid))
return -EINVAL;
ret = sb->s_qcop->get_nextdqblk(sb, &qid, &fdq);
if (ret)
@@ -288,7 +288,7 @@ static int quota_setquota(struct super_block *sb, int type, qid_t id,
if (!sb->s_qcop->set_dqblk)
return -ENOSYS;
qid = make_kqid(current_user_ns(), type, id);
- if (!qid_valid(qid))
+ if (!qid_has_mapping(sb->s_user_ns, qid))
return -EINVAL;
copy_from_if_dqblk(&fdq, &idq);
return sb->s_qcop->set_dqblk(sb, qid, &fdq);
@@ -581,10 +581,10 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id,
if (!sb->s_qcop->set_dqblk)
return -ENOSYS;
qid = make_kqid(current_user_ns(), type, id);
- if (!qid_valid(qid))
+ if (!qid_has_mapping(sb->s_user_ns, qid))
return -EINVAL;
/* Are we actually setting timer / warning limits for all users? */
- if (from_kqid(&init_user_ns, qid) == 0 &&
+ if (from_kqid(sb->s_user_ns, qid) == 0 &&
fdq.d_fieldmask & (FS_DQ_WARNS_MASK | FS_DQ_TIMER_MASK)) {
struct qc_info qinfo;
int ret;
@@ -642,7 +642,7 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
if (!sb->s_qcop->get_dqblk)
return -ENOSYS;
qid = make_kqid(current_user_ns(), type, id);
- if (!qid_valid(qid))
+ if (!qid_has_mapping(sb->s_user_ns, qid))
return -EINVAL;
ret = sb->s_qcop->get_dqblk(sb, qid, &qdq);
if (ret)
@@ -669,7 +669,7 @@ static int quota_getnextxquota(struct super_block *sb, int type, qid_t id,
if (!sb->s_qcop->get_nextdqblk)
return -ENOSYS;
qid = make_kqid(current_user_ns(), type, id);
- if (!qid_valid(qid))
+ if (!qid_has_mapping(sb->s_user_ns, qid))
return -EINVAL;
ret = sb->s_qcop->get_nextdqblk(sb, &qid, &qdq);
if (ret)
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index a586467f6ff6c..be3ddd189cd43 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -211,14 +211,11 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
struct page **pages = NULL, **ptr, *page;
loff_t isize;
- if (!(flags & MAP_SHARED))
- return addr;
-
/* the mapping mustn't extend beyond the EOF */
lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
isize = i_size_read(inode);
- ret = -EINVAL;
+ ret = -ENOSYS;
maxpages = (isize + PAGE_SIZE - 1) >> PAGE_SHIFT;
if (pgoff >= maxpages)
goto out;
@@ -227,7 +224,6 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
goto out;
/* gang-find the pages */
- ret = -ENOMEM;
pages = kcalloc(lpages, sizeof(struct page *), GFP_KERNEL);
if (!pages)
goto out_free;
@@ -263,7 +259,7 @@ out:
*/
static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma)
{
- if (!(vma->vm_flags & VM_SHARED))
+ if (!(vma->vm_flags & (VM_SHARED | VM_MAYSHARE)))
return -ENOSYS;
file_accessed(file);
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 38981b0375243..1ab6e6c2e60e7 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -223,8 +223,8 @@ int ramfs_fill_super(struct super_block *sb, void *data, int silent)
return err;
sb->s_maxbytes = MAX_LFS_FILESIZE;
- sb->s_blocksize = PAGE_CACHE_SIZE;
- sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ sb->s_blocksize = PAGE_SIZE;
+ sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = RAMFS_MAGIC;
sb->s_op = &ramfs_ops;
sb->s_time_gran = 1;
diff --git a/fs/read_write.c b/fs/read_write.c
index cf377cf9dfe38..66215a7b17cf1 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -302,18 +302,6 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
}
EXPORT_SYMBOL(vfs_llseek);
-static inline struct fd fdget_pos(int fd)
-{
- return __to_fd(__fdget_pos(fd));
-}
-
-static inline void fdput_pos(struct fd f)
-{
- if (f.flags & FDPUT_POS_UNLOCK)
- mutex_unlock(&f.file->f_pos_lock);
- fdput(f);
-}
-
SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
{
off_t retval;
@@ -410,11 +398,6 @@ ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos)
}
EXPORT_SYMBOL(vfs_iter_write);
-/*
- * rw_verify_area doesn't like huge counts. We limit
- * them to something that fits in "int" so that others
- * won't have to do range checks all the time.
- */
int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
{
struct inode *inode;
@@ -441,11 +424,8 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t
if (retval < 0)
return retval;
}
- retval = security_file_permission(file,
+ return security_file_permission(file,
read_write == READ ? MAY_READ : MAY_WRITE);
- if (retval)
- return retval;
- return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
}
static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
@@ -489,8 +469,9 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
return -EFAULT;
ret = rw_verify_area(READ, file, pos, count);
- if (ret >= 0) {
- count = ret;
+ if (!ret) {
+ if (count > MAX_RW_COUNT)
+ count = MAX_RW_COUNT;
ret = __vfs_read(file, buf, count, pos);
if (ret > 0) {
fsnotify_access(file);
@@ -572,8 +553,9 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
return -EFAULT;
ret = rw_verify_area(WRITE, file, pos, count);
- if (ret >= 0) {
- count = ret;
+ if (!ret) {
+ if (count > MAX_RW_COUNT)
+ count = MAX_RW_COUNT;
file_start_write(file);
ret = __vfs_write(file, buf, count, pos);
if (ret > 0) {
@@ -698,12 +680,16 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
struct kiocb kiocb;
ssize_t ret;
- if (flags & ~RWF_HIPRI)
+ if (flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC))
return -EOPNOTSUPP;
init_sync_kiocb(&kiocb, filp);
if (flags & RWF_HIPRI)
kiocb.ki_flags |= IOCB_HIPRI;
+ if (flags & RWF_DSYNC)
+ kiocb.ki_flags |= IOCB_DSYNC;
+ if (flags & RWF_SYNC)
+ kiocb.ki_flags |= (IOCB_DSYNC | IOCB_SYNC);
kiocb.ki_pos = *ppos;
ret = fn(&kiocb, iter);
@@ -1182,6 +1168,15 @@ COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
return do_compat_preadv64(fd, vec, vlen, pos, 0);
}
+#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
+COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
+ const struct compat_iovec __user *,vec,
+ unsigned long, vlen, loff_t, pos, int, flags)
+{
+ return do_compat_preadv64(fd, vec, vlen, pos, flags);
+}
+#endif
+
COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
const struct compat_iovec __user *,vec,
compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
@@ -1279,6 +1274,15 @@ COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
return do_compat_pwritev64(fd, vec, vlen, pos, 0);
}
+#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
+COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
+ const struct compat_iovec __user *,vec,
+ unsigned long, vlen, loff_t, pos, int, flags)
+{
+ return do_compat_pwritev64(fd, vec, vlen, pos, flags);
+}
+#endif
+
COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
const struct compat_iovec __user *,vec,
compat_ulong_t, vlen, u32, pos_low, u32, pos_high, int, flags)
@@ -1323,7 +1327,8 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
retval = rw_verify_area(READ, in.file, &pos, count);
if (retval < 0)
goto fput_in;
- count = retval;
+ if (count > MAX_RW_COUNT)
+ count = MAX_RW_COUNT;
/*
* Get output file, and verify that it is ok..
@@ -1341,7 +1346,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
retval = rw_verify_area(WRITE, out.file, &out_pos, count);
if (retval < 0)
goto fput_out;
- count = retval;
if (!max)
max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
@@ -1485,11 +1489,12 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
if (flags != 0)
return -EINVAL;
- /* copy_file_range allows full ssize_t len, ignoring MAX_RW_COUNT */
ret = rw_verify_area(READ, file_in, &pos_in, len);
- if (ret >= 0)
- ret = rw_verify_area(WRITE, file_out, &pos_out, len);
- if (ret < 0)
+ if (unlikely(ret))
+ return ret;
+
+ ret = rw_verify_area(WRITE, file_out, &pos_out, len);
+ if (unlikely(ret))
return ret;
if (!(file_in->f_mode & FMODE_READ) ||
diff --git a/fs/readdir.c b/fs/readdir.c
index e69ef3b79787b..9d0212c374d6d 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -24,27 +24,40 @@
int iterate_dir(struct file *file, struct dir_context *ctx)
{
struct inode *inode = file_inode(file);
+ bool shared = false;
int res = -ENOTDIR;
- if (!file->f_op->iterate)
+ if (file->f_op->iterate_shared)
+ shared = true;
+ else if (!file->f_op->iterate)
goto out;
res = security_file_permission(file, MAY_READ);
if (res)
goto out;
- res = mutex_lock_killable(&inode->i_mutex);
- if (res)
- goto out;
+ if (shared) {
+ inode_lock_shared(inode);
+ } else {
+ res = down_write_killable(&inode->i_rwsem);
+ if (res)
+ goto out;
+ }
res = -ENOENT;
if (!IS_DEADDIR(inode)) {
ctx->pos = file->f_pos;
- res = file->f_op->iterate(file, ctx);
+ if (shared)
+ res = file->f_op->iterate_shared(file, ctx);
+ else
+ res = file->f_op->iterate(file, ctx);
file->f_pos = ctx->pos;
fsnotify_access(file);
file_accessed(file);
}
- inode_unlock(inode);
+ if (shared)
+ inode_unlock_shared(inode);
+ else
+ inode_unlock(inode);
out:
return res;
}
@@ -111,7 +124,7 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
struct old_linux_dirent __user *, dirent, unsigned int, count)
{
int error;
- struct fd f = fdget(fd);
+ struct fd f = fdget_pos(fd);
struct readdir_callback buf = {
.ctx.actor = fillonedir,
.dirent = dirent
@@ -124,7 +137,7 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
if (buf.result)
error = buf.result;
- fdput(f);
+ fdput_pos(f);
return error;
}
@@ -169,6 +182,8 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen,
}
dirent = buf->previous;
if (dirent) {
+ if (signal_pending(current))
+ return -EINTR;
if (__put_user(offset, &dirent->d_off))
goto efault;
}
@@ -208,7 +223,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
if (!access_ok(VERIFY_WRITE, dirent, count))
return -EFAULT;
- f = fdget(fd);
+ f = fdget_pos(fd);
if (!f.file)
return -EBADF;
@@ -222,7 +237,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
else
error = count - buf.count;
}
- fdput(f);
+ fdput_pos(f);
return error;
}
@@ -248,6 +263,8 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen,
return -EINVAL;
dirent = buf->previous;
if (dirent) {
+ if (signal_pending(current))
+ return -EINTR;
if (__put_user(offset, &dirent->d_off))
goto efault;
}
@@ -289,7 +306,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
if (!access_ok(VERIFY_WRITE, dirent, count))
return -EFAULT;
- f = fdget(fd);
+ f = fdget_pos(fd);
if (!f.file)
return -EBADF;
@@ -304,6 +321,6 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
else
error = count - buf.count;
}
- fdput(f);
+ fdput_pos(f);
return error;
}
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 3abd4004184ba..45aa05e2232f9 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -20,7 +20,7 @@ static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
const struct file_operations reiserfs_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .iterate = reiserfs_readdir,
+ .iterate_shared = reiserfs_readdir,
.fsync = reiserfs_dir_fsync,
.unlocked_ioctl = reiserfs_ioctl,
#ifdef CONFIG_COMPAT
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 9424a4ba93a95..90f815bdfa8a1 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -180,11 +180,11 @@ int reiserfs_commit_page(struct inode *inode, struct page *page,
int partial = 0;
unsigned blocksize;
struct buffer_head *bh, *head;
- unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT;
+ unsigned long i_size_index = inode->i_size >> PAGE_SHIFT;
int new;
int logit = reiserfs_file_data_log(inode);
struct super_block *s = inode->i_sb;
- int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
+ int bh_per_page = PAGE_SIZE / s->s_blocksize;
struct reiserfs_transaction_handle th;
int ret = 0;
@@ -260,10 +260,10 @@ const struct file_operations reiserfs_file_operations = {
const struct inode_operations reiserfs_file_inode_operations = {
.setattr = reiserfs_setattr,
- .setxattr = reiserfs_setxattr,
- .getxattr = reiserfs_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
.listxattr = reiserfs_listxattr,
- .removexattr = reiserfs_removexattr,
+ .removexattr = generic_removexattr,
.permission = reiserfs_permission,
.get_acl = reiserfs_get_acl,
.set_acl = reiserfs_set_acl,
diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c
index b751eea32e207..5db6f45b3fed6 100644
--- a/fs/reiserfs/ibalance.c
+++ b/fs/reiserfs/ibalance.c
@@ -1153,8 +1153,9 @@ int balance_internal(struct tree_balance *tb,
insert_ptr);
}
- memcpy(new_insert_key_addr, &new_insert_key, KEY_SIZE);
insert_ptr[0] = new_insert_ptr;
+ if (new_insert_ptr)
+ memcpy(new_insert_key_addr, &new_insert_key, KEY_SIZE);
return order;
}
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index ae9e5b308cf9f..c2c59f9ff04be 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -386,7 +386,7 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
goto finished;
}
/* read file tail into part of page */
- offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1);
+ offset = (cpu_key_k_offset(&key) - 1) & (PAGE_SIZE - 1);
copy_item_head(&tmp_ih, ih);
/*
@@ -587,10 +587,10 @@ static int convert_tail_for_hole(struct inode *inode,
return -EIO;
/* always try to read until the end of the block */
- tail_start = tail_offset & (PAGE_CACHE_SIZE - 1);
+ tail_start = tail_offset & (PAGE_SIZE - 1);
tail_end = (tail_start | (bh_result->b_size - 1)) + 1;
- index = tail_offset >> PAGE_CACHE_SHIFT;
+ index = tail_offset >> PAGE_SHIFT;
/*
* hole_page can be zero in case of direct_io, we are sure
* that we cannot get here if we write with O_DIRECT into tail page
@@ -629,7 +629,7 @@ static int convert_tail_for_hole(struct inode *inode,
unlock:
if (tail_page != hole_page) {
unlock_page(tail_page);
- page_cache_release(tail_page);
+ put_page(tail_page);
}
out:
return retval;
@@ -2189,11 +2189,11 @@ static int grab_tail_page(struct inode *inode,
* we want the page with the last byte in the file,
* not the page that will hold the next byte for appending
*/
- unsigned long index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
+ unsigned long index = (inode->i_size - 1) >> PAGE_SHIFT;
unsigned long pos = 0;
unsigned long start = 0;
unsigned long blocksize = inode->i_sb->s_blocksize;
- unsigned long offset = (inode->i_size) & (PAGE_CACHE_SIZE - 1);
+ unsigned long offset = (inode->i_size) & (PAGE_SIZE - 1);
struct buffer_head *bh;
struct buffer_head *head;
struct page *page;
@@ -2251,7 +2251,7 @@ out:
unlock:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return error;
}
@@ -2265,7 +2265,7 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
{
struct reiserfs_transaction_handle th;
/* we want the offset for the first byte after the end of the file */
- unsigned long offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
+ unsigned long offset = inode->i_size & (PAGE_SIZE - 1);
unsigned blocksize = inode->i_sb->s_blocksize;
unsigned length;
struct page *page = NULL;
@@ -2345,7 +2345,7 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
}
}
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
reiserfs_write_unlock(inode->i_sb);
@@ -2354,7 +2354,7 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
out:
if (page) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
reiserfs_write_unlock(inode->i_sb);
@@ -2426,7 +2426,7 @@ research:
} else if (is_direct_le_ih(ih)) {
char *p;
p = page_address(bh_result->b_page);
- p += (byte_offset - 1) & (PAGE_CACHE_SIZE - 1);
+ p += (byte_offset - 1) & (PAGE_SIZE - 1);
copy_size = ih_item_len(ih) - pos_in_item;
fs_gen = get_generation(inode->i_sb);
@@ -2525,7 +2525,7 @@ static int reiserfs_write_full_page(struct page *page,
struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
- unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = inode->i_size >> PAGE_SHIFT;
int error = 0;
unsigned long block;
sector_t last_block;
@@ -2535,7 +2535,7 @@ static int reiserfs_write_full_page(struct page *page,
int checked = PageChecked(page);
struct reiserfs_transaction_handle th;
struct super_block *s = inode->i_sb;
- int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
+ int bh_per_page = PAGE_SIZE / s->s_blocksize;
th.t_trans_id = 0;
/* no logging allowed when nonblocking or from PF_MEMALLOC */
@@ -2564,16 +2564,16 @@ static int reiserfs_write_full_page(struct page *page,
if (page->index >= end_index) {
unsigned last_offset;
- last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
+ last_offset = inode->i_size & (PAGE_SIZE - 1);
/* no file contents in this page */
if (page->index >= end_index + 1 || !last_offset) {
unlock_page(page);
return 0;
}
- zero_user_segment(page, last_offset, PAGE_CACHE_SIZE);
+ zero_user_segment(page, last_offset, PAGE_SIZE);
}
bh = head;
- block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits);
+ block = page->index << (PAGE_SHIFT - s->s_blocksize_bits);
last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
/* first map all the buffers, logging any direct items we find */
do {
@@ -2668,7 +2668,7 @@ static int reiserfs_write_full_page(struct page *page,
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
- submit_bh(WRITE, bh);
+ submit_bh(REQ_OP_WRITE, 0, bh);
nr++;
}
put_bh(bh);
@@ -2728,7 +2728,7 @@ fail:
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
clear_buffer_dirty(bh);
- submit_bh(WRITE, bh);
+ submit_bh(REQ_OP_WRITE, 0, bh);
nr++;
}
put_bh(bh);
@@ -2774,7 +2774,7 @@ static int reiserfs_write_begin(struct file *file,
*fsdata = (void *)(unsigned long)flags;
}
- index = pos >> PAGE_CACHE_SHIFT;
+ index = pos >> PAGE_SHIFT;
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
@@ -2822,7 +2822,7 @@ static int reiserfs_write_begin(struct file *file,
}
if (ret) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
/* Truncate allocated blocks */
reiserfs_truncate_failed_write(inode);
}
@@ -2909,7 +2909,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
else
th = NULL;
- start = pos & (PAGE_CACHE_SIZE - 1);
+ start = pos & (PAGE_SIZE - 1);
if (unlikely(copied < len)) {
if (!PageUptodate(page))
copied = 0;
@@ -2974,7 +2974,7 @@ out:
if (locked)
reiserfs_write_unlock(inode->i_sb);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if (pos + len > inode->i_size)
reiserfs_truncate_failed_write(inode);
@@ -2996,7 +2996,7 @@ int reiserfs_commit_write(struct file *f, struct page *page,
unsigned from, unsigned to)
{
struct inode *inode = page->mapping->host;
- loff_t pos = ((loff_t) page->index << PAGE_CACHE_SHIFT) + to;
+ loff_t pos = ((loff_t) page->index << PAGE_SHIFT) + to;
int ret = 0;
int update_sd = 0;
struct reiserfs_transaction_handle *th = NULL;
@@ -3181,7 +3181,7 @@ static void reiserfs_invalidatepage(struct page *page, unsigned int offset,
struct inode *inode = page->mapping->host;
unsigned int curr_off = 0;
unsigned int stop = offset + length;
- int partial_page = (offset || length < PAGE_CACHE_SIZE);
+ int partial_page = (offset || length < PAGE_SIZE);
int ret = 1;
BUG_ON(!PageLocked(page));
@@ -3279,15 +3279,14 @@ static int reiserfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
* We thank Mingming Cao for helping us understand in great detail what
* to do in this section of the code.
*/
-static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- loff_t offset)
+static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
size_t count = iov_iter_count(iter);
ssize_t ret;
- ret = blockdev_direct_IO(iocb, inode, iter, offset,
+ ret = blockdev_direct_IO(iocb, inode, iter,
reiserfs_get_blocks_direct_io);
/*
@@ -3296,7 +3295,7 @@ static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
*/
if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
loff_t isize = i_size_read(inode);
- loff_t end = offset + count;
+ loff_t end = iocb->ki_pos + count;
if ((end > isize) && inode_newsize_ok(inode, isize) == 0) {
truncate_setsize(inode, isize);
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 036a1fc0a8c35..2f1ddc9080132 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -187,7 +187,11 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
}
/* we need to make sure nobody is changing the file size beneath us */
- reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb);
+{
+ int depth = reiserfs_write_unlock_nested(inode->i_sb);
+ inode_lock(inode);
+ reiserfs_write_lock_nested(inode->i_sb, depth);
+}
reiserfs_write_lock(inode->i_sb);
@@ -203,7 +207,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
* __reiserfs_write_begin on that page. This will force a
* reiserfs_get_block to unpack the tail for us.
*/
- index = inode->i_size >> PAGE_CACHE_SHIFT;
+ index = inode->i_size >> PAGE_SHIFT;
mapping = inode->i_mapping;
page = grab_cache_page(mapping, index);
retval = -ENOMEM;
@@ -221,7 +225,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
out_unlock:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
out:
inode_unlock(inode);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 44c2bdced1c87..bc2dde2423c2e 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -599,18 +599,18 @@ static int journal_list_still_alive(struct super_block *s,
* This does a check to see if the buffer belongs to one of these
* lost pages before doing the final put_bh. If page->mapping was
* null, it tries to free buffers on the page, which should make the
- * final page_cache_release drop the page from the lru.
+ * final put_page drop the page from the lru.
*/
static void release_buffer_page(struct buffer_head *bh)
{
struct page *page = bh->b_page;
if (!page->mapping && trylock_page(page)) {
- page_cache_get(page);
+ get_page(page);
put_bh(bh);
if (!page->mapping)
try_to_free_buffers(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
} else {
put_bh(bh);
}
@@ -652,7 +652,7 @@ static void submit_logged_buffer(struct buffer_head *bh)
BUG();
if (!buffer_uptodate(bh))
BUG();
- submit_bh(WRITE, bh);
+ submit_bh(REQ_OP_WRITE, 0, bh);
}
static void submit_ordered_buffer(struct buffer_head *bh)
@@ -662,7 +662,7 @@ static void submit_ordered_buffer(struct buffer_head *bh)
clear_buffer_dirty(bh);
if (!buffer_uptodate(bh))
BUG();
- submit_bh(WRITE, bh);
+ submit_bh(REQ_OP_WRITE, 0, bh);
}
#define CHUNK_SIZE 32
@@ -870,7 +870,7 @@ loop_next:
*/
if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) {
spin_unlock(lock);
- ll_rw_block(WRITE, 1, &bh);
+ ll_rw_block(REQ_OP_WRITE, 0, 1, &bh);
spin_lock(lock);
}
put_bh(bh);
@@ -1057,7 +1057,7 @@ static int flush_commit_list(struct super_block *s,
if (tbh) {
if (buffer_dirty(tbh)) {
depth = reiserfs_write_unlock_nested(s);
- ll_rw_block(WRITE, 1, &tbh);
+ ll_rw_block(REQ_OP_WRITE, 0, 1, &tbh);
reiserfs_write_lock_nested(s, depth);
}
put_bh(tbh) ;
@@ -2244,7 +2244,7 @@ abort_replay:
}
}
/* read in the log blocks, memcpy to the corresponding real block */
- ll_rw_block(READ, get_desc_trans_len(desc), log_blocks);
+ ll_rw_block(REQ_OP_READ, 0, get_desc_trans_len(desc), log_blocks);
for (i = 0; i < get_desc_trans_len(desc); i++) {
wait_on_buffer(log_blocks[i]);
@@ -2269,7 +2269,7 @@ abort_replay:
/* flush out the real blocks */
for (i = 0; i < get_desc_trans_len(desc); i++) {
set_buffer_dirty(real_blocks[i]);
- write_dirty_buffer(real_blocks[i], WRITE);
+ write_dirty_buffer(real_blocks[i], 0);
}
for (i = 0; i < get_desc_trans_len(desc); i++) {
wait_on_buffer(real_blocks[i]);
@@ -2346,7 +2346,7 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev,
} else
bhlist[j++] = bh;
}
- ll_rw_block(READ, j, bhlist);
+ ll_rw_block(REQ_OP_READ, 0, j, bhlist);
for (i = 1; i < j; i++)
brelse(bhlist[i]);
bh = bhlist[0];
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 2a12d46d7fb41..8a36696d6df99 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -1650,10 +1650,10 @@ const struct inode_operations reiserfs_dir_inode_operations = {
.mknod = reiserfs_mknod,
.rename = reiserfs_rename,
.setattr = reiserfs_setattr,
- .setxattr = reiserfs_setxattr,
- .getxattr = reiserfs_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
.listxattr = reiserfs_listxattr,
- .removexattr = reiserfs_removexattr,
+ .removexattr = generic_removexattr,
.permission = reiserfs_permission,
.get_acl = reiserfs_get_acl,
.set_acl = reiserfs_set_acl,
@@ -1667,10 +1667,10 @@ const struct inode_operations reiserfs_symlink_inode_operations = {
.readlink = generic_readlink,
.get_link = page_get_link,
.setattr = reiserfs_setattr,
- .setxattr = reiserfs_setxattr,
- .getxattr = reiserfs_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
.listxattr = reiserfs_listxattr,
- .removexattr = reiserfs_removexattr,
+ .removexattr = generic_removexattr,
.permission = reiserfs_permission,
};
@@ -1679,10 +1679,10 @@ const struct inode_operations reiserfs_symlink_inode_operations = {
*/
const struct inode_operations reiserfs_special_inode_operations = {
.setattr = reiserfs_setattr,
- .setxattr = reiserfs_setxattr,
- .getxattr = reiserfs_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
.listxattr = reiserfs_listxattr,
- .removexattr = reiserfs_removexattr,
+ .removexattr = generic_removexattr,
.permission = reiserfs_permission,
.get_acl = reiserfs_get_acl,
.set_acl = reiserfs_set_acl,
diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c
index 99a5d5dae46a5..415d66ca87d1b 100644
--- a/fs/reiserfs/objectid.c
+++ b/fs/reiserfs/objectid.c
@@ -3,8 +3,8 @@
*/
#include <linux/string.h>
-#include <linux/random.h>
#include <linux/time.h>
+#include <linux/uuid.h>
#include "reiserfs.h"
/* find where objectid map starts */
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 24cbe013240fa..4032d1e87c8fa 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -551,7 +551,7 @@ static int search_by_key_reada(struct super_block *s,
if (!buffer_uptodate(bh[j])) {
if (depth == -1)
depth = reiserfs_write_unlock_nested(s);
- ll_rw_block(READA, 1, bh + j);
+ ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, bh + j);
}
brelse(bh[j]);
}
@@ -660,7 +660,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key,
if (!buffer_uptodate(bh) && depth == -1)
depth = reiserfs_write_unlock_nested(sb);
- ll_rw_block(READ, 1, &bh);
+ ll_rw_block(REQ_OP_READ, 0, 1, &bh);
wait_on_buffer(bh);
if (depth != -1)
@@ -1342,7 +1342,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
*/
data = kmap_atomic(un_bh->b_page);
- off = ((le_ih_k_offset(&s_ih) - 1) & (PAGE_CACHE_SIZE - 1));
+ off = ((le_ih_k_offset(&s_ih) - 1) & (PAGE_SIZE - 1));
memcpy(data + off,
ih_item_body(PATH_PLAST_BUFFER(path), &s_ih),
ret_value);
@@ -1511,7 +1511,7 @@ static void unmap_buffers(struct page *page, loff_t pos)
if (page) {
if (page_has_buffers(page)) {
- tail_index = pos & (PAGE_CACHE_SIZE - 1);
+ tail_index = pos & (PAGE_SIZE - 1);
cur_index = 0;
head = page_buffers(page);
bh = head;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index b8f2d1e8c6453..7a4a85a6821e7 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1393,7 +1393,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
unsigned long safe_mask = 0;
unsigned int commit_max_age = (unsigned int)-1;
struct reiserfs_journal *journal = SB_JOURNAL(s);
- char *new_opts = kstrdup(arg, GFP_KERNEL);
+ char *new_opts;
int err;
char *qf_names[REISERFS_MAXQUOTAS];
unsigned int qfmt = 0;
@@ -1401,6 +1401,10 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
int i;
#endif
+ new_opts = kstrdup(arg, GFP_KERNEL);
+ if (arg && !new_opts)
+ return -ENOMEM;
+
sync_filesystem(s);
reiserfs_write_lock(s);
@@ -1546,7 +1550,8 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
}
out_ok_unlocked:
- replace_mount_options(s, new_opts);
+ if (new_opts)
+ replace_mount_options(s, new_opts);
return 0;
out_err_unlock:
@@ -1661,7 +1666,7 @@ static int read_super_block(struct super_block *s, int offset)
/* after journal replay, reread all bitmap and super blocks */
static int reread_meta_blocks(struct super_block *s)
{
- ll_rw_block(READ, 1, &SB_BUFFER_WITH_SB(s));
+ ll_rw_block(REQ_OP_READ, 0, 1, &SB_BUFFER_WITH_SB(s));
wait_on_buffer(SB_BUFFER_WITH_SB(s));
if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) {
reiserfs_warning(s, "reiserfs-2504", "error reading the super");
diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c
index f41e19b4bb42f..2d5489b0a2693 100644
--- a/fs/reiserfs/tail_conversion.c
+++ b/fs/reiserfs/tail_conversion.c
@@ -151,7 +151,7 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
*/
if (up_to_date_bh) {
unsigned pgoff =
- (tail_offset + total_tail - 1) & (PAGE_CACHE_SIZE - 1);
+ (tail_offset + total_tail - 1) & (PAGE_SIZE - 1);
char *kaddr = kmap_atomic(up_to_date_bh->b_page);
memset(kaddr + pgoff, 0, blk_size - total_tail);
kunmap_atomic(kaddr);
@@ -271,7 +271,7 @@ int indirect2direct(struct reiserfs_transaction_handle *th,
* the page was locked and this part of the page was up to date when
* indirect2direct was called, so we know the bytes are still valid
*/
- tail = tail + (pos & (PAGE_CACHE_SIZE - 1));
+ tail = tail + (pos & (PAGE_SIZE - 1));
PATH_LAST_POSITION(path)++;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 57e0b23105327..a33812ae9fad2 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -415,7 +415,7 @@ out:
static inline void reiserfs_put_page(struct page *page)
{
kunmap(page);
- page_cache_release(page);
+ put_page(page);
}
static struct page *reiserfs_get_page(struct inode *dir, size_t n)
@@ -427,7 +427,7 @@ static struct page *reiserfs_get_page(struct inode *dir, size_t n)
* and an unlink/rmdir has just occurred - GFP_NOFS avoids this
*/
mapping_set_gfp_mask(mapping, GFP_NOFS);
- page = read_mapping_page(mapping, n >> PAGE_CACHE_SHIFT, NULL);
+ page = read_mapping_page(mapping, n >> PAGE_SHIFT, NULL);
if (!IS_ERR(page)) {
kmap(page);
if (PageError(page))
@@ -526,10 +526,10 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
while (buffer_pos < buffer_size || buffer_pos == 0) {
size_t chunk;
size_t skip = 0;
- size_t page_offset = (file_pos & (PAGE_CACHE_SIZE - 1));
+ size_t page_offset = (file_pos & (PAGE_SIZE - 1));
- if (buffer_size - buffer_pos > PAGE_CACHE_SIZE)
- chunk = PAGE_CACHE_SIZE;
+ if (buffer_size - buffer_pos > PAGE_SIZE)
+ chunk = PAGE_SIZE;
else
chunk = buffer_size - buffer_pos;
@@ -546,8 +546,8 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
struct reiserfs_xattr_header *rxh;
skip = file_pos = sizeof(struct reiserfs_xattr_header);
- if (chunk + skip > PAGE_CACHE_SIZE)
- chunk = PAGE_CACHE_SIZE - skip;
+ if (chunk + skip > PAGE_SIZE)
+ chunk = PAGE_SIZE - skip;
rxh = (struct reiserfs_xattr_header *)data;
rxh->h_magic = cpu_to_le32(REISERFS_XATTR_MAGIC);
rxh->h_hash = cpu_to_le32(xahash);
@@ -675,8 +675,8 @@ reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer,
char *data;
size_t skip = 0;
- if (isize - file_pos > PAGE_CACHE_SIZE)
- chunk = PAGE_CACHE_SIZE;
+ if (isize - file_pos > PAGE_SIZE)
+ chunk = PAGE_SIZE;
else
chunk = isize - file_pos;
@@ -764,60 +764,6 @@ find_xattr_handler_prefix(const struct xattr_handler **handlers,
return xah;
}
-
-/*
- * Inode operation getxattr()
- */
-ssize_t
-reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
- size_t size)
-{
- const struct xattr_handler *handler;
-
- handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
-
- if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
- return -EOPNOTSUPP;
-
- return handler->get(handler, dentry, name, buffer, size);
-}
-
-/*
- * Inode operation setxattr()
- *
- * d_inode(dentry)->i_mutex down
- */
-int
-reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
- size_t size, int flags)
-{
- const struct xattr_handler *handler;
-
- handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
-
- if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
- return -EOPNOTSUPP;
-
- return handler->set(handler, dentry, name, value, size, flags);
-}
-
-/*
- * Inode operation removexattr()
- *
- * d_inode(dentry)->i_mutex down
- */
-int reiserfs_removexattr(struct dentry *dentry, const char *name)
-{
- const struct xattr_handler *handler;
-
- handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
-
- if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
- return -EOPNOTSUPP;
-
- return handler->set(handler, dentry, name, NULL, 0, XATTR_REPLACE);
-}
-
struct listxattr_buf {
struct dir_context ctx;
size_t size;
diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h
index 15dde6262c00e..613ff5aef94ea 100644
--- a/fs/reiserfs/xattr.h
+++ b/fs/reiserfs/xattr.h
@@ -2,6 +2,7 @@
#include <linux/init.h>
#include <linux/list.h>
#include <linux/rwsem.h>
+#include <linux/xattr.h>
struct inode;
struct dentry;
@@ -18,12 +19,7 @@ int reiserfs_permission(struct inode *inode, int mask);
#ifdef CONFIG_REISERFS_FS_XATTR
#define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir)
-ssize_t reiserfs_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size);
-int reiserfs_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags);
ssize_t reiserfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
-int reiserfs_removexattr(struct dentry *dentry, const char *name);
int reiserfs_xattr_get(struct inode *, const char *, void *, size_t);
int reiserfs_xattr_set(struct inode *, const char *, const void *, size_t, int);
@@ -92,10 +88,7 @@ static inline void reiserfs_init_xattr_rwsem(struct inode *inode)
#else
-#define reiserfs_getxattr NULL
-#define reiserfs_setxattr NULL
#define reiserfs_listxattr NULL
-#define reiserfs_removexattr NULL
static inline void reiserfs_init_xattr_rwsem(struct inode *inode)
{
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 558a16beaacb9..dbed42f755e01 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -197,10 +197,8 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
size = reiserfs_xattr_get(inode, name, NULL, 0);
if (size < 0) {
- if (size == -ENODATA || size == -ENOSYS) {
- set_cached_acl(inode, type, NULL);
+ if (size == -ENODATA || size == -ENOSYS)
return NULL;
- }
return ERR_PTR(size);
}
@@ -220,8 +218,6 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
} else {
acl = reiserfs_posix_acl_from_disk(value, retval);
}
- if (!IS_ERR(acl))
- set_cached_acl(inode, type, acl);
kfree(value);
return acl;
@@ -370,7 +366,7 @@ int reiserfs_cache_default_acl(struct inode *inode)
if (IS_PRIVATE(inode))
return 0;
- acl = reiserfs_get_acl(inode, ACL_TYPE_DEFAULT);
+ acl = get_acl(inode, ACL_TYPE_DEFAULT);
if (acl && !IS_ERR(acl)) {
int size = reiserfs_acl_size(acl->a_count);
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index ab0217d320396..e4cbb77199063 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -9,29 +9,27 @@
#include <linux/uaccess.h>
static int
-security_get(const struct xattr_handler *handler, struct dentry *dentry,
- const char *name, void *buffer, size_t size)
+security_get(const struct xattr_handler *handler, struct dentry *unused,
+ struct inode *inode, const char *name, void *buffer, size_t size)
{
- if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
- return -EINVAL;
-
- if (IS_PRIVATE(d_inode(dentry)))
+ if (IS_PRIVATE(inode))
return -EPERM;
- return reiserfs_xattr_get(d_inode(dentry), name, buffer, size);
+ return reiserfs_xattr_get(inode, xattr_full_name(handler, name),
+ buffer, size);
}
static int
-security_set(const struct xattr_handler *handler, struct dentry *dentry,
- const char *name, const void *buffer, size_t size, int flags)
+security_set(const struct xattr_handler *handler, struct dentry *unused,
+ struct inode *inode, const char *name, const void *buffer,
+ size_t size, int flags)
{
- if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
- return -EINVAL;
-
- if (IS_PRIVATE(d_inode(dentry)))
+ if (IS_PRIVATE(inode))
return -EPERM;
- return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags);
+ return reiserfs_xattr_set(inode,
+ xattr_full_name(handler, name),
+ buffer, size, flags);
}
static bool security_list(struct dentry *dentry)
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index 64b67aa643a96..f15a5f9e84ce1 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -8,29 +8,27 @@
#include <linux/uaccess.h>
static int
-trusted_get(const struct xattr_handler *handler, struct dentry *dentry,
- const char *name, void *buffer, size_t size)
+trusted_get(const struct xattr_handler *handler, struct dentry *unused,
+ struct inode *inode, const char *name, void *buffer, size_t size)
{
- if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
- return -EINVAL;
-
- if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry)))
+ if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode))
return -EPERM;
- return reiserfs_xattr_get(d_inode(dentry), name, buffer, size);
+ return reiserfs_xattr_get(inode, xattr_full_name(handler, name),
+ buffer, size);
}
static int
-trusted_set(const struct xattr_handler *handler, struct dentry *dentry,
- const char *name, const void *buffer, size_t size, int flags)
+trusted_set(const struct xattr_handler *handler, struct dentry *unused,
+ struct inode *inode, const char *name, const void *buffer,
+ size_t size, int flags)
{
- if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
- return -EINVAL;
-
- if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry)))
+ if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode))
return -EPERM;
- return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags);
+ return reiserfs_xattr_set(inode,
+ xattr_full_name(handler, name),
+ buffer, size, flags);
}
static bool trusted_list(struct dentry *dentry)
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index 12e6306f562a3..dc59df43b2dbf 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -7,27 +7,25 @@
#include <linux/uaccess.h>
static int
-user_get(const struct xattr_handler *handler, struct dentry *dentry,
- const char *name, void *buffer, size_t size)
+user_get(const struct xattr_handler *handler, struct dentry *unused,
+ struct inode *inode, const char *name, void *buffer, size_t size)
{
-
- if (strlen(name) < sizeof(XATTR_USER_PREFIX))
- return -EINVAL;
- if (!reiserfs_xattrs_user(dentry->d_sb))
+ if (!reiserfs_xattrs_user(inode->i_sb))
return -EOPNOTSUPP;
- return reiserfs_xattr_get(d_inode(dentry), name, buffer, size);
+ return reiserfs_xattr_get(inode, xattr_full_name(handler, name),
+ buffer, size);
}
static int
-user_set(const struct xattr_handler *handler, struct dentry *dentry,
- const char *name, const void *buffer, size_t size, int flags)
+user_set(const struct xattr_handler *handler, struct dentry *unused,
+ struct inode *inode, const char *name, const void *buffer,
+ size_t size, int flags)
{
- if (strlen(name) < sizeof(XATTR_USER_PREFIX))
- return -EINVAL;
-
- if (!reiserfs_xattrs_user(dentry->d_sb))
+ if (!reiserfs_xattrs_user(inode->i_sb))
return -EOPNOTSUPP;
- return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags);
+ return reiserfs_xattr_set(inode,
+ xattr_full_name(handler, name),
+ buffer, size, flags);
}
static bool user_list(struct dentry *dentry)
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 6b00ca357c58f..d0f8a38dfafac 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -280,8 +280,8 @@ error:
static const struct file_operations romfs_dir_operations = {
.read = generic_read_dir,
- .iterate = romfs_readdir,
- .llseek = default_llseek,
+ .iterate_shared = romfs_readdir,
+ .llseek = generic_file_llseek,
};
static const struct inode_operations romfs_dir_inode_operations = {
diff --git a/fs/select.c b/fs/select.c
index 869293988c2a0..8ed9da50896a1 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -47,7 +47,7 @@
#define MAX_SLACK (100 * NSEC_PER_MSEC)
-static long __estimate_accuracy(struct timespec *tv)
+static long __estimate_accuracy(struct timespec64 *tv)
{
long slack;
int divfactor = 1000;
@@ -70,10 +70,10 @@ static long __estimate_accuracy(struct timespec *tv)
return slack;
}
-u64 select_estimate_accuracy(struct timespec *tv)
+u64 select_estimate_accuracy(struct timespec64 *tv)
{
u64 ret;
- struct timespec now;
+ struct timespec64 now;
/*
* Realtime tasks get a slack of 0 for obvious reasons.
@@ -82,8 +82,8 @@ u64 select_estimate_accuracy(struct timespec *tv)
if (rt_task(current))
return 0;
- ktime_get_ts(&now);
- now = timespec_sub(*tv, now);
+ ktime_get_ts64(&now);
+ now = timespec64_sub(*tv, now);
ret = __estimate_accuracy(&now);
if (ret < current->timer_slack_ns)
return current->timer_slack_ns;
@@ -260,7 +260,7 @@ EXPORT_SYMBOL(poll_schedule_timeout);
/**
* poll_select_set_timeout - helper function to setup the timeout value
- * @to: pointer to timespec variable for the final timeout
+ * @to: pointer to timespec64 variable for the final timeout
* @sec: seconds (from user space)
* @nsec: nanoseconds (from user space)
*
@@ -269,26 +269,28 @@ EXPORT_SYMBOL(poll_schedule_timeout);
*
* Returns -EINVAL if sec/nsec are not normalized. Otherwise 0.
*/
-int poll_select_set_timeout(struct timespec *to, long sec, long nsec)
+int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec)
{
- struct timespec ts = {.tv_sec = sec, .tv_nsec = nsec};
+ struct timespec64 ts = {.tv_sec = sec, .tv_nsec = nsec};
- if (!timespec_valid(&ts))
+ if (!timespec64_valid(&ts))
return -EINVAL;
/* Optimize for the zero timeout value here */
if (!sec && !nsec) {
to->tv_sec = to->tv_nsec = 0;
} else {
- ktime_get_ts(to);
- *to = timespec_add_safe(*to, ts);
+ ktime_get_ts64(to);
+ *to = timespec64_add_safe(*to, ts);
}
return 0;
}
-static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
+static int poll_select_copy_remaining(struct timespec64 *end_time,
+ void __user *p,
int timeval, int ret)
{
+ struct timespec64 rts64;
struct timespec rts;
struct timeval rtv;
@@ -302,16 +304,18 @@ static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
if (!end_time->tv_sec && !end_time->tv_nsec)
return ret;
- ktime_get_ts(&rts);
- rts = timespec_sub(*end_time, rts);
- if (rts.tv_sec < 0)
- rts.tv_sec = rts.tv_nsec = 0;
+ ktime_get_ts64(&rts64);
+ rts64 = timespec64_sub(*end_time, rts64);
+ if (rts64.tv_sec < 0)
+ rts64.tv_sec = rts64.tv_nsec = 0;
+
+ rts = timespec64_to_timespec(rts64);
if (timeval) {
if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
memset(&rtv, 0, sizeof(rtv));
- rtv.tv_sec = rts.tv_sec;
- rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
+ rtv.tv_sec = rts64.tv_sec;
+ rtv.tv_usec = rts64.tv_nsec / NSEC_PER_USEC;
if (!copy_to_user(p, &rtv, sizeof(rtv)))
return ret;
@@ -396,7 +400,7 @@ static inline void wait_key_set(poll_table *wait, unsigned long in,
wait->_key |= POLLOUT_SET;
}
-int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
+int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
{
ktime_t expire, *to = NULL;
struct poll_wqueues table;
@@ -522,7 +526,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
* pointer to the expiry value.
*/
if (end_time && !to) {
- expire = timespec_to_ktime(*end_time);
+ expire = timespec64_to_ktime(*end_time);
to = &expire;
}
@@ -545,7 +549,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
* I'm trying ERESTARTNOHAND which restart only when you want to.
*/
int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
- fd_set __user *exp, struct timespec *end_time)
+ fd_set __user *exp, struct timespec64 *end_time)
{
fd_set_bits fds;
void *bits;
@@ -622,7 +626,7 @@ out_nofds:
SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
fd_set __user *, exp, struct timeval __user *, tvp)
{
- struct timespec end_time, *to = NULL;
+ struct timespec64 end_time, *to = NULL;
struct timeval tv;
int ret;
@@ -648,15 +652,17 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
const sigset_t __user *sigmask, size_t sigsetsize)
{
sigset_t ksigmask, sigsaved;
- struct timespec ts, end_time, *to = NULL;
+ struct timespec ts;
+ struct timespec64 ts64, end_time, *to = NULL;
int ret;
if (tsp) {
if (copy_from_user(&ts, tsp, sizeof(ts)))
return -EFAULT;
+ ts64 = timespec_to_timespec64(ts);
to = &end_time;
- if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+ if (poll_select_set_timeout(to, ts64.tv_sec, ts64.tv_nsec))
return -EINVAL;
}
@@ -779,7 +785,7 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
}
static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
- struct timespec *end_time)
+ struct timespec64 *end_time)
{
poll_table* pt = &wait->pt;
ktime_t expire, *to = NULL;
@@ -854,7 +860,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
* pointer to the expiry value.
*/
if (end_time && !to) {
- expire = timespec_to_ktime(*end_time);
+ expire = timespec64_to_ktime(*end_time);
to = &expire;
}
@@ -868,7 +874,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
sizeof(struct pollfd))
int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
- struct timespec *end_time)
+ struct timespec64 *end_time)
{
struct poll_wqueues table;
int err = -EFAULT, fdcount, len, size;
@@ -936,7 +942,7 @@ static long do_restart_poll(struct restart_block *restart_block)
{
struct pollfd __user *ufds = restart_block->poll.ufds;
int nfds = restart_block->poll.nfds;
- struct timespec *to = NULL, end_time;
+ struct timespec64 *to = NULL, end_time;
int ret;
if (restart_block->poll.has_timeout) {
@@ -957,7 +963,7 @@ static long do_restart_poll(struct restart_block *restart_block)
SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
int, timeout_msecs)
{
- struct timespec end_time, *to = NULL;
+ struct timespec64 end_time, *to = NULL;
int ret;
if (timeout_msecs >= 0) {
@@ -993,7 +999,8 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
size_t, sigsetsize)
{
sigset_t ksigmask, sigsaved;
- struct timespec ts, end_time, *to = NULL;
+ struct timespec ts;
+ struct timespec64 end_time, *to = NULL;
int ret;
if (tsp) {
diff --git a/fs/seq_file.c b/fs/seq_file.c
index e85664b7c7d96..19f532e7d35e9 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -72,9 +72,10 @@ int seq_open(struct file *file, const struct seq_operations *op)
mutex_init(&p->lock);
p->op = op;
-#ifdef CONFIG_USER_NS
- p->user_ns = file->f_cred->user_ns;
-#endif
+
+ // No refcounting: the lifetime of 'p' is constrained
+ // to the lifetime of the file.
+ p->file = file;
/*
* Wrappers around seq_open(e.g. swaps_open) need to be
diff --git a/fs/splice.c b/fs/splice.c
index 9947b5c696649..dd9bf7e410d29 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -88,7 +88,7 @@ out_unlock:
static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
- page_cache_release(buf->page);
+ put_page(buf->page);
buf->flags &= ~PIPE_BUF_FLAG_LRU;
}
@@ -268,7 +268,7 @@ EXPORT_SYMBOL_GPL(splice_to_pipe);
void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
{
- page_cache_release(spd->pages[i]);
+ put_page(spd->pages[i]);
}
/*
@@ -328,9 +328,9 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
if (splice_grow_spd(pipe, &spd))
return -ENOMEM;
- index = *ppos >> PAGE_CACHE_SHIFT;
- loff = *ppos & ~PAGE_CACHE_MASK;
- req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ index = *ppos >> PAGE_SHIFT;
+ loff = *ppos & ~PAGE_MASK;
+ req_pages = (len + loff + PAGE_SIZE - 1) >> PAGE_SHIFT;
nr_pages = min(req_pages, spd.nr_pages_max);
/*
@@ -365,7 +365,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
error = add_to_page_cache_lru(page, mapping, index,
mapping_gfp_constraint(mapping, GFP_KERNEL));
if (unlikely(error)) {
- page_cache_release(page);
+ put_page(page);
if (error == -EEXIST)
continue;
break;
@@ -385,7 +385,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
* Now loop over the map and see if we need to start IO on any
* pages, fill in the partial map, etc.
*/
- index = *ppos >> PAGE_CACHE_SHIFT;
+ index = *ppos >> PAGE_SHIFT;
nr_pages = spd.nr_pages;
spd.nr_pages = 0;
for (page_nr = 0; page_nr < nr_pages; page_nr++) {
@@ -397,7 +397,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
/*
* this_len is the max we'll use from this page
*/
- this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
+ this_len = min_t(unsigned long, len, PAGE_SIZE - loff);
page = spd.pages[page_nr];
if (PageReadahead(page))
@@ -426,7 +426,7 @@ retry_lookup:
error = -ENOMEM;
break;
}
- page_cache_release(spd.pages[page_nr]);
+ put_page(spd.pages[page_nr]);
spd.pages[page_nr] = page;
}
/*
@@ -456,7 +456,7 @@ fill_it:
* i_size must be checked after PageUptodate.
*/
isize = i_size_read(mapping->host);
- end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+ end_index = (isize - 1) >> PAGE_SHIFT;
if (unlikely(!isize || index > end_index))
break;
@@ -470,7 +470,7 @@ fill_it:
/*
* max good bytes in this page
*/
- plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
+ plen = ((isize - 1) & ~PAGE_MASK) + 1;
if (plen <= loff)
break;
@@ -494,8 +494,8 @@ fill_it:
* we got, 'nr_pages' is how many pages are in the map.
*/
while (page_nr < nr_pages)
- page_cache_release(spd.pages[page_nr++]);
- in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
+ put_page(spd.pages[page_nr++]);
+ in->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT;
if (spd.nr_pages)
error = splice_to_pipe(pipe, &spd);
@@ -636,8 +636,8 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
goto shrink_ret;
}
- offset = *ppos & ~PAGE_CACHE_MASK;
- nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ offset = *ppos & ~PAGE_MASK;
+ nr_pages = (len + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
for (i = 0; i < nr_pages && i < spd.nr_pages_max && len; i++) {
struct page *page;
@@ -647,7 +647,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
if (!page)
goto err;
- this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
+ this_len = min_t(size_t, len, PAGE_SIZE - offset);
vec[i].iov_base = (void __user *) page_address(page);
vec[i].iov_len = this_len;
spd.pages[i] = page;
@@ -1143,6 +1143,9 @@ static long do_splice_to(struct file *in, loff_t *ppos,
if (unlikely(ret < 0))
return ret;
+ if (unlikely(len > MAX_RW_COUNT))
+ len = MAX_RW_COUNT;
+
if (in->f_op->splice_read)
splice_read = in->f_op->splice_read;
else
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 0cea9b9236d07..ce62a380314f0 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -124,7 +124,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length,
goto block_release;
bytes += msblk->devblksize;
}
- ll_rw_block(READ, b, bh);
+ ll_rw_block(REQ_OP_READ, 0, b, bh);
} else {
/*
* Metadata block.
@@ -156,7 +156,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length,
goto block_release;
bytes += msblk->devblksize;
}
- ll_rw_block(READ, b - 1, bh + 1);
+ ll_rw_block(REQ_OP_READ, 0, b - 1, bh + 1);
}
for (i = 0; i < b; i++) {
@@ -181,11 +181,11 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length,
in = min(bytes, msblk->devblksize - offset);
bytes -= in;
while (in) {
- if (pg_offset == PAGE_CACHE_SIZE) {
+ if (pg_offset == PAGE_SIZE) {
data = squashfs_next_page(output);
pg_offset = 0;
}
- avail = min_t(int, in, PAGE_CACHE_SIZE -
+ avail = min_t(int, in, PAGE_SIZE -
pg_offset);
memcpy(data + pg_offset, bh[k]->b_data + offset,
avail);
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 1cb70a0b21684..23813c078cc95 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -30,7 +30,7 @@
* access the metadata and fragment caches.
*
* To avoid out of memory and fragmentation issues with vmalloc the cache
- * uses sequences of kmalloced PAGE_CACHE_SIZE buffers.
+ * uses sequences of kmalloced PAGE_SIZE buffers.
*
* It should be noted that the cache is not used for file datablocks, these
* are decompressed and cached in the page-cache in the normal way. The
@@ -231,7 +231,7 @@ void squashfs_cache_delete(struct squashfs_cache *cache)
/*
* Initialise cache allocating the specified number of entries, each of
* size block_size. To avoid vmalloc fragmentation issues each entry
- * is allocated as a sequence of kmalloced PAGE_CACHE_SIZE buffers.
+ * is allocated as a sequence of kmalloced PAGE_SIZE buffers.
*/
struct squashfs_cache *squashfs_cache_init(char *name, int entries,
int block_size)
@@ -255,7 +255,7 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries,
cache->unused = entries;
cache->entries = entries;
cache->block_size = block_size;
- cache->pages = block_size >> PAGE_CACHE_SHIFT;
+ cache->pages = block_size >> PAGE_SHIFT;
cache->pages = cache->pages ? cache->pages : 1;
cache->name = name;
cache->num_waiters = 0;
@@ -275,7 +275,7 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries,
}
for (j = 0; j < cache->pages; j++) {
- entry->data[j] = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+ entry->data[j] = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (entry->data[j] == NULL) {
ERROR("Failed to allocate %s buffer\n", name);
goto cleanup;
@@ -314,10 +314,10 @@ int squashfs_copy_data(void *buffer, struct squashfs_cache_entry *entry,
return min(length, entry->length - offset);
while (offset < entry->length) {
- void *buff = entry->data[offset / PAGE_CACHE_SIZE]
- + (offset % PAGE_CACHE_SIZE);
+ void *buff = entry->data[offset / PAGE_SIZE]
+ + (offset % PAGE_SIZE);
int bytes = min_t(int, entry->length - offset,
- PAGE_CACHE_SIZE - (offset % PAGE_CACHE_SIZE));
+ PAGE_SIZE - (offset % PAGE_SIZE));
if (bytes >= remaining) {
memcpy(buffer, buff, remaining);
@@ -415,7 +415,7 @@ struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *sb,
*/
void *squashfs_read_table(struct super_block *sb, u64 block, int length)
{
- int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ int pages = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
int i, res;
void *table, *buffer, **data;
struct squashfs_page_actor *actor;
@@ -436,7 +436,7 @@ void *squashfs_read_table(struct super_block *sb, u64 block, int length)
goto failed2;
}
- for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE)
+ for (i = 0; i < pages; i++, buffer += PAGE_SIZE)
data[i] = buffer;
res = squashfs_read_data(sb, block, length |
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index e9034bf6e5ae2..d2bc13636f792 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -102,7 +102,7 @@ static void *get_comp_opts(struct super_block *sb, unsigned short flags)
* Read decompressor specific options from file system if present
*/
if (SQUASHFS_COMP_OPTS(flags)) {
- buffer = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+ buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (buffer == NULL) {
comp_opts = ERR_PTR(-ENOMEM);
goto out;
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index d8c2d747be28d..a5845f94a2a17 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -231,6 +231,6 @@ failed_read:
const struct file_operations squashfs_dir_ops = {
.read = generic_read_dir,
- .iterate = squashfs_readdir,
- .llseek = default_llseek,
+ .iterate_shared = squashfs_readdir,
+ .llseek = generic_file_llseek,
};
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index e5c9689062ba8..13d80947bf9e6 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -175,7 +175,7 @@ static long long read_indexes(struct super_block *sb, int n,
{
int err, i;
long long block = 0;
- __le32 *blist = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+ __le32 *blist = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (blist == NULL) {
ERROR("read_indexes: Failed to allocate block_list\n");
@@ -183,7 +183,7 @@ static long long read_indexes(struct super_block *sb, int n,
}
while (n) {
- int blocks = min_t(int, n, PAGE_CACHE_SIZE >> 2);
+ int blocks = min_t(int, n, PAGE_SIZE >> 2);
err = squashfs_read_metadata(sb, blist, start_block,
offset, blocks << 2);
@@ -377,19 +377,19 @@ void squashfs_copy_cache(struct page *page, struct squashfs_cache_entry *buffer,
struct inode *inode = page->mapping->host;
struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
void *pageaddr;
- int i, mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
+ int i, mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1;
int start_index = page->index & ~mask, end_index = start_index | mask;
/*
* Loop copying datablock into pages. As the datablock likely covers
- * many PAGE_CACHE_SIZE pages (default block size is 128 KiB) explicitly
+ * many PAGE_SIZE pages (default block size is 128 KiB) explicitly
* grab the pages from the page cache, except for the page that we've
* been called to fill.
*/
for (i = start_index; i <= end_index && bytes > 0; i++,
- bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) {
+ bytes -= PAGE_SIZE, offset += PAGE_SIZE) {
struct page *push_page;
- int avail = buffer ? min_t(int, bytes, PAGE_CACHE_SIZE) : 0;
+ int avail = buffer ? min_t(int, bytes, PAGE_SIZE) : 0;
TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail);
@@ -404,14 +404,14 @@ void squashfs_copy_cache(struct page *page, struct squashfs_cache_entry *buffer,
pageaddr = kmap_atomic(push_page);
squashfs_copy_data(pageaddr, buffer, offset, avail);
- memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail);
+ memset(pageaddr + avail, 0, PAGE_SIZE - avail);
kunmap_atomic(pageaddr);
flush_dcache_page(push_page);
SetPageUptodate(push_page);
skip_page:
unlock_page(push_page);
if (i != page->index)
- page_cache_release(push_page);
+ put_page(push_page);
}
}
@@ -454,7 +454,7 @@ static int squashfs_readpage(struct file *file, struct page *page)
{
struct inode *inode = page->mapping->host;
struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
- int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT);
+ int index = page->index >> (msblk->block_log - PAGE_SHIFT);
int file_end = i_size_read(inode) >> msblk->block_log;
int res;
void *pageaddr;
@@ -462,8 +462,8 @@ static int squashfs_readpage(struct file *file, struct page *page)
TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n",
page->index, squashfs_i(inode)->start);
- if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT))
+ if (page->index >= ((i_size_read(inode) + PAGE_SIZE - 1) >>
+ PAGE_SHIFT))
goto out;
if (index < file_end || squashfs_i(inode)->fragment_block ==
@@ -487,7 +487,7 @@ error_out:
SetPageError(page);
out:
pageaddr = kmap_atomic(page);
- memset(pageaddr, 0, PAGE_CACHE_SIZE);
+ memset(pageaddr, 0, PAGE_SIZE);
kunmap_atomic(pageaddr);
flush_dcache_page(page);
if (!PageError(page))
diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
index 43e7a7eddac03..cb485d8e0e91b 100644
--- a/fs/squashfs/file_direct.c
+++ b/fs/squashfs/file_direct.c
@@ -30,8 +30,8 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
struct inode *inode = target_page->mapping->host;
struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
- int file_end = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
- int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
+ int file_end = (i_size_read(inode) - 1) >> PAGE_SHIFT;
+ int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1;
int start_index = target_page->index & ~mask;
int end_index = start_index | mask;
int i, n, pages, missing_pages, bytes, res = -ENOMEM;
@@ -68,7 +68,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
if (PageUptodate(page[i])) {
unlock_page(page[i]);
- page_cache_release(page[i]);
+ put_page(page[i]);
page[i] = NULL;
missing_pages++;
}
@@ -96,10 +96,10 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
goto mark_errored;
/* Last page may have trailing bytes not filled */
- bytes = res % PAGE_CACHE_SIZE;
+ bytes = res % PAGE_SIZE;
if (bytes) {
pageaddr = kmap_atomic(page[pages - 1]);
- memset(pageaddr + bytes, 0, PAGE_CACHE_SIZE - bytes);
+ memset(pageaddr + bytes, 0, PAGE_SIZE - bytes);
kunmap_atomic(pageaddr);
}
@@ -109,7 +109,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
SetPageUptodate(page[i]);
unlock_page(page[i]);
if (page[i] != target_page)
- page_cache_release(page[i]);
+ put_page(page[i]);
}
kfree(actor);
@@ -127,7 +127,7 @@ mark_errored:
flush_dcache_page(page[i]);
SetPageError(page[i]);
unlock_page(page[i]);
- page_cache_release(page[i]);
+ put_page(page[i]);
}
out:
@@ -153,21 +153,21 @@ static int squashfs_read_cache(struct page *target_page, u64 block, int bsize,
}
for (n = 0; n < pages && bytes > 0; n++,
- bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) {
- int avail = min_t(int, bytes, PAGE_CACHE_SIZE);
+ bytes -= PAGE_SIZE, offset += PAGE_SIZE) {
+ int avail = min_t(int, bytes, PAGE_SIZE);
if (page[n] == NULL)
continue;
pageaddr = kmap_atomic(page[n]);
squashfs_copy_data(pageaddr, buffer, offset, avail);
- memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail);
+ memset(pageaddr + avail, 0, PAGE_SIZE - avail);
kunmap_atomic(pageaddr);
flush_dcache_page(page[n]);
SetPageUptodate(page[n]);
unlock_page(page[n]);
if (page[n] != target_page)
- page_cache_release(page[n]);
+ put_page(page[n]);
}
out:
diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c
index c31e2bc9c0815..ff4468bd18b02 100644
--- a/fs/squashfs/lz4_wrapper.c
+++ b/fs/squashfs/lz4_wrapper.c
@@ -117,13 +117,13 @@ static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm,
data = squashfs_first_page(output);
buff = stream->output;
while (data) {
- if (bytes <= PAGE_CACHE_SIZE) {
+ if (bytes <= PAGE_SIZE) {
memcpy(data, buff, bytes);
break;
}
- memcpy(data, buff, PAGE_CACHE_SIZE);
- buff += PAGE_CACHE_SIZE;
- bytes -= PAGE_CACHE_SIZE;
+ memcpy(data, buff, PAGE_SIZE);
+ buff += PAGE_SIZE;
+ bytes -= PAGE_SIZE;
data = squashfs_next_page(output);
}
squashfs_finish_page(output);
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
index 244b9fbfff7b2..934c17e965908 100644
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -102,13 +102,13 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm,
data = squashfs_first_page(output);
buff = stream->output;
while (data) {
- if (bytes <= PAGE_CACHE_SIZE) {
+ if (bytes <= PAGE_SIZE) {
memcpy(data, buff, bytes);
break;
} else {
- memcpy(data, buff, PAGE_CACHE_SIZE);
- buff += PAGE_CACHE_SIZE;
- bytes -= PAGE_CACHE_SIZE;
+ memcpy(data, buff, PAGE_SIZE);
+ buff += PAGE_SIZE;
+ bytes -= PAGE_SIZE;
data = squashfs_next_page(output);
}
}
diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c
index 5a1c11f564415..9b7b1b6a78926 100644
--- a/fs/squashfs/page_actor.c
+++ b/fs/squashfs/page_actor.c
@@ -48,7 +48,7 @@ struct squashfs_page_actor *squashfs_page_actor_init(void **buffer,
if (actor == NULL)
return NULL;
- actor->length = length ? : pages * PAGE_CACHE_SIZE;
+ actor->length = length ? : pages * PAGE_SIZE;
actor->buffer = buffer;
actor->pages = pages;
actor->next_page = 0;
@@ -88,7 +88,7 @@ struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page,
if (actor == NULL)
return NULL;
- actor->length = length ? : pages * PAGE_CACHE_SIZE;
+ actor->length = length ? : pages * PAGE_SIZE;
actor->page = page;
actor->pages = pages;
actor->next_page = 0;
diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h
index 26dd82008b82c..98537eab27e27 100644
--- a/fs/squashfs/page_actor.h
+++ b/fs/squashfs/page_actor.h
@@ -24,7 +24,7 @@ static inline struct squashfs_page_actor *squashfs_page_actor_init(void **page,
if (actor == NULL)
return NULL;
- actor->length = length ? : pages * PAGE_CACHE_SIZE;
+ actor->length = length ? : pages * PAGE_SIZE;
actor->page = page;
actor->pages = pages;
actor->next_page = 0;
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 5e79bfa4f2607..cf01e15a7b16d 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -152,7 +152,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
* Check the system page size is not larger than the filesystem
* block size (by default 128K). This is currently not supported.
*/
- if (PAGE_CACHE_SIZE > msblk->block_size) {
+ if (PAGE_SIZE > msblk->block_size) {
ERROR("Page size > filesystem block size (%d). This is "
"currently not supported!\n", msblk->block_size);
goto failed_mount;
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index dbcc2f54bad46..d688ef42a6a1f 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -48,10 +48,10 @@ static int squashfs_symlink_readpage(struct file *file, struct page *page)
struct inode *inode = page->mapping->host;
struct super_block *sb = inode->i_sb;
struct squashfs_sb_info *msblk = sb->s_fs_info;
- int index = page->index << PAGE_CACHE_SHIFT;
+ int index = page->index << PAGE_SHIFT;
u64 block = squashfs_i(inode)->start;
int offset = squashfs_i(inode)->offset;
- int length = min_t(int, i_size_read(inode) - index, PAGE_CACHE_SIZE);
+ int length = min_t(int, i_size_read(inode) - index, PAGE_SIZE);
int bytes, copied;
void *pageaddr;
struct squashfs_cache_entry *entry;
@@ -94,7 +94,7 @@ static int squashfs_symlink_readpage(struct file *file, struct page *page)
copied = squashfs_copy_data(pageaddr + bytes, entry, offset,
length - bytes);
if (copied == length - bytes)
- memset(pageaddr + length, 0, PAGE_CACHE_SIZE - length);
+ memset(pageaddr + length, 0, PAGE_SIZE - length);
else
block = entry->next_index;
kunmap_atomic(pageaddr);
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
index 1e9de96288d8f..1548b3784548d 100644
--- a/fs/squashfs/xattr.c
+++ b/fs/squashfs/xattr.c
@@ -214,10 +214,12 @@ failed:
static int squashfs_xattr_handler_get(const struct xattr_handler *handler,
- struct dentry *d, const char *name,
+ struct dentry *unused,
+ struct inode *inode,
+ const char *name,
void *buffer, size_t size)
{
- return squashfs_xattr_get(d_inode(d), handler->flags, name,
+ return squashfs_xattr_get(inode, handler->flags, name,
buffer, size);
}
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
index c609624e4b8a8..6bfaef73d0652 100644
--- a/fs/squashfs/xz_wrapper.c
+++ b/fs/squashfs/xz_wrapper.c
@@ -141,7 +141,7 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
stream->buf.in_pos = 0;
stream->buf.in_size = 0;
stream->buf.out_pos = 0;
- stream->buf.out_size = PAGE_CACHE_SIZE;
+ stream->buf.out_size = PAGE_SIZE;
stream->buf.out = squashfs_first_page(output);
do {
@@ -158,7 +158,7 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
stream->buf.out = squashfs_next_page(output);
if (stream->buf.out != NULL) {
stream->buf.out_pos = 0;
- total += PAGE_CACHE_SIZE;
+ total += PAGE_SIZE;
}
}
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 8727caba68822..2ec24d128bce0 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -69,7 +69,7 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
int zlib_err, zlib_init = 0, k = 0;
z_stream *stream = strm;
- stream->avail_out = PAGE_CACHE_SIZE;
+ stream->avail_out = PAGE_SIZE;
stream->next_out = squashfs_first_page(output);
stream->avail_in = 0;
@@ -85,7 +85,7 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
if (stream->avail_out == 0) {
stream->next_out = squashfs_next_page(output);
if (stream->next_out != NULL)
- stream->avail_out = PAGE_CACHE_SIZE;
+ stream->avail_out = PAGE_SIZE;
}
if (!zlib_init) {
diff --git a/fs/super.c b/fs/super.c
index 74914b1bae70f..c2ff475c1711f 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -33,6 +33,7 @@
#include <linux/cleancache.h>
#include <linux/fsnotify.h>
#include <linux/lockdep.h>
+#include <linux/user_namespace.h>
#include "internal.h"
@@ -165,6 +166,7 @@ static void destroy_super(struct super_block *s)
list_lru_destroy(&s->s_inode_lru);
security_sb_free(s);
WARN_ON(!list_empty(&s->s_mounts));
+ put_user_ns(s->s_user_ns);
kfree(s->s_subtype);
kfree(s->s_options);
call_rcu(&s->rcu, destroy_super_rcu);
@@ -174,11 +176,13 @@ static void destroy_super(struct super_block *s)
* alloc_super - create new superblock
* @type: filesystem type superblock should belong to
* @flags: the mount flags
+ * @user_ns: User namespace for the super_block
*
* Allocates and initializes a new &struct super_block. alloc_super()
* returns a pointer new superblock or %NULL if allocation had failed.
*/
-static struct super_block *alloc_super(struct file_system_type *type, int flags)
+static struct super_block *alloc_super(struct file_system_type *type, int flags,
+ struct user_namespace *user_ns)
{
struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER);
static const struct super_operations default_op;
@@ -188,6 +192,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
return NULL;
INIT_LIST_HEAD(&s->s_mounts);
+ s->s_user_ns = get_user_ns(user_ns);
if (security_sb_alloc(s))
goto fail;
@@ -201,11 +206,15 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
init_waitqueue_head(&s->s_writers.wait_unfrozen);
s->s_bdi = &noop_backing_dev_info;
s->s_flags = flags;
+ if (s->s_user_ns != &init_user_ns)
+ s->s_iflags |= SB_I_NODEV;
INIT_HLIST_NODE(&s->s_instances);
INIT_HLIST_BL_HEAD(&s->s_anon);
mutex_init(&s->s_sync_lock);
INIT_LIST_HEAD(&s->s_inodes);
spin_lock_init(&s->s_inode_list_lock);
+ INIT_LIST_HEAD(&s->s_inodes_wb);
+ spin_lock_init(&s->s_inode_wblist_lock);
if (list_lru_init_memcg(&s->s_dentry_lru))
goto fail;
@@ -285,7 +294,7 @@ static void put_super(struct super_block *sb)
* deactivate_locked_super - drop an active reference to superblock
* @s: superblock to deactivate
*
- * Drops an active reference to superblock, converting it into a temprory
+ * Drops an active reference to superblock, converting it into a temporary
* one if there is no other active references left. In that case we
* tell fs driver to shut it down and drop the temporary reference we
* had just acquired.
@@ -443,29 +452,42 @@ void generic_shutdown_super(struct super_block *sb)
EXPORT_SYMBOL(generic_shutdown_super);
/**
- * sget - find or create a superblock
+ * sget_userns - find or create a superblock
* @type: filesystem type superblock should belong to
* @test: comparison callback
* @set: setup callback
* @flags: mount flags
+ * @user_ns: User namespace for the super_block
* @data: argument to each of them
*/
-struct super_block *sget(struct file_system_type *type,
+struct super_block *sget_userns(struct file_system_type *type,
int (*test)(struct super_block *,void *),
int (*set)(struct super_block *,void *),
- int flags,
+ int flags, struct user_namespace *user_ns,
void *data)
{
struct super_block *s = NULL;
struct super_block *old;
int err;
+ if (!(flags & MS_KERNMOUNT) &&
+ !(type->fs_flags & FS_USERNS_MOUNT) &&
+ !capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
retry:
spin_lock(&sb_lock);
if (test) {
hlist_for_each_entry(old, &type->fs_supers, s_instances) {
if (!test(old, data))
continue;
+ if (user_ns != old->s_user_ns) {
+ spin_unlock(&sb_lock);
+ if (s) {
+ up_write(&s->s_umount);
+ destroy_super(s);
+ }
+ return ERR_PTR(-EBUSY);
+ }
if (!grab_super(old))
goto retry;
if (s) {
@@ -478,7 +500,7 @@ retry:
}
if (!s) {
spin_unlock(&sb_lock);
- s = alloc_super(type, flags);
+ s = alloc_super(type, flags, user_ns);
if (!s)
return ERR_PTR(-ENOMEM);
goto retry;
@@ -501,6 +523,31 @@ retry:
return s;
}
+EXPORT_SYMBOL(sget_userns);
+
+/**
+ * sget - find or create a superblock
+ * @type: filesystem type superblock should belong to
+ * @test: comparison callback
+ * @set: setup callback
+ * @flags: mount flags
+ * @data: argument to each of them
+ */
+struct super_block *sget(struct file_system_type *type,
+ int (*test)(struct super_block *,void *),
+ int (*set)(struct super_block *,void *),
+ int flags,
+ void *data)
+{
+ struct user_namespace *user_ns = current_user_ns();
+
+ /* Ensure the requestor has permissions over the target filesystem */
+ if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ return sget_userns(type, test, set, flags, user_ns, data);
+}
+
EXPORT_SYMBOL(sget);
void drop_super(struct super_block *sb)
@@ -918,12 +965,20 @@ static int ns_set_super(struct super_block *sb, void *data)
return set_anon_super(sb, NULL);
}
-struct dentry *mount_ns(struct file_system_type *fs_type, int flags,
- void *data, int (*fill_super)(struct super_block *, void *, int))
+struct dentry *mount_ns(struct file_system_type *fs_type,
+ int flags, void *data, void *ns, struct user_namespace *user_ns,
+ int (*fill_super)(struct super_block *, void *, int))
{
struct super_block *sb;
- sb = sget(fs_type, ns_test_super, ns_set_super, flags, data);
+ /* Don't allow mounting unless the caller has CAP_SYS_ADMIN
+ * over the namespace.
+ */
+ if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ sb = sget_userns(fs_type, ns_test_super, ns_set_super, flags,
+ user_ns, ns);
if (IS_ERR(sb))
return ERR_CAST(sb);
diff --git a/fs/sync.c b/fs/sync.c
index dd5d1711c7ac3..2a54c1f220359 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -302,7 +302,7 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
goto out;
if (sizeof(pgoff_t) == 4) {
- if (offset >= (0x100000000ULL << PAGE_CACHE_SHIFT)) {
+ if (offset >= (0x100000000ULL << PAGE_SHIFT)) {
/*
* The range starts outside a 32 bit machine's
* pagecache addressing capabilities. Let it "succeed"
@@ -310,7 +310,7 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
ret = 0;
goto out;
}
- if (endbyte >= (0x100000000ULL << PAGE_CACHE_SHIFT)) {
+ if (endbyte >= (0x100000000ULL << PAGE_SHIFT)) {
/*
* Out to EOF
*/
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index f3db82071cfbd..20b8f82e115b6 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -41,8 +41,7 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
if (IS_ERR(root) || !new_sb)
kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
else if (new_sb)
- /* Userspace would break if executables appear on sysfs */
- root->d_sb->s_iflags |= SB_I_NOEXEC;
+ root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE;
return root;
}
@@ -59,7 +58,7 @@ static struct file_system_type sysfs_fs_type = {
.name = "sysfs",
.mount = sysfs_mount,
.kill_sb = sysfs_kill_sb,
- .fs_flags = FS_USERNS_VISIBLE | FS_USERNS_MOUNT,
+ .fs_flags = FS_USERNS_MOUNT,
};
int __init sysfs_init(void)
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 63c1bcb224ee8..2661b77fc8a79 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -23,14 +23,14 @@ static int sysv_readdir(struct file *, struct dir_context *);
const struct file_operations sysv_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .iterate = sysv_readdir,
+ .iterate_shared = sysv_readdir,
.fsync = generic_file_fsync,
};
static inline void dir_put_page(struct page *page)
{
kunmap(page);
- page_cache_release(page);
+ put_page(page);
}
static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len)
@@ -73,8 +73,8 @@ static int sysv_readdir(struct file *file, struct dir_context *ctx)
if (pos >= inode->i_size)
return 0;
- offset = pos & ~PAGE_CACHE_MASK;
- n = pos >> PAGE_CACHE_SHIFT;
+ offset = pos & ~PAGE_MASK;
+ n = pos >> PAGE_SHIFT;
for ( ; n < npages; n++, offset = 0) {
char *kaddr, *limit;
@@ -85,7 +85,7 @@ static int sysv_readdir(struct file *file, struct dir_context *ctx)
continue;
kaddr = (char *)page_address(page);
de = (struct sysv_dir_entry *)(kaddr+offset);
- limit = kaddr + PAGE_CACHE_SIZE - SYSV_DIRSIZE;
+ limit = kaddr + PAGE_SIZE - SYSV_DIRSIZE;
for ( ;(char*)de <= limit; de++, ctx->pos += sizeof(*de)) {
char *name = de->name;
@@ -146,7 +146,7 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_
if (!IS_ERR(page)) {
kaddr = (char*)page_address(page);
de = (struct sysv_dir_entry *) kaddr;
- kaddr += PAGE_CACHE_SIZE - SYSV_DIRSIZE;
+ kaddr += PAGE_SIZE - SYSV_DIRSIZE;
for ( ; (char *) de <= kaddr ; de++) {
if (!de->inode)
continue;
@@ -190,7 +190,7 @@ int sysv_add_link(struct dentry *dentry, struct inode *inode)
goto out;
kaddr = (char*)page_address(page);
de = (struct sysv_dir_entry *)kaddr;
- kaddr += PAGE_CACHE_SIZE - SYSV_DIRSIZE;
+ kaddr += PAGE_SIZE - SYSV_DIRSIZE;
while ((char *)de <= kaddr) {
if (!de->inode)
goto got_it;
@@ -261,7 +261,7 @@ int sysv_make_empty(struct inode *inode, struct inode *dir)
kmap(page);
base = (char*)page_address(page);
- memset(base, 0, PAGE_CACHE_SIZE);
+ memset(base, 0, PAGE_SIZE);
de = (struct sysv_dir_entry *) base;
de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
@@ -273,7 +273,7 @@ int sysv_make_empty(struct inode *inode, struct inode *dir)
kunmap(page);
err = dir_commit_chunk(page, 0, 2 * SYSV_DIRSIZE);
fail:
- page_cache_release(page);
+ put_page(page);
return err;
}
@@ -296,7 +296,7 @@ int sysv_empty_dir(struct inode * inode)
kaddr = (char *)page_address(page);
de = (struct sysv_dir_entry *)kaddr;
- kaddr += PAGE_CACHE_SIZE-SYSV_DIRSIZE;
+ kaddr += PAGE_SIZE-SYSV_DIRSIZE;
for ( ;(char *)de <= kaddr; de++) {
if (!de->inode)
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 11e83ed0b4bf4..a42de45ce40db 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -33,7 +33,7 @@ static int sysv_hash(const struct dentry *dentry, struct qstr *qstr)
function. */
if (qstr->len > SYSV_NAMELEN) {
qstr->len = SYSV_NAMELEN;
- qstr->hash = full_name_hash(qstr->name, qstr->len);
+ qstr->hash = full_name_hash(dentry, qstr->name, qstr->len);
}
return 0;
}
@@ -264,11 +264,11 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
out_dir:
if (dir_de) {
kunmap(dir_page);
- page_cache_release(dir_page);
+ put_page(dir_page);
}
out_old:
kunmap(old_page);
- page_cache_release(old_page);
+ put_page(old_page);
out:
return err;
}
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 053818dd6c18b..9ae4abb4110b8 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -390,6 +390,11 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
clockid != CLOCK_BOOTTIME_ALARM))
return -EINVAL;
+ if (!capable(CAP_WAKE_ALARM) &&
+ (clockid == CLOCK_REALTIME_ALARM ||
+ clockid == CLOCK_BOOTTIME_ALARM))
+ return -EPERM;
+
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
return -ENOMEM;
@@ -433,6 +438,11 @@ static int do_timerfd_settime(int ufd, int flags,
return ret;
ctx = f.file->private_data;
+ if (!capable(CAP_WAKE_ALARM) && isalarm(ctx)) {
+ fdput(f);
+ return -EPERM;
+ }
+
timerfd_setup_cancel(ctx, flags);
/*
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 4a0e48f921048..ad40b64c5e2f4 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -541,9 +541,6 @@ void tracefs_remove(struct dentry *dentry)
return;
parent = dentry->d_parent;
- if (!parent || !parent->d_inode)
- return;
-
inode_lock(parent->d_inode);
ret = __tracefs_remove(dentry, parent);
inode_unlock(parent->d_inode);
@@ -566,10 +563,6 @@ void tracefs_remove_recursive(struct dentry *dentry)
if (IS_ERR_OR_NULL(dentry))
return;
- parent = dentry->d_parent;
- if (!parent || !parent->d_inode)
- return;
-
parent = dentry;
down:
inode_lock(parent->d_inode);
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 595ca0debe117..69e287e207327 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -260,7 +260,7 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode)
pr_err("\txattr_names %u\n", ui->xattr_names);
pr_err("\tdirty %u\n", ui->dirty);
pr_err("\txattr %u\n", ui->xattr);
- pr_err("\tbulk_read %u\n", ui->xattr);
+ pr_err("\tbulk_read %u\n", ui->bulk_read);
pr_err("\tsynced_i_size %llu\n",
(unsigned long long)ui->synced_i_size);
pr_err("\tui_size %llu\n",
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 795992a8321e9..4b86d3a738e18 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1182,10 +1182,10 @@ const struct inode_operations ubifs_dir_inode_operations = {
.rename = ubifs_rename,
.setattr = ubifs_setattr,
.getattr = ubifs_getattr,
- .setxattr = ubifs_setxattr,
- .getxattr = ubifs_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
.listxattr = ubifs_listxattr,
- .removexattr = ubifs_removexattr,
+ .removexattr = generic_removexattr,
#ifdef CONFIG_UBIFS_ATIME_SUPPORT
.update_time = ubifs_update_time,
#endif
@@ -1195,7 +1195,7 @@ const struct file_operations ubifs_dir_operations = {
.llseek = generic_file_llseek,
.release = ubifs_dir_release,
.read = generic_read_dir,
- .iterate = ubifs_readdir,
+ .iterate_shared = ubifs_readdir,
.fsync = ubifs_fsync,
.unlocked_ioctl = ubifs_ioctl,
#ifdef CONFIG_COMPAT
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 065c88f8e4b8c..7bbf420d12898 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -52,6 +52,7 @@
#include "ubifs.h"
#include <linux/mount.h>
#include <linux/slab.h>
+#include <linux/migrate.h>
static int read_block(struct inode *inode, void *addr, unsigned int block,
struct ubifs_data_node *dn)
@@ -121,7 +122,7 @@ static int do_readpage(struct page *page)
if (block >= beyond) {
/* Reading beyond inode */
SetPageChecked(page);
- memset(addr, 0, PAGE_CACHE_SIZE);
+ memset(addr, 0, PAGE_SIZE);
goto out;
}
@@ -223,7 +224,7 @@ static int write_begin_slow(struct address_space *mapping,
{
struct inode *inode = mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t index = pos >> PAGE_SHIFT;
struct ubifs_budget_req req = { .new_page = 1 };
int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
struct page *page;
@@ -254,13 +255,13 @@ static int write_begin_slow(struct address_space *mapping,
}
if (!PageUptodate(page)) {
- if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
+ if (!(pos & ~PAGE_MASK) && len == PAGE_SIZE)
SetPageChecked(page);
else {
err = do_readpage(page);
if (err) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
ubifs_release_budget(c, &req);
return err;
}
@@ -428,7 +429,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
struct inode *inode = mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
struct ubifs_inode *ui = ubifs_inode(inode);
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t index = pos >> PAGE_SHIFT;
int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
int skipped_read = 0;
struct page *page;
@@ -446,7 +447,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
if (!PageUptodate(page)) {
/* The page is not loaded from the flash */
- if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) {
+ if (!(pos & ~PAGE_MASK) && len == PAGE_SIZE) {
/*
* We change whole page so no need to load it. But we
* do not know whether this page exists on the media or
@@ -462,7 +463,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
err = do_readpage(page);
if (err) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return err;
}
}
@@ -494,7 +495,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
mutex_unlock(&ui->ui_mutex);
}
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return write_begin_slow(mapping, pos, len, pagep, flags);
}
@@ -549,12 +550,12 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping,
dbg_gen("ino %lu, pos %llu, pg %lu, len %u, copied %d, i_size %lld",
inode->i_ino, pos, page->index, len, copied, inode->i_size);
- if (unlikely(copied < len && len == PAGE_CACHE_SIZE)) {
+ if (unlikely(copied < len && len == PAGE_SIZE)) {
/*
* VFS copied less data to the page that it intended and
* declared in its '->write_begin()' call via the @len
* argument. If the page was not up-to-date, and @len was
- * @PAGE_CACHE_SIZE, the 'ubifs_write_begin()' function did
+ * @PAGE_SIZE, the 'ubifs_write_begin()' function did
* not load it from the media (for optimization reasons). This
* means that part of the page contains garbage. So read the
* page now.
@@ -593,7 +594,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping,
out:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return copied;
}
@@ -621,10 +622,10 @@ static int populate_page(struct ubifs_info *c, struct page *page,
addr = zaddr = kmap(page);
- end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
+ end_index = (i_size - 1) >> PAGE_SHIFT;
if (!i_size || page->index > end_index) {
hole = 1;
- memset(addr, 0, PAGE_CACHE_SIZE);
+ memset(addr, 0, PAGE_SIZE);
goto out_hole;
}
@@ -673,7 +674,7 @@ static int populate_page(struct ubifs_info *c, struct page *page,
}
if (end_index == page->index) {
- int len = i_size & (PAGE_CACHE_SIZE - 1);
+ int len = i_size & (PAGE_SIZE - 1);
if (len && len < read)
memset(zaddr + len, 0, read - len);
@@ -773,7 +774,7 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu,
isize = i_size_read(inode);
if (isize == 0)
goto out_free;
- end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
+ end_index = ((isize - 1) >> PAGE_SHIFT);
for (page_idx = 1; page_idx < page_cnt; page_idx++) {
pgoff_t page_offset = offset + page_idx;
@@ -788,7 +789,7 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu,
if (!PageUptodate(page))
err = populate_page(c, page, bu, &n);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if (err)
break;
}
@@ -905,7 +906,7 @@ static int do_writepage(struct page *page, int len)
#ifdef UBIFS_DEBUG
struct ubifs_inode *ui = ubifs_inode(inode);
spin_lock(&ui->ui_lock);
- ubifs_assert(page->index <= ui->synced_i_size >> PAGE_CACHE_SHIFT);
+ ubifs_assert(page->index <= ui->synced_i_size >> PAGE_SHIFT);
spin_unlock(&ui->ui_lock);
#endif
@@ -1001,8 +1002,8 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
struct inode *inode = page->mapping->host;
struct ubifs_inode *ui = ubifs_inode(inode);
loff_t i_size = i_size_read(inode), synced_i_size;
- pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
- int err, len = i_size & (PAGE_CACHE_SIZE - 1);
+ pgoff_t end_index = i_size >> PAGE_SHIFT;
+ int err, len = i_size & (PAGE_SIZE - 1);
void *kaddr;
dbg_gen("ino %lu, pg %lu, pg flags %#lx",
@@ -1021,7 +1022,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
/* Is the page fully inside @i_size? */
if (page->index < end_index) {
- if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) {
+ if (page->index >= synced_i_size >> PAGE_SHIFT) {
err = inode->i_sb->s_op->write_inode(inode, NULL);
if (err)
goto out_unlock;
@@ -1034,7 +1035,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
* with this.
*/
}
- return do_writepage(page, PAGE_CACHE_SIZE);
+ return do_writepage(page, PAGE_SIZE);
}
/*
@@ -1045,7 +1046,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
* writes to that region are not written out to the file."
*/
kaddr = kmap_atomic(page);
- memset(kaddr + len, 0, PAGE_CACHE_SIZE - len);
+ memset(kaddr + len, 0, PAGE_SIZE - len);
flush_dcache_page(page);
kunmap_atomic(kaddr);
@@ -1138,7 +1139,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
truncate_setsize(inode, new_size);
if (offset) {
- pgoff_t index = new_size >> PAGE_CACHE_SHIFT;
+ pgoff_t index = new_size >> PAGE_SHIFT;
struct page *page;
page = find_lock_page(inode->i_mapping, index);
@@ -1157,9 +1158,9 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
clear_page_dirty_for_io(page);
if (UBIFS_BLOCKS_PER_PAGE_SHIFT)
offset = new_size &
- (PAGE_CACHE_SIZE - 1);
+ (PAGE_SIZE - 1);
err = do_writepage(page, offset);
- page_cache_release(page);
+ put_page(page);
if (err)
goto out_budg;
/*
@@ -1173,7 +1174,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
* having to read it.
*/
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
}
}
@@ -1285,7 +1286,7 @@ static void ubifs_invalidatepage(struct page *page, unsigned int offset,
struct ubifs_info *c = inode->i_sb->s_fs_info;
ubifs_assert(PagePrivate(page));
- if (offset || length < PAGE_CACHE_SIZE)
+ if (offset || length < PAGE_SIZE)
/* Partial page remains dirty */
return;
@@ -1452,6 +1453,26 @@ static int ubifs_set_page_dirty(struct page *page)
return ret;
}
+#ifdef CONFIG_MIGRATION
+static int ubifs_migrate_page(struct address_space *mapping,
+ struct page *newpage, struct page *page, enum migrate_mode mode)
+{
+ int rc;
+
+ rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
+ if (rc != MIGRATEPAGE_SUCCESS)
+ return rc;
+
+ if (PagePrivate(page)) {
+ ClearPagePrivate(page);
+ SetPagePrivate(newpage);
+ }
+
+ migrate_page_copy(newpage, page);
+ return MIGRATEPAGE_SUCCESS;
+}
+#endif
+
static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
{
/*
@@ -1591,16 +1612,19 @@ const struct address_space_operations ubifs_file_address_operations = {
.write_end = ubifs_write_end,
.invalidatepage = ubifs_invalidatepage,
.set_page_dirty = ubifs_set_page_dirty,
+#ifdef CONFIG_MIGRATION
+ .migratepage = ubifs_migrate_page,
+#endif
.releasepage = ubifs_releasepage,
};
const struct inode_operations ubifs_file_inode_operations = {
.setattr = ubifs_setattr,
.getattr = ubifs_getattr,
- .setxattr = ubifs_setxattr,
- .getxattr = ubifs_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
.listxattr = ubifs_listxattr,
- .removexattr = ubifs_removexattr,
+ .removexattr = generic_removexattr,
#ifdef CONFIG_UBIFS_ATIME_SUPPORT
.update_time = ubifs_update_time,
#endif
@@ -1611,10 +1635,10 @@ const struct inode_operations ubifs_symlink_inode_operations = {
.get_link = simple_get_link,
.setattr = ubifs_setattr,
.getattr = ubifs_getattr,
- .setxattr = ubifs_setxattr,
- .getxattr = ubifs_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
.listxattr = ubifs_listxattr,
- .removexattr = ubifs_removexattr,
+ .removexattr = generic_removexattr,
#ifdef CONFIG_UBIFS_ATIME_SUPPORT
.update_time = ubifs_update_time,
#endif
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 9718da86ad01a..821b34816976a 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -100,10 +100,6 @@ static int switch_gc_head(struct ubifs_info *c)
if (err)
return err;
- err = ubifs_wbuf_sync_nolock(wbuf);
- if (err)
- return err;
-
err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0);
if (err)
return err;
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index f4fbc7b6b7947..3cbb904a6d7d9 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -28,8 +28,8 @@
#include "ubifs.h"
#include <linux/slab.h>
-#include <linux/random.h>
#include <linux/math64.h>
+#include <linux/uuid.h>
/*
* Default journal size in logical eraseblocks as a percent of total
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index a233ba913be4f..4ec051089186e 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -520,19 +520,19 @@ static int init_constants_early(struct ubifs_info *c)
c->max_write_shift = fls(c->max_write_size) - 1;
if (c->leb_size < UBIFS_MIN_LEB_SZ) {
- ubifs_err(c, "too small LEBs (%d bytes), min. is %d bytes",
- c->leb_size, UBIFS_MIN_LEB_SZ);
+ ubifs_errc(c, "too small LEBs (%d bytes), min. is %d bytes",
+ c->leb_size, UBIFS_MIN_LEB_SZ);
return -EINVAL;
}
if (c->leb_cnt < UBIFS_MIN_LEB_CNT) {
- ubifs_err(c, "too few LEBs (%d), min. is %d",
- c->leb_cnt, UBIFS_MIN_LEB_CNT);
+ ubifs_errc(c, "too few LEBs (%d), min. is %d",
+ c->leb_cnt, UBIFS_MIN_LEB_CNT);
return -EINVAL;
}
if (!is_power_of_2(c->min_io_size)) {
- ubifs_err(c, "bad min. I/O size %d", c->min_io_size);
+ ubifs_errc(c, "bad min. I/O size %d", c->min_io_size);
return -EINVAL;
}
@@ -543,8 +543,8 @@ static int init_constants_early(struct ubifs_info *c)
if (c->max_write_size < c->min_io_size ||
c->max_write_size % c->min_io_size ||
!is_power_of_2(c->max_write_size)) {
- ubifs_err(c, "bad write buffer size %d for %d min. I/O unit",
- c->max_write_size, c->min_io_size);
+ ubifs_errc(c, "bad write buffer size %d for %d min. I/O unit",
+ c->max_write_size, c->min_io_size);
return -EINVAL;
}
@@ -2040,6 +2040,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
if (c->max_inode_sz > MAX_LFS_FILESIZE)
sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE;
sb->s_op = &ubifs_super_operations;
+ sb->s_xattr = ubifs_xattr_handlers;
mutex_lock(&c->umount_mutex);
err = mount_ubifs(c);
@@ -2107,8 +2108,9 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
*/
ubi = open_ubi(name, UBI_READONLY);
if (IS_ERR(ubi)) {
- pr_err("UBIFS error (pid: %d): cannot open \"%s\", error %d",
- current->pid, name, (int)PTR_ERR(ubi));
+ if (!(flags & MS_SILENT))
+ pr_err("UBIFS error (pid: %d): cannot open \"%s\", error %d",
+ current->pid, name, (int)PTR_ERR(ubi));
return ERR_CAST(ubi);
}
@@ -2237,12 +2239,12 @@ static int __init ubifs_init(void)
BUILD_BUG_ON(UBIFS_COMPR_TYPES_CNT > 4);
/*
- * We require that PAGE_CACHE_SIZE is greater-than-or-equal-to
+ * We require that PAGE_SIZE is greater-than-or-equal-to
* UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2.
*/
- if (PAGE_CACHE_SIZE < UBIFS_BLOCK_SIZE) {
+ if (PAGE_SIZE < UBIFS_BLOCK_SIZE) {
pr_err("UBIFS error (pid %d): VFS page cache size is %u bytes, but UBIFS requires at least 4096 bytes",
- current->pid, (unsigned int)PAGE_CACHE_SIZE);
+ current->pid, (unsigned int)PAGE_SIZE);
return -EINVAL;
}
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index c2a57e193a81c..4617d459022a5 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -37,6 +37,7 @@
#include <linux/pagemap.h>
#include <linux/backing-dev.h>
#include <linux/security.h>
+#include <linux/xattr.h>
#include "ubifs-media.h"
/* Version of this UBIFS implementation */
@@ -46,8 +47,8 @@
#define UBIFS_SUPER_MAGIC 0x24051905
/* Number of UBIFS blocks per VFS page */
-#define UBIFS_BLOCKS_PER_PAGE (PAGE_CACHE_SIZE / UBIFS_BLOCK_SIZE)
-#define UBIFS_BLOCKS_PER_PAGE_SHIFT (PAGE_CACHE_SHIFT - UBIFS_BLOCK_SHIFT)
+#define UBIFS_BLOCKS_PER_PAGE (PAGE_SIZE / UBIFS_BLOCK_SIZE)
+#define UBIFS_BLOCKS_PER_PAGE_SHIFT (PAGE_SHIFT - UBIFS_BLOCK_SHIFT)
/* "File system end of life" sequence number watermark */
#define SQNUM_WARN_WATERMARK 0xFFFFFFFF00000000ULL
@@ -1732,12 +1733,8 @@ int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
/* xattr.c */
-int ubifs_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags);
-ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
- size_t size);
+extern const struct xattr_handler *ubifs_xattr_handlers[];
ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size);
-int ubifs_removexattr(struct dentry *dentry, const char *name);
int ubifs_init_security(struct inode *dentry, struct inode *inode,
const struct qstr *qstr);
@@ -1786,8 +1783,8 @@ void ubifs_err(const struct ubifs_info *c, const char *fmt, ...);
__printf(2, 3)
void ubifs_warn(const struct ubifs_info *c, const char *fmt, ...);
/*
- * A variant of 'ubifs_err()' which takes the UBIFS file-sytem description
- * object as an argument.
+ * A conditional variant of 'ubifs_err()' which doesn't output anything
+ * if probing (ie. MS_SILENT set).
*/
#define ubifs_errc(c, fmt, ...) \
do { \
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index b043e044121d1..e237811f09ce5 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -249,42 +249,6 @@ out_free:
return err;
}
-/**
- * check_namespace - check extended attribute name-space.
- * @nm: extended attribute name
- *
- * This function makes sure the extended attribute name belongs to one of the
- * supported extended attribute name-spaces. Returns name-space index in case
- * of success and a negative error code in case of failure.
- */
-static int check_namespace(const struct qstr *nm)
-{
- int type;
-
- if (nm->len > UBIFS_MAX_NLEN)
- return -ENAMETOOLONG;
-
- if (!strncmp(nm->name, XATTR_TRUSTED_PREFIX,
- XATTR_TRUSTED_PREFIX_LEN)) {
- if (nm->name[XATTR_TRUSTED_PREFIX_LEN] == '\0')
- return -EINVAL;
- type = TRUSTED_XATTR;
- } else if (!strncmp(nm->name, XATTR_USER_PREFIX,
- XATTR_USER_PREFIX_LEN)) {
- if (nm->name[XATTR_USER_PREFIX_LEN] == '\0')
- return -EINVAL;
- type = USER_XATTR;
- } else if (!strncmp(nm->name, XATTR_SECURITY_PREFIX,
- XATTR_SECURITY_PREFIX_LEN)) {
- if (nm->name[XATTR_SECURITY_PREFIX_LEN] == '\0')
- return -EINVAL;
- type = SECURITY_XATTR;
- } else
- return -EOPNOTSUPP;
-
- return type;
-}
-
static struct inode *iget_xattr(struct ubifs_info *c, ino_t inum)
{
struct inode *inode;
@@ -302,24 +266,23 @@ static struct inode *iget_xattr(struct ubifs_info *c, ino_t inum)
return ERR_PTR(-EINVAL);
}
-static int setxattr(struct inode *host, const char *name, const void *value,
- size_t size, int flags)
+static int __ubifs_setxattr(struct inode *host, const char *name,
+ const void *value, size_t size, int flags)
{
struct inode *inode;
struct ubifs_info *c = host->i_sb->s_fs_info;
struct qstr nm = QSTR_INIT(name, strlen(name));
struct ubifs_dent_node *xent;
union ubifs_key key;
- int err, type;
+ int err;
ubifs_assert(inode_is_locked(host));
if (size > UBIFS_MAX_INO_DATA)
return -ERANGE;
- type = check_namespace(&nm);
- if (type < 0)
- return type;
+ if (nm.len > UBIFS_MAX_NLEN)
+ return -ENAMETOOLONG;
xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
if (!xent)
@@ -363,19 +326,10 @@ out_free:
return err;
}
-int ubifs_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+static ssize_t __ubifs_getxattr(struct inode *host, const char *name,
+ void *buf, size_t size)
{
- dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd",
- name, d_inode(dentry)->i_ino, dentry, size);
-
- return setxattr(d_inode(dentry), name, value, size, flags);
-}
-
-ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
- size_t size)
-{
- struct inode *inode, *host = d_inode(dentry);
+ struct inode *inode;
struct ubifs_info *c = host->i_sb->s_fs_info;
struct qstr nm = QSTR_INIT(name, strlen(name));
struct ubifs_inode *ui;
@@ -383,12 +337,8 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
union ubifs_key key;
int err;
- dbg_gen("xattr '%s', ino %lu ('%pd'), buf size %zd", name,
- host->i_ino, dentry, size);
-
- err = check_namespace(&nm);
- if (err < 0)
- return err;
+ if (nm.len > UBIFS_MAX_NLEN)
+ return -ENAMETOOLONG;
xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
if (!xent)
@@ -460,8 +410,6 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
lowest_xent_key(c, &key, host->i_ino);
while (1) {
- int type;
-
xent = ubifs_tnc_next_ent(c, &key, &nm);
if (IS_ERR(xent)) {
err = PTR_ERR(xent);
@@ -471,14 +419,10 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
nm.name = xent->name;
nm.len = le16_to_cpu(xent->nlen);
- type = check_namespace(&nm);
- if (unlikely(type < 0)) {
- err = type;
- break;
- }
-
/* Show trusted namespace only for "power" users */
- if (type != TRUSTED_XATTR || capable(CAP_SYS_ADMIN)) {
+ if (strncmp(xent->name, XATTR_TRUSTED_PREFIX,
+ XATTR_TRUSTED_PREFIX_LEN) ||
+ capable(CAP_SYS_ADMIN)) {
memcpy(buffer + written, nm.name, nm.len + 1);
written += nm.len + 1;
}
@@ -538,22 +482,19 @@ out_cancel:
return err;
}
-int ubifs_removexattr(struct dentry *dentry, const char *name)
+static int __ubifs_removexattr(struct inode *host, const char *name)
{
- struct inode *inode, *host = d_inode(dentry);
+ struct inode *inode;
struct ubifs_info *c = host->i_sb->s_fs_info;
struct qstr nm = QSTR_INIT(name, strlen(name));
struct ubifs_dent_node *xent;
union ubifs_key key;
int err;
- dbg_gen("xattr '%s', ino %lu ('%pd')", name,
- host->i_ino, dentry);
ubifs_assert(inode_is_locked(host));
- err = check_namespace(&nm);
- if (err < 0)
- return err;
+ if (nm.len > UBIFS_MAX_NLEN)
+ return -ENAMETOOLONG;
xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
if (!xent)
@@ -603,7 +544,7 @@ static int init_xattrs(struct inode *inode, const struct xattr *xattr_array,
}
strcpy(name, XATTR_SECURITY_PREFIX);
strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
- err = setxattr(inode, name, xattr->value, xattr->value_len, 0);
+ err = __ubifs_setxattr(inode, name, xattr->value, xattr->value_len, 0);
kfree(name);
if (err < 0)
break;
@@ -626,3 +567,52 @@ int ubifs_init_security(struct inode *dentry, struct inode *inode,
}
return err;
}
+
+static int ubifs_xattr_get(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, void *buffer, size_t size)
+{
+ dbg_gen("xattr '%s', ino %lu ('%pd'), buf size %zd", name,
+ inode->i_ino, dentry, size);
+
+ return __ubifs_getxattr(inode, name, buffer, size);
+}
+
+static int ubifs_xattr_set(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd",
+ name, inode->i_ino, dentry, size);
+
+ if (value)
+ return __ubifs_setxattr(inode, name, value, size, flags);
+ else
+ return __ubifs_removexattr(inode, name);
+}
+
+static const struct xattr_handler ubifs_user_xattr_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .get = ubifs_xattr_get,
+ .set = ubifs_xattr_set,
+};
+
+static const struct xattr_handler ubifs_trusted_xattr_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .get = ubifs_xattr_get,
+ .set = ubifs_xattr_set,
+};
+
+static const struct xattr_handler ubifs_security_xattr_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .get = ubifs_xattr_get,
+ .set = ubifs_xattr_set,
+};
+
+const struct xattr_handler *ubifs_xattr_handlers[] = {
+ &ubifs_user_xattr_handler,
+ &ubifs_trusted_xattr_handler,
+ &ubifs_security_xattr_handler,
+ NULL
+};
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index b51b371b874a0..aaec13c952531 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -113,7 +113,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
brelse(tmp);
}
if (num) {
- ll_rw_block(READA, num, bha);
+ ll_rw_block(REQ_OP_READ, REQ_RAHEAD, num, bha);
for (i = 0; i < num; i++)
brelse(bha[i]);
}
@@ -202,7 +202,7 @@ out:
const struct file_operations udf_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .iterate = udf_readdir,
+ .iterate_shared = udf_readdir,
.unlocked_ioctl = udf_ioctl,
.fsync = generic_file_fsync,
};
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index c763fda257bf3..988d5352bdb86 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -87,7 +87,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
brelse(tmp);
}
if (num) {
- ll_rw_block(READA, num, bha);
+ ll_rw_block(REQ_OP_READ, REQ_RAHEAD, num, bha);
for (i = 0; i < num; i++)
brelse(bha[i]);
}
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 1af98963d860f..6325706173275 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -46,7 +46,7 @@ static void __udf_adinicb_readpage(struct page *page)
kaddr = kmap(page);
memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, inode->i_size);
- memset(kaddr + inode->i_size, 0, PAGE_CACHE_SIZE - inode->i_size);
+ memset(kaddr + inode->i_size, 0, PAGE_SIZE - inode->i_size);
flush_dcache_page(page);
SetPageUptodate(page);
kunmap(page);
@@ -87,20 +87,19 @@ static int udf_adinicb_write_begin(struct file *file,
{
struct page *page;
- if (WARN_ON_ONCE(pos >= PAGE_CACHE_SIZE))
+ if (WARN_ON_ONCE(pos >= PAGE_SIZE))
return -EIO;
page = grab_cache_page_write_begin(mapping, 0, flags);
if (!page)
return -ENOMEM;
*pagep = page;
- if (!PageUptodate(page) && len != PAGE_CACHE_SIZE)
+ if (!PageUptodate(page) && len != PAGE_SIZE)
__udf_adinicb_readpage(page);
return 0;
}
-static ssize_t udf_adinicb_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- loff_t offset)
+static ssize_t udf_adinicb_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
/* Fallback to buffered I/O. */
return 0;
@@ -153,9 +152,7 @@ out:
if (retval > 0) {
mark_inode_dirty(inode);
- err = generic_write_sync(file, iocb->ki_pos - retval, retval);
- if (err < 0)
- retval = err;
+ retval = generic_write_sync(iocb, retval);
}
return retval;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 166d3ed32c39a..55aa587bbc385 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -214,8 +214,7 @@ static int udf_write_begin(struct file *file, struct address_space *mapping,
return ret;
}
-static ssize_t udf_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- loff_t offset)
+static ssize_t udf_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -223,9 +222,9 @@ static ssize_t udf_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
size_t count = iov_iter_count(iter);
ssize_t ret;
- ret = blockdev_direct_IO(iocb, inode, iter, offset, udf_get_block);
+ ret = blockdev_direct_IO(iocb, inode, iter, udf_get_block);
if (unlikely(ret < 0 && iov_iter_rw(iter) == WRITE))
- udf_write_failed(mapping, offset + count);
+ udf_write_failed(mapping, iocb->ki_pos + count);
return ret;
}
@@ -287,7 +286,7 @@ int udf_expand_file_adinicb(struct inode *inode)
if (!PageUptodate(page)) {
kaddr = kmap(page);
memset(kaddr + iinfo->i_lenAlloc, 0x00,
- PAGE_CACHE_SIZE - iinfo->i_lenAlloc);
+ PAGE_SIZE - iinfo->i_lenAlloc);
memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr,
iinfo->i_lenAlloc);
flush_dcache_page(page);
@@ -319,7 +318,7 @@ int udf_expand_file_adinicb(struct inode *inode)
inode->i_data.a_ops = &udf_adinicb_aops;
up_write(&iinfo->i_data_sem);
}
- page_cache_release(page);
+ put_page(page);
mark_inode_dirty(inode);
return err;
@@ -1200,7 +1199,7 @@ struct buffer_head *udf_bread(struct inode *inode, int block,
if (buffer_uptodate(bh))
return bh;
- ll_rw_block(READ, 1, &bh);
+ ll_rw_block(REQ_OP_READ, 0, 1, &bh);
wait_on_buffer(bh);
if (buffer_uptodate(bh))
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index a2ba11eca9955..c3e5c96793715 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -1250,7 +1250,7 @@ static struct dentry *udf_get_parent(struct dentry *child)
brelse(fibh.sbh);
tloc = lelb_to_cpu(cfi.icb.extLocation);
- inode = udf_iget(d_inode(child)->i_sb, &tloc);
+ inode = udf_iget(child->d_sb, &tloc);
if (IS_ERR(inode))
return ERR_CAST(inode);
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 5f861ed287c3f..888c364b2fe95 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -295,7 +295,8 @@ static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block,
map = &UDF_SB(sb)->s_partmaps[partition];
/* map to sparable/physical partition desc */
phyblock = udf_get_pblock(sb, eloc.logicalBlockNum,
- map->s_partition_num, ext_offset + offset);
+ map->s_type_specific.s_metadata.s_phys_partition_ref,
+ ext_offset + offset);
}
brelse(epos.bh);
@@ -317,14 +318,18 @@ uint32_t udf_get_pblock_meta25(struct super_block *sb, uint32_t block,
mdata = &map->s_type_specific.s_metadata;
inode = mdata->s_metadata_fe ? : mdata->s_mirror_fe;
- /* We shouldn't mount such media... */
- BUG_ON(!inode);
+ if (!inode)
+ return 0xFFFFFFFF;
+
retblk = udf_try_read_meta(inode, block, partition, offset);
if (retblk == 0xFFFFFFFF && mdata->s_metadata_fe) {
udf_warn(sb, "error reading from METADATA, trying to read from MIRROR\n");
if (!(mdata->s_flags & MF_MIRROR_FE_LOADED)) {
mdata->s_mirror_fe = udf_find_metadata_inode_efe(sb,
- mdata->s_mirror_file_loc, map->s_partition_num);
+ mdata->s_mirror_file_loc,
+ mdata->s_phys_partition_ref);
+ if (IS_ERR(mdata->s_mirror_fe))
+ mdata->s_mirror_fe = NULL;
mdata->s_flags |= MF_MIRROR_FE_LOADED;
}
diff --git a/fs/udf/super.c b/fs/udf/super.c
index fa92fe839fda2..4942549e7dc8b 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -78,6 +78,15 @@
#define VSD_FIRST_SECTOR_OFFSET 32768
#define VSD_MAX_SECTOR_OFFSET 0x800000
+/*
+ * Maximum number of Terminating Descriptor / Logical Volume Integrity
+ * Descriptor redirections. The chosen numbers are arbitrary - just that we
+ * hopefully don't limit any real use of rewritten inode on write-once media
+ * but avoid looping for too long on corrupted media.
+ */
+#define UDF_MAX_TD_NESTING 64
+#define UDF_MAX_LVID_NESTING 1000
+
enum { UDF_MAX_LINKS = 0xffff };
/* These are the "meat" - everything else is stuffing */
@@ -919,14 +928,14 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
#endif
}
- ret = udf_CS0toUTF8(outstr, 31, pvoldesc->volIdent, 32);
+ ret = udf_dstrCS0toUTF8(outstr, 31, pvoldesc->volIdent, 32);
if (ret < 0)
goto out_bh;
strncpy(UDF_SB(sb)->s_volume_ident, outstr, ret);
udf_debug("volIdent[] = '%s'\n", UDF_SB(sb)->s_volume_ident);
- ret = udf_CS0toUTF8(outstr, 127, pvoldesc->volSetIdent, 128);
+ ret = udf_dstrCS0toUTF8(outstr, 127, pvoldesc->volSetIdent, 128);
if (ret < 0)
goto out_bh;
@@ -942,13 +951,13 @@ out2:
}
struct inode *udf_find_metadata_inode_efe(struct super_block *sb,
- u32 meta_file_loc, u32 partition_num)
+ u32 meta_file_loc, u32 partition_ref)
{
struct kernel_lb_addr addr;
struct inode *metadata_fe;
addr.logicalBlockNum = meta_file_loc;
- addr.partitionReferenceNum = partition_num;
+ addr.partitionReferenceNum = partition_ref;
metadata_fe = udf_iget_special(sb, &addr);
@@ -965,7 +974,8 @@ struct inode *udf_find_metadata_inode_efe(struct super_block *sb,
return metadata_fe;
}
-static int udf_load_metadata_files(struct super_block *sb, int partition)
+static int udf_load_metadata_files(struct super_block *sb, int partition,
+ int type1_index)
{
struct udf_sb_info *sbi = UDF_SB(sb);
struct udf_part_map *map;
@@ -975,20 +985,21 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
map = &sbi->s_partmaps[partition];
mdata = &map->s_type_specific.s_metadata;
+ mdata->s_phys_partition_ref = type1_index;
/* metadata address */
udf_debug("Metadata file location: block = %d part = %d\n",
- mdata->s_meta_file_loc, map->s_partition_num);
+ mdata->s_meta_file_loc, mdata->s_phys_partition_ref);
fe = udf_find_metadata_inode_efe(sb, mdata->s_meta_file_loc,
- map->s_partition_num);
+ mdata->s_phys_partition_ref);
if (IS_ERR(fe)) {
/* mirror file entry */
udf_debug("Mirror metadata file location: block = %d part = %d\n",
- mdata->s_mirror_file_loc, map->s_partition_num);
+ mdata->s_mirror_file_loc, mdata->s_phys_partition_ref);
fe = udf_find_metadata_inode_efe(sb, mdata->s_mirror_file_loc,
- map->s_partition_num);
+ mdata->s_phys_partition_ref);
if (IS_ERR(fe)) {
udf_err(sb, "Both metadata and mirror metadata inode efe can not found\n");
@@ -1006,7 +1017,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
*/
if (mdata->s_bitmap_file_loc != 0xFFFFFFFF) {
addr.logicalBlockNum = mdata->s_bitmap_file_loc;
- addr.partitionReferenceNum = map->s_partition_num;
+ addr.partitionReferenceNum = mdata->s_phys_partition_ref;
udf_debug("Bitmap file location: block = %d part = %d\n",
addr.logicalBlockNum, addr.partitionReferenceNum);
@@ -1274,7 +1285,7 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
p = (struct partitionDesc *)bh->b_data;
partitionNumber = le16_to_cpu(p->partitionNumber);
- /* First scan for TYPE1, SPARABLE and METADATA partitions */
+ /* First scan for TYPE1 and SPARABLE partitions */
for (i = 0; i < sbi->s_partitions; i++) {
map = &sbi->s_partmaps[i];
udf_debug("Searching map: (%d == %d)\n",
@@ -1324,7 +1335,7 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
goto out_bh;
if (map->s_partition_type == UDF_METADATA_MAP25) {
- ret = udf_load_metadata_files(sb, i);
+ ret = udf_load_metadata_files(sb, i, type1_idx);
if (ret < 0) {
udf_err(sb, "error loading MetaData partition map %d\n",
i);
@@ -1541,42 +1552,52 @@ out_bh:
}
/*
- * udf_load_logicalvolint
- *
+ * Find the prevailing Logical Volume Integrity Descriptor.
*/
static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ad loc)
{
- struct buffer_head *bh = NULL;
+ struct buffer_head *bh, *final_bh;
uint16_t ident;
struct udf_sb_info *sbi = UDF_SB(sb);
struct logicalVolIntegrityDesc *lvid;
+ int indirections = 0;
+
+ while (++indirections <= UDF_MAX_LVID_NESTING) {
+ final_bh = NULL;
+ while (loc.extLength > 0 &&
+ (bh = udf_read_tagged(sb, loc.extLocation,
+ loc.extLocation, &ident))) {
+ if (ident != TAG_IDENT_LVID) {
+ brelse(bh);
+ break;
+ }
+
+ brelse(final_bh);
+ final_bh = bh;
- while (loc.extLength > 0 &&
- (bh = udf_read_tagged(sb, loc.extLocation,
- loc.extLocation, &ident)) &&
- ident == TAG_IDENT_LVID) {
- sbi->s_lvid_bh = bh;
- lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
+ loc.extLength -= sb->s_blocksize;
+ loc.extLocation++;
+ }
- if (lvid->nextIntegrityExt.extLength)
- udf_load_logicalvolint(sb,
- leea_to_cpu(lvid->nextIntegrityExt));
+ if (!final_bh)
+ return;
- if (sbi->s_lvid_bh != bh)
- brelse(bh);
- loc.extLength -= sb->s_blocksize;
- loc.extLocation++;
+ brelse(sbi->s_lvid_bh);
+ sbi->s_lvid_bh = final_bh;
+
+ lvid = (struct logicalVolIntegrityDesc *)final_bh->b_data;
+ if (lvid->nextIntegrityExt.extLength == 0)
+ return;
+
+ loc = leea_to_cpu(lvid->nextIntegrityExt);
}
- if (sbi->s_lvid_bh != bh)
- brelse(bh);
+
+ udf_warn(sb, "Too many LVID indirections (max %u), ignoring.\n",
+ UDF_MAX_LVID_NESTING);
+ brelse(sbi->s_lvid_bh);
+ sbi->s_lvid_bh = NULL;
}
-/*
- * Maximum number of Terminating Descriptor redirections. The chosen number is
- * arbitrary - just that we hopefully don't limit any real use of rewritten
- * inode on write-once media but avoid looping for too long on corrupted media.
- */
-#define UDF_MAX_TD_NESTING 64
/*
* Process a main/reserve volume descriptor sequence.
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 1f32c7bd9f57f..c13875d669c0f 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -3,9 +3,7 @@
#include <linux/mutex.h>
#include <linux/bitops.h>
-
-/* Since UDF 2.01 is ISO 13346 based... */
-#define UDF_SUPER_MAGIC 0x15013346
+#include <linux/magic.h>
#define UDF_MAX_READ_VERSION 0x0250
#define UDF_MAX_WRITE_VERSION 0x0201
@@ -63,6 +61,11 @@ struct udf_meta_data {
__u32 s_bitmap_file_loc;
__u32 s_alloc_unit_size;
__u16 s_align_unit_size;
+ /*
+ * Partition Reference Number of the associated physical / sparable
+ * partition
+ */
+ __u16 s_phys_partition_ref;
int s_flags;
struct inode *s_metadata_fe;
struct inode *s_mirror_fe;
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 972b70625614f..263829ef18736 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -212,7 +212,7 @@ extern int udf_get_filename(struct super_block *, const uint8_t *, int,
uint8_t *, int);
extern int udf_put_filename(struct super_block *, const uint8_t *, int,
uint8_t *, int);
-extern int udf_CS0toUTF8(uint8_t *, int, const uint8_t *, int);
+extern int udf_dstrCS0toUTF8(uint8_t *, int, const uint8_t *, int);
/* ialloc.c */
extern void udf_free_inode(struct inode *);
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index 3ff42f4437f3e..695389a4fc239 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -335,9 +335,21 @@ try_again:
return u_len;
}
-int udf_CS0toUTF8(uint8_t *utf_o, int o_len, const uint8_t *ocu_i, int i_len)
+int udf_dstrCS0toUTF8(uint8_t *utf_o, int o_len,
+ const uint8_t *ocu_i, int i_len)
{
- return udf_name_from_CS0(utf_o, o_len, ocu_i, i_len,
+ int s_len = 0;
+
+ if (i_len > 0) {
+ s_len = ocu_i[i_len - 1];
+ if (s_len >= i_len) {
+ pr_err("incorrect dstring lengths (%d/%d)\n",
+ s_len, i_len);
+ return -EINVAL;
+ }
+ }
+
+ return udf_name_from_CS0(utf_o, o_len, ocu_i, s_len,
udf_uni2char_utf8, 0);
}
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index dc5fae601c24b..67e085d591d83 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -237,7 +237,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg,
sector_t newb, struct page *locked_page)
{
const unsigned blks_per_page =
- 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ 1 << (PAGE_SHIFT - inode->i_blkbits);
const unsigned mask = blks_per_page - 1;
struct address_space * const mapping = inode->i_mapping;
pgoff_t index, cur_index, last_index;
@@ -255,9 +255,9 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg,
cur_index = locked_page->index;
end = count + beg;
- last_index = end >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ last_index = end >> (PAGE_SHIFT - inode->i_blkbits);
for (i = beg; i < end; i = (i | mask) + 1) {
- index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ index = i >> (PAGE_SHIFT - inode->i_blkbits);
if (likely(cur_index != index)) {
page = ufs_get_locked_page(mapping, index);
@@ -292,7 +292,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg,
if (!buffer_mapped(bh))
map_bh(bh, inode->i_sb, oldb + pos);
if (!buffer_uptodate(bh)) {
- ll_rw_block(READ, 1, &bh);
+ ll_rw_block(REQ_OP_READ, 0, 1, &bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
ufs_error(inode->i_sb, __func__,
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 74f2e80288bfa..fa3bda1a860fe 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -62,7 +62,7 @@ static int ufs_commit_chunk(struct page *page, loff_t pos, unsigned len)
static inline void ufs_put_page(struct page *page)
{
kunmap(page);
- page_cache_release(page);
+ put_page(page);
}
ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr)
@@ -105,19 +105,19 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
}
-static void ufs_check_page(struct page *page)
+static bool ufs_check_page(struct page *page)
{
struct inode *dir = page->mapping->host;
struct super_block *sb = dir->i_sb;
char *kaddr = page_address(page);
unsigned offs, rec_len;
- unsigned limit = PAGE_CACHE_SIZE;
+ unsigned limit = PAGE_SIZE;
const unsigned chunk_mask = UFS_SB(sb)->s_uspi->s_dirblksize - 1;
struct ufs_dir_entry *p;
char *error;
- if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
- limit = dir->i_size & ~PAGE_CACHE_MASK;
+ if ((dir->i_size >> PAGE_SHIFT) == page->index) {
+ limit = dir->i_size & ~PAGE_MASK;
if (limit & chunk_mask)
goto Ebadsize;
if (!limit)
@@ -143,7 +143,7 @@ static void ufs_check_page(struct page *page)
goto Eend;
out:
SetPageChecked(page);
- return;
+ return true;
/* Too bad, we had an error */
@@ -170,7 +170,7 @@ Einumber:
bad_entry:
ufs_error (sb, "ufs_check_page", "bad entry in directory #%lu: %s - "
"offset=%lu, rec_len=%d, name_len=%d",
- dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+ dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs,
rec_len, ufs_get_de_namlen(sb, p));
goto fail;
Eend:
@@ -178,10 +178,10 @@ Eend:
ufs_error(sb, __func__,
"entry in directory #%lu spans the page boundary"
"offset=%lu",
- dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs);
+ dir->i_ino, (page->index<<PAGE_SHIFT)+offs);
fail:
- SetPageChecked(page);
SetPageError(page);
+ return false;
}
static struct page *ufs_get_page(struct inode *dir, unsigned long n)
@@ -190,10 +190,10 @@ static struct page *ufs_get_page(struct inode *dir, unsigned long n)
struct page *page = read_mapping_page(mapping, n, NULL);
if (!IS_ERR(page)) {
kmap(page);
- if (!PageChecked(page))
- ufs_check_page(page);
- if (PageError(page))
- goto fail;
+ if (unlikely(!PageChecked(page))) {
+ if (PageError(page) || !ufs_check_page(page))
+ goto fail;
+ }
}
return page;
@@ -211,9 +211,9 @@ ufs_last_byte(struct inode *inode, unsigned long page_nr)
{
unsigned last_byte = inode->i_size;
- last_byte -= page_nr << PAGE_CACHE_SHIFT;
- if (last_byte > PAGE_CACHE_SIZE)
- last_byte = PAGE_CACHE_SIZE;
+ last_byte -= page_nr << PAGE_SHIFT;
+ if (last_byte > PAGE_SIZE)
+ last_byte = PAGE_SIZE;
return last_byte;
}
@@ -279,12 +279,6 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr,
de = (struct ufs_dir_entry *) kaddr;
kaddr += ufs_last_byte(dir, n) - reclen;
while ((char *) de <= kaddr) {
- if (de->d_reclen == 0) {
- ufs_error(dir->i_sb, __func__,
- "zero-length directory entry");
- ufs_put_page(page);
- goto out;
- }
if (ufs_match(sb, namelen, name, de))
goto found;
de = ufs_next_entry(sb, de);
@@ -341,7 +335,7 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode)
kaddr = page_address(page);
dir_end = kaddr + ufs_last_byte(dir, n);
de = (struct ufs_dir_entry *)kaddr;
- kaddr += PAGE_CACHE_SIZE - reclen;
+ kaddr += PAGE_SIZE - reclen;
while ((char *)de <= kaddr) {
if ((char *)de == dir_end) {
/* We hit i_size */
@@ -414,11 +408,8 @@ ufs_validate_entry(struct super_block *sb, char *base,
{
struct ufs_dir_entry *de = (struct ufs_dir_entry*)(base + offset);
struct ufs_dir_entry *p = (struct ufs_dir_entry*)(base + (offset&mask));
- while ((char*)p < (char*)de) {
- if (p->d_reclen == 0)
- break;
+ while ((char*)p < (char*)de)
p = ufs_next_entry(sb, p);
- }
return (char *)p - base;
}
@@ -432,8 +423,8 @@ ufs_readdir(struct file *file, struct dir_context *ctx)
loff_t pos = ctx->pos;
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
- unsigned int offset = pos & ~PAGE_CACHE_MASK;
- unsigned long n = pos >> PAGE_CACHE_SHIFT;
+ unsigned int offset = pos & ~PAGE_MASK;
+ unsigned long n = pos >> PAGE_SHIFT;
unsigned long npages = dir_pages(inode);
unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1);
int need_revalidate = file->f_version != inode->i_version;
@@ -454,14 +445,14 @@ ufs_readdir(struct file *file, struct dir_context *ctx)
ufs_error(sb, __func__,
"bad page in #%lu",
inode->i_ino);
- ctx->pos += PAGE_CACHE_SIZE - offset;
+ ctx->pos += PAGE_SIZE - offset;
return -EIO;
}
kaddr = page_address(page);
if (unlikely(need_revalidate)) {
if (offset) {
offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask);
- ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
+ ctx->pos = (n<<PAGE_SHIFT) + offset;
}
file->f_version = inode->i_version;
need_revalidate = 0;
@@ -469,12 +460,6 @@ ufs_readdir(struct file *file, struct dir_context *ctx)
de = (struct ufs_dir_entry *)(kaddr+offset);
limit = kaddr + ufs_last_byte(inode, n) - UFS_DIR_REC_LEN(1);
for ( ;(char*)de <= limit; de = ufs_next_entry(sb, de)) {
- if (de->d_reclen == 0) {
- ufs_error(sb, __func__,
- "zero-length directory entry");
- ufs_put_page(page);
- return -EIO;
- }
if (de->d_ino) {
unsigned char d_type = DT_UNKNOWN;
@@ -574,7 +559,7 @@ int ufs_make_empty(struct inode * inode, struct inode *dir)
kmap(page);
base = (char*)page_address(page);
- memset(base, 0, PAGE_CACHE_SIZE);
+ memset(base, 0, PAGE_SIZE);
de = (struct ufs_dir_entry *) base;
@@ -594,7 +579,7 @@ int ufs_make_empty(struct inode * inode, struct inode *dir)
err = ufs_commit_chunk(page, 0, chunk_size);
fail:
- page_cache_release(page);
+ put_page(page);
return err;
}
@@ -653,7 +638,7 @@ not_empty:
const struct file_operations ufs_dir_operations = {
.read = generic_read_dir,
- .iterate = ufs_readdir,
+ .iterate_shared = ufs_readdir,
.fsync = generic_file_fsync,
.llseek = generic_file_llseek,
};
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index d897e169ab9c4..9f49431e798d6 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -1051,13 +1051,13 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size)
lastfrag--;
lastpage = ufs_get_locked_page(mapping, lastfrag >>
- (PAGE_CACHE_SHIFT - inode->i_blkbits));
+ (PAGE_SHIFT - inode->i_blkbits));
if (IS_ERR(lastpage)) {
err = -EIO;
goto out;
}
- end = lastfrag & ((1 << (PAGE_CACHE_SHIFT - inode->i_blkbits)) - 1);
+ end = lastfrag & ((1 << (PAGE_SHIFT - inode->i_blkbits)) - 1);
bh = page_buffers(lastpage);
for (i = 0; i < end; ++i)
bh = bh->b_this_page;
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index acf4a3b61b81f..a1559f7628053 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -305,7 +305,7 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
ufs_set_link(old_inode, dir_de, dir_page, new_dir, 0);
else {
kunmap(dir_page);
- page_cache_release(dir_page);
+ put_page(dir_page);
}
inode_dec_link_count(old_dir);
}
@@ -315,11 +315,11 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
out_dir:
if (dir_de) {
kunmap(dir_page);
- page_cache_release(dir_page);
+ put_page(dir_page);
}
out_old:
kunmap(old_page);
- page_cache_release(old_page);
+ put_page(old_page);
out:
return err;
}
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 442fd52ebffe5..f04ab232d08d4 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -132,7 +132,7 @@ static struct dentry *ufs_get_parent(struct dentry *child)
ino = ufs_inode_by_name(d_inode(child), &dot_dot);
if (!ino)
return ERR_PTR(-ENOENT);
- return d_obtain_alias(ufs_iget(d_inode(child)->i_sb, ino));
+ return d_obtain_alias(ufs_iget(child->d_sb, ino));
}
static const struct export_operations ufs_export_ops = {
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index b6c2f94e041ed..f41ad0a6106f2 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -118,7 +118,7 @@ void ubh_sync_block(struct ufs_buffer_head *ubh)
unsigned i;
for (i = 0; i < ubh->count; i++)
- write_dirty_buffer(ubh->bh[i], WRITE);
+ write_dirty_buffer(ubh->bh[i], 0);
for (i = 0; i < ubh->count; i++)
wait_on_buffer(ubh->bh[i]);
@@ -261,14 +261,14 @@ struct page *ufs_get_locked_page(struct address_space *mapping,
if (unlikely(page->mapping == NULL)) {
/* Truncate got there first */
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
page = NULL;
goto out;
}
if (!PageUptodate(page) || PageError(page)) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
printk(KERN_ERR "ufs_change_blocknr: "
"can not read page: ino %lu, index: %lu\n",
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 9541759282409..b7fbf53dbc81a 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -283,7 +283,7 @@ extern struct page *ufs_get_locked_page(struct address_space *mapping,
static inline void ufs_put_locked_page(struct page *page)
{
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 66cdb44616d5a..85959d8324df2 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -137,7 +137,7 @@ static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
- mmput(ctx->mm);
+ mmdrop(ctx->mm);
kmem_cache_free(userfaultfd_ctx_cachep, ctx);
}
}
@@ -257,10 +257,9 @@ out:
* fatal_signal_pending()s, and the mmap_sem must be released before
* returning it.
*/
-int handle_userfault(struct vm_area_struct *vma, unsigned long address,
- unsigned int flags, unsigned long reason)
+int handle_userfault(struct fault_env *fe, unsigned long reason)
{
- struct mm_struct *mm = vma->vm_mm;
+ struct mm_struct *mm = fe->vma->vm_mm;
struct userfaultfd_ctx *ctx;
struct userfaultfd_wait_queue uwq;
int ret;
@@ -269,7 +268,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
ret = VM_FAULT_SIGBUS;
- ctx = vma->vm_userfaultfd_ctx.ctx;
+ ctx = fe->vma->vm_userfaultfd_ctx.ctx;
if (!ctx)
goto out;
@@ -302,17 +301,17 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
* without first stopping userland access to the memory. For
* VM_UFFD_MISSING userfaults this is enough for now.
*/
- if (unlikely(!(flags & FAULT_FLAG_ALLOW_RETRY))) {
+ if (unlikely(!(fe->flags & FAULT_FLAG_ALLOW_RETRY))) {
/*
* Validate the invariant that nowait must allow retry
* to be sure not to return SIGBUS erroneously on
* nowait invocations.
*/
- BUG_ON(flags & FAULT_FLAG_RETRY_NOWAIT);
+ BUG_ON(fe->flags & FAULT_FLAG_RETRY_NOWAIT);
#ifdef CONFIG_DEBUG_VM
if (printk_ratelimit()) {
printk(KERN_WARNING
- "FAULT_FLAG_ALLOW_RETRY missing %x\n", flags);
+ "FAULT_FLAG_ALLOW_RETRY missing %x\n", fe->flags);
dump_stack();
}
#endif
@@ -324,7 +323,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
* and wait.
*/
ret = VM_FAULT_RETRY;
- if (flags & FAULT_FLAG_RETRY_NOWAIT)
+ if (fe->flags & FAULT_FLAG_RETRY_NOWAIT)
goto out;
/* take the reference before dropping the mmap_sem */
@@ -332,10 +331,11 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
uwq.wq.private = current;
- uwq.msg = userfault_msg(address, flags, reason);
+ uwq.msg = userfault_msg(fe->address, fe->flags, reason);
uwq.ctx = ctx;
- return_to_userland = (flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
+ return_to_userland =
+ (fe->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
(FAULT_FLAG_USER|FAULT_FLAG_KILLABLE);
spin_lock(&ctx->fault_pending_wqh.lock);
@@ -353,7 +353,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
TASK_KILLABLE);
spin_unlock(&ctx->fault_pending_wqh.lock);
- must_wait = userfaultfd_must_wait(ctx, address, flags, reason);
+ must_wait = userfaultfd_must_wait(ctx, fe->address, fe->flags, reason);
up_read(&mm->mmap_sem);
if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
@@ -434,6 +434,9 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
ACCESS_ONCE(ctx->released) = true;
+ if (!mmget_not_zero(mm))
+ goto wakeup;
+
/*
* Flush page faults out of all CPUs. NOTE: all page faults
* must be retried without returning VM_FAULT_SIGBUS if
@@ -466,7 +469,8 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
}
up_write(&mm->mmap_sem);
-
+ mmput(mm);
+wakeup:
/*
* After no new page faults can wait on this fault_*wqh, flush
* the last page faults that may have been already waiting on
@@ -760,10 +764,12 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
start = uffdio_register.range.start;
end = start + uffdio_register.range.len;
+ ret = -ENOMEM;
+ if (!mmget_not_zero(mm))
+ goto out;
+
down_write(&mm->mmap_sem);
vma = find_vma_prev(mm, start, &prev);
-
- ret = -ENOMEM;
if (!vma)
goto out_unlock;
@@ -864,6 +870,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
} while (vma && vma->vm_start < end);
out_unlock:
up_write(&mm->mmap_sem);
+ mmput(mm);
if (!ret) {
/*
* Now that we scanned all vmas we can already tell
@@ -902,10 +909,12 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
start = uffdio_unregister.start;
end = start + uffdio_unregister.len;
+ ret = -ENOMEM;
+ if (!mmget_not_zero(mm))
+ goto out;
+
down_write(&mm->mmap_sem);
vma = find_vma_prev(mm, start, &prev);
-
- ret = -ENOMEM;
if (!vma)
goto out_unlock;
@@ -998,6 +1007,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
} while (vma && vma->vm_start < end);
out_unlock:
up_write(&mm->mmap_sem);
+ mmput(mm);
out:
return ret;
}
@@ -1067,9 +1077,11 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
goto out;
if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE)
goto out;
-
- ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
- uffdio_copy.len);
+ if (mmget_not_zero(ctx->mm)) {
+ ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
+ uffdio_copy.len);
+ mmput(ctx->mm);
+ }
if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
return -EFAULT;
if (ret < 0)
@@ -1110,8 +1122,11 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
goto out;
- ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
- uffdio_zeropage.range.len);
+ if (mmget_not_zero(ctx->mm)) {
+ ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
+ uffdio_zeropage.range.len);
+ mmput(ctx->mm);
+ }
if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
return -EFAULT;
if (ret < 0)
@@ -1289,12 +1304,12 @@ static struct file *userfaultfd_file_create(int flags)
ctx->released = false;
ctx->mm = current->mm;
/* prevent the mm struct to be freed */
- atomic_inc(&ctx->mm->mm_users);
+ atomic_inc(&ctx->mm->mm_count);
file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
if (IS_ERR(file)) {
- mmput(ctx->mm);
+ mmdrop(ctx->mm);
kmem_cache_free(userfaultfd_ctx_cachep, ctx);
}
out:
diff --git a/fs/utimes.c b/fs/utimes.c
index 85c40f4f373d5..794f5f5b1fb5c 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -92,10 +92,11 @@ static int utimes_common(struct path *path, struct timespec *times)
* then we need to check permissions, because
* inode_change_ok() won't do it.
*/
- error = -EACCES;
+ error = -EPERM;
if (IS_IMMUTABLE(inode))
goto mnt_drop_write_and_out;
+ error = -EACCES;
if (!inode_owner_or_capable(inode)) {
error = inode_permission(inode, MAY_WRITE);
if (error)
diff --git a/fs/xattr.c b/fs/xattr.c
index 4861322e28e8d..c243905835abd 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -38,6 +38,13 @@ xattr_permission(struct inode *inode, const char *name, int mask)
if (mask & MAY_WRITE) {
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
return -EPERM;
+ /*
+ * Updating an xattr will likely cause i_uid and i_gid
+ * to be writen back improperly if their true value is
+ * unknown to the vfs.
+ */
+ if (HAS_UNMAPPED_ID(inode))
+ return -EPERM;
}
/*
@@ -100,7 +107,7 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
if (issec)
inode->i_flags &= ~S_NOSEC;
if (inode->i_op->setxattr) {
- error = inode->i_op->setxattr(dentry, name, value, size, flags);
+ error = inode->i_op->setxattr(dentry, inode, name, value, size, flags);
if (!error) {
fsnotify_xattr(dentry);
security_inode_post_setxattr(dentry, name, value,
@@ -192,7 +199,7 @@ vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value,
if (!inode->i_op->getxattr)
return -EOPNOTSUPP;
- error = inode->i_op->getxattr(dentry, name, NULL, 0);
+ error = inode->i_op->getxattr(dentry, inode, name, NULL, 0);
if (error < 0)
return error;
@@ -203,7 +210,7 @@ vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value,
memset(value, 0, error + 1);
}
- error = inode->i_op->getxattr(dentry, name, value, error);
+ error = inode->i_op->getxattr(dentry, inode, name, value, error);
*xattr_value = value;
return error;
}
@@ -236,7 +243,7 @@ vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size)
}
nolsm:
if (inode->i_op->getxattr)
- error = inode->i_op->getxattr(dentry, name, value, size);
+ error = inode->i_op->getxattr(dentry, inode, name, value, size);
else
error = -EOPNOTSUPP;
@@ -655,6 +662,7 @@ strcmp_prefix(const char *a, const char *a_prefix)
* operations to the correct xattr_handler.
*/
#define for_each_xattr_handler(handlers, handler) \
+ if (handlers) \
for ((handler) = *(handlers)++; \
(handler) != NULL; \
(handler) = *(handlers)++)
@@ -668,7 +676,7 @@ xattr_resolve_name(const struct xattr_handler **handlers, const char **name)
const struct xattr_handler *handler;
if (!*name)
- return NULL;
+ return ERR_PTR(-EINVAL);
for_each_xattr_handler(handlers, handler) {
const char *n;
@@ -691,14 +699,16 @@ xattr_resolve_name(const struct xattr_handler **handlers, const char **name)
* Find the handler for the prefix and dispatch its get() operation.
*/
ssize_t
-generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size)
+generic_getxattr(struct dentry *dentry, struct inode *inode,
+ const char *name, void *buffer, size_t size)
{
const struct xattr_handler *handler;
handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
if (IS_ERR(handler))
return PTR_ERR(handler);
- return handler->get(handler, dentry, name, buffer, size);
+ return handler->get(handler, dentry, inode,
+ name, buffer, size);
}
/*
@@ -742,7 +752,8 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
* Find the handler for the prefix and dispatch its set() operation.
*/
int
-generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags)
+generic_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
{
const struct xattr_handler *handler;
@@ -751,7 +762,7 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz
handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
if (IS_ERR(handler))
return PTR_ERR(handler);
- return handler->set(handler, dentry, name, value, size, flags);
+ return handler->set(handler, dentry, inode, name, value, size, flags);
}
/*
@@ -766,7 +777,8 @@ generic_removexattr(struct dentry *dentry, const char *name)
handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
if (IS_ERR(handler))
return PTR_ERR(handler);
- return handler->set(handler, dentry, name, NULL, 0, XATTR_REPLACE);
+ return handler->set(handler, dentry, d_inode(dentry), name, NULL,
+ 0, XATTR_REPLACE);
}
EXPORT_SYMBOL(generic_getxattr);
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 5d47b4df61eac..35faf128f36d8 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -4,6 +4,7 @@ config XFS_FS
depends on (64BIT || LBDAF)
select EXPORTFS
select LIBCRC32C
+ select FS_IOMAP
help
XFS is a high performance journaling filesystem which originated
on the SGI IRIX platform. It is completely multi-threaded, can
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 3542d94fddce5..fc593c8694936 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -39,6 +39,7 @@ xfs-y += $(addprefix libxfs/, \
xfs_btree.o \
xfs_da_btree.o \
xfs_da_format.o \
+ xfs_defer.o \
xfs_dir2.o \
xfs_dir2_block.o \
xfs_dir2_data.o \
@@ -51,6 +52,8 @@ xfs-y += $(addprefix libxfs/, \
xfs_inode_fork.o \
xfs_inode_buf.o \
xfs_log_rlimit.o \
+ xfs_rmap.o \
+ xfs_rmap_btree.o \
xfs_sb.o \
xfs_symlink_remote.o \
xfs_trans_resv.o \
@@ -100,11 +103,13 @@ xfs-y += xfs_log.o \
xfs_extfree_item.o \
xfs_icreate_item.o \
xfs_inode_item.o \
+ xfs_rmap_item.o \
xfs_log_recover.o \
xfs_trans_ail.o \
xfs_trans_buf.o \
xfs_trans_extfree.o \
xfs_trans_inode.o \
+ xfs_trans_rmap.o \
# optional features
xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
@@ -121,5 +126,4 @@ xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o
-xfs-$(CONFIG_NFSD_BLOCKLAYOUT) += xfs_pnfs.o
-xfs-$(CONFIG_NFSD_SCSILAYOUT) += xfs_pnfs.o
+xfs-$(CONFIG_EXPORTFS_BLOCK_OPS) += xfs_pnfs.o
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 686ba6fb20ddc..339c696bbc018 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -93,19 +93,23 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
}
void *
-kmem_realloc(const void *ptr, size_t newsize, size_t oldsize,
- xfs_km_flags_t flags)
+kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags)
{
- void *new;
+ int retries = 0;
+ gfp_t lflags = kmem_flags_convert(flags);
+ void *ptr;
- new = kmem_alloc(newsize, flags);
- if (ptr) {
- if (new)
- memcpy(new, ptr,
- ((oldsize < newsize) ? oldsize : newsize));
- kmem_free(ptr);
- }
- return new;
+ do {
+ ptr = krealloc(old, newsize, lflags);
+ if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+ return ptr;
+ if (!(++retries % 100))
+ xfs_err(NULL,
+ "%s(%u) possible memory allocation deadlock size %zu in %s (mode:0x%x)",
+ current->comm, current->pid,
+ newsize, __func__, lflags);
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ } while (1);
}
void *
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index d1c66e465ca56..689f746224e7a 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -62,7 +62,7 @@ kmem_flags_convert(xfs_km_flags_t flags)
extern void *kmem_alloc(size_t, xfs_km_flags_t);
extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t);
-extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t);
+extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t);
static inline void kmem_free(const void *ptr)
{
kvfree(ptr);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index a708e38b494c7..776ae2f325d1e 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -24,8 +24,10 @@
#include "xfs_bit.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
+#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
+#include "xfs_rmap.h"
#include "xfs_alloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_extent_busy.h"
@@ -49,6 +51,81 @@ STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
+xfs_extlen_t
+xfs_prealloc_blocks(
+ struct xfs_mount *mp)
+{
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ return XFS_RMAP_BLOCK(mp) + 1;
+ if (xfs_sb_version_hasfinobt(&mp->m_sb))
+ return XFS_FIBT_BLOCK(mp) + 1;
+ return XFS_IBT_BLOCK(mp) + 1;
+}
+
+/*
+ * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of
+ * AGF buffer (PV 947395), we place constraints on the relationship among
+ * actual allocations for data blocks, freelist blocks, and potential file data
+ * bmap btree blocks. However, these restrictions may result in no actual space
+ * allocated for a delayed extent, for example, a data block in a certain AG is
+ * allocated but there is no additional block for the additional bmap btree
+ * block due to a split of the bmap btree of the file. The result of this may
+ * lead to an infinite loop when the file gets flushed to disk and all delayed
+ * extents need to be actually allocated. To get around this, we explicitly set
+ * aside a few blocks which will not be reserved in delayed allocation.
+ *
+ * When rmap is disabled, we need to reserve 4 fsbs _per AG_ for the freelist
+ * and 4 more to handle a potential split of the file's bmap btree.
+ *
+ * When rmap is enabled, we must also be able to handle two rmap btree inserts
+ * to record both the file data extent and a new bmbt block. The bmbt block
+ * might not be in the same AG as the file data extent. In the worst case
+ * the bmap btree splits multiple levels and all the new blocks come from
+ * different AGs, so set aside enough to handle rmap btree splits in all AGs.
+ */
+unsigned int
+xfs_alloc_set_aside(
+ struct xfs_mount *mp)
+{
+ unsigned int blocks;
+
+ blocks = 4 + (mp->m_sb.sb_agcount * XFS_ALLOC_AGFL_RESERVE);
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ blocks += mp->m_sb.sb_agcount * mp->m_rmap_maxlevels;
+ return blocks;
+}
+
+/*
+ * When deciding how much space to allocate out of an AG, we limit the
+ * allocation maximum size to the size the AG. However, we cannot use all the
+ * blocks in the AG - some are permanently used by metadata. These
+ * blocks are generally:
+ * - the AG superblock, AGF, AGI and AGFL
+ * - the AGF (bno and cnt) and AGI btree root blocks, and optionally
+ * the AGI free inode and rmap btree root blocks.
+ * - blocks on the AGFL according to xfs_alloc_set_aside() limits
+ * - the rmapbt root block
+ *
+ * The AG headers are sector sized, so the amount of space they take up is
+ * dependent on filesystem geometry. The others are all single blocks.
+ */
+unsigned int
+xfs_alloc_ag_max_usable(
+ struct xfs_mount *mp)
+{
+ unsigned int blocks;
+
+ blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */
+ blocks += XFS_ALLOC_AGFL_RESERVE;
+ blocks += 3; /* AGF, AGI btree root blocks */
+ if (xfs_sb_version_hasfinobt(&mp->m_sb))
+ blocks++; /* finobt root block */
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ blocks++; /* rmap root block */
+
+ return mp->m_sb.sb_agblocks - blocks;
+}
+
/*
* Lookup the record equal to [bno, len] in the btree given by cur.
*/
@@ -84,7 +161,7 @@ xfs_alloc_lookup_ge(
* Lookup the first record less than or equal to [bno, len]
* in the btree given by cur.
*/
-int /* error */
+static int /* error */
xfs_alloc_lookup_le(
struct xfs_btree_cur *cur, /* btree cursor */
xfs_agblock_t bno, /* starting block of extent */
@@ -636,6 +713,14 @@ xfs_alloc_ag_vextent(
ASSERT(!args->wasfromfl || !args->isfl);
ASSERT(args->agbno % args->alignment == 0);
+ /* if not file data, insert new block into the reverse map btree */
+ if (args->oinfo.oi_owner != XFS_RMAP_OWN_UNKNOWN) {
+ error = xfs_rmap_alloc(args->tp, args->agbp, args->agno,
+ args->agbno, args->len, &args->oinfo);
+ if (error)
+ return error;
+ }
+
if (!args->wasfromfl) {
error = xfs_alloc_update_counters(args->tp, args->pag,
args->agbp,
@@ -1577,14 +1662,15 @@ error0:
/*
* Free the extent starting at agno/bno for length.
*/
-STATIC int /* error */
+STATIC int
xfs_free_ag_extent(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_buf_t *agbp, /* buffer for a.g. freelist header */
- xfs_agnumber_t agno, /* allocation group number */
- xfs_agblock_t bno, /* starting block number */
- xfs_extlen_t len, /* length of extent */
- int isfl) /* set if is freelist blocks - no sb acctg */
+ xfs_trans_t *tp,
+ xfs_buf_t *agbp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ struct xfs_owner_info *oinfo,
+ int isfl)
{
xfs_btree_cur_t *bno_cur; /* cursor for by-block btree */
xfs_btree_cur_t *cnt_cur; /* cursor for by-size btree */
@@ -1601,12 +1687,19 @@ xfs_free_ag_extent(
xfs_extlen_t nlen; /* new length of freespace */
xfs_perag_t *pag; /* per allocation group data */
+ bno_cur = cnt_cur = NULL;
mp = tp->t_mountp;
+
+ if (oinfo->oi_owner != XFS_RMAP_OWN_UNKNOWN) {
+ error = xfs_rmap_free(tp, agbp, agno, bno, len, oinfo);
+ if (error)
+ goto error0;
+ }
+
/*
* Allocate and initialize a cursor for the by-block btree.
*/
bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO);
- cnt_cur = NULL;
/*
* Look for a neighboring block on the left (lower block numbers)
* that is contiguous with this space.
@@ -1839,19 +1932,8 @@ void
xfs_alloc_compute_maxlevels(
xfs_mount_t *mp) /* file system mount structure */
{
- int level;
- uint maxblocks;
- uint maxleafents;
- int minleafrecs;
- int minnoderecs;
-
- maxleafents = (mp->m_sb.sb_agblocks + 1) / 2;
- minleafrecs = mp->m_alloc_mnr[0];
- minnoderecs = mp->m_alloc_mnr[1];
- maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
- for (level = 1; maxblocks > 1; level++)
- maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
- mp->m_ag_maxlevels = level;
+ mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_alloc_mnr,
+ (mp->m_sb.sb_agblocks + 1) / 2);
}
/*
@@ -1886,6 +1968,11 @@ xfs_alloc_min_freelist(
/* space needed by-size freespace btree */
min_free += min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_CNTi] + 1,
mp->m_ag_maxlevels);
+ /* space needed reverse mapping used space btree */
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ min_free += min_t(unsigned int,
+ pag->pagf_levels[XFS_BTNUM_RMAPi] + 1,
+ mp->m_rmap_maxlevels);
return min_free;
}
@@ -2003,21 +2090,34 @@ xfs_alloc_fix_freelist(
* anything other than extra overhead when we need to put more blocks
* back on the free list? Maybe we should only do this when space is
* getting low or the AGFL is more than half full?
+ *
+ * The NOSHRINK flag prevents the AGFL from being shrunk if it's too
+ * big; the NORMAP flag prevents AGFL expand/shrink operations from
+ * updating the rmapbt. Both flags are used in xfs_repair while we're
+ * rebuilding the rmapbt, and neither are used by the kernel. They're
+ * both required to ensure that rmaps are correctly recorded for the
+ * regenerated AGFL, bnobt, and cntbt. See repair/phase5.c and
+ * repair/rmap.c in xfsprogs for details.
*/
- while (pag->pagf_flcount > need) {
+ memset(&targs, 0, sizeof(targs));
+ if (flags & XFS_ALLOC_FLAG_NORMAP)
+ xfs_rmap_skip_owner_update(&targs.oinfo);
+ else
+ xfs_rmap_ag_owner(&targs.oinfo, XFS_RMAP_OWN_AG);
+ while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) {
struct xfs_buf *bp;
error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
if (error)
goto out_agbp_relse;
- error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1);
+ error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1,
+ &targs.oinfo, 1);
if (error)
goto out_agbp_relse;
bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
xfs_trans_binval(tp, bp);
}
- memset(&targs, 0, sizeof(targs));
targs.tp = tp;
targs.mp = mp;
targs.agbp = agbp;
@@ -2282,6 +2382,10 @@ xfs_agf_verify(
be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS)
return false;
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb) &&
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS)
+ return false;
+
/*
* during growfs operations, the perag is not fully initialised,
* so we can't use it for any useful checking. growfs ensures we can't
@@ -2413,6 +2517,8 @@ xfs_alloc_read_agf(
be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
pag->pagf_levels[XFS_BTNUM_CNTi] =
be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
+ pag->pagf_levels[XFS_BTNUM_RMAPi] =
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
spin_lock_init(&pag->pagb_lock);
pag->pagb_count = 0;
pag->pagb_tree = RB_ROOT;
@@ -2658,55 +2764,85 @@ error0:
return error;
}
-/*
- * Free an extent.
- * Just break up the extent address and hand off to xfs_free_ag_extent
- * after fixing up the freelist.
- */
-int /* error */
-xfs_free_extent(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_fsblock_t bno, /* starting block number of extent */
- xfs_extlen_t len) /* length of extent */
+/* Ensure that the freelist is at full capacity. */
+int
+xfs_free_extent_fix_freelist(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ struct xfs_buf **agbp)
{
- xfs_alloc_arg_t args;
- int error;
+ struct xfs_alloc_arg args;
+ int error;
- ASSERT(len != 0);
- memset(&args, 0, sizeof(xfs_alloc_arg_t));
+ memset(&args, 0, sizeof(struct xfs_alloc_arg));
args.tp = tp;
args.mp = tp->t_mountp;
+ args.agno = agno;
/*
* validate that the block number is legal - the enables us to detect
* and handle a silent filesystem corruption rather than crashing.
*/
- args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
if (args.agno >= args.mp->m_sb.sb_agcount)
return -EFSCORRUPTED;
- args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
- if (args.agbno >= args.mp->m_sb.sb_agblocks)
- return -EFSCORRUPTED;
-
args.pag = xfs_perag_get(args.mp, args.agno);
ASSERT(args.pag);
error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
if (error)
- goto error0;
+ goto out;
+
+ *agbp = args.agbp;
+out:
+ xfs_perag_put(args.pag);
+ return error;
+}
+
+/*
+ * Free an extent.
+ * Just break up the extent address and hand off to xfs_free_ag_extent
+ * after fixing up the freelist.
+ */
+int /* error */
+xfs_free_extent(
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_fsblock_t bno, /* starting block number of extent */
+ xfs_extlen_t len, /* length of extent */
+ struct xfs_owner_info *oinfo) /* extent owner */
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_buf *agbp;
+ xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno);
+ xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno);
+ int error;
+
+ ASSERT(len != 0);
+
+ if (XFS_TEST_ERROR(false, mp,
+ XFS_ERRTAG_FREE_EXTENT,
+ XFS_RANDOM_FREE_EXTENT))
+ return -EIO;
+
+ error = xfs_free_extent_fix_freelist(tp, agno, &agbp);
+ if (error)
+ return error;
+
+ XFS_WANT_CORRUPTED_GOTO(mp, agbno < mp->m_sb.sb_agblocks, err);
/* validate the extent size is legal now we have the agf locked */
- if (args.agbno + len >
- be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) {
- error = -EFSCORRUPTED;
- goto error0;
- }
+ XFS_WANT_CORRUPTED_GOTO(mp,
+ agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length),
+ err);
- error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
- if (!error)
- xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0);
-error0:
- xfs_perag_put(args.pag);
+ error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, 0);
+ if (error)
+ goto err;
+
+ xfs_extent_busy_insert(tp, agno, agbno, len, 0);
+ return 0;
+
+err:
+ xfs_trans_brelse(tp, agbp);
return error;
}
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 135eb3d24db71..6fe2d6b7cfe93 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -54,41 +54,8 @@ typedef unsigned int xfs_alloctype_t;
*/
#define XFS_ALLOC_FLAG_TRYLOCK 0x00000001 /* use trylock for buffer locking */
#define XFS_ALLOC_FLAG_FREEING 0x00000002 /* indicate caller is freeing extents*/
-
-/*
- * In order to avoid ENOSPC-related deadlock caused by
- * out-of-order locking of AGF buffer (PV 947395), we place
- * constraints on the relationship among actual allocations for
- * data blocks, freelist blocks, and potential file data bmap
- * btree blocks. However, these restrictions may result in no
- * actual space allocated for a delayed extent, for example, a data
- * block in a certain AG is allocated but there is no additional
- * block for the additional bmap btree block due to a split of the
- * bmap btree of the file. The result of this may lead to an
- * infinite loop in xfssyncd when the file gets flushed to disk and
- * all delayed extents need to be actually allocated. To get around
- * this, we explicitly set aside a few blocks which will not be
- * reserved in delayed allocation. Considering the minimum number of
- * needed freelist blocks is 4 fsbs _per AG_, a potential split of file's bmap
- * btree requires 1 fsb, so we set the number of set-aside blocks
- * to 4 + 4*agcount.
- */
-#define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4))
-
-/*
- * When deciding how much space to allocate out of an AG, we limit the
- * allocation maximum size to the size the AG. However, we cannot use all the
- * blocks in the AG - some are permanently used by metadata. These
- * blocks are generally:
- * - the AG superblock, AGF, AGI and AGFL
- * - the AGF (bno and cnt) and AGI btree root blocks
- * - 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits
- *
- * The AG headers are sector sized, so the amount of space they take up is
- * dependent on filesystem geometry. The others are all single blocks.
- */
-#define XFS_ALLOC_AG_MAX_USABLE(mp) \
- ((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7)
+#define XFS_ALLOC_FLAG_NORMAP 0x00000004 /* don't modify the rmapbt */
+#define XFS_ALLOC_FLAG_NOSHRINK 0x00000008 /* don't shrink the freelist */
/*
@@ -123,6 +90,7 @@ typedef struct xfs_alloc_arg {
char isfl; /* set if is freelist blocks - !acctg */
char userdata; /* mask defining userdata treatment */
xfs_fsblock_t firstblock; /* io first block allocated */
+ struct xfs_owner_info oinfo; /* owner of blocks being allocated */
} xfs_alloc_arg_t;
/*
@@ -132,6 +100,11 @@ typedef struct xfs_alloc_arg {
#define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */
#define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */
+/* freespace limit calculations */
+#define XFS_ALLOC_AGFL_RESERVE 4
+unsigned int xfs_alloc_set_aside(struct xfs_mount *mp);
+unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp);
+
xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp,
struct xfs_perag *pag, xfs_extlen_t need);
unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp,
@@ -208,16 +181,10 @@ xfs_alloc_vextent(
*/
int /* error */
xfs_free_extent(
- struct xfs_trans *tp, /* transaction pointer */
- xfs_fsblock_t bno, /* starting block number of extent */
- xfs_extlen_t len); /* length of extent */
-
-int /* error */
-xfs_alloc_lookup_le(
- struct xfs_btree_cur *cur, /* btree cursor */
- xfs_agblock_t bno, /* starting block of extent */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_fsblock_t bno, /* starting block number of extent */
xfs_extlen_t len, /* length of extent */
- int *stat); /* success/failure */
+ struct xfs_owner_info *oinfo);/* extent owner */
int /* error */
xfs_alloc_lookup_ge(
@@ -236,5 +203,9 @@ xfs_alloc_get_rec(
int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags);
+int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno,
+ struct xfs_buf **agbp);
+
+xfs_extlen_t xfs_prealloc_blocks(struct xfs_mount *mp);
#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index d9b42425291e3..5ba2dac5e67c4 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -212,17 +212,6 @@ xfs_allocbt_init_key_from_rec(
}
STATIC void
-xfs_allocbt_init_rec_from_key(
- union xfs_btree_key *key,
- union xfs_btree_rec *rec)
-{
- ASSERT(key->alloc.ar_startblock != 0);
-
- rec->alloc.ar_startblock = key->alloc.ar_startblock;
- rec->alloc.ar_blockcount = key->alloc.ar_blockcount;
-}
-
-STATIC void
xfs_allocbt_init_rec_from_cur(
struct xfs_btree_cur *cur,
union xfs_btree_rec *rec)
@@ -406,7 +395,6 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
.get_minrecs = xfs_allocbt_get_minrecs,
.get_maxrecs = xfs_allocbt_get_maxrecs,
.init_key_from_rec = xfs_allocbt_init_key_from_rec,
- .init_rec_from_key = xfs_allocbt_init_rec_from_key,
.init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
.key_diff = xfs_allocbt_key_diff,
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index fa3b948ef9c25..af1ecb19121e9 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -23,6 +23,7 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_mount.h"
+#include "xfs_defer.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_attr_sf.h"
@@ -203,7 +204,7 @@ xfs_attr_set(
{
struct xfs_mount *mp = dp->i_mount;
struct xfs_da_args args;
- struct xfs_bmap_free flist;
+ struct xfs_defer_ops dfops;
struct xfs_trans_res tres;
xfs_fsblock_t firstblock;
int rsvd = (flags & ATTR_ROOT) != 0;
@@ -221,7 +222,7 @@ xfs_attr_set(
args.value = value;
args.valuelen = valuelen;
args.firstblock = &firstblock;
- args.flist = &flist;
+ args.dfops = &dfops;
args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
args.total = xfs_attr_calc_size(&args, &local);
@@ -242,37 +243,21 @@ xfs_attr_set(
return error;
}
- /*
- * Start our first transaction of the day.
- *
- * All future transactions during this code must be "chained" off
- * this one via the trans_dup() call. All transactions will contain
- * the inode, and the inode will always be marked with trans_ihold().
- * Since the inode will be locked in all transactions, we must log
- * the inode in every transaction to let it float upward through
- * the log.
- */
- args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_SET);
+ tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
+ M_RES(mp)->tr_attrsetrt.tr_logres * args.total;
+ tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
+ tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
/*
* Root fork attributes can use reserved data blocks for this
* operation if necessary
*/
-
- if (rsvd)
- args.trans->t_flags |= XFS_TRANS_RESERVE;
-
- tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
- M_RES(mp)->tr_attrsetrt.tr_logres * args.total;
- tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
- tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
- error = xfs_trans_reserve(args.trans, &tres, args.total, 0);
- if (error) {
- xfs_trans_cancel(args.trans);
+ error = xfs_trans_alloc(mp, &tres, args.total, 0,
+ rsvd ? XFS_TRANS_RESERVE : 0, &args.trans);
+ if (error)
return error;
- }
- xfs_ilock(dp, XFS_ILOCK_EXCL);
+ xfs_ilock(dp, XFS_ILOCK_EXCL);
error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0,
rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
XFS_QMOPT_RES_REGBLKS);
@@ -332,13 +317,13 @@ xfs_attr_set(
* It won't fit in the shortform, transform to a leaf block.
* GROT: another possible req'mt for a double-split btree op.
*/
- xfs_bmap_init(args.flist, args.firstblock);
+ xfs_defer_init(args.dfops, args.firstblock);
error = xfs_attr_shortform_to_leaf(&args);
if (!error)
- error = xfs_bmap_finish(&args.trans, args.flist, dp);
+ error = xfs_defer_finish(&args.trans, args.dfops, dp);
if (error) {
args.trans = NULL;
- xfs_bmap_cancel(&flist);
+ xfs_defer_cancel(&dfops);
goto out;
}
@@ -398,7 +383,7 @@ xfs_attr_remove(
{
struct xfs_mount *mp = dp->i_mount;
struct xfs_da_args args;
- struct xfs_bmap_free flist;
+ struct xfs_defer_ops dfops;
xfs_fsblock_t firstblock;
int error;
@@ -415,7 +400,7 @@ xfs_attr_remove(
return error;
args.firstblock = &firstblock;
- args.flist = &flist;
+ args.dfops = &dfops;
/*
* we have no control over the attribute names that userspace passes us
@@ -429,31 +414,15 @@ xfs_attr_remove(
return error;
/*
- * Start our first transaction of the day.
- *
- * All future transactions during this code must be "chained" off
- * this one via the trans_dup() call. All transactions will contain
- * the inode, and the inode will always be marked with trans_ihold().
- * Since the inode will be locked in all transactions, we must log
- * the inode in every transaction to let it float upward through
- * the log.
- */
- args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_RM);
-
- /*
* Root fork attributes can use reserved data blocks for this
* operation if necessary
*/
-
- if (flags & ATTR_ROOT)
- args.trans->t_flags |= XFS_TRANS_RESERVE;
-
- error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm,
- XFS_ATTRRM_SPACE_RES(mp), 0);
- if (error) {
- xfs_trans_cancel(args.trans);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_attrrm,
+ XFS_ATTRRM_SPACE_RES(mp), 0,
+ (flags & ATTR_ROOT) ? XFS_TRANS_RESERVE : 0,
+ &args.trans);
+ if (error)
return error;
- }
xfs_ilock(dp, XFS_ILOCK_EXCL);
/*
@@ -616,13 +585,13 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
* Commit that transaction so that the node_addname() call
* can manage its own transactions.
*/
- xfs_bmap_init(args->flist, args->firstblock);
+ xfs_defer_init(args->dfops, args->firstblock);
error = xfs_attr3_leaf_to_node(args);
if (!error)
- error = xfs_bmap_finish(&args->trans, args->flist, dp);
+ error = xfs_defer_finish(&args->trans, args->dfops, dp);
if (error) {
args->trans = NULL;
- xfs_bmap_cancel(args->flist);
+ xfs_defer_cancel(args->dfops);
return error;
}
@@ -706,15 +675,15 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
* If the result is small enough, shrink it all into the inode.
*/
if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
- xfs_bmap_init(args->flist, args->firstblock);
+ xfs_defer_init(args->dfops, args->firstblock);
error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
if (!error)
- error = xfs_bmap_finish(&args->trans,
- args->flist, dp);
+ error = xfs_defer_finish(&args->trans,
+ args->dfops, dp);
if (error) {
args->trans = NULL;
- xfs_bmap_cancel(args->flist);
+ xfs_defer_cancel(args->dfops);
return error;
}
}
@@ -769,14 +738,14 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
* If the result is small enough, shrink it all into the inode.
*/
if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
- xfs_bmap_init(args->flist, args->firstblock);
+ xfs_defer_init(args->dfops, args->firstblock);
error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
if (!error)
- error = xfs_bmap_finish(&args->trans, args->flist, dp);
+ error = xfs_defer_finish(&args->trans, args->dfops, dp);
if (error) {
args->trans = NULL;
- xfs_bmap_cancel(args->flist);
+ xfs_defer_cancel(args->dfops);
return error;
}
}
@@ -895,14 +864,14 @@ restart:
*/
xfs_da_state_free(state);
state = NULL;
- xfs_bmap_init(args->flist, args->firstblock);
+ xfs_defer_init(args->dfops, args->firstblock);
error = xfs_attr3_leaf_to_node(args);
if (!error)
- error = xfs_bmap_finish(&args->trans,
- args->flist, dp);
+ error = xfs_defer_finish(&args->trans,
+ args->dfops, dp);
if (error) {
args->trans = NULL;
- xfs_bmap_cancel(args->flist);
+ xfs_defer_cancel(args->dfops);
goto out;
}
@@ -923,13 +892,13 @@ restart:
* in the index/blkno/rmtblkno/rmtblkcnt fields and
* in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields.
*/
- xfs_bmap_init(args->flist, args->firstblock);
+ xfs_defer_init(args->dfops, args->firstblock);
error = xfs_da3_split(state);
if (!error)
- error = xfs_bmap_finish(&args->trans, args->flist, dp);
+ error = xfs_defer_finish(&args->trans, args->dfops, dp);
if (error) {
args->trans = NULL;
- xfs_bmap_cancel(args->flist);
+ xfs_defer_cancel(args->dfops);
goto out;
}
} else {
@@ -1022,14 +991,14 @@ restart:
* Check to see if the tree needs to be collapsed.
*/
if (retval && (state->path.active > 1)) {
- xfs_bmap_init(args->flist, args->firstblock);
+ xfs_defer_init(args->dfops, args->firstblock);
error = xfs_da3_join(state);
if (!error)
- error = xfs_bmap_finish(&args->trans,
- args->flist, dp);
+ error = xfs_defer_finish(&args->trans,
+ args->dfops, dp);
if (error) {
args->trans = NULL;
- xfs_bmap_cancel(args->flist);
+ xfs_defer_cancel(args->dfops);
goto out;
}
}
@@ -1145,13 +1114,13 @@ xfs_attr_node_removename(xfs_da_args_t *args)
* Check to see if the tree needs to be collapsed.
*/
if (retval && (state->path.active > 1)) {
- xfs_bmap_init(args->flist, args->firstblock);
+ xfs_defer_init(args->dfops, args->firstblock);
error = xfs_da3_join(state);
if (!error)
- error = xfs_bmap_finish(&args->trans, args->flist, dp);
+ error = xfs_defer_finish(&args->trans, args->dfops, dp);
if (error) {
args->trans = NULL;
- xfs_bmap_cancel(args->flist);
+ xfs_defer_cancel(args->dfops);
goto out;
}
/*
@@ -1178,15 +1147,15 @@ xfs_attr_node_removename(xfs_da_args_t *args)
goto out;
if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
- xfs_bmap_init(args->flist, args->firstblock);
+ xfs_defer_init(args->dfops, args->firstblock);
error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
if (!error)
- error = xfs_bmap_finish(&args->trans,
- args->flist, dp);
+ error = xfs_defer_finish(&args->trans,
+ args->dfops, dp);
if (error) {
args->trans = NULL;
- xfs_bmap_cancel(args->flist);
+ xfs_defer_cancel(args->dfops);
goto out;
}
} else
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 01a5ecfedfcf1..8ea91f3630938 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -792,7 +792,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
nargs.dp = dp;
nargs.geo = args->geo;
nargs.firstblock = args->firstblock;
- nargs.flist = args->flist;
+ nargs.dfops = args->dfops;
nargs.total = args->total;
nargs.whichfork = XFS_ATTR_FORK;
nargs.trans = args->trans;
@@ -922,7 +922,7 @@ xfs_attr3_leaf_to_shortform(
nargs.geo = args->geo;
nargs.dp = dp;
nargs.firstblock = args->firstblock;
- nargs.flist = args->flist;
+ nargs.dfops = args->dfops;
nargs.total = args->total;
nargs.whichfork = XFS_ATTR_FORK;
nargs.trans = args->trans;
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index 882c8d3388913..4f2aed04f8273 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -50,7 +50,6 @@ int xfs_attr_shortform_lookup(struct xfs_da_args *args);
int xfs_attr_shortform_getvalue(struct xfs_da_args *args);
int xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
int xfs_attr_shortform_remove(struct xfs_da_args *args);
-int xfs_attr_shortform_list(struct xfs_attr_list_context *context);
int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes);
void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp);
@@ -88,8 +87,6 @@ int xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval);
void xfs_attr3_leaf_unbalance(struct xfs_da_state *state,
struct xfs_da_state_blk *drop_blk,
struct xfs_da_state_blk *save_blk);
-int xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);
-
/*
* Utility routines.
*/
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index a572532a55cdc..d52f525f5b2df 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -24,6 +24,7 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_mount.h"
+#include "xfs_defer.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_inode.h"
@@ -460,16 +461,16 @@ xfs_attr_rmtval_set(
* extent and then crash then the block may not contain the
* correct metadata after log recovery occurs.
*/
- xfs_bmap_init(args->flist, args->firstblock);
+ xfs_defer_init(args->dfops, args->firstblock);
nmap = 1;
error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
blkcnt, XFS_BMAPI_ATTRFORK, args->firstblock,
- args->total, &map, &nmap, args->flist);
+ args->total, &map, &nmap, args->dfops);
if (!error)
- error = xfs_bmap_finish(&args->trans, args->flist, dp);
+ error = xfs_defer_finish(&args->trans, args->dfops, dp);
if (error) {
args->trans = NULL;
- xfs_bmap_cancel(args->flist);
+ xfs_defer_cancel(args->dfops);
return error;
}
@@ -503,7 +504,7 @@ xfs_attr_rmtval_set(
ASSERT(blkcnt > 0);
- xfs_bmap_init(args->flist, args->firstblock);
+ xfs_defer_init(args->dfops, args->firstblock);
nmap = 1;
error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno,
blkcnt, &map, &nmap,
@@ -603,16 +604,16 @@ xfs_attr_rmtval_remove(
blkcnt = args->rmtblkcnt;
done = 0;
while (!done) {
- xfs_bmap_init(args->flist, args->firstblock);
+ xfs_defer_init(args->dfops, args->firstblock);
error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
XFS_BMAPI_ATTRFORK, 1, args->firstblock,
- args->flist, &done);
+ args->dfops, &done);
if (!error)
- error = xfs_bmap_finish(&args->trans, args->flist,
+ error = xfs_defer_finish(&args->trans, args->dfops,
args->dp);
if (error) {
args->trans = NULL;
- xfs_bmap_cancel(args->flist);
+ xfs_defer_cancel(args->dfops);
return error;
}
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 041b6948aeccd..b060bca934027 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -24,6 +24,7 @@
#include "xfs_bit.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
+#include "xfs_defer.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_dir2.h"
@@ -45,6 +46,7 @@
#include "xfs_symlink.h"
#include "xfs_attr_leaf.h"
#include "xfs_filestream.h"
+#include "xfs_rmap.h"
kmem_zone_t *xfs_bmap_free_item_zone;
@@ -570,14 +572,13 @@ xfs_bmap_validate_ret(
*/
void
xfs_bmap_add_free(
- xfs_fsblock_t bno, /* fs block number of extent */
- xfs_filblks_t len, /* length of extent */
- xfs_bmap_free_t *flist, /* list of extents */
- xfs_mount_t *mp) /* mount point structure */
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ xfs_fsblock_t bno,
+ xfs_filblks_t len,
+ struct xfs_owner_info *oinfo)
{
- xfs_bmap_free_item_t *cur; /* current (next) element */
- xfs_bmap_free_item_t *new; /* new element */
- xfs_bmap_free_item_t *prev; /* previous element */
+ struct xfs_extent_free_item *new; /* new element */
#ifdef DEBUG
xfs_agnumber_t agno;
xfs_agblock_t agbno;
@@ -594,59 +595,17 @@ xfs_bmap_add_free(
ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
#endif
ASSERT(xfs_bmap_free_item_zone != NULL);
- new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
- new->xbfi_startblock = bno;
- new->xbfi_blockcount = (xfs_extlen_t)len;
- for (prev = NULL, cur = flist->xbf_first;
- cur != NULL;
- prev = cur, cur = cur->xbfi_next) {
- if (cur->xbfi_startblock >= bno)
- break;
- }
- if (prev)
- prev->xbfi_next = new;
- else
- flist->xbf_first = new;
- new->xbfi_next = cur;
- flist->xbf_count++;
-}
-/*
- * Remove the entry "free" from the free item list. Prev points to the
- * previous entry, unless "free" is the head of the list.
- */
-void
-xfs_bmap_del_free(
- xfs_bmap_free_t *flist, /* free item list header */
- xfs_bmap_free_item_t *prev, /* previous item on list, if any */
- xfs_bmap_free_item_t *free) /* list item to be freed */
-{
- if (prev)
- prev->xbfi_next = free->xbfi_next;
+ new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
+ new->xefi_startblock = bno;
+ new->xefi_blockcount = (xfs_extlen_t)len;
+ if (oinfo)
+ new->xefi_oinfo = *oinfo;
else
- flist->xbf_first = free->xbfi_next;
- flist->xbf_count--;
- kmem_zone_free(xfs_bmap_free_item_zone, free);
-}
-
-/*
- * Free up any items left in the list.
- */
-void
-xfs_bmap_cancel(
- xfs_bmap_free_t *flist) /* list of bmap_free_items */
-{
- xfs_bmap_free_item_t *free; /* free list item */
- xfs_bmap_free_item_t *next;
-
- if (flist->xbf_count == 0)
- return;
- ASSERT(flist->xbf_first != NULL);
- for (free = flist->xbf_first; free; free = next) {
- next = free->xbfi_next;
- xfs_bmap_del_free(flist, NULL, free);
- }
- ASSERT(flist->xbf_count == 0);
+ xfs_rmap_skip_owner_update(&new->xefi_oinfo);
+ trace_xfs_bmap_free_defer(mp, XFS_FSB_TO_AGNO(mp, bno), 0,
+ XFS_FSB_TO_AGBNO(mp, bno), len);
+ xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list);
}
/*
@@ -676,6 +635,7 @@ xfs_bmap_btree_to_extents(
xfs_mount_t *mp; /* mount point structure */
__be64 *pp; /* ptr to block address */
struct xfs_btree_block *rblock;/* root btree block */
+ struct xfs_owner_info oinfo;
mp = ip->i_mount;
ifp = XFS_IFORK_PTR(ip, whichfork);
@@ -699,7 +659,8 @@ xfs_bmap_btree_to_extents(
cblock = XFS_BUF_TO_BLOCK(cbp);
if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
return error;
- xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
+ xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork);
+ xfs_bmap_add_free(mp, cur->bc_private.b.dfops, cbno, 1, &oinfo);
ip->i_d.di_nblocks--;
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
xfs_trans_binval(tp, cbp);
@@ -722,7 +683,7 @@ xfs_bmap_extents_to_btree(
xfs_trans_t *tp, /* transaction pointer */
xfs_inode_t *ip, /* incore inode pointer */
xfs_fsblock_t *firstblock, /* first-block-allocated */
- xfs_bmap_free_t *flist, /* blocks freed in xaction */
+ struct xfs_defer_ops *dfops, /* blocks freed in xaction */
xfs_btree_cur_t **curp, /* cursor returned to caller */
int wasdel, /* converting a delayed alloc */
int *logflagsp, /* inode logging flags */
@@ -771,7 +732,7 @@ xfs_bmap_extents_to_btree(
*/
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
cur->bc_private.b.firstblock = *firstblock;
- cur->bc_private.b.flist = flist;
+ cur->bc_private.b.dfops = dfops;
cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
/*
* Convert to a btree with two levels, one record in root.
@@ -780,11 +741,12 @@ xfs_bmap_extents_to_btree(
memset(&args, 0, sizeof(args));
args.tp = tp;
args.mp = mp;
+ xfs_rmap_ino_bmbt_owner(&args.oinfo, ip->i_ino, whichfork);
args.firstblock = *firstblock;
if (*firstblock == NULLFSBLOCK) {
args.type = XFS_ALLOCTYPE_START_BNO;
args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
- } else if (flist->xbf_low) {
+ } else if (dfops->dop_low) {
args.type = XFS_ALLOCTYPE_START_BNO;
args.fsbno = *firstblock;
} else {
@@ -805,7 +767,7 @@ xfs_bmap_extents_to_btree(
ASSERT(args.fsbno != NULLFSBLOCK);
ASSERT(*firstblock == NULLFSBLOCK ||
args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) ||
- (flist->xbf_low &&
+ (dfops->dop_low &&
args.agno > XFS_FSB_TO_AGNO(mp, *firstblock)));
*firstblock = cur->bc_private.b.firstblock = args.fsbno;
cur->bc_private.b.allocated++;
@@ -926,6 +888,7 @@ xfs_bmap_local_to_extents(
memset(&args, 0, sizeof(args));
args.tp = tp;
args.mp = ip->i_mount;
+ xfs_rmap_ino_owner(&args.oinfo, ip->i_ino, whichfork, 0);
args.firstblock = *firstblock;
/*
* Allocate a block. We know we need only one, since the
@@ -990,7 +953,7 @@ xfs_bmap_add_attrfork_btree(
xfs_trans_t *tp, /* transaction pointer */
xfs_inode_t *ip, /* incore inode pointer */
xfs_fsblock_t *firstblock, /* first block allocated */
- xfs_bmap_free_t *flist, /* blocks to free at commit */
+ struct xfs_defer_ops *dfops, /* blocks to free at commit */
int *flags) /* inode logging flags */
{
xfs_btree_cur_t *cur; /* btree cursor */
@@ -1003,7 +966,7 @@ xfs_bmap_add_attrfork_btree(
*flags |= XFS_ILOG_DBROOT;
else {
cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
- cur->bc_private.b.flist = flist;
+ cur->bc_private.b.dfops = dfops;
cur->bc_private.b.firstblock = *firstblock;
if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
goto error0;
@@ -1033,7 +996,7 @@ xfs_bmap_add_attrfork_extents(
xfs_trans_t *tp, /* transaction pointer */
xfs_inode_t *ip, /* incore inode pointer */
xfs_fsblock_t *firstblock, /* first block allocated */
- xfs_bmap_free_t *flist, /* blocks to free at commit */
+ struct xfs_defer_ops *dfops, /* blocks to free at commit */
int *flags) /* inode logging flags */
{
xfs_btree_cur_t *cur; /* bmap btree cursor */
@@ -1042,7 +1005,7 @@ xfs_bmap_add_attrfork_extents(
if (ip->i_d.di_nextents * sizeof(xfs_bmbt_rec_t) <= XFS_IFORK_DSIZE(ip))
return 0;
cur = NULL;
- error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, &cur, 0,
+ error = xfs_bmap_extents_to_btree(tp, ip, firstblock, dfops, &cur, 0,
flags, XFS_DATA_FORK);
if (cur) {
cur->bc_private.b.allocated = 0;
@@ -1068,7 +1031,7 @@ xfs_bmap_add_attrfork_local(
xfs_trans_t *tp, /* transaction pointer */
xfs_inode_t *ip, /* incore inode pointer */
xfs_fsblock_t *firstblock, /* first block allocated */
- xfs_bmap_free_t *flist, /* blocks to free at commit */
+ struct xfs_defer_ops *dfops, /* blocks to free at commit */
int *flags) /* inode logging flags */
{
xfs_da_args_t dargs; /* args for dir/attr code */
@@ -1081,7 +1044,7 @@ xfs_bmap_add_attrfork_local(
dargs.geo = ip->i_mount->m_dir_geo;
dargs.dp = ip;
dargs.firstblock = firstblock;
- dargs.flist = flist;
+ dargs.dfops = dfops;
dargs.total = dargs.geo->fsbcount;
dargs.whichfork = XFS_DATA_FORK;
dargs.trans = tp;
@@ -1109,7 +1072,7 @@ xfs_bmap_add_attrfork(
int rsvd) /* xact may use reserved blks */
{
xfs_fsblock_t firstblock; /* 1st block/ag allocated */
- xfs_bmap_free_t flist; /* freed extent records */
+ struct xfs_defer_ops dfops; /* freed extent records */
xfs_mount_t *mp; /* mount structure */
xfs_trans_t *tp; /* transaction pointer */
int blks; /* space reservation */
@@ -1121,15 +1084,14 @@ xfs_bmap_add_attrfork(
mp = ip->i_mount;
ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
- tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK);
+
blks = XFS_ADDAFORK_SPACE_RES(mp);
- if (rsvd)
- tp->t_flags |= XFS_TRANS_RESERVE;
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0);
- if (error) {
- xfs_trans_cancel(tp);
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_addafork, blks, 0,
+ rsvd ? XFS_TRANS_RESERVE : 0, &tp);
+ if (error)
return error;
- }
+
xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
@@ -1176,18 +1138,18 @@ xfs_bmap_add_attrfork(
ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
ip->i_afp->if_flags = XFS_IFEXTENTS;
logflags = 0;
- xfs_bmap_init(&flist, &firstblock);
+ xfs_defer_init(&dfops, &firstblock);
switch (ip->i_d.di_format) {
case XFS_DINODE_FMT_LOCAL:
- error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist,
+ error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &dfops,
&logflags);
break;
case XFS_DINODE_FMT_EXTENTS:
error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock,
- &flist, &logflags);
+ &dfops, &logflags);
break;
case XFS_DINODE_FMT_BTREE:
- error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist,
+ error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &dfops,
&logflags);
break;
default:
@@ -1216,7 +1178,7 @@ xfs_bmap_add_attrfork(
xfs_log_sb(tp);
}
- error = xfs_bmap_finish(&tp, &flist, NULL);
+ error = xfs_defer_finish(&tp, &dfops, NULL);
if (error)
goto bmap_cancel;
error = xfs_trans_commit(tp);
@@ -1224,7 +1186,7 @@ xfs_bmap_add_attrfork(
return error;
bmap_cancel:
- xfs_bmap_cancel(&flist);
+ xfs_defer_cancel(&dfops);
trans_cancel:
xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -2021,7 +1983,7 @@ xfs_bmap_add_extent_delay_real(
if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
- bma->firstblock, bma->flist,
+ bma->firstblock, bma->dfops,
&bma->cur, 1, &tmp_rval, whichfork);
rval |= tmp_rval;
if (error)
@@ -2105,7 +2067,7 @@ xfs_bmap_add_extent_delay_real(
if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
- bma->firstblock, bma->flist, &bma->cur, 1,
+ bma->firstblock, bma->dfops, &bma->cur, 1,
&tmp_rval, whichfork);
rval |= tmp_rval;
if (error)
@@ -2174,7 +2136,7 @@ xfs_bmap_add_extent_delay_real(
if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
- bma->firstblock, bma->flist, &bma->cur,
+ bma->firstblock, bma->dfops, &bma->cur,
1, &tmp_rval, whichfork);
rval |= tmp_rval;
if (error)
@@ -2217,13 +2179,18 @@ xfs_bmap_add_extent_delay_real(
ASSERT(0);
}
+ /* add reverse mapping */
+ error = xfs_rmap_map_extent(mp, bma->dfops, bma->ip, whichfork, new);
+ if (error)
+ goto done;
+
/* convert to a btree if necessary */
if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
int tmp_logflags; /* partial log flag return val */
ASSERT(bma->cur == NULL);
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
- bma->firstblock, bma->flist, &bma->cur,
+ bma->firstblock, bma->dfops, &bma->cur,
da_old > 0, &tmp_logflags, whichfork);
bma->logflags |= tmp_logflags;
if (error)
@@ -2265,7 +2232,7 @@ xfs_bmap_add_extent_unwritten_real(
xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
xfs_fsblock_t *first, /* pointer to firstblock variable */
- xfs_bmap_free_t *flist, /* list of extents to be freed */
+ struct xfs_defer_ops *dfops, /* list of extents to be freed */
int *logflagsp) /* inode logging flags */
{
xfs_btree_cur_t *cur; /* btree cursor */
@@ -2753,12 +2720,17 @@ xfs_bmap_add_extent_unwritten_real(
ASSERT(0);
}
+ /* update reverse mappings */
+ error = xfs_rmap_convert_extent(mp, dfops, ip, XFS_DATA_FORK, new);
+ if (error)
+ goto done;
+
/* convert to a btree if necessary */
if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) {
int tmp_logflags; /* partial log flag return val */
ASSERT(cur == NULL);
- error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur,
+ error = xfs_bmap_extents_to_btree(tp, ip, first, dfops, &cur,
0, &tmp_logflags, XFS_DATA_FORK);
*logflagsp |= tmp_logflags;
if (error)
@@ -3145,13 +3117,18 @@ xfs_bmap_add_extent_hole_real(
break;
}
+ /* add reverse mapping */
+ error = xfs_rmap_map_extent(mp, bma->dfops, bma->ip, whichfork, new);
+ if (error)
+ goto done;
+
/* convert to a btree if necessary */
if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
int tmp_logflags; /* partial log flag return val */
ASSERT(bma->cur == NULL);
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
- bma->firstblock, bma->flist, &bma->cur,
+ bma->firstblock, bma->dfops, &bma->cur,
0, &tmp_logflags, whichfork);
bma->logflags |= tmp_logflags;
if (error)
@@ -3709,9 +3686,10 @@ xfs_bmap_btalloc(
args.tp = ap->tp;
args.mp = mp;
args.fsbno = ap->blkno;
+ xfs_rmap_skip_owner_update(&args.oinfo);
/* Trim the allocation back to the maximum an AG can fit. */
- args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp));
+ args.maxlen = MIN(ap->length, mp->m_ag_max_usable);
args.firstblock = *ap->firstblock;
blen = 0;
if (nullfb) {
@@ -3726,7 +3704,7 @@ xfs_bmap_btalloc(
error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
if (error)
return error;
- } else if (ap->flist->xbf_low) {
+ } else if (ap->dfops->dop_low) {
if (xfs_inode_is_filestream(ap->ip))
args.type = XFS_ALLOCTYPE_FIRST_AG;
else
@@ -3742,11 +3720,11 @@ xfs_bmap_btalloc(
args.prod = align;
if ((args.mod = (xfs_extlen_t)do_mod(ap->offset, args.prod)))
args.mod = (xfs_extlen_t)(args.prod - args.mod);
- } else if (mp->m_sb.sb_blocksize >= PAGE_CACHE_SIZE) {
+ } else if (mp->m_sb.sb_blocksize >= PAGE_SIZE) {
args.prod = 1;
args.mod = 0;
} else {
- args.prod = PAGE_CACHE_SIZE >> mp->m_sb.sb_blocklog;
+ args.prod = PAGE_SIZE >> mp->m_sb.sb_blocklog;
if ((args.mod = (xfs_extlen_t)(do_mod(ap->offset, args.prod))))
args.mod = (xfs_extlen_t)(args.prod - args.mod);
}
@@ -3759,7 +3737,7 @@ xfs_bmap_btalloc(
* is >= the stripe unit and the allocation offset is
* at the end of file.
*/
- if (!ap->flist->xbf_low && ap->aeof) {
+ if (!ap->dfops->dop_low && ap->aeof) {
if (!ap->offset) {
args.alignment = stripe_align;
atype = args.type;
@@ -3852,7 +3830,7 @@ xfs_bmap_btalloc(
args.minleft = 0;
if ((error = xfs_alloc_vextent(&args)))
return error;
- ap->flist->xbf_low = 1;
+ ap->dfops->dop_low = true;
}
if (args.fsbno != NULLFSBLOCK) {
/*
@@ -3862,7 +3840,7 @@ xfs_bmap_btalloc(
ASSERT(*ap->firstblock == NULLFSBLOCK ||
XFS_FSB_TO_AGNO(mp, *ap->firstblock) ==
XFS_FSB_TO_AGNO(mp, args.fsbno) ||
- (ap->flist->xbf_low &&
+ (ap->dfops->dop_low &&
XFS_FSB_TO_AGNO(mp, *ap->firstblock) <
XFS_FSB_TO_AGNO(mp, args.fsbno)));
@@ -3870,7 +3848,7 @@ xfs_bmap_btalloc(
if (*ap->firstblock == NULLFSBLOCK)
*ap->firstblock = args.fsbno;
ASSERT(nullfb || fb_agno == args.agno ||
- (ap->flist->xbf_low && fb_agno < args.agno));
+ (ap->dfops->dop_low && fb_agno < args.agno));
ap->length = args.len;
ap->ip->i_d.di_nblocks += args.len;
xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
@@ -4337,7 +4315,7 @@ xfs_bmapi_allocate(
if (error)
return error;
- if (bma->flist->xbf_low)
+ if (bma->dfops->dop_low)
bma->minleft = 0;
if (bma->cur)
bma->cur->bc_private.b.firstblock = *bma->firstblock;
@@ -4346,7 +4324,7 @@ xfs_bmapi_allocate(
if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) {
bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork);
bma->cur->bc_private.b.firstblock = *bma->firstblock;
- bma->cur->bc_private.b.flist = bma->flist;
+ bma->cur->bc_private.b.dfops = bma->dfops;
}
/*
* Bump the number of extents we've allocated
@@ -4427,7 +4405,7 @@ xfs_bmapi_convert_unwritten(
bma->cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp,
bma->ip, whichfork);
bma->cur->bc_private.b.firstblock = *bma->firstblock;
- bma->cur->bc_private.b.flist = bma->flist;
+ bma->cur->bc_private.b.dfops = bma->dfops;
}
mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
? XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
@@ -4444,7 +4422,7 @@ xfs_bmapi_convert_unwritten(
}
error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
- &bma->cur, mval, bma->firstblock, bma->flist,
+ &bma->cur, mval, bma->firstblock, bma->dfops,
&tmp_logflags);
/*
* Log the inode core unconditionally in the unwritten extent conversion
@@ -4498,7 +4476,7 @@ xfs_bmapi_write(
xfs_extlen_t total, /* total blocks needed */
struct xfs_bmbt_irec *mval, /* output: map values */
int *nmap, /* i/o: mval size/count */
- struct xfs_bmap_free *flist) /* i/o: list extents to free */
+ struct xfs_defer_ops *dfops) /* i/o: list extents to free */
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp;
@@ -4588,7 +4566,7 @@ xfs_bmapi_write(
bma.ip = ip;
bma.total = total;
bma.userdata = 0;
- bma.flist = flist;
+ bma.dfops = dfops;
bma.firstblock = firstblock;
while (bno < end && n < *nmap) {
@@ -4702,7 +4680,7 @@ error0:
XFS_FSB_TO_AGNO(mp, *firstblock) ==
XFS_FSB_TO_AGNO(mp,
bma.cur->bc_private.b.firstblock) ||
- (flist->xbf_low &&
+ (dfops->dop_low &&
XFS_FSB_TO_AGNO(mp, *firstblock) <
XFS_FSB_TO_AGNO(mp,
bma.cur->bc_private.b.firstblock)));
@@ -4786,7 +4764,7 @@ xfs_bmap_del_extent(
xfs_inode_t *ip, /* incore inode pointer */
xfs_trans_t *tp, /* current transaction pointer */
xfs_extnum_t *idx, /* extent number to update/delete */
- xfs_bmap_free_t *flist, /* list of extents to be freed */
+ struct xfs_defer_ops *dfops, /* list of extents to be freed */
xfs_btree_cur_t *cur, /* if null, not a btree */
xfs_bmbt_irec_t *del, /* data to remove from extents */
int *logflagsp, /* inode logging flags */
@@ -4888,6 +4866,7 @@ xfs_bmap_del_extent(
nblks = 0;
do_fx = 0;
}
+
/*
* Set flag value to use in switch statement.
* Left-contig is 2, right-contig is 1.
@@ -5070,12 +5049,20 @@ xfs_bmap_del_extent(
++*idx;
break;
}
+
+ /* remove reverse mapping */
+ if (!delay) {
+ error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, del);
+ if (error)
+ goto done;
+ }
+
/*
* If we need to, add to list of extents to delete.
*/
if (do_fx)
- xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist,
- mp);
+ xfs_bmap_add_free(mp, dfops, del->br_startblock,
+ del->br_blockcount, NULL);
/*
* Adjust inode # blocks in the file.
*/
@@ -5115,7 +5102,7 @@ xfs_bunmapi(
xfs_extnum_t nexts, /* number of extents max */
xfs_fsblock_t *firstblock, /* first allocated block
controls a.g. for allocs */
- xfs_bmap_free_t *flist, /* i/o: list extents to free */
+ struct xfs_defer_ops *dfops, /* i/o: list extents to free */
int *done) /* set if not done yet */
{
xfs_btree_cur_t *cur; /* bmap btree cursor */
@@ -5188,7 +5175,7 @@ xfs_bunmapi(
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
cur->bc_private.b.firstblock = *firstblock;
- cur->bc_private.b.flist = flist;
+ cur->bc_private.b.dfops = dfops;
cur->bc_private.b.flags = 0;
} else
cur = NULL;
@@ -5197,8 +5184,10 @@ xfs_bunmapi(
/*
* Synchronize by locking the bitmap inode.
*/
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP);
xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM);
+ xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
}
extno = 0;
@@ -5280,7 +5269,7 @@ xfs_bunmapi(
}
del.br_state = XFS_EXT_UNWRITTEN;
error = xfs_bmap_add_extent_unwritten_real(tp, ip,
- &lastx, &cur, &del, firstblock, flist,
+ &lastx, &cur, &del, firstblock, dfops,
&logflags);
if (error)
goto error0;
@@ -5339,7 +5328,7 @@ xfs_bunmapi(
lastx--;
error = xfs_bmap_add_extent_unwritten_real(tp,
ip, &lastx, &cur, &prev,
- firstblock, flist, &logflags);
+ firstblock, dfops, &logflags);
if (error)
goto error0;
goto nodelete;
@@ -5348,7 +5337,7 @@ xfs_bunmapi(
del.br_state = XFS_EXT_UNWRITTEN;
error = xfs_bmap_add_extent_unwritten_real(tp,
ip, &lastx, &cur, &del,
- firstblock, flist, &logflags);
+ firstblock, dfops, &logflags);
if (error)
goto error0;
goto nodelete;
@@ -5406,7 +5395,7 @@ xfs_bunmapi(
} else if (cur)
cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL;
- error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del,
+ error = xfs_bmap_del_extent(ip, tp, &lastx, dfops, cur, &del,
&tmp_logflags, whichfork);
logflags |= tmp_logflags;
if (error)
@@ -5440,7 +5429,7 @@ nodelete:
*/
if (xfs_bmap_needs_btree(ip, whichfork)) {
ASSERT(cur == NULL);
- error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist,
+ error = xfs_bmap_extents_to_btree(tp, ip, firstblock, dfops,
&cur, 0, &tmp_logflags, whichfork);
logflags |= tmp_logflags;
if (error)
@@ -5607,7 +5596,8 @@ xfs_bmse_shift_one(
struct xfs_bmbt_rec_host *gotp,
struct xfs_btree_cur *cur,
int *logflags,
- enum shift_direction direction)
+ enum shift_direction direction,
+ struct xfs_defer_ops *dfops)
{
struct xfs_ifork *ifp;
struct xfs_mount *mp;
@@ -5655,9 +5645,13 @@ xfs_bmse_shift_one(
/* check whether to merge the extent or shift it down */
if (xfs_bmse_can_merge(&adj_irec, &got,
offset_shift_fsb)) {
- return xfs_bmse_merge(ip, whichfork, offset_shift_fsb,
- *current_ext, gotp, adj_irecp,
- cur, logflags);
+ error = xfs_bmse_merge(ip, whichfork, offset_shift_fsb,
+ *current_ext, gotp, adj_irecp,
+ cur, logflags);
+ if (error)
+ return error;
+ adj_irec = got;
+ goto update_rmap;
}
} else {
startoff = got.br_startoff + offset_shift_fsb;
@@ -5694,9 +5688,10 @@ update_current_ext:
(*current_ext)--;
xfs_bmbt_set_startoff(gotp, startoff);
*logflags |= XFS_ILOG_CORE;
+ adj_irec = got;
if (!cur) {
*logflags |= XFS_ILOG_DEXT;
- return 0;
+ goto update_rmap;
}
error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock,
@@ -5706,8 +5701,18 @@ update_current_ext:
XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
got.br_startoff = startoff;
- return xfs_bmbt_update(cur, got.br_startoff, got.br_startblock,
- got.br_blockcount, got.br_state);
+ error = xfs_bmbt_update(cur, got.br_startoff, got.br_startblock,
+ got.br_blockcount, got.br_state);
+ if (error)
+ return error;
+
+update_rmap:
+ /* update reverse mapping */
+ error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, &adj_irec);
+ if (error)
+ return error;
+ adj_irec.br_startoff = startoff;
+ return xfs_rmap_map_extent(mp, dfops, ip, whichfork, &adj_irec);
}
/*
@@ -5729,7 +5734,7 @@ xfs_bmap_shift_extents(
int *done,
xfs_fileoff_t stop_fsb,
xfs_fsblock_t *firstblock,
- struct xfs_bmap_free *flist,
+ struct xfs_defer_ops *dfops,
enum shift_direction direction,
int num_exts)
{
@@ -5774,7 +5779,7 @@ xfs_bmap_shift_extents(
if (ifp->if_flags & XFS_IFBROOT) {
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
cur->bc_private.b.firstblock = *firstblock;
- cur->bc_private.b.flist = flist;
+ cur->bc_private.b.dfops = dfops;
cur->bc_private.b.flags = 0;
}
@@ -5835,7 +5840,7 @@ xfs_bmap_shift_extents(
while (nexts++ < num_exts) {
error = xfs_bmse_shift_one(ip, whichfork, offset_shift_fsb,
&current_ext, gotp, cur, &logflags,
- direction);
+ direction, dfops);
if (error)
goto del_cursor;
/*
@@ -5883,7 +5888,7 @@ xfs_bmap_split_extent_at(
struct xfs_inode *ip,
xfs_fileoff_t split_fsb,
xfs_fsblock_t *firstfsb,
- struct xfs_bmap_free *free_list)
+ struct xfs_defer_ops *dfops)
{
int whichfork = XFS_DATA_FORK;
struct xfs_btree_cur *cur = NULL;
@@ -5945,7 +5950,7 @@ xfs_bmap_split_extent_at(
if (ifp->if_flags & XFS_IFBROOT) {
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
cur->bc_private.b.firstblock = *firstfsb;
- cur->bc_private.b.flist = free_list;
+ cur->bc_private.b.dfops = dfops;
cur->bc_private.b.flags = 0;
error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
got.br_startblock,
@@ -5998,7 +6003,7 @@ xfs_bmap_split_extent_at(
int tmp_logflags; /* partial log flag return val */
ASSERT(cur == NULL);
- error = xfs_bmap_extents_to_btree(tp, ip, firstfsb, free_list,
+ error = xfs_bmap_extents_to_btree(tp, ip, firstfsb, dfops,
&cur, 0, &tmp_logflags, whichfork);
logflags |= tmp_logflags;
}
@@ -6022,36 +6027,33 @@ xfs_bmap_split_extent(
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
- struct xfs_bmap_free free_list;
+ struct xfs_defer_ops dfops;
xfs_fsblock_t firstfsb;
int error;
- tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
- XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
+ XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp);
+ if (error)
return error;
- }
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- xfs_bmap_init(&free_list, &firstfsb);
+ xfs_defer_init(&dfops, &firstfsb);
error = xfs_bmap_split_extent_at(tp, ip, split_fsb,
- &firstfsb, &free_list);
+ &firstfsb, &dfops);
if (error)
goto out;
- error = xfs_bmap_finish(&tp, &free_list, NULL);
+ error = xfs_defer_finish(&tp, &dfops, NULL);
if (error)
goto out;
return xfs_trans_commit(tp);
out:
- xfs_bmap_cancel(&free_list);
+ xfs_defer_cancel(&dfops);
xfs_trans_cancel(tp);
return error;
}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 423a34e832bdc..254034f969413 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -32,7 +32,7 @@ extern kmem_zone_t *xfs_bmap_free_item_zone;
*/
struct xfs_bmalloca {
xfs_fsblock_t *firstblock; /* i/o first block allocated */
- struct xfs_bmap_free *flist; /* bmap freelist */
+ struct xfs_defer_ops *dfops; /* bmap freelist */
struct xfs_trans *tp; /* transaction pointer */
struct xfs_inode *ip; /* incore inode pointer */
struct xfs_bmbt_irec prev; /* extent before the new one */
@@ -62,33 +62,13 @@ struct xfs_bmalloca {
* List of extents to be free "later".
* The list is kept sorted on xbf_startblock.
*/
-typedef struct xfs_bmap_free_item
+struct xfs_extent_free_item
{
- xfs_fsblock_t xbfi_startblock;/* starting fs block number */
- xfs_extlen_t xbfi_blockcount;/* number of blocks in extent */
- struct xfs_bmap_free_item *xbfi_next; /* link to next entry */
-} xfs_bmap_free_item_t;
-
-/*
- * Header for free extent list.
- *
- * xbf_low is used by the allocator to activate the lowspace algorithm -
- * when free space is running low the extent allocator may choose to
- * allocate an extent from an AG without leaving sufficient space for
- * a btree split when inserting the new extent. In this case the allocator
- * will enable the lowspace algorithm which is supposed to allow further
- * allocations (such as btree splits and newroots) to allocate from
- * sequential AGs. In order to avoid locking AGs out of order the lowspace
- * algorithm will start searching for free space from AG 0. If the correct
- * transaction reservations have been made then this algorithm will eventually
- * find all the space it needs.
- */
-typedef struct xfs_bmap_free
-{
- xfs_bmap_free_item_t *xbf_first; /* list of to-be-free extents */
- int xbf_count; /* count of items on list */
- int xbf_low; /* alloc in low mode */
-} xfs_bmap_free_t;
+ xfs_fsblock_t xefi_startblock;/* starting fs block number */
+ xfs_extlen_t xefi_blockcount;/* number of blocks in extent */
+ struct list_head xefi_list;
+ struct xfs_owner_info xefi_oinfo; /* extent owner */
+};
#define XFS_BMAP_MAX_NMAP 4
@@ -139,12 +119,6 @@ static inline int xfs_bmapi_aflag(int w)
#define DELAYSTARTBLOCK ((xfs_fsblock_t)-1LL)
#define HOLESTARTBLOCK ((xfs_fsblock_t)-2LL)
-static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
-{
- ((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \
- (flp)->xbf_low = 0, *(fbp) = NULLFSBLOCK);
-}
-
/*
* Flags for xfs_bmap_add_extent*.
*/
@@ -191,11 +165,9 @@ void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
-void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
- struct xfs_bmap_free *flist, struct xfs_mount *mp);
-void xfs_bmap_cancel(struct xfs_bmap_free *flist);
-int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
- struct xfs_inode *ip);
+void xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+ xfs_fsblock_t bno, xfs_filblks_t len,
+ struct xfs_owner_info *oinfo);
void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
@@ -216,18 +188,18 @@ int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t len, int flags,
xfs_fsblock_t *firstblock, xfs_extlen_t total,
struct xfs_bmbt_irec *mval, int *nmap,
- struct xfs_bmap_free *flist);
+ struct xfs_defer_ops *dfops);
int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t len, int flags,
xfs_extnum_t nexts, xfs_fsblock_t *firstblock,
- struct xfs_bmap_free *flist, int *done);
+ struct xfs_defer_ops *dfops, int *done);
int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
xfs_extnum_t num);
uint xfs_default_attroffset(struct xfs_inode *ip);
int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb,
int *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock,
- struct xfs_bmap_free *flist, enum shift_direction direction,
+ struct xfs_defer_ops *dfops, enum shift_direction direction,
int num_exts);
int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset);
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 6282f6e708afa..cd85274e810cd 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -23,6 +23,7 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_mount.h"
+#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_inode_item.h"
@@ -34,6 +35,7 @@
#include "xfs_quota.h"
#include "xfs_trace.h"
#include "xfs_cksum.h"
+#include "xfs_rmap.h"
/*
* Determine the extent state.
@@ -406,11 +408,11 @@ xfs_bmbt_dup_cursor(
cur->bc_private.b.ip, cur->bc_private.b.whichfork);
/*
- * Copy the firstblock, flist, and flags values,
+ * Copy the firstblock, dfops, and flags values,
* since init cursor doesn't get them.
*/
new->bc_private.b.firstblock = cur->bc_private.b.firstblock;
- new->bc_private.b.flist = cur->bc_private.b.flist;
+ new->bc_private.b.dfops = cur->bc_private.b.dfops;
new->bc_private.b.flags = cur->bc_private.b.flags;
return new;
@@ -423,7 +425,7 @@ xfs_bmbt_update_cursor(
{
ASSERT((dst->bc_private.b.firstblock != NULLFSBLOCK) ||
(dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME));
- ASSERT(dst->bc_private.b.flist == src->bc_private.b.flist);
+ ASSERT(dst->bc_private.b.dfops == src->bc_private.b.dfops);
dst->bc_private.b.allocated += src->bc_private.b.allocated;
dst->bc_private.b.firstblock = src->bc_private.b.firstblock;
@@ -446,6 +448,8 @@ xfs_bmbt_alloc_block(
args.mp = cur->bc_mp;
args.fsbno = cur->bc_private.b.firstblock;
args.firstblock = args.fsbno;
+ xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_private.b.ip->i_ino,
+ cur->bc_private.b.whichfork);
if (args.fsbno == NULLFSBLOCK) {
args.fsbno = be64_to_cpu(start->l);
@@ -462,7 +466,7 @@ xfs_bmbt_alloc_block(
* block allocation here and corrupt the filesystem.
*/
args.minleft = args.tp->t_blk_res;
- } else if (cur->bc_private.b.flist->xbf_low) {
+ } else if (cur->bc_private.b.dfops->dop_low) {
args.type = XFS_ALLOCTYPE_START_BNO;
} else {
args.type = XFS_ALLOCTYPE_NEAR_BNO;
@@ -490,7 +494,7 @@ xfs_bmbt_alloc_block(
error = xfs_alloc_vextent(&args);
if (error)
goto error0;
- cur->bc_private.b.flist->xbf_low = 1;
+ cur->bc_private.b.dfops->dop_low = true;
}
if (args.fsbno == NULLFSBLOCK) {
XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
@@ -525,8 +529,10 @@ xfs_bmbt_free_block(
struct xfs_inode *ip = cur->bc_private.b.ip;
struct xfs_trans *tp = cur->bc_tp;
xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
+ struct xfs_owner_info oinfo;
- xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp);
+ xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_private.b.whichfork);
+ xfs_bmap_add_free(mp, cur->bc_private.b.dfops, fsbno, 1, &oinfo);
ip->i_d.di_nblocks--;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -600,17 +606,6 @@ xfs_bmbt_init_key_from_rec(
}
STATIC void
-xfs_bmbt_init_rec_from_key(
- union xfs_btree_key *key,
- union xfs_btree_rec *rec)
-{
- ASSERT(key->bmbt.br_startoff != 0);
-
- xfs_bmbt_disk_set_allf(&rec->bmbt, be64_to_cpu(key->bmbt.br_startoff),
- 0, 0, XFS_EXT_NORM);
-}
-
-STATIC void
xfs_bmbt_init_rec_from_cur(
struct xfs_btree_cur *cur,
union xfs_btree_rec *rec)
@@ -760,7 +755,6 @@ static const struct xfs_btree_ops xfs_bmbt_ops = {
.get_minrecs = xfs_bmbt_get_minrecs,
.get_dmaxrecs = xfs_bmbt_get_dmaxrecs,
.init_key_from_rec = xfs_bmbt_init_key_from_rec,
- .init_rec_from_key = xfs_bmbt_init_rec_from_key,
.init_rec_from_cur = xfs_bmbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur,
.key_diff = xfs_bmbt_key_diff,
@@ -800,7 +794,7 @@ xfs_bmbt_init_cursor(
cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
cur->bc_private.b.ip = ip;
cur->bc_private.b.firstblock = NULLFSBLOCK;
- cur->bc_private.b.flist = NULL;
+ cur->bc_private.b.dfops = NULL;
cur->bc_private.b.allocated = 0;
cur->bc_private.b.flags = 0;
cur->bc_private.b.whichfork = whichfork;
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 1f88e1ce770f3..b5c213a051cde 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -23,6 +23,7 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_mount.h"
+#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_inode_item.h"
@@ -43,15 +44,14 @@ kmem_zone_t *xfs_btree_cur_zone;
* Btree magic numbers.
*/
static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
- { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
+ { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
XFS_FIBT_MAGIC },
- { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC,
+ { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC,
XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC }
};
#define xfs_btree_magic(cur) \
xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
-
STATIC int /* error (0 or EFSCORRUPTED) */
xfs_btree_check_lblock(
struct xfs_btree_cur *cur, /* btree cursor */
@@ -428,6 +428,50 @@ xfs_btree_dup_cursor(
* into a btree block (xfs_btree_*_offset) or return a pointer to the given
* record, key or pointer (xfs_btree_*_addr). Note that all addressing
* inside the btree block is done using indices starting at one, not zero!
+ *
+ * If XFS_BTREE_OVERLAPPING is set, then this btree supports keys containing
+ * overlapping intervals. In such a tree, records are still sorted lowest to
+ * highest and indexed by the smallest key value that refers to the record.
+ * However, nodes are different: each pointer has two associated keys -- one
+ * indexing the lowest key available in the block(s) below (the same behavior
+ * as the key in a regular btree) and another indexing the highest key
+ * available in the block(s) below. Because records are /not/ sorted by the
+ * highest key, all leaf block updates require us to compute the highest key
+ * that matches any record in the leaf and to recursively update the high keys
+ * in the nodes going further up in the tree, if necessary. Nodes look like
+ * this:
+ *
+ * +--------+-----+-----+-----+-----+-----+-------+-------+-----+
+ * Non-Leaf: | header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... |
+ * +--------+-----+-----+-----+-----+-----+-------+-------+-----+
+ *
+ * To perform an interval query on an overlapped tree, perform the usual
+ * depth-first search and use the low and high keys to decide if we can skip
+ * that particular node. If a leaf node is reached, return the records that
+ * intersect the interval. Note that an interval query may return numerous
+ * entries. For a non-overlapped tree, simply search for the record associated
+ * with the lowest key and iterate forward until a non-matching record is
+ * found. Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by
+ * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in
+ * more detail.
+ *
+ * Why do we care about overlapping intervals? Let's say you have a bunch of
+ * reverse mapping records on a reflink filesystem:
+ *
+ * 1: +- file A startblock B offset C length D -----------+
+ * 2: +- file E startblock F offset G length H --------------+
+ * 3: +- file I startblock F offset J length K --+
+ * 4: +- file L... --+
+ *
+ * Now say we want to map block (B+D) into file A at offset (C+D). Ideally,
+ * we'd simply increment the length of record 1. But how do we find the record
+ * that ends at (B+D-1) (i.e. record 1)? A LE lookup of (B+D-1) would return
+ * record 3 because the keys are ordered first by startblock. An interval
+ * query would return records 1 and 2 because they both overlap (B+D-1), and
+ * from that we can pick out record 1 as the appropriate left neighbor.
+ *
+ * In the non-overlapped case you can do a LE lookup and decrement the cursor
+ * because a record's interval must end before the next record.
*/
/*
@@ -479,6 +523,18 @@ xfs_btree_key_offset(
}
/*
+ * Calculate offset of the n-th high key in a btree block.
+ */
+STATIC size_t
+xfs_btree_high_key_offset(
+ struct xfs_btree_cur *cur,
+ int n)
+{
+ return xfs_btree_block_len(cur) +
+ (n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2);
+}
+
+/*
* Calculate offset of the n-th block pointer in a btree block.
*/
STATIC size_t
@@ -519,6 +575,19 @@ xfs_btree_key_addr(
}
/*
+ * Return a pointer to the n-th high key in the btree block.
+ */
+STATIC union xfs_btree_key *
+xfs_btree_high_key_addr(
+ struct xfs_btree_cur *cur,
+ int n,
+ struct xfs_btree_block *block)
+{
+ return (union xfs_btree_key *)
+ ((char *)block + xfs_btree_high_key_offset(cur, n));
+}
+
+/*
* Return a pointer to the n-th block pointer in the btree block.
*/
STATIC union xfs_btree_ptr *
@@ -543,12 +612,12 @@ xfs_btree_ptr_addr(
*/
STATIC struct xfs_btree_block *
xfs_btree_get_iroot(
- struct xfs_btree_cur *cur)
+ struct xfs_btree_cur *cur)
{
- struct xfs_ifork *ifp;
+ struct xfs_ifork *ifp;
- ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
- return (struct xfs_btree_block *)ifp->if_broot;
+ ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+ return (struct xfs_btree_block *)ifp->if_broot;
}
/*
@@ -1144,6 +1213,9 @@ xfs_btree_set_refs(
case XFS_BTNUM_BMAP:
xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
break;
+ case XFS_BTNUM_RMAP:
+ xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF);
+ break;
default:
ASSERT(0);
}
@@ -1879,32 +1951,214 @@ error0:
return error;
}
+/* Find the high key storage area from a regular key. */
+STATIC union xfs_btree_key *
+xfs_btree_high_key_from_key(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *key)
+{
+ ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
+ return (union xfs_btree_key *)((char *)key +
+ (cur->bc_ops->key_len / 2));
+}
+
+/* Determine the low (and high if overlapped) keys of a leaf block */
+STATIC void
+xfs_btree_get_leaf_keys(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ union xfs_btree_key *key)
+{
+ union xfs_btree_key max_hkey;
+ union xfs_btree_key hkey;
+ union xfs_btree_rec *rec;
+ union xfs_btree_key *high;
+ int n;
+
+ rec = xfs_btree_rec_addr(cur, 1, block);
+ cur->bc_ops->init_key_from_rec(key, rec);
+
+ if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+
+ cur->bc_ops->init_high_key_from_rec(&max_hkey, rec);
+ for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
+ rec = xfs_btree_rec_addr(cur, n, block);
+ cur->bc_ops->init_high_key_from_rec(&hkey, rec);
+ if (cur->bc_ops->diff_two_keys(cur, &hkey, &max_hkey)
+ > 0)
+ max_hkey = hkey;
+ }
+
+ high = xfs_btree_high_key_from_key(cur, key);
+ memcpy(high, &max_hkey, cur->bc_ops->key_len / 2);
+ }
+}
+
+/* Determine the low (and high if overlapped) keys of a node block */
+STATIC void
+xfs_btree_get_node_keys(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ union xfs_btree_key *key)
+{
+ union xfs_btree_key *hkey;
+ union xfs_btree_key *max_hkey;
+ union xfs_btree_key *high;
+ int n;
+
+ if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+ memcpy(key, xfs_btree_key_addr(cur, 1, block),
+ cur->bc_ops->key_len / 2);
+
+ max_hkey = xfs_btree_high_key_addr(cur, 1, block);
+ for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
+ hkey = xfs_btree_high_key_addr(cur, n, block);
+ if (cur->bc_ops->diff_two_keys(cur, hkey, max_hkey) > 0)
+ max_hkey = hkey;
+ }
+
+ high = xfs_btree_high_key_from_key(cur, key);
+ memcpy(high, max_hkey, cur->bc_ops->key_len / 2);
+ } else {
+ memcpy(key, xfs_btree_key_addr(cur, 1, block),
+ cur->bc_ops->key_len);
+ }
+}
+
+/* Derive the keys for any btree block. */
+STATIC void
+xfs_btree_get_keys(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ union xfs_btree_key *key)
+{
+ if (be16_to_cpu(block->bb_level) == 0)
+ xfs_btree_get_leaf_keys(cur, block, key);
+ else
+ xfs_btree_get_node_keys(cur, block, key);
+}
+
/*
- * Update keys at all levels from here to the root along the cursor's path.
+ * Decide if we need to update the parent keys of a btree block. For
+ * a standard btree this is only necessary if we're updating the first
+ * record/key. For an overlapping btree, we must always update the
+ * keys because the highest key can be in any of the records or keys
+ * in the block.
+ */
+static inline bool
+xfs_btree_needs_key_update(
+ struct xfs_btree_cur *cur,
+ int ptr)
+{
+ return (cur->bc_flags & XFS_BTREE_OVERLAPPING) || ptr == 1;
+}
+
+/*
+ * Update the low and high parent keys of the given level, progressing
+ * towards the root. If force_all is false, stop if the keys for a given
+ * level do not need updating.
*/
STATIC int
-xfs_btree_updkey(
+__xfs_btree_updkeys(
+ struct xfs_btree_cur *cur,
+ int level,
+ struct xfs_btree_block *block,
+ struct xfs_buf *bp0,
+ bool force_all)
+{
+ union xfs_btree_bigkey key; /* keys from current level */
+ union xfs_btree_key *lkey; /* keys from the next level up */
+ union xfs_btree_key *hkey;
+ union xfs_btree_key *nlkey; /* keys from the next level up */
+ union xfs_btree_key *nhkey;
+ struct xfs_buf *bp;
+ int ptr;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
+
+ /* Exit if there aren't any parent levels to update. */
+ if (level + 1 >= cur->bc_nlevels)
+ return 0;
+
+ trace_xfs_btree_updkeys(cur, level, bp0);
+
+ lkey = (union xfs_btree_key *)&key;
+ hkey = xfs_btree_high_key_from_key(cur, lkey);
+ xfs_btree_get_keys(cur, block, lkey);
+ for (level++; level < cur->bc_nlevels; level++) {
+#ifdef DEBUG
+ int error;
+#endif
+ block = xfs_btree_get_block(cur, level, &bp);
+ trace_xfs_btree_updkeys(cur, level, bp);
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+ }
+#endif
+ ptr = cur->bc_ptrs[level];
+ nlkey = xfs_btree_key_addr(cur, ptr, block);
+ nhkey = xfs_btree_high_key_addr(cur, ptr, block);
+ if (!force_all &&
+ !(cur->bc_ops->diff_two_keys(cur, nlkey, lkey) != 0 ||
+ cur->bc_ops->diff_two_keys(cur, nhkey, hkey) != 0))
+ break;
+ xfs_btree_copy_keys(cur, nlkey, lkey, 1);
+ xfs_btree_log_keys(cur, bp, ptr, ptr);
+ if (level + 1 >= cur->bc_nlevels)
+ break;
+ xfs_btree_get_node_keys(cur, block, lkey);
+ }
+
+ return 0;
+}
+
+/* Update all the keys from some level in cursor back to the root. */
+STATIC int
+xfs_btree_updkeys_force(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ struct xfs_buf *bp;
+ struct xfs_btree_block *block;
+
+ block = xfs_btree_get_block(cur, level, &bp);
+ return __xfs_btree_updkeys(cur, level, block, bp, true);
+}
+
+/*
+ * Update the parent keys of the given level, progressing towards the root.
+ */
+STATIC int
+xfs_btree_update_keys(
struct xfs_btree_cur *cur,
- union xfs_btree_key *keyp,
int level)
{
struct xfs_btree_block *block;
struct xfs_buf *bp;
union xfs_btree_key *kp;
+ union xfs_btree_key key;
int ptr;
+ ASSERT(level >= 0);
+
+ block = xfs_btree_get_block(cur, level, &bp);
+ if (cur->bc_flags & XFS_BTREE_OVERLAPPING)
+ return __xfs_btree_updkeys(cur, level, block, bp, false);
+
XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
XFS_BTREE_TRACE_ARGIK(cur, level, keyp);
- ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1);
-
/*
* Go up the tree from this level toward the root.
* At each level, update the key value to the value input.
* Stop when we reach a level where the cursor isn't pointing
* at the first entry in the block.
*/
- for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
+ xfs_btree_get_keys(cur, block, &key);
+ for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
#ifdef DEBUG
int error;
#endif
@@ -1918,7 +2172,7 @@ xfs_btree_updkey(
#endif
ptr = cur->bc_ptrs[level];
kp = xfs_btree_key_addr(cur, ptr, block);
- xfs_btree_copy_keys(cur, kp, keyp, 1);
+ xfs_btree_copy_keys(cur, kp, &key, 1);
xfs_btree_log_keys(cur, bp, ptr, ptr);
}
@@ -1970,12 +2224,9 @@ xfs_btree_update(
ptr, LASTREC_UPDATE);
}
- /* Updating first rec in leaf. Pass new key value up to our parent. */
- if (ptr == 1) {
- union xfs_btree_key key;
-
- cur->bc_ops->init_key_from_rec(&key, rec);
- error = xfs_btree_updkey(cur, &key, 1);
+ /* Pass new key value up to our parent. */
+ if (xfs_btree_needs_key_update(cur, ptr)) {
+ error = xfs_btree_update_keys(cur, 0);
if (error)
goto error0;
}
@@ -1998,18 +2249,19 @@ xfs_btree_lshift(
int level,
int *stat) /* success/failure */
{
- union xfs_btree_key key; /* btree key */
struct xfs_buf *lbp; /* left buffer pointer */
struct xfs_btree_block *left; /* left btree block */
int lrecs; /* left record count */
struct xfs_buf *rbp; /* right buffer pointer */
struct xfs_btree_block *right; /* right btree block */
+ struct xfs_btree_cur *tcur; /* temporary btree cursor */
int rrecs; /* right record count */
union xfs_btree_ptr lptr; /* left btree pointer */
union xfs_btree_key *rkp = NULL; /* right btree key */
union xfs_btree_ptr *rpp = NULL; /* right address pointer */
union xfs_btree_rec *rrp = NULL; /* right record pointer */
int error; /* error return value */
+ int i;
XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
XFS_BTREE_TRACE_ARGI(cur, level);
@@ -2139,18 +2391,33 @@ xfs_btree_lshift(
xfs_btree_rec_addr(cur, 2, right),
-1, rrecs);
xfs_btree_log_recs(cur, rbp, 1, rrecs);
+ }
- /*
- * If it's the first record in the block, we'll need a key
- * structure to pass up to the next level (updkey).
- */
- cur->bc_ops->init_key_from_rec(&key,
- xfs_btree_rec_addr(cur, 1, right));
- rkp = &key;
+ /*
+ * Using a temporary cursor, update the parent key values of the
+ * block on the left.
+ */
+ if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+ error = xfs_btree_dup_cursor(cur, &tcur);
+ if (error)
+ goto error0;
+ i = xfs_btree_firstrec(tcur, level);
+ XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0);
+
+ error = xfs_btree_decrement(tcur, level, &i);
+ if (error)
+ goto error1;
+
+ /* Update the parent high keys of the left block, if needed. */
+ error = xfs_btree_update_keys(tcur, level);
+ if (error)
+ goto error1;
+
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
}
- /* Update the parent key values of right. */
- error = xfs_btree_updkey(cur, rkp, level + 1);
+ /* Update the parent keys of the right block. */
+ error = xfs_btree_update_keys(cur, level);
if (error)
goto error0;
@@ -2169,6 +2436,11 @@ out0:
error0:
XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
return error;
+
+error1:
+ XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
+ xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+ return error;
}
/*
@@ -2181,7 +2453,6 @@ xfs_btree_rshift(
int level,
int *stat) /* success/failure */
{
- union xfs_btree_key key; /* btree key */
struct xfs_buf *lbp; /* left buffer pointer */
struct xfs_btree_block *left; /* left btree block */
struct xfs_buf *rbp; /* right buffer pointer */
@@ -2290,12 +2561,6 @@ xfs_btree_rshift(
/* Now put the new data in, and log it. */
xfs_btree_copy_recs(cur, rrp, lrp, 1);
xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
-
- cur->bc_ops->init_key_from_rec(&key, rrp);
- rkp = &key;
-
- ASSERT(cur->bc_ops->recs_inorder(cur, rrp,
- xfs_btree_rec_addr(cur, 2, right)));
}
/*
@@ -2315,13 +2580,21 @@ xfs_btree_rshift(
if (error)
goto error0;
i = xfs_btree_lastrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0);
error = xfs_btree_increment(tcur, level, &i);
if (error)
goto error1;
- error = xfs_btree_updkey(tcur, rkp, level + 1);
+ /* Update the parent high keys of the left block, if needed. */
+ if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+ error = xfs_btree_update_keys(cur, level);
+ if (error)
+ goto error1;
+ }
+
+ /* Update the parent keys of the right block. */
+ error = xfs_btree_update_keys(tcur, level);
if (error)
goto error1;
@@ -2422,6 +2695,11 @@ __xfs_btree_split(
XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+ /* Adjust numrecs for the later get_*_keys() calls. */
+ lrecs -= rrecs;
+ xfs_btree_set_numrecs(left, lrecs);
+ xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
+
/*
* Copy btree block entries from the left block over to the
* new block, the right. Update the right block and log the
@@ -2447,14 +2725,15 @@ __xfs_btree_split(
}
#endif
+ /* Copy the keys & pointers to the new block. */
xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
xfs_btree_log_keys(cur, rbp, 1, rrecs);
xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
- /* Grab the keys to the entries moved to the right block */
- xfs_btree_copy_keys(cur, key, rkp, 1);
+ /* Stash the keys of the new block for later insertion. */
+ xfs_btree_get_node_keys(cur, right, key);
} else {
/* It's a leaf. Move records. */
union xfs_btree_rec *lrp; /* left record pointer */
@@ -2463,27 +2742,23 @@ __xfs_btree_split(
lrp = xfs_btree_rec_addr(cur, src_index, left);
rrp = xfs_btree_rec_addr(cur, 1, right);
+ /* Copy records to the new block. */
xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
xfs_btree_log_recs(cur, rbp, 1, rrecs);
- cur->bc_ops->init_key_from_rec(key,
- xfs_btree_rec_addr(cur, 1, right));
+ /* Stash the keys of the new block for later insertion. */
+ xfs_btree_get_leaf_keys(cur, right, key);
}
-
/*
* Find the left block number by looking in the buffer.
- * Adjust numrecs, sibling pointers.
+ * Adjust sibling pointers.
*/
xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
- lrecs -= rrecs;
- xfs_btree_set_numrecs(left, lrecs);
- xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
-
xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
@@ -2499,6 +2774,14 @@ __xfs_btree_split(
xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
}
+
+ /* Update the parent high keys of the left block, if needed. */
+ if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+ error = xfs_btree_update_keys(cur, level);
+ if (error)
+ goto error0;
+ }
+
/*
* If the cursor is really in the right block, move it there.
* If it's just pointing past the last entry in left, then we'll
@@ -2802,6 +3085,7 @@ xfs_btree_new_root(
bp = lbp;
nptr = 2;
}
+
/* Fill in the new block's btree header and log it. */
xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
@@ -2810,19 +3094,24 @@ xfs_btree_new_root(
/* Fill in the key data in the new root. */
if (xfs_btree_get_level(left) > 0) {
- xfs_btree_copy_keys(cur,
- xfs_btree_key_addr(cur, 1, new),
- xfs_btree_key_addr(cur, 1, left), 1);
- xfs_btree_copy_keys(cur,
- xfs_btree_key_addr(cur, 2, new),
- xfs_btree_key_addr(cur, 1, right), 1);
+ /*
+ * Get the keys for the left block's keys and put them directly
+ * in the parent block. Do the same for the right block.
+ */
+ xfs_btree_get_node_keys(cur, left,
+ xfs_btree_key_addr(cur, 1, new));
+ xfs_btree_get_node_keys(cur, right,
+ xfs_btree_key_addr(cur, 2, new));
} else {
- cur->bc_ops->init_key_from_rec(
- xfs_btree_key_addr(cur, 1, new),
- xfs_btree_rec_addr(cur, 1, left));
- cur->bc_ops->init_key_from_rec(
- xfs_btree_key_addr(cur, 2, new),
- xfs_btree_rec_addr(cur, 1, right));
+ /*
+ * Get the keys for the left block's records and put them
+ * directly in the parent block. Do the same for the right
+ * block.
+ */
+ xfs_btree_get_leaf_keys(cur, left,
+ xfs_btree_key_addr(cur, 1, new));
+ xfs_btree_get_leaf_keys(cur, right,
+ xfs_btree_key_addr(cur, 2, new));
}
xfs_btree_log_keys(cur, nbp, 1, 2);
@@ -2858,10 +3147,9 @@ xfs_btree_make_block_unfull(
int *index, /* new tree index */
union xfs_btree_ptr *nptr, /* new btree ptr */
struct xfs_btree_cur **ncur, /* new btree cursor */
- union xfs_btree_rec *nrec, /* new record */
+ union xfs_btree_key *key, /* key of new block */
int *stat)
{
- union xfs_btree_key key; /* new btree key value */
int error = 0;
if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
@@ -2871,6 +3159,7 @@ xfs_btree_make_block_unfull(
if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
/* A root block that can be made bigger. */
xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
+ *stat = 1;
} else {
/* A root block that needs replacing */
int logflags = 0;
@@ -2906,13 +3195,12 @@ xfs_btree_make_block_unfull(
* If this works we have to re-set our variables because we
* could be in a different block now.
*/
- error = xfs_btree_split(cur, level, nptr, &key, ncur, stat);
+ error = xfs_btree_split(cur, level, nptr, key, ncur, stat);
if (error || *stat == 0)
return error;
*index = cur->bc_ptrs[level];
- cur->bc_ops->init_rec_from_key(&key, nrec);
return 0;
}
@@ -2925,16 +3213,17 @@ xfs_btree_insrec(
struct xfs_btree_cur *cur, /* btree cursor */
int level, /* level to insert record at */
union xfs_btree_ptr *ptrp, /* i/o: block number inserted */
- union xfs_btree_rec *recp, /* i/o: record data inserted */
+ union xfs_btree_rec *rec, /* record to insert */
+ union xfs_btree_key *key, /* i/o: block key for ptrp */
struct xfs_btree_cur **curp, /* output: new cursor replacing cur */
int *stat) /* success/failure */
{
struct xfs_btree_block *block; /* btree block */
struct xfs_buf *bp; /* buffer for block */
- union xfs_btree_key key; /* btree key */
union xfs_btree_ptr nptr; /* new block ptr */
struct xfs_btree_cur *ncur; /* new btree cursor */
- union xfs_btree_rec nrec; /* new record count */
+ union xfs_btree_bigkey nkey; /* new block key */
+ union xfs_btree_key *lkey;
int optr; /* old key/record index */
int ptr; /* key/record index */
int numrecs;/* number of records */
@@ -2942,11 +3231,13 @@ xfs_btree_insrec(
#ifdef DEBUG
int i;
#endif
+ xfs_daddr_t old_bn;
XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
- XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp);
+ XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, &rec);
ncur = NULL;
+ lkey = (union xfs_btree_key *)&nkey;
/*
* If we have an external root pointer, and we've made it to the
@@ -2969,15 +3260,13 @@ xfs_btree_insrec(
return 0;
}
- /* Make a key out of the record data to be inserted, and save it. */
- cur->bc_ops->init_key_from_rec(&key, recp);
-
optr = ptr;
XFS_BTREE_STATS_INC(cur, insrec);
/* Get pointers to the btree buffer and block. */
block = xfs_btree_get_block(cur, level, &bp);
+ old_bn = bp ? bp->b_bn : XFS_BUF_DADDR_NULL;
numrecs = xfs_btree_get_numrecs(block);
#ifdef DEBUG
@@ -2988,10 +3277,10 @@ xfs_btree_insrec(
/* Check that the new entry is being inserted in the right place. */
if (ptr <= numrecs) {
if (level == 0) {
- ASSERT(cur->bc_ops->recs_inorder(cur, recp,
+ ASSERT(cur->bc_ops->recs_inorder(cur, rec,
xfs_btree_rec_addr(cur, ptr, block)));
} else {
- ASSERT(cur->bc_ops->keys_inorder(cur, &key,
+ ASSERT(cur->bc_ops->keys_inorder(cur, key,
xfs_btree_key_addr(cur, ptr, block)));
}
}
@@ -3004,7 +3293,7 @@ xfs_btree_insrec(
xfs_btree_set_ptr_null(cur, &nptr);
if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
error = xfs_btree_make_block_unfull(cur, level, numrecs,
- &optr, &ptr, &nptr, &ncur, &nrec, stat);
+ &optr, &ptr, &nptr, &ncur, lkey, stat);
if (error || *stat == 0)
goto error0;
}
@@ -3054,7 +3343,7 @@ xfs_btree_insrec(
#endif
/* Now put the new data in, bump numrecs and log it. */
- xfs_btree_copy_keys(cur, kp, &key, 1);
+ xfs_btree_copy_keys(cur, kp, key, 1);
xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
numrecs++;
xfs_btree_set_numrecs(block, numrecs);
@@ -3075,7 +3364,7 @@ xfs_btree_insrec(
xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
/* Now put the new data in, bump numrecs and log it. */
- xfs_btree_copy_recs(cur, rp, recp, 1);
+ xfs_btree_copy_recs(cur, rp, rec, 1);
xfs_btree_set_numrecs(block, ++numrecs);
xfs_btree_log_recs(cur, bp, ptr, numrecs);
#ifdef DEBUG
@@ -3089,9 +3378,18 @@ xfs_btree_insrec(
/* Log the new number of records in the btree header. */
xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
- /* If we inserted at the start of a block, update the parents' keys. */
- if (optr == 1) {
- error = xfs_btree_updkey(cur, &key, level + 1);
+ /*
+ * If we just inserted into a new tree block, we have to
+ * recalculate nkey here because nkey is out of date.
+ *
+ * Otherwise we're just updating an existing block (having shoved
+ * some records into the new tree block), so use the regular key
+ * update mechanism.
+ */
+ if (bp && bp->b_bn != old_bn) {
+ xfs_btree_get_keys(cur, block, lkey);
+ } else if (xfs_btree_needs_key_update(cur, optr)) {
+ error = xfs_btree_update_keys(cur, level);
if (error)
goto error0;
}
@@ -3101,7 +3399,7 @@ xfs_btree_insrec(
* we are at the far right edge of the tree, update it.
*/
if (xfs_btree_is_lastrec(cur, block, level)) {
- cur->bc_ops->update_lastrec(cur, block, recp,
+ cur->bc_ops->update_lastrec(cur, block, rec,
ptr, LASTREC_INSREC);
}
@@ -3111,7 +3409,7 @@ xfs_btree_insrec(
*/
*ptrp = nptr;
if (!xfs_btree_ptr_is_null(cur, &nptr)) {
- *recp = nrec;
+ xfs_btree_copy_keys(cur, key, lkey, 1);
*curp = ncur;
}
@@ -3142,14 +3440,20 @@ xfs_btree_insert(
union xfs_btree_ptr nptr; /* new block number (split result) */
struct xfs_btree_cur *ncur; /* new cursor (split result) */
struct xfs_btree_cur *pcur; /* previous level's cursor */
+ union xfs_btree_bigkey bkey; /* key of block to insert */
+ union xfs_btree_key *key;
union xfs_btree_rec rec; /* record to insert */
level = 0;
ncur = NULL;
pcur = cur;
+ key = (union xfs_btree_key *)&bkey;
xfs_btree_set_ptr_null(cur, &nptr);
+
+ /* Make a key out of the record data to be inserted, and save it. */
cur->bc_ops->init_rec_from_cur(cur, &rec);
+ cur->bc_ops->init_key_from_rec(key, &rec);
/*
* Loop going up the tree, starting at the leaf level.
@@ -3161,7 +3465,8 @@ xfs_btree_insert(
* Insert nrec/nptr into this level of the tree.
* Note if we fail, nptr will be null.
*/
- error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i);
+ error = xfs_btree_insrec(pcur, level, &nptr, &rec, key,
+ &ncur, &i);
if (error) {
if (pcur != cur)
xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
@@ -3385,8 +3690,6 @@ xfs_btree_delrec(
struct xfs_buf *bp; /* buffer for block */
int error; /* error return value */
int i; /* loop counter */
- union xfs_btree_key key; /* storage for keyp */
- union xfs_btree_key *keyp = &key; /* passed to the next level */
union xfs_btree_ptr lptr; /* left sibling block ptr */
struct xfs_buf *lbp; /* left buffer pointer */
struct xfs_btree_block *left; /* left btree block */
@@ -3457,13 +3760,6 @@ xfs_btree_delrec(
xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
}
-
- /*
- * If it's the first record in the block, we'll need to pass a
- * key up to the next level (updkey).
- */
- if (ptr == 1)
- keyp = xfs_btree_key_addr(cur, 1, block);
} else {
/* It's a leaf. operate on records */
if (ptr < numrecs) {
@@ -3472,16 +3768,6 @@ xfs_btree_delrec(
-1, numrecs - ptr);
xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
}
-
- /*
- * If it's the first record in the block, we'll need a key
- * structure to pass up to the next level (updkey).
- */
- if (ptr == 1) {
- cur->bc_ops->init_key_from_rec(&key,
- xfs_btree_rec_addr(cur, 1, block));
- keyp = &key;
- }
}
/*
@@ -3548,8 +3834,8 @@ xfs_btree_delrec(
* If we deleted the leftmost entry in the block, update the
* key values above us in the tree.
*/
- if (ptr == 1) {
- error = xfs_btree_updkey(cur, keyp, level + 1);
+ if (xfs_btree_needs_key_update(cur, ptr)) {
+ error = xfs_btree_update_keys(cur, level);
if (error)
goto error0;
}
@@ -3878,6 +4164,16 @@ xfs_btree_delrec(
if (level > 0)
cur->bc_ptrs[level]--;
+ /*
+ * We combined blocks, so we have to update the parent keys if the
+ * btree supports overlapped intervals. However, bc_ptrs[level + 1]
+ * points to the old block so that the caller knows which record to
+ * delete. Therefore, the caller must be savvy enough to call updkeys
+ * for us if we return stat == 2. The other exit points from this
+ * function don't require deletions further up the tree, so they can
+ * call updkeys directly.
+ */
+
XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
/* Return value means the next level up has something to do. */
*stat = 2;
@@ -3903,6 +4199,7 @@ xfs_btree_delete(
int error; /* error return value */
int level;
int i;
+ bool joined = false;
XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
@@ -3916,6 +4213,18 @@ xfs_btree_delete(
error = xfs_btree_delrec(cur, level, &i);
if (error)
goto error0;
+ if (i == 2)
+ joined = true;
+ }
+
+ /*
+ * If we combined blocks as part of deleting the record, delrec won't
+ * have updated the parent high keys so we have to do that here.
+ */
+ if (joined && (cur->bc_flags & XFS_BTREE_OVERLAPPING)) {
+ error = xfs_btree_updkeys_force(cur, 0);
+ if (error)
+ goto error0;
}
if (i == 0) {
@@ -3978,6 +4287,81 @@ xfs_btree_get_rec(
return 0;
}
+/* Visit a block in a btree. */
+STATIC int
+xfs_btree_visit_block(
+ struct xfs_btree_cur *cur,
+ int level,
+ xfs_btree_visit_blocks_fn fn,
+ void *data)
+{
+ struct xfs_btree_block *block;
+ struct xfs_buf *bp;
+ union xfs_btree_ptr rptr;
+ int error;
+
+ /* do right sibling readahead */
+ xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+ block = xfs_btree_get_block(cur, level, &bp);
+
+ /* process the block */
+ error = fn(cur, level, data);
+ if (error)
+ return error;
+
+ /* now read rh sibling block for next iteration */
+ xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+ if (xfs_btree_ptr_is_null(cur, &rptr))
+ return -ENOENT;
+
+ return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
+}
+
+
+/* Visit every block in a btree. */
+int
+xfs_btree_visit_blocks(
+ struct xfs_btree_cur *cur,
+ xfs_btree_visit_blocks_fn fn,
+ void *data)
+{
+ union xfs_btree_ptr lptr;
+ int level;
+ struct xfs_btree_block *block = NULL;
+ int error = 0;
+
+ cur->bc_ops->init_ptr_from_cur(cur, &lptr);
+
+ /* for each level */
+ for (level = cur->bc_nlevels - 1; level >= 0; level--) {
+ /* grab the left hand block */
+ error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
+ if (error)
+ return error;
+
+ /* readahead the left most block for the next level down */
+ if (level > 0) {
+ union xfs_btree_ptr *ptr;
+
+ ptr = xfs_btree_ptr_addr(cur, 1, block);
+ xfs_btree_readahead_ptr(cur, ptr, 1);
+
+ /* save for the next iteration of the loop */
+ lptr = *ptr;
+ }
+
+ /* for each buffer in the level */
+ do {
+ error = xfs_btree_visit_block(cur, level, fn, data);
+ } while (!error);
+
+ if (error != -ENOENT)
+ return error;
+ }
+
+ return 0;
+}
+
/*
* Change the owner of a btree.
*
@@ -4002,26 +4386,27 @@ xfs_btree_get_rec(
* just queue the modified buffer as delayed write buffer so the transaction
* recovery completion writes the changes to disk.
*/
+struct xfs_btree_block_change_owner_info {
+ __uint64_t new_owner;
+ struct list_head *buffer_list;
+};
+
static int
xfs_btree_block_change_owner(
struct xfs_btree_cur *cur,
int level,
- __uint64_t new_owner,
- struct list_head *buffer_list)
+ void *data)
{
+ struct xfs_btree_block_change_owner_info *bbcoi = data;
struct xfs_btree_block *block;
struct xfs_buf *bp;
- union xfs_btree_ptr rptr;
-
- /* do right sibling readahead */
- xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
/* modify the owner */
block = xfs_btree_get_block(cur, level, &bp);
if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
- block->bb_u.l.bb_owner = cpu_to_be64(new_owner);
+ block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
else
- block->bb_u.s.bb_owner = cpu_to_be32(new_owner);
+ block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
/*
* If the block is a root block hosted in an inode, we might not have a
@@ -4035,19 +4420,14 @@ xfs_btree_block_change_owner(
xfs_trans_ordered_buf(cur->bc_tp, bp);
xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
} else {
- xfs_buf_delwri_queue(bp, buffer_list);
+ xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
}
} else {
ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
ASSERT(level == cur->bc_nlevels - 1);
}
- /* now read rh sibling block for next iteration */
- xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
- if (xfs_btree_ptr_is_null(cur, &rptr))
- return -ENOENT;
-
- return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
+ return 0;
}
int
@@ -4056,43 +4436,13 @@ xfs_btree_change_owner(
__uint64_t new_owner,
struct list_head *buffer_list)
{
- union xfs_btree_ptr lptr;
- int level;
- struct xfs_btree_block *block = NULL;
- int error = 0;
+ struct xfs_btree_block_change_owner_info bbcoi;
- cur->bc_ops->init_ptr_from_cur(cur, &lptr);
+ bbcoi.new_owner = new_owner;
+ bbcoi.buffer_list = buffer_list;
- /* for each level */
- for (level = cur->bc_nlevels - 1; level >= 0; level--) {
- /* grab the left hand block */
- error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
- if (error)
- return error;
-
- /* readahead the left most block for the next level down */
- if (level > 0) {
- union xfs_btree_ptr *ptr;
-
- ptr = xfs_btree_ptr_addr(cur, 1, block);
- xfs_btree_readahead_ptr(cur, ptr, 1);
-
- /* save for the next iteration of the loop */
- lptr = *ptr;
- }
-
- /* for each buffer in the level */
- do {
- error = xfs_btree_block_change_owner(cur, level,
- new_owner,
- buffer_list);
- } while (!error);
-
- if (error != -ENOENT)
- return error;
- }
-
- return 0;
+ return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner,
+ &bbcoi);
}
/**
@@ -4152,3 +4502,286 @@ xfs_btree_sblock_verify(
return true;
}
+
+/*
+ * Calculate the number of btree levels needed to store a given number of
+ * records in a short-format btree.
+ */
+uint
+xfs_btree_compute_maxlevels(
+ struct xfs_mount *mp,
+ uint *limits,
+ unsigned long len)
+{
+ uint level;
+ unsigned long maxblocks;
+
+ maxblocks = (len + limits[0] - 1) / limits[0];
+ for (level = 1; maxblocks > 1; level++)
+ maxblocks = (maxblocks + limits[1] - 1) / limits[1];
+ return level;
+}
+
+/*
+ * Query a regular btree for all records overlapping a given interval.
+ * Start with a LE lookup of the key of low_rec and return all records
+ * until we find a record with a key greater than the key of high_rec.
+ */
+STATIC int
+xfs_btree_simple_query_range(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *low_key,
+ union xfs_btree_key *high_key,
+ xfs_btree_query_range_fn fn,
+ void *priv)
+{
+ union xfs_btree_rec *recp;
+ union xfs_btree_key rec_key;
+ __int64_t diff;
+ int stat;
+ bool firstrec = true;
+ int error;
+
+ ASSERT(cur->bc_ops->init_high_key_from_rec);
+ ASSERT(cur->bc_ops->diff_two_keys);
+
+ /*
+ * Find the leftmost record. The btree cursor must be set
+ * to the low record used to generate low_key.
+ */
+ stat = 0;
+ error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat);
+ if (error)
+ goto out;
+
+ while (stat) {
+ /* Find the record. */
+ error = xfs_btree_get_rec(cur, &recp, &stat);
+ if (error || !stat)
+ break;
+ cur->bc_ops->init_high_key_from_rec(&rec_key, recp);
+
+ /* Skip if high_key(rec) < low_key. */
+ if (firstrec) {
+ firstrec = false;
+ diff = cur->bc_ops->diff_two_keys(cur, low_key,
+ &rec_key);
+ if (diff > 0)
+ goto advloop;
+ }
+
+ /* Stop if high_key < low_key(rec). */
+ diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key);
+ if (diff > 0)
+ break;
+
+ /* Callback */
+ error = fn(cur, recp, priv);
+ if (error < 0 || error == XFS_BTREE_QUERY_RANGE_ABORT)
+ break;
+
+advloop:
+ /* Move on to the next record. */
+ error = xfs_btree_increment(cur, 0, &stat);
+ if (error)
+ break;
+ }
+
+out:
+ return error;
+}
+
+/*
+ * Query an overlapped interval btree for all records overlapping a given
+ * interval. This function roughly follows the algorithm given in
+ * "Interval Trees" of _Introduction to Algorithms_, which is section
+ * 14.3 in the 2nd and 3rd editions.
+ *
+ * First, generate keys for the low and high records passed in.
+ *
+ * For any leaf node, generate the high and low keys for the record.
+ * If the record keys overlap with the query low/high keys, pass the
+ * record to the function iterator.
+ *
+ * For any internal node, compare the low and high keys of each
+ * pointer against the query low/high keys. If there's an overlap,
+ * follow the pointer.
+ *
+ * As an optimization, we stop scanning a block when we find a low key
+ * that is greater than the query's high key.
+ */
+STATIC int
+xfs_btree_overlapped_query_range(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *low_key,
+ union xfs_btree_key *high_key,
+ xfs_btree_query_range_fn fn,
+ void *priv)
+{
+ union xfs_btree_ptr ptr;
+ union xfs_btree_ptr *pp;
+ union xfs_btree_key rec_key;
+ union xfs_btree_key rec_hkey;
+ union xfs_btree_key *lkp;
+ union xfs_btree_key *hkp;
+ union xfs_btree_rec *recp;
+ struct xfs_btree_block *block;
+ __int64_t ldiff;
+ __int64_t hdiff;
+ int level;
+ struct xfs_buf *bp;
+ int i;
+ int error;
+
+ /* Load the root of the btree. */
+ level = cur->bc_nlevels - 1;
+ cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+ error = xfs_btree_lookup_get_block(cur, level, &ptr, &block);
+ if (error)
+ return error;
+ xfs_btree_get_block(cur, level, &bp);
+ trace_xfs_btree_overlapped_query_range(cur, level, bp);
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error)
+ goto out;
+#endif
+ cur->bc_ptrs[level] = 1;
+
+ while (level < cur->bc_nlevels) {
+ block = xfs_btree_get_block(cur, level, &bp);
+
+ /* End of node, pop back towards the root. */
+ if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) {
+pop_up:
+ if (level < cur->bc_nlevels - 1)
+ cur->bc_ptrs[level + 1]++;
+ level++;
+ continue;
+ }
+
+ if (level == 0) {
+ /* Handle a leaf node. */
+ recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
+
+ cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp);
+ ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey,
+ low_key);
+
+ cur->bc_ops->init_key_from_rec(&rec_key, recp);
+ hdiff = cur->bc_ops->diff_two_keys(cur, high_key,
+ &rec_key);
+
+ /*
+ * If (record's high key >= query's low key) and
+ * (query's high key >= record's low key), then
+ * this record overlaps the query range; callback.
+ */
+ if (ldiff >= 0 && hdiff >= 0) {
+ error = fn(cur, recp, priv);
+ if (error < 0 ||
+ error == XFS_BTREE_QUERY_RANGE_ABORT)
+ break;
+ } else if (hdiff < 0) {
+ /* Record is larger than high key; pop. */
+ goto pop_up;
+ }
+ cur->bc_ptrs[level]++;
+ continue;
+ }
+
+ /* Handle an internal node. */
+ lkp = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block);
+ hkp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block);
+ pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block);
+
+ ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key);
+ hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp);
+
+ /*
+ * If (pointer's high key >= query's low key) and
+ * (query's high key >= pointer's low key), then
+ * this record overlaps the query range; follow pointer.
+ */
+ if (ldiff >= 0 && hdiff >= 0) {
+ level--;
+ error = xfs_btree_lookup_get_block(cur, level, pp,
+ &block);
+ if (error)
+ goto out;
+ xfs_btree_get_block(cur, level, &bp);
+ trace_xfs_btree_overlapped_query_range(cur, level, bp);
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error)
+ goto out;
+#endif
+ cur->bc_ptrs[level] = 1;
+ continue;
+ } else if (hdiff < 0) {
+ /* The low key is larger than the upper range; pop. */
+ goto pop_up;
+ }
+ cur->bc_ptrs[level]++;
+ }
+
+out:
+ /*
+ * If we don't end this function with the cursor pointing at a record
+ * block, a subsequent non-error cursor deletion will not release
+ * node-level buffers, causing a buffer leak. This is quite possible
+ * with a zero-results range query, so release the buffers if we
+ * failed to return any results.
+ */
+ if (cur->bc_bufs[0] == NULL) {
+ for (i = 0; i < cur->bc_nlevels; i++) {
+ if (cur->bc_bufs[i]) {
+ xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
+ cur->bc_bufs[i] = NULL;
+ cur->bc_ptrs[i] = 0;
+ cur->bc_ra[i] = 0;
+ }
+ }
+ }
+
+ return error;
+}
+
+/*
+ * Query a btree for all records overlapping a given interval of keys. The
+ * supplied function will be called with each record found; return one of the
+ * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error
+ * code. This function returns XFS_BTREE_QUERY_RANGE_ABORT, zero, or a
+ * negative error code.
+ */
+int
+xfs_btree_query_range(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_irec *low_rec,
+ union xfs_btree_irec *high_rec,
+ xfs_btree_query_range_fn fn,
+ void *priv)
+{
+ union xfs_btree_rec rec;
+ union xfs_btree_key low_key;
+ union xfs_btree_key high_key;
+
+ /* Find the keys of both ends of the interval. */
+ cur->bc_rec = *high_rec;
+ cur->bc_ops->init_rec_from_cur(cur, &rec);
+ cur->bc_ops->init_key_from_rec(&high_key, &rec);
+
+ cur->bc_rec = *low_rec;
+ cur->bc_ops->init_rec_from_cur(cur, &rec);
+ cur->bc_ops->init_key_from_rec(&low_key, &rec);
+
+ /* Enforce low key < high key. */
+ if (cur->bc_ops->diff_two_keys(cur, &low_key, &high_key) > 0)
+ return -EINVAL;
+
+ if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ return xfs_btree_simple_query_range(cur, &low_key,
+ &high_key, fn, priv);
+ return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,
+ fn, priv);
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 2e874be702093..04d0865e5e6dc 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -19,7 +19,7 @@
#define __XFS_BTREE_H__
struct xfs_buf;
-struct xfs_bmap_free;
+struct xfs_defer_ops;
struct xfs_inode;
struct xfs_mount;
struct xfs_trans;
@@ -38,17 +38,37 @@ union xfs_btree_ptr {
};
union xfs_btree_key {
- xfs_bmbt_key_t bmbt;
- xfs_bmdr_key_t bmbr; /* bmbt root block */
- xfs_alloc_key_t alloc;
- xfs_inobt_key_t inobt;
+ struct xfs_bmbt_key bmbt;
+ xfs_bmdr_key_t bmbr; /* bmbt root block */
+ xfs_alloc_key_t alloc;
+ struct xfs_inobt_key inobt;
+ struct xfs_rmap_key rmap;
+};
+
+/*
+ * In-core key that holds both low and high keys for overlapped btrees.
+ * The two keys are packed next to each other on disk, so do the same
+ * in memory. Preserve the existing xfs_btree_key as a single key to
+ * avoid the mental model breakage that would happen if we passed a
+ * bigkey into a function that operates on a single key.
+ */
+union xfs_btree_bigkey {
+ struct xfs_bmbt_key bmbt;
+ xfs_bmdr_key_t bmbr; /* bmbt root block */
+ xfs_alloc_key_t alloc;
+ struct xfs_inobt_key inobt;
+ struct {
+ struct xfs_rmap_key rmap;
+ struct xfs_rmap_key rmap_hi;
+ };
};
union xfs_btree_rec {
- xfs_bmbt_rec_t bmbt;
- xfs_bmdr_rec_t bmbr; /* bmbt root block */
- xfs_alloc_rec_t alloc;
- xfs_inobt_rec_t inobt;
+ struct xfs_bmbt_rec bmbt;
+ xfs_bmdr_rec_t bmbr; /* bmbt root block */
+ struct xfs_alloc_rec alloc;
+ struct xfs_inobt_rec inobt;
+ struct xfs_rmap_rec rmap;
};
/*
@@ -63,6 +83,7 @@ union xfs_btree_rec {
#define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi)
#define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi)
#define XFS_BTNUM_FINO ((xfs_btnum_t)XFS_BTNUM_FINOi)
+#define XFS_BTNUM_RMAP ((xfs_btnum_t)XFS_BTNUM_RMAPi)
/*
* For logging record fields.
@@ -95,6 +116,7 @@ do { \
case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(__mp, bmbt, stat); break; \
case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(__mp, ibt, stat); break; \
case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(__mp, fibt, stat); break; \
+ case XFS_BTNUM_RMAP: __XFS_BTREE_STATS_INC(__mp, rmap, stat); break; \
case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
} \
} while (0)
@@ -115,11 +137,13 @@ do { \
__XFS_BTREE_STATS_ADD(__mp, ibt, stat, val); break; \
case XFS_BTNUM_FINO: \
__XFS_BTREE_STATS_ADD(__mp, fibt, stat, val); break; \
+ case XFS_BTNUM_RMAP: \
+ __XFS_BTREE_STATS_ADD(__mp, rmap, stat, val); break; \
case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
} \
} while (0)
-#define XFS_BTREE_MAXLEVELS 8 /* max of all btrees */
+#define XFS_BTREE_MAXLEVELS 9 /* max of all btrees */
struct xfs_btree_ops {
/* size of the key and record structures */
@@ -158,17 +182,25 @@ struct xfs_btree_ops {
/* init values of btree structures */
void (*init_key_from_rec)(union xfs_btree_key *key,
union xfs_btree_rec *rec);
- void (*init_rec_from_key)(union xfs_btree_key *key,
- union xfs_btree_rec *rec);
void (*init_rec_from_cur)(struct xfs_btree_cur *cur,
union xfs_btree_rec *rec);
void (*init_ptr_from_cur)(struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr);
+ void (*init_high_key_from_rec)(union xfs_btree_key *key,
+ union xfs_btree_rec *rec);
/* difference between key value and cursor value */
__int64_t (*key_diff)(struct xfs_btree_cur *cur,
union xfs_btree_key *key);
+ /*
+ * Difference between key2 and key1 -- positive if key1 > key2,
+ * negative if key1 < key2, and zero if equal.
+ */
+ __int64_t (*diff_two_keys)(struct xfs_btree_cur *cur,
+ union xfs_btree_key *key1,
+ union xfs_btree_key *key2);
+
const struct xfs_buf_ops *buf_ops;
#if defined(DEBUG) || defined(XFS_WARN)
@@ -192,6 +224,13 @@ struct xfs_btree_ops {
#define LASTREC_DELREC 2
+union xfs_btree_irec {
+ struct xfs_alloc_rec_incore a;
+ struct xfs_bmbt_irec b;
+ struct xfs_inobt_rec_incore i;
+ struct xfs_rmap_irec r;
+};
+
/*
* Btree cursor structure.
* This collects all information needed by the btree code in one place.
@@ -202,11 +241,7 @@ typedef struct xfs_btree_cur
struct xfs_mount *bc_mp; /* file system mount struct */
const struct xfs_btree_ops *bc_ops;
uint bc_flags; /* btree features - below */
- union {
- xfs_alloc_rec_incore_t a;
- xfs_bmbt_irec_t b;
- xfs_inobt_rec_incore_t i;
- } bc_rec; /* current insert/search record value */
+ union xfs_btree_irec bc_rec; /* current insert/search record value */
struct xfs_buf *bc_bufs[XFS_BTREE_MAXLEVELS]; /* buf ptr per level */
int bc_ptrs[XFS_BTREE_MAXLEVELS]; /* key/record # */
__uint8_t bc_ra[XFS_BTREE_MAXLEVELS]; /* readahead bits */
@@ -218,11 +253,12 @@ typedef struct xfs_btree_cur
union {
struct { /* needed for BNO, CNT, INO */
struct xfs_buf *agbp; /* agf/agi buffer pointer */
+ struct xfs_defer_ops *dfops; /* deferred updates */
xfs_agnumber_t agno; /* ag number */
} a;
struct { /* needed for BMAP */
struct xfs_inode *ip; /* pointer to our inode */
- struct xfs_bmap_free *flist; /* list to free after */
+ struct xfs_defer_ops *dfops; /* deferred updates */
xfs_fsblock_t firstblock; /* 1st blk allocated */
int allocated; /* count of alloced */
short forksize; /* fork's inode space */
@@ -238,6 +274,7 @@ typedef struct xfs_btree_cur
#define XFS_BTREE_ROOT_IN_INODE (1<<1) /* root may be variable size */
#define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */
#define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */
+#define XFS_BTREE_OVERLAPPING (1<<4) /* overlapping intervals */
#define XFS_BTREE_NOERROR 0
@@ -474,5 +511,22 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block)
bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp);
bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs);
+uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits,
+ unsigned long len);
+
+/* return codes */
+#define XFS_BTREE_QUERY_RANGE_CONTINUE 0 /* keep iterating */
+#define XFS_BTREE_QUERY_RANGE_ABORT 1 /* stop iterating */
+typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec, void *priv);
+
+int xfs_btree_query_range(struct xfs_btree_cur *cur,
+ union xfs_btree_irec *low_rec, union xfs_btree_irec *high_rec,
+ xfs_btree_query_range_fn fn, void *priv);
+
+typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level,
+ void *data);
+int xfs_btree_visit_blocks(struct xfs_btree_cur *cur,
+ xfs_btree_visit_blocks_fn fn, void *data);
#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 097bf7717d805..f2dc1a950c85c 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -356,7 +356,6 @@ xfs_da3_split(
struct xfs_da_state_blk *newblk;
struct xfs_da_state_blk *addblk;
struct xfs_da_intnode *node;
- struct xfs_buf *bp;
int max;
int action = 0;
int error;
@@ -397,7 +396,9 @@ xfs_da3_split(
break;
}
/*
- * Entry wouldn't fit, split the leaf again.
+ * Entry wouldn't fit, split the leaf again. The new
+ * extrablk will be consumed by xfs_da3_node_split if
+ * the node is split.
*/
state->extravalid = 1;
if (state->inleaf) {
@@ -446,6 +447,14 @@ xfs_da3_split(
return 0;
/*
+ * xfs_da3_node_split() should have consumed any extra blocks we added
+ * during a double leaf split in the attr fork. This is guaranteed as
+ * we can't be here if the attr fork only has a single leaf block.
+ */
+ ASSERT(state->extravalid == 0 ||
+ state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
+
+ /*
* Split the root node.
*/
ASSERT(state->path.active == 0);
@@ -457,43 +466,33 @@ xfs_da3_split(
}
/*
- * Update pointers to the node which used to be block 0 and
- * just got bumped because of the addition of a new root node.
- * There might be three blocks involved if a double split occurred,
- * and the original block 0 could be at any position in the list.
+ * Update pointers to the node which used to be block 0 and just got
+ * bumped because of the addition of a new root node. Note that the
+ * original block 0 could be at any position in the list of blocks in
+ * the tree.
*
- * Note: the magic numbers and sibling pointers are in the same
- * physical place for both v2 and v3 headers (by design). Hence it
- * doesn't matter which version of the xfs_da_intnode structure we use
- * here as the result will be the same using either structure.
+ * Note: the magic numbers and sibling pointers are in the same physical
+ * place for both v2 and v3 headers (by design). Hence it doesn't matter
+ * which version of the xfs_da_intnode structure we use here as the
+ * result will be the same using either structure.
*/
node = oldblk->bp->b_addr;
if (node->hdr.info.forw) {
- if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) {
- bp = addblk->bp;
- } else {
- ASSERT(state->extravalid);
- bp = state->extrablk.bp;
- }
- node = bp->b_addr;
+ ASSERT(be32_to_cpu(node->hdr.info.forw) == addblk->blkno);
+ node = addblk->bp->b_addr;
node->hdr.info.back = cpu_to_be32(oldblk->blkno);
- xfs_trans_log_buf(state->args->trans, bp,
- XFS_DA_LOGRANGE(node, &node->hdr.info,
- sizeof(node->hdr.info)));
+ xfs_trans_log_buf(state->args->trans, addblk->bp,
+ XFS_DA_LOGRANGE(node, &node->hdr.info,
+ sizeof(node->hdr.info)));
}
node = oldblk->bp->b_addr;
if (node->hdr.info.back) {
- if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) {
- bp = addblk->bp;
- } else {
- ASSERT(state->extravalid);
- bp = state->extrablk.bp;
- }
- node = bp->b_addr;
+ ASSERT(be32_to_cpu(node->hdr.info.back) == addblk->blkno);
+ node = addblk->bp->b_addr;
node->hdr.info.forw = cpu_to_be32(oldblk->blkno);
- xfs_trans_log_buf(state->args->trans, bp,
- XFS_DA_LOGRANGE(node, &node->hdr.info,
- sizeof(node->hdr.info)));
+ xfs_trans_log_buf(state->args->trans, addblk->bp,
+ XFS_DA_LOGRANGE(node, &node->hdr.info,
+ sizeof(node->hdr.info)));
}
addblk->bp = NULL;
return 0;
@@ -2030,7 +2029,7 @@ xfs_da_grow_inode_int(
error = xfs_bmapi_write(tp, dp, *bno, count,
xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
args->firstblock, args->total, &map, &nmap,
- args->flist);
+ args->dfops);
if (error)
return error;
@@ -2053,7 +2052,7 @@ xfs_da_grow_inode_int(
error = xfs_bmapi_write(tp, dp, b, c,
xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
args->firstblock, args->total,
- &mapp[mapi], &nmap, args->flist);
+ &mapp[mapi], &nmap, args->dfops);
if (error)
goto out_free_map;
if (nmap < 1)
@@ -2363,7 +2362,7 @@ xfs_da_shrink_inode(
*/
error = xfs_bunmapi(tp, dp, dead_blkno, count,
xfs_bmapi_aflag(w), 0, args->firstblock,
- args->flist, &done);
+ args->dfops, &done);
if (error == -ENOSPC) {
if (w != XFS_DATA_FORK)
break;
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index 6e153e399a775..98c75cbe6ac2e 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -19,7 +19,7 @@
#ifndef __XFS_DA_BTREE_H__
#define __XFS_DA_BTREE_H__
-struct xfs_bmap_free;
+struct xfs_defer_ops;
struct xfs_inode;
struct xfs_trans;
struct zone;
@@ -70,7 +70,7 @@ typedef struct xfs_da_args {
xfs_ino_t inumber; /* input/output inode number */
struct xfs_inode *dp; /* directory inode to manipulate */
xfs_fsblock_t *firstblock; /* ptr to firstblock for bmap calls */
- struct xfs_bmap_free *flist; /* ptr to freelist for bmap_finish */
+ struct xfs_defer_ops *dfops; /* ptr to freelist for bmap_finish */
struct xfs_trans *trans; /* current trans (changes over time) */
xfs_extlen_t total; /* total blocks needed, for 1st bmap */
int whichfork; /* data or attribute fork */
diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c
index 9d624a6229468..f1e8d4dbb6001 100644
--- a/fs/xfs/libxfs/xfs_da_format.c
+++ b/fs/xfs/libxfs/xfs_da_format.c
@@ -40,8 +40,7 @@ xfs_dir2_sf_entsize(
int count = sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */
count += len; /* name */
- count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) :
- sizeof(xfs_dir2_ino4_t); /* ino # */
+ count += hdr->i8count ? XFS_INO64_SIZE : XFS_INO32_SIZE; /* ino # */
return count;
}
@@ -125,33 +124,33 @@ xfs_dir3_sfe_put_ftype(
static xfs_ino_t
xfs_dir2_sf_get_ino(
struct xfs_dir2_sf_hdr *hdr,
- xfs_dir2_inou_t *from)
+ __uint8_t *from)
{
if (hdr->i8count)
- return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL;
+ return get_unaligned_be64(from) & 0x00ffffffffffffffULL;
else
- return get_unaligned_be32(&from->i4.i);
+ return get_unaligned_be32(from);
}
static void
xfs_dir2_sf_put_ino(
struct xfs_dir2_sf_hdr *hdr,
- xfs_dir2_inou_t *to,
+ __uint8_t *to,
xfs_ino_t ino)
{
ASSERT((ino & 0xff00000000000000ULL) == 0);
if (hdr->i8count)
- put_unaligned_be64(ino, &to->i8.i);
+ put_unaligned_be64(ino, to);
else
- put_unaligned_be32(ino, &to->i4.i);
+ put_unaligned_be32(ino, to);
}
static xfs_ino_t
xfs_dir2_sf_get_parent_ino(
struct xfs_dir2_sf_hdr *hdr)
{
- return xfs_dir2_sf_get_ino(hdr, &hdr->parent);
+ return xfs_dir2_sf_get_ino(hdr, hdr->parent);
}
static void
@@ -159,7 +158,7 @@ xfs_dir2_sf_put_parent_ino(
struct xfs_dir2_sf_hdr *hdr,
xfs_ino_t ino)
{
- xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino);
+ xfs_dir2_sf_put_ino(hdr, hdr->parent, ino);
}
/*
@@ -173,8 +172,7 @@ xfs_dir2_sfe_get_ino(
struct xfs_dir2_sf_hdr *hdr,
struct xfs_dir2_sf_entry *sfep)
{
- return xfs_dir2_sf_get_ino(hdr,
- (xfs_dir2_inou_t *)&sfep->name[sfep->namelen]);
+ return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen]);
}
static void
@@ -183,8 +181,7 @@ xfs_dir2_sfe_put_ino(
struct xfs_dir2_sf_entry *sfep,
xfs_ino_t ino)
{
- xfs_dir2_sf_put_ino(hdr,
- (xfs_dir2_inou_t *)&sfep->name[sfep->namelen], ino);
+ xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen], ino);
}
static xfs_ino_t
@@ -192,8 +189,7 @@ xfs_dir3_sfe_get_ino(
struct xfs_dir2_sf_hdr *hdr,
struct xfs_dir2_sf_entry *sfep)
{
- return xfs_dir2_sf_get_ino(hdr,
- (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1]);
+ return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen + 1]);
}
static void
@@ -202,8 +198,7 @@ xfs_dir3_sfe_put_ino(
struct xfs_dir2_sf_entry *sfep,
xfs_ino_t ino)
{
- xfs_dir2_sf_put_ino(hdr,
- (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1], ino);
+ xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen + 1], ino);
}
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 8d4d8bce41bf7..9a492a9e19bd0 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -192,12 +192,6 @@ typedef __uint16_t xfs_dir2_data_off_t;
typedef uint xfs_dir2_data_aoff_t; /* argument form */
/*
- * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t.
- * Only need 16 bits, this is the byte offset into the single block form.
- */
-typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t;
-
-/*
* Offset in data space of a data entry.
*/
typedef __uint32_t xfs_dir2_dataptr_t;
@@ -214,22 +208,10 @@ typedef xfs_off_t xfs_dir2_off_t;
*/
typedef __uint32_t xfs_dir2_db_t;
-/*
- * Inode number stored as 8 8-bit values.
- */
-typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t;
-
-/*
- * Inode number stored as 4 8-bit values.
- * Works a lot of the time, when all the inode numbers in a directory
- * fit in 32 bits.
- */
-typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t;
+#define XFS_INO32_SIZE 4
+#define XFS_INO64_SIZE 8
+#define XFS_INO64_DIFF (XFS_INO64_SIZE - XFS_INO32_SIZE)
-typedef union {
- xfs_dir2_ino8_t i8;
- xfs_dir2_ino4_t i4;
-} xfs_dir2_inou_t;
#define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL)
/*
@@ -246,39 +228,38 @@ typedef union {
typedef struct xfs_dir2_sf_hdr {
__uint8_t count; /* count of entries */
__uint8_t i8count; /* count of 8-byte inode #s */
- xfs_dir2_inou_t parent; /* parent dir inode number */
-} __arch_pack xfs_dir2_sf_hdr_t;
+ __uint8_t parent[8]; /* parent dir inode number */
+} __packed xfs_dir2_sf_hdr_t;
typedef struct xfs_dir2_sf_entry {
__u8 namelen; /* actual name length */
- xfs_dir2_sf_off_t offset; /* saved offset */
+ __u8 offset[2]; /* saved offset */
__u8 name[]; /* name, variable size */
/*
* A single byte containing the file type field follows the inode
* number for version 3 directory entries.
*
- * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a
- * variable offset after the name.
+ * A 64-bit or 32-bit inode number follows here, at a variable offset
+ * after the name.
*/
-} __arch_pack xfs_dir2_sf_entry_t;
+} xfs_dir2_sf_entry_t;
static inline int xfs_dir2_sf_hdr_size(int i8count)
{
return sizeof(struct xfs_dir2_sf_hdr) -
- (i8count == 0) *
- (sizeof(xfs_dir2_ino8_t) - sizeof(xfs_dir2_ino4_t));
+ (i8count == 0) * XFS_INO64_DIFF;
}
static inline xfs_dir2_data_aoff_t
xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep)
{
- return get_unaligned_be16(&sfep->offset.i);
+ return get_unaligned_be16(sfep->offset);
}
static inline void
xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off)
{
- put_unaligned_be16(off, &sfep->offset.i);
+ put_unaligned_be16(off, sfep->offset);
}
static inline struct xfs_dir2_sf_entry *
@@ -648,6 +629,7 @@ typedef struct xfs_attr_shortform {
struct xfs_attr_sf_hdr { /* constant-structure header block */
__be16 totsize; /* total bytes in shortform list */
__u8 count; /* count of active entries */
+ __u8 padding;
} hdr;
struct xfs_attr_sf_entry {
__uint8_t namelen; /* actual length of name (no NULL) */
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
new file mode 100644
index 0000000000000..054a2032fdb39
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -0,0 +1,463 @@
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_trans.h"
+#include "xfs_trace.h"
+
+/*
+ * Deferred Operations in XFS
+ *
+ * Due to the way locking rules work in XFS, certain transactions (block
+ * mapping and unmapping, typically) have permanent reservations so that
+ * we can roll the transaction to adhere to AG locking order rules and
+ * to unlock buffers between metadata updates. Prior to rmap/reflink,
+ * the mapping code had a mechanism to perform these deferrals for
+ * extents that were going to be freed; this code makes that facility
+ * more generic.
+ *
+ * When adding the reverse mapping and reflink features, it became
+ * necessary to perform complex remapping multi-transactions to comply
+ * with AG locking order rules, and to be able to spread a single
+ * refcount update operation (an operation on an n-block extent can
+ * update as many as n records!) among multiple transactions. XFS can
+ * roll a transaction to facilitate this, but using this facility
+ * requires us to log "intent" items in case log recovery needs to
+ * redo the operation, and to log "done" items to indicate that redo
+ * is not necessary.
+ *
+ * Deferred work is tracked in xfs_defer_pending items. Each pending
+ * item tracks one type of deferred work. Incoming work items (which
+ * have not yet had an intent logged) are attached to a pending item
+ * on the dop_intake list, where they wait for the caller to finish
+ * the deferred operations.
+ *
+ * Finishing a set of deferred operations is an involved process. To
+ * start, we define "rolling a deferred-op transaction" as follows:
+ *
+ * > For each xfs_defer_pending item on the dop_intake list,
+ * - Sort the work items in AG order. XFS locking
+ * order rules require us to lock buffers in AG order.
+ * - Create a log intent item for that type.
+ * - Attach it to the pending item.
+ * - Move the pending item from the dop_intake list to the
+ * dop_pending list.
+ * > Roll the transaction.
+ *
+ * NOTE: To avoid exceeding the transaction reservation, we limit the
+ * number of items that we attach to a given xfs_defer_pending.
+ *
+ * The actual finishing process looks like this:
+ *
+ * > For each xfs_defer_pending in the dop_pending list,
+ * - Roll the deferred-op transaction as above.
+ * - Create a log done item for that type, and attach it to the
+ * log intent item.
+ * - For each work item attached to the log intent item,
+ * * Perform the described action.
+ * * Attach the work item to the log done item.
+ *
+ * The key here is that we must log an intent item for all pending
+ * work items every time we roll the transaction, and that we must log
+ * a done item as soon as the work is completed. With this mechanism
+ * we can perform complex remapping operations, chaining intent items
+ * as needed.
+ *
+ * This is an example of remapping the extent (E, E+B) into file X at
+ * offset A and dealing with the extent (C, C+B) already being mapped
+ * there:
+ * +-------------------------------------------------+
+ * | Unmap file X startblock C offset A length B | t0
+ * | Intent to reduce refcount for extent (C, B) |
+ * | Intent to remove rmap (X, C, A, B) |
+ * | Intent to free extent (D, 1) (bmbt block) |
+ * | Intent to map (X, A, B) at startblock E |
+ * +-------------------------------------------------+
+ * | Map file X startblock E offset A length B | t1
+ * | Done mapping (X, E, A, B) |
+ * | Intent to increase refcount for extent (E, B) |
+ * | Intent to add rmap (X, E, A, B) |
+ * +-------------------------------------------------+
+ * | Reduce refcount for extent (C, B) | t2
+ * | Done reducing refcount for extent (C, B) |
+ * | Increase refcount for extent (E, B) |
+ * | Done increasing refcount for extent (E, B) |
+ * | Intent to free extent (C, B) |
+ * | Intent to free extent (F, 1) (refcountbt block) |
+ * | Intent to remove rmap (F, 1, REFC) |
+ * +-------------------------------------------------+
+ * | Remove rmap (X, C, A, B) | t3
+ * | Done removing rmap (X, C, A, B) |
+ * | Add rmap (X, E, A, B) |
+ * | Done adding rmap (X, E, A, B) |
+ * | Remove rmap (F, 1, REFC) |
+ * | Done removing rmap (F, 1, REFC) |
+ * +-------------------------------------------------+
+ * | Free extent (C, B) | t4
+ * | Done freeing extent (C, B) |
+ * | Free extent (D, 1) |
+ * | Done freeing extent (D, 1) |
+ * | Free extent (F, 1) |
+ * | Done freeing extent (F, 1) |
+ * +-------------------------------------------------+
+ *
+ * If we should crash before t2 commits, log recovery replays
+ * the following intent items:
+ *
+ * - Intent to reduce refcount for extent (C, B)
+ * - Intent to remove rmap (X, C, A, B)
+ * - Intent to free extent (D, 1) (bmbt block)
+ * - Intent to increase refcount for extent (E, B)
+ * - Intent to add rmap (X, E, A, B)
+ *
+ * In the process of recovering, it should also generate and take care
+ * of these intent items:
+ *
+ * - Intent to free extent (C, B)
+ * - Intent to free extent (F, 1) (refcountbt block)
+ * - Intent to remove rmap (F, 1, REFC)
+ */
+
+static const struct xfs_defer_op_type *defer_op_types[XFS_DEFER_OPS_TYPE_MAX];
+
+/*
+ * For each pending item in the intake list, log its intent item and the
+ * associated extents, then add the entire intake list to the end of
+ * the pending list.
+ */
+STATIC void
+xfs_defer_intake_work(
+ struct xfs_trans *tp,
+ struct xfs_defer_ops *dop)
+{
+ struct list_head *li;
+ struct xfs_defer_pending *dfp;
+
+ list_for_each_entry(dfp, &dop->dop_intake, dfp_list) {
+ trace_xfs_defer_intake_work(tp->t_mountp, dfp);
+ dfp->dfp_intent = dfp->dfp_type->create_intent(tp,
+ dfp->dfp_count);
+ list_sort(tp->t_mountp, &dfp->dfp_work,
+ dfp->dfp_type->diff_items);
+ list_for_each(li, &dfp->dfp_work)
+ dfp->dfp_type->log_item(tp, dfp->dfp_intent, li);
+ }
+
+ list_splice_tail_init(&dop->dop_intake, &dop->dop_pending);
+}
+
+/* Abort all the intents that were committed. */
+STATIC void
+xfs_defer_trans_abort(
+ struct xfs_trans *tp,
+ struct xfs_defer_ops *dop,
+ int error)
+{
+ struct xfs_defer_pending *dfp;
+
+ trace_xfs_defer_trans_abort(tp->t_mountp, dop);
+ /*
+ * If the transaction was committed, drop the intent reference
+ * since we're bailing out of here. The other reference is
+ * dropped when the intent hits the AIL. If the transaction
+ * was not committed, the intent is freed by the intent item
+ * unlock handler on abort.
+ */
+ if (!dop->dop_committed)
+ return;
+
+ /* Abort intent items. */
+ list_for_each_entry(dfp, &dop->dop_pending, dfp_list) {
+ trace_xfs_defer_pending_abort(tp->t_mountp, dfp);
+ if (dfp->dfp_committed)
+ dfp->dfp_type->abort_intent(dfp->dfp_intent);
+ }
+
+ /* Shut down FS. */
+ xfs_force_shutdown(tp->t_mountp, (error == -EFSCORRUPTED) ?
+ SHUTDOWN_CORRUPT_INCORE : SHUTDOWN_META_IO_ERROR);
+}
+
+/* Roll a transaction so we can do some deferred op processing. */
+STATIC int
+xfs_defer_trans_roll(
+ struct xfs_trans **tp,
+ struct xfs_defer_ops *dop,
+ struct xfs_inode *ip)
+{
+ int i;
+ int error;
+
+ /* Log all the joined inodes except the one we passed in. */
+ for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) {
+ if (dop->dop_inodes[i] == ip)
+ continue;
+ xfs_trans_log_inode(*tp, dop->dop_inodes[i], XFS_ILOG_CORE);
+ }
+
+ trace_xfs_defer_trans_roll((*tp)->t_mountp, dop);
+
+ /* Roll the transaction. */
+ error = xfs_trans_roll(tp, ip);
+ if (error) {
+ trace_xfs_defer_trans_roll_error((*tp)->t_mountp, dop, error);
+ xfs_defer_trans_abort(*tp, dop, error);
+ return error;
+ }
+ dop->dop_committed = true;
+
+ /* Rejoin the joined inodes except the one we passed in. */
+ for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) {
+ if (dop->dop_inodes[i] == ip)
+ continue;
+ xfs_trans_ijoin(*tp, dop->dop_inodes[i], 0);
+ }
+
+ return error;
+}
+
+/* Do we have any work items to finish? */
+bool
+xfs_defer_has_unfinished_work(
+ struct xfs_defer_ops *dop)
+{
+ return !list_empty(&dop->dop_pending) || !list_empty(&dop->dop_intake);
+}
+
+/*
+ * Add this inode to the deferred op. Each joined inode is relogged
+ * each time we roll the transaction, in addition to any inode passed
+ * to xfs_defer_finish().
+ */
+int
+xfs_defer_join(
+ struct xfs_defer_ops *dop,
+ struct xfs_inode *ip)
+{
+ int i;
+
+ for (i = 0; i < XFS_DEFER_OPS_NR_INODES; i++) {
+ if (dop->dop_inodes[i] == ip)
+ return 0;
+ else if (dop->dop_inodes[i] == NULL) {
+ dop->dop_inodes[i] = ip;
+ return 0;
+ }
+ }
+
+ return -EFSCORRUPTED;
+}
+
+/*
+ * Finish all the pending work. This involves logging intent items for
+ * any work items that wandered in since the last transaction roll (if
+ * one has even happened), rolling the transaction, and finishing the
+ * work items in the first item on the logged-and-pending list.
+ *
+ * If an inode is provided, relog it to the new transaction.
+ */
+int
+xfs_defer_finish(
+ struct xfs_trans **tp,
+ struct xfs_defer_ops *dop,
+ struct xfs_inode *ip)
+{
+ struct xfs_defer_pending *dfp;
+ struct list_head *li;
+ struct list_head *n;
+ void *done_item = NULL;
+ void *state;
+ int error = 0;
+ void (*cleanup_fn)(struct xfs_trans *, void *, int);
+
+ ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
+
+ trace_xfs_defer_finish((*tp)->t_mountp, dop);
+
+ /* Until we run out of pending work to finish... */
+ while (xfs_defer_has_unfinished_work(dop)) {
+ /* Log intents for work items sitting in the intake. */
+ xfs_defer_intake_work(*tp, dop);
+
+ /* Roll the transaction. */
+ error = xfs_defer_trans_roll(tp, dop, ip);
+ if (error)
+ goto out;
+
+ /* Mark all pending intents as committed. */
+ list_for_each_entry_reverse(dfp, &dop->dop_pending, dfp_list) {
+ if (dfp->dfp_committed)
+ break;
+ trace_xfs_defer_pending_commit((*tp)->t_mountp, dfp);
+ dfp->dfp_committed = true;
+ }
+
+ /* Log an intent-done item for the first pending item. */
+ dfp = list_first_entry(&dop->dop_pending,
+ struct xfs_defer_pending, dfp_list);
+ trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp);
+ done_item = dfp->dfp_type->create_done(*tp, dfp->dfp_intent,
+ dfp->dfp_count);
+ cleanup_fn = dfp->dfp_type->finish_cleanup;
+
+ /* Finish the work items. */
+ state = NULL;
+ list_for_each_safe(li, n, &dfp->dfp_work) {
+ list_del(li);
+ dfp->dfp_count--;
+ error = dfp->dfp_type->finish_item(*tp, dop, li,
+ done_item, &state);
+ if (error) {
+ /*
+ * Clean up after ourselves and jump out.
+ * xfs_defer_cancel will take care of freeing
+ * all these lists and stuff.
+ */
+ if (cleanup_fn)
+ cleanup_fn(*tp, state, error);
+ xfs_defer_trans_abort(*tp, dop, error);
+ goto out;
+ }
+ }
+ /* Done with the dfp, free it. */
+ list_del(&dfp->dfp_list);
+ kmem_free(dfp);
+
+ if (cleanup_fn)
+ cleanup_fn(*tp, state, error);
+ }
+
+out:
+ if (error)
+ trace_xfs_defer_finish_error((*tp)->t_mountp, dop, error);
+ else
+ trace_xfs_defer_finish_done((*tp)->t_mountp, dop);
+ return error;
+}
+
+/*
+ * Free up any items left in the list.
+ */
+void
+xfs_defer_cancel(
+ struct xfs_defer_ops *dop)
+{
+ struct xfs_defer_pending *dfp;
+ struct xfs_defer_pending *pli;
+ struct list_head *pwi;
+ struct list_head *n;
+
+ trace_xfs_defer_cancel(NULL, dop);
+
+ /*
+ * Free the pending items. Caller should already have arranged
+ * for the intent items to be released.
+ */
+ list_for_each_entry_safe(dfp, pli, &dop->dop_intake, dfp_list) {
+ trace_xfs_defer_intake_cancel(NULL, dfp);
+ list_del(&dfp->dfp_list);
+ list_for_each_safe(pwi, n, &dfp->dfp_work) {
+ list_del(pwi);
+ dfp->dfp_count--;
+ dfp->dfp_type->cancel_item(pwi);
+ }
+ ASSERT(dfp->dfp_count == 0);
+ kmem_free(dfp);
+ }
+ list_for_each_entry_safe(dfp, pli, &dop->dop_pending, dfp_list) {
+ trace_xfs_defer_pending_cancel(NULL, dfp);
+ list_del(&dfp->dfp_list);
+ list_for_each_safe(pwi, n, &dfp->dfp_work) {
+ list_del(pwi);
+ dfp->dfp_count--;
+ dfp->dfp_type->cancel_item(pwi);
+ }
+ ASSERT(dfp->dfp_count == 0);
+ kmem_free(dfp);
+ }
+}
+
+/* Add an item for later deferred processing. */
+void
+xfs_defer_add(
+ struct xfs_defer_ops *dop,
+ enum xfs_defer_ops_type type,
+ struct list_head *li)
+{
+ struct xfs_defer_pending *dfp = NULL;
+
+ /*
+ * Add the item to a pending item at the end of the intake list.
+ * If the last pending item has the same type, reuse it. Else,
+ * create a new pending item at the end of the intake list.
+ */
+ if (!list_empty(&dop->dop_intake)) {
+ dfp = list_last_entry(&dop->dop_intake,
+ struct xfs_defer_pending, dfp_list);
+ if (dfp->dfp_type->type != type ||
+ (dfp->dfp_type->max_items &&
+ dfp->dfp_count >= dfp->dfp_type->max_items))
+ dfp = NULL;
+ }
+ if (!dfp) {
+ dfp = kmem_alloc(sizeof(struct xfs_defer_pending),
+ KM_SLEEP | KM_NOFS);
+ dfp->dfp_type = defer_op_types[type];
+ dfp->dfp_committed = false;
+ dfp->dfp_intent = NULL;
+ dfp->dfp_count = 0;
+ INIT_LIST_HEAD(&dfp->dfp_work);
+ list_add_tail(&dfp->dfp_list, &dop->dop_intake);
+ }
+
+ list_add_tail(li, &dfp->dfp_work);
+ dfp->dfp_count++;
+}
+
+/* Initialize a deferred operation list. */
+void
+xfs_defer_init_op_type(
+ const struct xfs_defer_op_type *type)
+{
+ defer_op_types[type->type] = type;
+}
+
+/* Initialize a deferred operation. */
+void
+xfs_defer_init(
+ struct xfs_defer_ops *dop,
+ xfs_fsblock_t *fbp)
+{
+ dop->dop_committed = false;
+ dop->dop_low = false;
+ memset(&dop->dop_inodes, 0, sizeof(dop->dop_inodes));
+ *fbp = NULLFSBLOCK;
+ INIT_LIST_HEAD(&dop->dop_intake);
+ INIT_LIST_HEAD(&dop->dop_pending);
+ trace_xfs_defer_init(NULL, dop);
+}
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
new file mode 100644
index 0000000000000..cc3981c482968
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_DEFER_H__
+#define __XFS_DEFER_H__
+
+struct xfs_defer_op_type;
+
+/*
+ * Save a log intent item and a list of extents, so that we can replay
+ * whatever action had to happen to the extent list and file the log done
+ * item.
+ */
+struct xfs_defer_pending {
+ const struct xfs_defer_op_type *dfp_type; /* function pointers */
+ struct list_head dfp_list; /* pending items */
+ bool dfp_committed; /* committed trans? */
+ void *dfp_intent; /* log intent item */
+ struct list_head dfp_work; /* work items */
+ unsigned int dfp_count; /* # extent items */
+};
+
+/*
+ * Header for deferred operation list.
+ *
+ * dop_low is used by the allocator to activate the lowspace algorithm -
+ * when free space is running low the extent allocator may choose to
+ * allocate an extent from an AG without leaving sufficient space for
+ * a btree split when inserting the new extent. In this case the allocator
+ * will enable the lowspace algorithm which is supposed to allow further
+ * allocations (such as btree splits and newroots) to allocate from
+ * sequential AGs. In order to avoid locking AGs out of order the lowspace
+ * algorithm will start searching for free space from AG 0. If the correct
+ * transaction reservations have been made then this algorithm will eventually
+ * find all the space it needs.
+ */
+enum xfs_defer_ops_type {
+ XFS_DEFER_OPS_TYPE_RMAP,
+ XFS_DEFER_OPS_TYPE_FREE,
+ XFS_DEFER_OPS_TYPE_MAX,
+};
+
+#define XFS_DEFER_OPS_NR_INODES 2 /* join up to two inodes */
+
+struct xfs_defer_ops {
+ bool dop_committed; /* did any trans commit? */
+ bool dop_low; /* alloc in low mode */
+ struct list_head dop_intake; /* unlogged pending work */
+ struct list_head dop_pending; /* logged pending work */
+
+ /* relog these inodes with each roll */
+ struct xfs_inode *dop_inodes[XFS_DEFER_OPS_NR_INODES];
+};
+
+void xfs_defer_add(struct xfs_defer_ops *dop, enum xfs_defer_ops_type type,
+ struct list_head *h);
+int xfs_defer_finish(struct xfs_trans **tp, struct xfs_defer_ops *dop,
+ struct xfs_inode *ip);
+void xfs_defer_cancel(struct xfs_defer_ops *dop);
+void xfs_defer_init(struct xfs_defer_ops *dop, xfs_fsblock_t *fbp);
+bool xfs_defer_has_unfinished_work(struct xfs_defer_ops *dop);
+int xfs_defer_join(struct xfs_defer_ops *dop, struct xfs_inode *ip);
+
+/* Description of a deferred type. */
+struct xfs_defer_op_type {
+ enum xfs_defer_ops_type type;
+ unsigned int max_items;
+ void (*abort_intent)(void *);
+ void *(*create_done)(struct xfs_trans *, void *, unsigned int);
+ int (*finish_item)(struct xfs_trans *, struct xfs_defer_ops *,
+ struct list_head *, void *, void **);
+ void (*finish_cleanup)(struct xfs_trans *, void *, int);
+ void (*cancel_item)(struct list_head *);
+ int (*diff_items)(void *, struct list_head *, struct list_head *);
+ void *(*create_intent)(struct xfs_trans *, uint);
+ void (*log_item)(struct xfs_trans *, void *, struct list_head *);
+};
+
+void xfs_defer_init_op_type(const struct xfs_defer_op_type *type);
+
+#endif /* __XFS_DEFER_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index af0f9d171f8a0..20a96dd5af7eb 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -21,6 +21,7 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
+#include "xfs_defer.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_inode.h"
@@ -259,7 +260,7 @@ xfs_dir_createname(
struct xfs_name *name,
xfs_ino_t inum, /* new entry inode number */
xfs_fsblock_t *first, /* bmap's firstblock */
- xfs_bmap_free_t *flist, /* bmap's freeblock list */
+ struct xfs_defer_ops *dfops, /* bmap's freeblock list */
xfs_extlen_t total) /* bmap's total block count */
{
struct xfs_da_args *args;
@@ -286,7 +287,7 @@ xfs_dir_createname(
args->inumber = inum;
args->dp = dp;
args->firstblock = first;
- args->flist = flist;
+ args->dfops = dfops;
args->total = total;
args->whichfork = XFS_DATA_FORK;
args->trans = tp;
@@ -436,7 +437,7 @@ xfs_dir_removename(
struct xfs_name *name,
xfs_ino_t ino,
xfs_fsblock_t *first, /* bmap's firstblock */
- xfs_bmap_free_t *flist, /* bmap's freeblock list */
+ struct xfs_defer_ops *dfops, /* bmap's freeblock list */
xfs_extlen_t total) /* bmap's total block count */
{
struct xfs_da_args *args;
@@ -458,7 +459,7 @@ xfs_dir_removename(
args->inumber = ino;
args->dp = dp;
args->firstblock = first;
- args->flist = flist;
+ args->dfops = dfops;
args->total = total;
args->whichfork = XFS_DATA_FORK;
args->trans = tp;
@@ -498,7 +499,7 @@ xfs_dir_replace(
struct xfs_name *name, /* name of entry to replace */
xfs_ino_t inum, /* new inode number */
xfs_fsblock_t *first, /* bmap's firstblock */
- xfs_bmap_free_t *flist, /* bmap's freeblock list */
+ struct xfs_defer_ops *dfops, /* bmap's freeblock list */
xfs_extlen_t total) /* bmap's total block count */
{
struct xfs_da_args *args;
@@ -523,7 +524,7 @@ xfs_dir_replace(
args->inumber = inum;
args->dp = dp;
args->firstblock = first;
- args->flist = flist;
+ args->dfops = dfops;
args->total = total;
args->whichfork = XFS_DATA_FORK;
args->trans = tp;
@@ -680,7 +681,7 @@ xfs_dir2_shrink_inode(
/* Unmap the fsblock(s). */
error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount, 0, 0,
- args->firstblock, args->flist, &done);
+ args->firstblock, args->dfops, &done);
if (error) {
/*
* ENOSPC actually can happen if we're in a removename with no
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index e55353651f5b8..becc926c3e3d9 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -18,7 +18,7 @@
#ifndef __XFS_DIR2_H__
#define __XFS_DIR2_H__
-struct xfs_bmap_free;
+struct xfs_defer_ops;
struct xfs_da_args;
struct xfs_inode;
struct xfs_mount;
@@ -129,18 +129,18 @@ extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp,
extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_name *name, xfs_ino_t inum,
xfs_fsblock_t *first,
- struct xfs_bmap_free *flist, xfs_extlen_t tot);
+ struct xfs_defer_ops *dfops, xfs_extlen_t tot);
extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_name *name, xfs_ino_t *inum,
struct xfs_name *ci_name);
extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_name *name, xfs_ino_t ino,
xfs_fsblock_t *first,
- struct xfs_bmap_free *flist, xfs_extlen_t tot);
+ struct xfs_defer_ops *dfops, xfs_extlen_t tot);
extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_name *name, xfs_ino_t inum,
xfs_fsblock_t *first,
- struct xfs_bmap_free *flist, xfs_extlen_t tot);
+ struct xfs_defer_ops *dfops, xfs_extlen_t tot);
extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_name *name);
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 974d62e677f45..c6809ff41197d 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -126,13 +126,12 @@ xfs_dir2_block_sfsize(
/*
* Calculate the new size, see if we should give up yet.
*/
- size = xfs_dir2_sf_hdr_size(i8count) + /* header */
- count + /* namelen */
- count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */
- namelen + /* name */
- (i8count ? /* inumber */
- (uint)sizeof(xfs_dir2_ino8_t) * count :
- (uint)sizeof(xfs_dir2_ino4_t) * count);
+ size = xfs_dir2_sf_hdr_size(i8count) + /* header */
+ count * 3 * sizeof(u8) + /* namelen + offset */
+ namelen + /* name */
+ (i8count ? /* inumber */
+ count * XFS_INO64_SIZE :
+ count * XFS_INO32_SIZE);
if (size > XFS_IFORK_DSIZE(dp))
return size; /* size value is a failure */
}
@@ -257,15 +256,12 @@ xfs_dir2_block_to_sf(
*
* Convert the inode to local format and copy the data in.
*/
- dp->i_df.if_flags &= ~XFS_IFEXTENTS;
- dp->i_df.if_flags |= XFS_IFINLINE;
- dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
ASSERT(dp->i_df.if_bytes == 0);
- xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+ xfs_init_local_fork(dp, XFS_DATA_FORK, dst, size);
+ dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+ dp->i_d.di_size = size;
logflags |= XFS_ILOG_DDATA;
- memcpy(dp->i_df.if_u1.if_data, dst, size);
- dp->i_d.di_size = size;
xfs_dir2_sf_check(args);
out:
xfs_trans_log_inode(args->trans, dp, logflags);
@@ -322,10 +318,7 @@ xfs_dir2_sf_addname(
/*
* Yes, adjust the inode size. old count + (parent + new)
*/
- incr_isize +=
- (sfp->count + 2) *
- ((uint)sizeof(xfs_dir2_ino8_t) -
- (uint)sizeof(xfs_dir2_ino4_t));
+ incr_isize += (sfp->count + 2) * XFS_INO64_DIFF;
objchange = 1;
}
@@ -900,11 +893,7 @@ xfs_dir2_sf_replace(
int error; /* error return value */
int newsize; /* new inode size */
- newsize =
- dp->i_df.if_bytes +
- (sfp->count + 1) *
- ((uint)sizeof(xfs_dir2_ino8_t) -
- (uint)sizeof(xfs_dir2_ino4_t));
+ newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF;
/*
* Won't fit as shortform, convert to block then do replace.
*/
@@ -1025,10 +1014,7 @@ xfs_dir2_sf_toino4(
/*
* Compute the new inode size.
*/
- newsize =
- oldsize -
- (oldsfp->count + 1) *
- ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
+ newsize = oldsize - (oldsfp->count + 1) * XFS_INO64_DIFF;
xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
/*
@@ -1051,7 +1037,7 @@ xfs_dir2_sf_toino4(
i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
sfep->namelen = oldsfep->namelen;
- sfep->offset = oldsfep->offset;
+ memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset));
memcpy(sfep->name, oldsfep->name, sfep->namelen);
dp->d_ops->sf_put_ino(sfp, sfep,
dp->d_ops->sf_get_ino(oldsfp, oldsfep));
@@ -1101,10 +1087,7 @@ xfs_dir2_sf_toino8(
/*
* Compute the new inode size (nb: entry count + 1 for parent)
*/
- newsize =
- oldsize +
- (oldsfp->count + 1) *
- ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
+ newsize = oldsize + (oldsfp->count + 1) * XFS_INO64_DIFF;
xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
/*
@@ -1127,7 +1110,7 @@ xfs_dir2_sf_toino8(
i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
sfep->namelen = oldsfep->namelen;
- sfep->offset = oldsfep->offset;
+ memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset));
memcpy(sfep->name, oldsfep->name, sfep->namelen);
dp->d_ops->sf_put_ino(sfp, sfep,
dp->d_ops->sf_get_ino(oldsfp, oldsfep));
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index dc97eb21af071..f814d42c73b2f 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -455,8 +455,10 @@ xfs_sb_has_compat_feature(
}
#define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */
+#define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */
#define XFS_SB_FEAT_RO_COMPAT_ALL \
- (XFS_SB_FEAT_RO_COMPAT_FINOBT)
+ (XFS_SB_FEAT_RO_COMPAT_FINOBT | \
+ XFS_SB_FEAT_RO_COMPAT_RMAPBT)
#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL
static inline bool
xfs_sb_has_ro_compat_feature(
@@ -538,6 +540,12 @@ static inline bool xfs_sb_version_hasmetauuid(struct xfs_sb *sbp)
(sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID);
}
+static inline bool xfs_sb_version_hasrmapbt(struct xfs_sb *sbp)
+{
+ return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) &&
+ (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_RMAPBT);
+}
+
/*
* end of superblock version macros
*/
@@ -598,10 +606,10 @@ xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
#define XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION)
/*
- * Btree number 0 is bno, 1 is cnt. This value gives the size of the
+ * Btree number 0 is bno, 1 is cnt, 2 is rmap. This value gives the size of the
* arrays below.
*/
-#define XFS_BTNUM_AGF ((int)XFS_BTNUM_CNTi + 1)
+#define XFS_BTNUM_AGF ((int)XFS_BTNUM_RMAPi + 1)
/*
* The second word of agf_levels in the first a.g. overlaps the EFS
@@ -618,12 +626,10 @@ typedef struct xfs_agf {
__be32 agf_seqno; /* sequence # starting from 0 */
__be32 agf_length; /* size in blocks of a.g. */
/*
- * Freespace information
+ * Freespace and rmap information
*/
__be32 agf_roots[XFS_BTNUM_AGF]; /* root blocks */
- __be32 agf_spare0; /* spare field */
__be32 agf_levels[XFS_BTNUM_AGF]; /* btree levels */
- __be32 agf_spare1; /* spare field */
__be32 agf_flfirst; /* first freelist block's index */
__be32 agf_fllast; /* last freelist block's index */
@@ -1308,17 +1314,118 @@ typedef __be32 xfs_inobt_ptr_t;
#define XFS_FIBT_BLOCK(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
/*
- * The first data block of an AG depends on whether the filesystem was formatted
- * with the finobt feature. If so, account for the finobt reserved root btree
- * block.
+ * Reverse mapping btree format definitions
+ *
+ * There is a btree for the reverse map per allocation group
+ */
+#define XFS_RMAP_CRC_MAGIC 0x524d4233 /* 'RMB3' */
+
+/*
+ * Ownership info for an extent. This is used to create reverse-mapping
+ * entries.
*/
-#define XFS_PREALLOC_BLOCKS(mp) \
+#define XFS_OWNER_INFO_ATTR_FORK (1 << 0)
+#define XFS_OWNER_INFO_BMBT_BLOCK (1 << 1)
+struct xfs_owner_info {
+ uint64_t oi_owner;
+ xfs_fileoff_t oi_offset;
+ unsigned int oi_flags;
+};
+
+/*
+ * Special owner types.
+ *
+ * Seeing as we only support up to 8EB, we have the upper bit of the owner field
+ * to tell us we have a special owner value. We use these for static metadata
+ * allocated at mkfs/growfs time, as well as for freespace management metadata.
+ */
+#define XFS_RMAP_OWN_NULL (-1ULL) /* No owner, for growfs */
+#define XFS_RMAP_OWN_UNKNOWN (-2ULL) /* Unknown owner, for EFI recovery */
+#define XFS_RMAP_OWN_FS (-3ULL) /* static fs metadata */
+#define XFS_RMAP_OWN_LOG (-4ULL) /* static fs metadata */
+#define XFS_RMAP_OWN_AG (-5ULL) /* AG freespace btree blocks */
+#define XFS_RMAP_OWN_INOBT (-6ULL) /* Inode btree blocks */
+#define XFS_RMAP_OWN_INODES (-7ULL) /* Inode chunk */
+#define XFS_RMAP_OWN_MIN (-8ULL) /* guard */
+
+#define XFS_RMAP_NON_INODE_OWNER(owner) (!!((owner) & (1ULL << 63)))
+
+/*
+ * Data record structure
+ */
+struct xfs_rmap_rec {
+ __be32 rm_startblock; /* extent start block */
+ __be32 rm_blockcount; /* extent length */
+ __be64 rm_owner; /* extent owner */
+ __be64 rm_offset; /* offset within the owner */
+};
+
+/*
+ * rmap btree record
+ * rm_offset:63 is the attribute fork flag
+ * rm_offset:62 is the bmbt block flag
+ * rm_offset:61 is the unwritten extent flag (same as l0:63 in bmbt)
+ * rm_offset:54-60 aren't used and should be zero
+ * rm_offset:0-53 is the block offset within the inode
+ */
+#define XFS_RMAP_OFF_ATTR_FORK ((__uint64_t)1ULL << 63)
+#define XFS_RMAP_OFF_BMBT_BLOCK ((__uint64_t)1ULL << 62)
+#define XFS_RMAP_OFF_UNWRITTEN ((__uint64_t)1ULL << 61)
+
+#define XFS_RMAP_LEN_MAX ((__uint32_t)~0U)
+#define XFS_RMAP_OFF_FLAGS (XFS_RMAP_OFF_ATTR_FORK | \
+ XFS_RMAP_OFF_BMBT_BLOCK | \
+ XFS_RMAP_OFF_UNWRITTEN)
+#define XFS_RMAP_OFF_MASK ((__uint64_t)0x3FFFFFFFFFFFFFULL)
+
+#define XFS_RMAP_OFF(off) ((off) & XFS_RMAP_OFF_MASK)
+
+#define XFS_RMAP_IS_BMBT_BLOCK(off) (!!((off) & XFS_RMAP_OFF_BMBT_BLOCK))
+#define XFS_RMAP_IS_ATTR_FORK(off) (!!((off) & XFS_RMAP_OFF_ATTR_FORK))
+#define XFS_RMAP_IS_UNWRITTEN(len) (!!((off) & XFS_RMAP_OFF_UNWRITTEN))
+
+#define RMAPBT_STARTBLOCK_BITLEN 32
+#define RMAPBT_BLOCKCOUNT_BITLEN 32
+#define RMAPBT_OWNER_BITLEN 64
+#define RMAPBT_ATTRFLAG_BITLEN 1
+#define RMAPBT_BMBTFLAG_BITLEN 1
+#define RMAPBT_EXNTFLAG_BITLEN 1
+#define RMAPBT_UNUSED_OFFSET_BITLEN 7
+#define RMAPBT_OFFSET_BITLEN 54
+
+#define XFS_RMAP_ATTR_FORK (1 << 0)
+#define XFS_RMAP_BMBT_BLOCK (1 << 1)
+#define XFS_RMAP_UNWRITTEN (1 << 2)
+#define XFS_RMAP_KEY_FLAGS (XFS_RMAP_ATTR_FORK | \
+ XFS_RMAP_BMBT_BLOCK)
+#define XFS_RMAP_REC_FLAGS (XFS_RMAP_UNWRITTEN)
+struct xfs_rmap_irec {
+ xfs_agblock_t rm_startblock; /* extent start block */
+ xfs_extlen_t rm_blockcount; /* extent length */
+ __uint64_t rm_owner; /* extent owner */
+ __uint64_t rm_offset; /* offset within the owner */
+ unsigned int rm_flags; /* state flags */
+};
+
+/*
+ * Key structure
+ *
+ * We don't use the length for lookups
+ */
+struct xfs_rmap_key {
+ __be32 rm_startblock; /* extent start block */
+ __be64 rm_owner; /* extent owner */
+ __be64 rm_offset; /* offset within the owner */
+} __attribute__((packed));
+
+/* btree pointer type */
+typedef __be32 xfs_rmap_ptr_t;
+
+#define XFS_RMAP_BLOCK(mp) \
(xfs_sb_version_hasfinobt(&((mp)->m_sb)) ? \
XFS_FIBT_BLOCK(mp) + 1 : \
XFS_IBT_BLOCK(mp) + 1)
-
-
/*
* BMAP Btree format definitions
*
@@ -1435,41 +1542,57 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
* with the crc feature bit, and all accesses to them must be conditional on
* that flag.
*/
+/* short form block header */
+struct xfs_btree_block_shdr {
+ __be32 bb_leftsib;
+ __be32 bb_rightsib;
+
+ __be64 bb_blkno;
+ __be64 bb_lsn;
+ uuid_t bb_uuid;
+ __be32 bb_owner;
+ __le32 bb_crc;
+};
+
+/* long form block header */
+struct xfs_btree_block_lhdr {
+ __be64 bb_leftsib;
+ __be64 bb_rightsib;
+
+ __be64 bb_blkno;
+ __be64 bb_lsn;
+ uuid_t bb_uuid;
+ __be64 bb_owner;
+ __le32 bb_crc;
+ __be32 bb_pad; /* padding for alignment */
+};
+
struct xfs_btree_block {
__be32 bb_magic; /* magic number for block type */
__be16 bb_level; /* 0 is a leaf */
__be16 bb_numrecs; /* current # of data records */
union {
- struct {
- __be32 bb_leftsib;
- __be32 bb_rightsib;
-
- __be64 bb_blkno;
- __be64 bb_lsn;
- uuid_t bb_uuid;
- __be32 bb_owner;
- __le32 bb_crc;
- } s; /* short form pointers */
- struct {
- __be64 bb_leftsib;
- __be64 bb_rightsib;
-
- __be64 bb_blkno;
- __be64 bb_lsn;
- uuid_t bb_uuid;
- __be64 bb_owner;
- __le32 bb_crc;
- __be32 bb_pad; /* padding for alignment */
- } l; /* long form pointers */
+ struct xfs_btree_block_shdr s;
+ struct xfs_btree_block_lhdr l;
} bb_u; /* rest */
};
-#define XFS_BTREE_SBLOCK_LEN 16 /* size of a short form block */
-#define XFS_BTREE_LBLOCK_LEN 24 /* size of a long form block */
+/* size of a short form block */
+#define XFS_BTREE_SBLOCK_LEN \
+ (offsetof(struct xfs_btree_block, bb_u) + \
+ offsetof(struct xfs_btree_block_shdr, bb_blkno))
+/* size of a long form block */
+#define XFS_BTREE_LBLOCK_LEN \
+ (offsetof(struct xfs_btree_block, bb_u) + \
+ offsetof(struct xfs_btree_block_lhdr, bb_blkno))
/* sizes of CRC enabled btree blocks */
-#define XFS_BTREE_SBLOCK_CRC_LEN (XFS_BTREE_SBLOCK_LEN + 40)
-#define XFS_BTREE_LBLOCK_CRC_LEN (XFS_BTREE_LBLOCK_LEN + 48)
+#define XFS_BTREE_SBLOCK_CRC_LEN \
+ (offsetof(struct xfs_btree_block, bb_u) + \
+ sizeof(struct xfs_btree_block_shdr))
+#define XFS_BTREE_LBLOCK_CRC_LEN \
+ (offsetof(struct xfs_btree_block, bb_u) + \
+ sizeof(struct xfs_btree_block_lhdr))
#define XFS_BTREE_SBLOCK_CRC_OFF \
offsetof(struct xfs_btree_block, bb_u.s.bb_crc)
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index fffe3d01bd9fb..79455058b7525 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -206,6 +206,7 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */
#define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */
#define XFS_FSOP_GEOM_FLAGS_SPINODES 0x40000 /* sparse inode chunks */
+#define XFS_FSOP_GEOM_FLAGS_RMAPBT 0x80000 /* Reverse mapping btree */
/*
* Minimum and maximum sizes need for growth checks.
@@ -521,12 +522,8 @@ typedef struct xfs_swapext
#define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection)
/* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */
-/* XFS_IOC_FREEZE -- FIFREEZE 119 */
-/* XFS_IOC_THAW -- FITHAW 120 */
-#ifndef FIFREEZE
-#define XFS_IOC_FREEZE _IOWR('X', 119, int)
-#define XFS_IOC_THAW _IOWR('X', 120, int)
-#endif
+#define XFS_IOC_FREEZE _IOWR('X', 119, int) /* aka FIFREEZE */
+#define XFS_IOC_THAW _IOWR('X', 120, int) /* aka FITHAW */
#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq)
#define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 22297f9b0fd52..51b4e0de1fdc4 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -24,6 +24,7 @@
#include "xfs_bit.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
+#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_ialloc.h"
@@ -39,6 +40,7 @@
#include "xfs_icache.h"
#include "xfs_trace.h"
#include "xfs_log.h"
+#include "xfs_rmap.h"
/*
@@ -614,6 +616,7 @@ xfs_ialloc_ag_alloc(
args.tp = tp;
args.mp = tp->t_mountp;
args.fsbno = NULLFSBLOCK;
+ xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_INODES);
#ifdef DEBUG
/* randomly do sparse inode allocations */
@@ -1817,20 +1820,21 @@ xfs_difree_inode_chunk(
struct xfs_mount *mp,
xfs_agnumber_t agno,
struct xfs_inobt_rec_incore *rec,
- struct xfs_bmap_free *flist)
+ struct xfs_defer_ops *dfops)
{
xfs_agblock_t sagbno = XFS_AGINO_TO_AGBNO(mp, rec->ir_startino);
int startidx, endidx;
int nextbit;
xfs_agblock_t agbno;
int contigblk;
+ struct xfs_owner_info oinfo;
DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS);
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES);
if (!xfs_inobt_issparse(rec->ir_holemask)) {
/* not sparse, calculate extent info directly */
- xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
- XFS_AGINO_TO_AGBNO(mp, rec->ir_startino)),
- mp->m_ialloc_blks, flist, mp);
+ xfs_bmap_add_free(mp, dfops, XFS_AGB_TO_FSB(mp, agno, sagbno),
+ mp->m_ialloc_blks, &oinfo);
return;
}
@@ -1873,8 +1877,8 @@ xfs_difree_inode_chunk(
ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
- xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, agbno), contigblk,
- flist, mp);
+ xfs_bmap_add_free(mp, dfops, XFS_AGB_TO_FSB(mp, agno, agbno),
+ contigblk, &oinfo);
/* reset range to current bit and carry on... */
startidx = endidx = nextbit;
@@ -1890,7 +1894,7 @@ xfs_difree_inobt(
struct xfs_trans *tp,
struct xfs_buf *agbp,
xfs_agino_t agino,
- struct xfs_bmap_free *flist,
+ struct xfs_defer_ops *dfops,
struct xfs_icluster *xic,
struct xfs_inobt_rec_incore *orec)
{
@@ -1977,7 +1981,7 @@ xfs_difree_inobt(
goto error0;
}
- xfs_difree_inode_chunk(mp, agno, &rec, flist);
+ xfs_difree_inode_chunk(mp, agno, &rec, dfops);
} else {
xic->deleted = 0;
@@ -2122,7 +2126,7 @@ int
xfs_difree(
struct xfs_trans *tp, /* transaction pointer */
xfs_ino_t inode, /* inode to be freed */
- struct xfs_bmap_free *flist, /* extents to free */
+ struct xfs_defer_ops *dfops, /* extents to free */
struct xfs_icluster *xic) /* cluster info if deleted */
{
/* REFERENCED */
@@ -2174,7 +2178,7 @@ xfs_difree(
/*
* Fix up the inode allocation btree.
*/
- error = xfs_difree_inobt(mp, tp, agbp, agino, flist, xic, &rec);
+ error = xfs_difree_inobt(mp, tp, agbp, agino, dfops, xic, &rec);
if (error)
goto error0;
@@ -2395,20 +2399,11 @@ void
xfs_ialloc_compute_maxlevels(
xfs_mount_t *mp) /* file system mount structure */
{
- int level;
- uint maxblocks;
- uint maxleafents;
- int minleafrecs;
- int minnoderecs;
-
- maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >>
- XFS_INODES_PER_CHUNK_LOG;
- minleafrecs = mp->m_inobt_mnr[0];
- minnoderecs = mp->m_inobt_mnr[1];
- maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
- for (level = 1; maxblocks > 1; level++)
- maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
- mp->m_in_maxlevels = level;
+ uint inodes;
+
+ inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG;
+ mp->m_in_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_inobt_mnr,
+ inodes);
}
/*
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index 6e450df2979bf..0bb89669fc072 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -95,7 +95,7 @@ int /* error */
xfs_difree(
struct xfs_trans *tp, /* transaction pointer */
xfs_ino_t inode, /* inode to be freed */
- struct xfs_bmap_free *flist, /* extents to free */
+ struct xfs_defer_ops *dfops, /* extents to free */
struct xfs_icluster *ifree); /* cluster info if deleted */
/*
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 89c21d771e35e..31ca2208c03df 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -32,6 +32,7 @@
#include "xfs_trace.h"
#include "xfs_cksum.h"
#include "xfs_trans.h"
+#include "xfs_rmap.h"
STATIC int
@@ -96,6 +97,7 @@ xfs_inobt_alloc_block(
memset(&args, 0, sizeof(args));
args.tp = cur->bc_tp;
args.mp = cur->bc_mp;
+ xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_INOBT);
args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno);
args.minlen = 1;
args.maxlen = 1;
@@ -125,8 +127,12 @@ xfs_inobt_free_block(
struct xfs_btree_cur *cur,
struct xfs_buf *bp)
{
+ struct xfs_owner_info oinfo;
+
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT);
return xfs_free_extent(cur->bc_tp,
- XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1);
+ XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1,
+ &oinfo);
}
STATIC int
@@ -146,14 +152,6 @@ xfs_inobt_init_key_from_rec(
}
STATIC void
-xfs_inobt_init_rec_from_key(
- union xfs_btree_key *key,
- union xfs_btree_rec *rec)
-{
- rec->inobt.ir_startino = key->inobt.ir_startino;
-}
-
-STATIC void
xfs_inobt_init_rec_from_cur(
struct xfs_btree_cur *cur,
union xfs_btree_rec *rec)
@@ -314,7 +312,6 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
.get_minrecs = xfs_inobt_get_minrecs,
.get_maxrecs = xfs_inobt_get_maxrecs,
.init_key_from_rec = xfs_inobt_init_key_from_rec,
- .init_rec_from_key = xfs_inobt_init_rec_from_key,
.init_rec_from_cur = xfs_inobt_init_rec_from_cur,
.init_ptr_from_cur = xfs_inobt_init_ptr_from_cur,
.key_diff = xfs_inobt_key_diff,
@@ -336,7 +333,6 @@ static const struct xfs_btree_ops xfs_finobt_ops = {
.get_minrecs = xfs_inobt_get_minrecs,
.get_maxrecs = xfs_inobt_get_maxrecs,
.init_key_from_rec = xfs_inobt_init_key_from_rec,
- .init_rec_from_key = xfs_inobt_init_rec_from_key,
.init_rec_from_cur = xfs_inobt_init_rec_from_cur,
.init_ptr_from_cur = xfs_finobt_init_ptr_from_cur,
.key_diff = xfs_inobt_key_diff,
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 9d9559eb2835a..4b9769e23c834 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -22,6 +22,7 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
+#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_error.h"
#include "xfs_cksum.h"
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 11faf7df14c80..bbcc8c7a44b3f 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -231,6 +231,48 @@ xfs_iformat_fork(
return error;
}
+void
+xfs_init_local_fork(
+ struct xfs_inode *ip,
+ int whichfork,
+ const void *data,
+ int size)
+{
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ int mem_size = size, real_size = 0;
+ bool zero_terminate;
+
+ /*
+ * If we are using the local fork to store a symlink body we need to
+ * zero-terminate it so that we can pass it back to the VFS directly.
+ * Overallocate the in-memory fork by one for that and add a zero
+ * to terminate it below.
+ */
+ zero_terminate = S_ISLNK(VFS_I(ip)->i_mode);
+ if (zero_terminate)
+ mem_size++;
+
+ if (size == 0)
+ ifp->if_u1.if_data = NULL;
+ else if (mem_size <= sizeof(ifp->if_u2.if_inline_data))
+ ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+ else {
+ real_size = roundup(mem_size, 4);
+ ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
+ }
+
+ if (size) {
+ memcpy(ifp->if_u1.if_data, data, size);
+ if (zero_terminate)
+ ifp->if_u1.if_data[size] = '\0';
+ }
+
+ ifp->if_bytes = size;
+ ifp->if_real_bytes = real_size;
+ ifp->if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
+ ifp->if_flags |= XFS_IFINLINE;
+}
+
/*
* The file is in-lined in the on-disk inode.
* If it fits into if_inline_data, then copy
@@ -248,8 +290,6 @@ xfs_iformat_local(
int whichfork,
int size)
{
- xfs_ifork_t *ifp;
- int real_size;
/*
* If the size is unreasonable, then something
@@ -265,22 +305,8 @@ xfs_iformat_local(
ip->i_mount, dip);
return -EFSCORRUPTED;
}
- ifp = XFS_IFORK_PTR(ip, whichfork);
- real_size = 0;
- if (size == 0)
- ifp->if_u1.if_data = NULL;
- else if (size <= sizeof(ifp->if_u2.if_inline_data))
- ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
- else {
- real_size = roundup(size, 4);
- ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
- }
- ifp->if_bytes = size;
- ifp->if_real_bytes = real_size;
- if (size)
- memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
- ifp->if_flags &= ~XFS_IFEXTENTS;
- ifp->if_flags |= XFS_IFINLINE;
+
+ xfs_init_local_fork(ip, whichfork, XFS_DFORK_PTR(dip, whichfork), size);
return 0;
}
@@ -516,7 +542,6 @@ xfs_iroot_realloc(
new_max = cur_max + rec_diff;
new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
- XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max),
KM_SLEEP | KM_NOFS);
op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
ifp->if_broot_bytes);
@@ -660,7 +685,6 @@ xfs_idata_realloc(
ifp->if_u1.if_data =
kmem_realloc(ifp->if_u1.if_data,
real_size,
- ifp->if_real_bytes,
KM_SLEEP | KM_NOFS);
}
} else {
@@ -1376,8 +1400,7 @@ xfs_iext_realloc_direct(
if (rnew_size != ifp->if_real_bytes) {
ifp->if_u1.if_extents =
kmem_realloc(ifp->if_u1.if_extents,
- rnew_size,
- ifp->if_real_bytes, KM_NOFS);
+ rnew_size, KM_NOFS);
}
if (rnew_size > ifp->if_real_bytes) {
memset(&ifp->if_u1.if_extents[ifp->if_bytes /
@@ -1461,9 +1484,8 @@ xfs_iext_realloc_indirect(
if (new_size == 0) {
xfs_iext_destroy(ifp);
} else {
- ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
- kmem_realloc(ifp->if_u1.if_ext_irec,
- new_size, size, KM_NOFS);
+ ifp->if_u1.if_ext_irec =
+ kmem_realloc(ifp->if_u1.if_ext_irec, new_size, KM_NOFS);
}
}
@@ -1497,6 +1519,24 @@ xfs_iext_indirect_to_direct(
}
/*
+ * Remove all records from the indirection array.
+ */
+STATIC void
+xfs_iext_irec_remove_all(
+ struct xfs_ifork *ifp)
+{
+ int nlists;
+ int i;
+
+ ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+ nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+ for (i = 0; i < nlists; i++)
+ kmem_free(ifp->if_u1.if_ext_irec[i].er_extbuf);
+ kmem_free(ifp->if_u1.if_ext_irec);
+ ifp->if_flags &= ~XFS_IFEXTIREC;
+}
+
+/*
* Free incore file extents.
*/
void
@@ -1504,14 +1544,7 @@ xfs_iext_destroy(
xfs_ifork_t *ifp) /* inode fork pointer */
{
if (ifp->if_flags & XFS_IFEXTIREC) {
- int erp_idx;
- int nlists;
-
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) {
- xfs_iext_irec_remove(ifp, erp_idx);
- }
- ifp->if_flags &= ~XFS_IFEXTIREC;
+ xfs_iext_irec_remove_all(ifp);
} else if (ifp->if_real_bytes) {
kmem_free(ifp->if_u1.if_extents);
} else if (ifp->if_bytes) {
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 7d3b1ed6dcbe9..f95e072ae6468 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -134,6 +134,7 @@ void xfs_iroot_realloc(struct xfs_inode *, int, int);
int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
int xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *,
int);
+void xfs_init_local_fork(struct xfs_inode *, int, const void *, int);
struct xfs_bmbt_rec_host *
xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t);
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index d54a8018b079d..a6eed43fa7cd5 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -110,7 +110,9 @@ static inline uint xlog_get_cycle(char *ptr)
#define XLOG_REG_TYPE_COMMIT 18
#define XLOG_REG_TYPE_TRANSHDR 19
#define XLOG_REG_TYPE_ICREATE 20
-#define XLOG_REG_TYPE_MAX 20
+#define XLOG_REG_TYPE_RUI_FORMAT 21
+#define XLOG_REG_TYPE_RUD_FORMAT 22
+#define XLOG_REG_TYPE_MAX 22
/*
* Flags to log operation header
@@ -212,6 +214,11 @@ typedef struct xfs_trans_header {
#define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */
/*
+ * The only type valid for th_type in CIL-enabled file system logs:
+ */
+#define XFS_TRANS_CHECKPOINT 40
+
+/*
* Log item types.
*/
#define XFS_LI_EFI 0x1236
@@ -222,6 +229,8 @@ typedef struct xfs_trans_header {
#define XFS_LI_DQUOT 0x123d
#define XFS_LI_QUOTAOFF 0x123e
#define XFS_LI_ICREATE 0x123f
+#define XFS_LI_RUI 0x1240 /* rmap update intent */
+#define XFS_LI_RUD 0x1241
#define XFS_LI_TYPE_DESC \
{ XFS_LI_EFI, "XFS_LI_EFI" }, \
@@ -231,7 +240,9 @@ typedef struct xfs_trans_header {
{ XFS_LI_BUF, "XFS_LI_BUF" }, \
{ XFS_LI_DQUOT, "XFS_LI_DQUOT" }, \
{ XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" }, \
- { XFS_LI_ICREATE, "XFS_LI_ICREATE" }
+ { XFS_LI_ICREATE, "XFS_LI_ICREATE" }, \
+ { XFS_LI_RUI, "XFS_LI_RUI" }, \
+ { XFS_LI_RUD, "XFS_LI_RUD" }
/*
* Inode Log Item Format definitions.
@@ -599,6 +610,59 @@ typedef struct xfs_efd_log_format_64 {
} xfs_efd_log_format_64_t;
/*
+ * RUI/RUD (reverse mapping) log format definitions
+ */
+struct xfs_map_extent {
+ __uint64_t me_owner;
+ __uint64_t me_startblock;
+ __uint64_t me_startoff;
+ __uint32_t me_len;
+ __uint32_t me_flags;
+};
+
+/* rmap me_flags: upper bits are flags, lower byte is type code */
+#define XFS_RMAP_EXTENT_MAP 1
+#define XFS_RMAP_EXTENT_UNMAP 3
+#define XFS_RMAP_EXTENT_CONVERT 5
+#define XFS_RMAP_EXTENT_ALLOC 7
+#define XFS_RMAP_EXTENT_FREE 8
+#define XFS_RMAP_EXTENT_TYPE_MASK 0xFF
+
+#define XFS_RMAP_EXTENT_ATTR_FORK (1U << 31)
+#define XFS_RMAP_EXTENT_BMBT_BLOCK (1U << 30)
+#define XFS_RMAP_EXTENT_UNWRITTEN (1U << 29)
+
+#define XFS_RMAP_EXTENT_FLAGS (XFS_RMAP_EXTENT_TYPE_MASK | \
+ XFS_RMAP_EXTENT_ATTR_FORK | \
+ XFS_RMAP_EXTENT_BMBT_BLOCK | \
+ XFS_RMAP_EXTENT_UNWRITTEN)
+
+/*
+ * This is the structure used to lay out an rui log item in the
+ * log. The rui_extents field is a variable size array whose
+ * size is given by rui_nextents.
+ */
+struct xfs_rui_log_format {
+ __uint16_t rui_type; /* rui log item type */
+ __uint16_t rui_size; /* size of this item */
+ __uint32_t rui_nextents; /* # extents to free */
+ __uint64_t rui_id; /* rui identifier */
+ struct xfs_map_extent rui_extents[1]; /* array of extents to rmap */
+};
+
+/*
+ * This is the structure used to lay out an rud log item in the
+ * log. The rud_extents array is a variable size array whose
+ * size is given by rud_nextents;
+ */
+struct xfs_rud_log_format {
+ __uint16_t rud_type; /* rud log item type */
+ __uint16_t rud_size; /* size of this item */
+ __uint32_t __pad;
+ __uint64_t rud_rui_id; /* id of corresponding rui */
+};
+
+/*
* Dquot Log format definitions.
*
* The first two fields must be the type and size fitting into
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
new file mode 100644
index 0000000000000..73d05407d6636
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -0,0 +1,1399 @@
+/*
+ * Copyright (c) 2014 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_btree.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+#include "xfs_error.h"
+#include "xfs_extent_busy.h"
+#include "xfs_bmap.h"
+#include "xfs_inode.h"
+
+/*
+ * Lookup the first record less than or equal to [bno, len, owner, offset]
+ * in the btree given by cur.
+ */
+int
+xfs_rmap_lookup_le(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ uint64_t owner,
+ uint64_t offset,
+ unsigned int flags,
+ int *stat)
+{
+ cur->bc_rec.r.rm_startblock = bno;
+ cur->bc_rec.r.rm_blockcount = len;
+ cur->bc_rec.r.rm_owner = owner;
+ cur->bc_rec.r.rm_offset = offset;
+ cur->bc_rec.r.rm_flags = flags;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+
+/*
+ * Lookup the record exactly matching [bno, len, owner, offset]
+ * in the btree given by cur.
+ */
+int
+xfs_rmap_lookup_eq(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ uint64_t owner,
+ uint64_t offset,
+ unsigned int flags,
+ int *stat)
+{
+ cur->bc_rec.r.rm_startblock = bno;
+ cur->bc_rec.r.rm_blockcount = len;
+ cur->bc_rec.r.rm_owner = owner;
+ cur->bc_rec.r.rm_offset = offset;
+ cur->bc_rec.r.rm_flags = flags;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+/*
+ * Update the record referred to by cur to the value given
+ * by [bno, len, owner, offset].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_rmap_update(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *irec)
+{
+ union xfs_btree_rec rec;
+ int error;
+
+ trace_xfs_rmap_update(cur->bc_mp, cur->bc_private.a.agno,
+ irec->rm_startblock, irec->rm_blockcount,
+ irec->rm_owner, irec->rm_offset, irec->rm_flags);
+
+ rec.rmap.rm_startblock = cpu_to_be32(irec->rm_startblock);
+ rec.rmap.rm_blockcount = cpu_to_be32(irec->rm_blockcount);
+ rec.rmap.rm_owner = cpu_to_be64(irec->rm_owner);
+ rec.rmap.rm_offset = cpu_to_be64(
+ xfs_rmap_irec_offset_pack(irec));
+ error = xfs_btree_update(cur, &rec);
+ if (error)
+ trace_xfs_rmap_update_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
+int
+xfs_rmap_insert(
+ struct xfs_btree_cur *rcur,
+ xfs_agblock_t agbno,
+ xfs_extlen_t len,
+ uint64_t owner,
+ uint64_t offset,
+ unsigned int flags)
+{
+ int i;
+ int error;
+
+ trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_private.a.agno, agbno,
+ len, owner, offset, flags);
+
+ error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 0, done);
+
+ rcur->bc_rec.r.rm_startblock = agbno;
+ rcur->bc_rec.r.rm_blockcount = len;
+ rcur->bc_rec.r.rm_owner = owner;
+ rcur->bc_rec.r.rm_offset = offset;
+ rcur->bc_rec.r.rm_flags = flags;
+ error = xfs_btree_insert(rcur, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done);
+done:
+ if (error)
+ trace_xfs_rmap_insert_error(rcur->bc_mp,
+ rcur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
+static int
+xfs_rmap_btrec_to_irec(
+ union xfs_btree_rec *rec,
+ struct xfs_rmap_irec *irec)
+{
+ irec->rm_flags = 0;
+ irec->rm_startblock = be32_to_cpu(rec->rmap.rm_startblock);
+ irec->rm_blockcount = be32_to_cpu(rec->rmap.rm_blockcount);
+ irec->rm_owner = be64_to_cpu(rec->rmap.rm_owner);
+ return xfs_rmap_irec_offset_unpack(be64_to_cpu(rec->rmap.rm_offset),
+ irec);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int
+xfs_rmap_get_rec(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *irec,
+ int *stat)
+{
+ union xfs_btree_rec *rec;
+ int error;
+
+ error = xfs_btree_get_rec(cur, &rec, stat);
+ if (error || !*stat)
+ return error;
+
+ return xfs_rmap_btrec_to_irec(rec, irec);
+}
+
+/*
+ * Find the extent in the rmap btree and remove it.
+ *
+ * The record we find should always be an exact match for the extent that we're
+ * looking for, since we insert them into the btree without modification.
+ *
+ * Special Case #1: when growing the filesystem, we "free" an extent when
+ * growing the last AG. This extent is new space and so it is not tracked as
+ * used space in the btree. The growfs code will pass in an owner of
+ * XFS_RMAP_OWN_NULL to indicate that it expected that there is no owner of this
+ * extent. We verify that - the extent lookup result in a record that does not
+ * overlap.
+ *
+ * Special Case #2: EFIs do not record the owner of the extent, so when
+ * recovering EFIs from the log we pass in XFS_RMAP_OWN_UNKNOWN to tell the rmap
+ * btree to ignore the owner (i.e. wildcard match) so we don't trigger
+ * corruption checks during log recovery.
+ */
+STATIC int
+xfs_rmap_unmap(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool unwritten,
+ struct xfs_owner_info *oinfo)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_rmap_irec ltrec;
+ uint64_t ltoff;
+ int error = 0;
+ int i;
+ uint64_t owner;
+ uint64_t offset;
+ unsigned int flags;
+ bool ignore_off;
+
+ xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+ ignore_off = XFS_RMAP_NON_INODE_OWNER(owner) ||
+ (flags & XFS_RMAP_BMBT_BLOCK);
+ if (unwritten)
+ flags |= XFS_RMAP_UNWRITTEN;
+ trace_xfs_rmap_unmap(mp, cur->bc_private.a.agno, bno, len,
+ unwritten, oinfo);
+
+ /*
+ * We should always have a left record because there's a static record
+ * for the AG headers at rm_startblock == 0 created by mkfs/growfs that
+ * will not ever be removed from the tree.
+ */
+ error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, &i);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+
+ error = xfs_rmap_get_rec(cur, &ltrec, &i);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
+ cur->bc_private.a.agno, ltrec.rm_startblock,
+ ltrec.rm_blockcount, ltrec.rm_owner,
+ ltrec.rm_offset, ltrec.rm_flags);
+ ltoff = ltrec.rm_offset;
+
+ /*
+ * For growfs, the incoming extent must be beyond the left record we
+ * just found as it is new space and won't be used by anyone. This is
+ * just a corruption check as we don't actually do anything with this
+ * extent. Note that we need to use >= instead of > because it might
+ * be the case that the "left" extent goes all the way to EOFS.
+ */
+ if (owner == XFS_RMAP_OWN_NULL) {
+ XFS_WANT_CORRUPTED_GOTO(mp, bno >= ltrec.rm_startblock +
+ ltrec.rm_blockcount, out_error);
+ goto out_done;
+ }
+
+ /* Make sure the unwritten flag matches. */
+ XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) ==
+ (ltrec.rm_flags & XFS_RMAP_UNWRITTEN), out_error);
+
+ /* Make sure the extent we found covers the entire freeing range. */
+ XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_startblock <= bno &&
+ ltrec.rm_startblock + ltrec.rm_blockcount >=
+ bno + len, out_error);
+
+ /* Make sure the owner matches what we expect to find in the tree. */
+ XFS_WANT_CORRUPTED_GOTO(mp, owner == ltrec.rm_owner ||
+ XFS_RMAP_NON_INODE_OWNER(owner), out_error);
+
+ /* Check the offset, if necessary. */
+ if (!XFS_RMAP_NON_INODE_OWNER(owner)) {
+ if (flags & XFS_RMAP_BMBT_BLOCK) {
+ XFS_WANT_CORRUPTED_GOTO(mp,
+ ltrec.rm_flags & XFS_RMAP_BMBT_BLOCK,
+ out_error);
+ } else {
+ XFS_WANT_CORRUPTED_GOTO(mp,
+ ltrec.rm_offset <= offset, out_error);
+ XFS_WANT_CORRUPTED_GOTO(mp,
+ ltoff + ltrec.rm_blockcount >= offset + len,
+ out_error);
+ }
+ }
+
+ if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) {
+ /* exact match, simply remove the record from rmap tree */
+ trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+ ltrec.rm_startblock, ltrec.rm_blockcount,
+ ltrec.rm_owner, ltrec.rm_offset,
+ ltrec.rm_flags);
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ } else if (ltrec.rm_startblock == bno) {
+ /*
+ * overlap left hand side of extent: move the start, trim the
+ * length and update the current record.
+ *
+ * ltbno ltlen
+ * Orig: |oooooooooooooooooooo|
+ * Freeing: |fffffffff|
+ * Result: |rrrrrrrrrr|
+ * bno len
+ */
+ ltrec.rm_startblock += len;
+ ltrec.rm_blockcount -= len;
+ if (!ignore_off)
+ ltrec.rm_offset += len;
+ error = xfs_rmap_update(cur, &ltrec);
+ if (error)
+ goto out_error;
+ } else if (ltrec.rm_startblock + ltrec.rm_blockcount == bno + len) {
+ /*
+ * overlap right hand side of extent: trim the length and update
+ * the current record.
+ *
+ * ltbno ltlen
+ * Orig: |oooooooooooooooooooo|
+ * Freeing: |fffffffff|
+ * Result: |rrrrrrrrrr|
+ * bno len
+ */
+ ltrec.rm_blockcount -= len;
+ error = xfs_rmap_update(cur, &ltrec);
+ if (error)
+ goto out_error;
+ } else {
+
+ /*
+ * overlap middle of extent: trim the length of the existing
+ * record to the length of the new left-extent size, increment
+ * the insertion position so we can insert a new record
+ * containing the remaining right-extent space.
+ *
+ * ltbno ltlen
+ * Orig: |oooooooooooooooooooo|
+ * Freeing: |fffffffff|
+ * Result: |rrrrr| |rrrr|
+ * bno len
+ */
+ xfs_extlen_t orig_len = ltrec.rm_blockcount;
+
+ ltrec.rm_blockcount = bno - ltrec.rm_startblock;
+ error = xfs_rmap_update(cur, &ltrec);
+ if (error)
+ goto out_error;
+
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto out_error;
+
+ cur->bc_rec.r.rm_startblock = bno + len;
+ cur->bc_rec.r.rm_blockcount = orig_len - len -
+ ltrec.rm_blockcount;
+ cur->bc_rec.r.rm_owner = ltrec.rm_owner;
+ if (ignore_off)
+ cur->bc_rec.r.rm_offset = 0;
+ else
+ cur->bc_rec.r.rm_offset = offset + len;
+ cur->bc_rec.r.rm_flags = flags;
+ trace_xfs_rmap_insert(mp, cur->bc_private.a.agno,
+ cur->bc_rec.r.rm_startblock,
+ cur->bc_rec.r.rm_blockcount,
+ cur->bc_rec.r.rm_owner,
+ cur->bc_rec.r.rm_offset,
+ cur->bc_rec.r.rm_flags);
+ error = xfs_btree_insert(cur, &i);
+ if (error)
+ goto out_error;
+ }
+
+out_done:
+ trace_xfs_rmap_unmap_done(mp, cur->bc_private.a.agno, bno, len,
+ unwritten, oinfo);
+out_error:
+ if (error)
+ trace_xfs_rmap_unmap_error(mp, cur->bc_private.a.agno,
+ error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Remove a reference to an extent in the rmap btree.
+ */
+int
+xfs_rmap_free(
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ struct xfs_owner_info *oinfo)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_btree_cur *cur;
+ int error;
+
+ if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ return 0;
+
+ cur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno);
+
+ error = xfs_rmap_unmap(cur, bno, len, false, oinfo);
+ if (error)
+ goto out_error;
+
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ return 0;
+
+out_error:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
+ * A mergeable rmap must have the same owner and the same values for
+ * the unwritten, attr_fork, and bmbt flags. The startblock and
+ * offset are checked separately.
+ */
+static bool
+xfs_rmap_is_mergeable(
+ struct xfs_rmap_irec *irec,
+ uint64_t owner,
+ unsigned int flags)
+{
+ if (irec->rm_owner == XFS_RMAP_OWN_NULL)
+ return false;
+ if (irec->rm_owner != owner)
+ return false;
+ if ((flags & XFS_RMAP_UNWRITTEN) ^
+ (irec->rm_flags & XFS_RMAP_UNWRITTEN))
+ return false;
+ if ((flags & XFS_RMAP_ATTR_FORK) ^
+ (irec->rm_flags & XFS_RMAP_ATTR_FORK))
+ return false;
+ if ((flags & XFS_RMAP_BMBT_BLOCK) ^
+ (irec->rm_flags & XFS_RMAP_BMBT_BLOCK))
+ return false;
+ return true;
+}
+
+/*
+ * When we allocate a new block, the first thing we do is add a reference to
+ * the extent in the rmap btree. This takes the form of a [agbno, length,
+ * owner, offset] record. Flags are encoded in the high bits of the offset
+ * field.
+ */
+STATIC int
+xfs_rmap_map(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool unwritten,
+ struct xfs_owner_info *oinfo)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_rmap_irec ltrec;
+ struct xfs_rmap_irec gtrec;
+ int have_gt;
+ int have_lt;
+ int error = 0;
+ int i;
+ uint64_t owner;
+ uint64_t offset;
+ unsigned int flags = 0;
+ bool ignore_off;
+
+ xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+ ASSERT(owner != 0);
+ ignore_off = XFS_RMAP_NON_INODE_OWNER(owner) ||
+ (flags & XFS_RMAP_BMBT_BLOCK);
+ if (unwritten)
+ flags |= XFS_RMAP_UNWRITTEN;
+ trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len,
+ unwritten, oinfo);
+
+ /*
+ * For the initial lookup, look for an exact match or the left-adjacent
+ * record for our insertion point. This will also give us the record for
+ * start block contiguity tests.
+ */
+ error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags,
+ &have_lt);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(mp, have_lt == 1, out_error);
+
+ error = xfs_rmap_get_rec(cur, &ltrec, &have_lt);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(mp, have_lt == 1, out_error);
+ trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
+ cur->bc_private.a.agno, ltrec.rm_startblock,
+ ltrec.rm_blockcount, ltrec.rm_owner,
+ ltrec.rm_offset, ltrec.rm_flags);
+
+ if (!xfs_rmap_is_mergeable(&ltrec, owner, flags))
+ have_lt = 0;
+
+ XFS_WANT_CORRUPTED_GOTO(mp,
+ have_lt == 0 ||
+ ltrec.rm_startblock + ltrec.rm_blockcount <= bno, out_error);
+
+ /*
+ * Increment the cursor to see if we have a right-adjacent record to our
+ * insertion point. This will give us the record for end block
+ * contiguity tests.
+ */
+ error = xfs_btree_increment(cur, 0, &have_gt);
+ if (error)
+ goto out_error;
+ if (have_gt) {
+ error = xfs_rmap_get_rec(cur, &gtrec, &have_gt);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(mp, have_gt == 1, out_error);
+ XFS_WANT_CORRUPTED_GOTO(mp, bno + len <= gtrec.rm_startblock,
+ out_error);
+ trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
+ cur->bc_private.a.agno, gtrec.rm_startblock,
+ gtrec.rm_blockcount, gtrec.rm_owner,
+ gtrec.rm_offset, gtrec.rm_flags);
+ if (!xfs_rmap_is_mergeable(&gtrec, owner, flags))
+ have_gt = 0;
+ }
+
+ /*
+ * Note: cursor currently points one record to the right of ltrec, even
+ * if there is no record in the tree to the right.
+ */
+ if (have_lt &&
+ ltrec.rm_startblock + ltrec.rm_blockcount == bno &&
+ (ignore_off || ltrec.rm_offset + ltrec.rm_blockcount == offset)) {
+ /*
+ * left edge contiguous, merge into left record.
+ *
+ * ltbno ltlen
+ * orig: |ooooooooo|
+ * adding: |aaaaaaaaa|
+ * result: |rrrrrrrrrrrrrrrrrrr|
+ * bno len
+ */
+ ltrec.rm_blockcount += len;
+ if (have_gt &&
+ bno + len == gtrec.rm_startblock &&
+ (ignore_off || offset + len == gtrec.rm_offset) &&
+ (unsigned long)ltrec.rm_blockcount + len +
+ gtrec.rm_blockcount <= XFS_RMAP_LEN_MAX) {
+ /*
+ * right edge also contiguous, delete right record
+ * and merge into left record.
+ *
+ * ltbno ltlen gtbno gtlen
+ * orig: |ooooooooo| |ooooooooo|
+ * adding: |aaaaaaaaa|
+ * result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr|
+ */
+ ltrec.rm_blockcount += gtrec.rm_blockcount;
+ trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+ gtrec.rm_startblock,
+ gtrec.rm_blockcount,
+ gtrec.rm_owner,
+ gtrec.rm_offset,
+ gtrec.rm_flags);
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ }
+
+ /* point the cursor back to the left record and update */
+ error = xfs_btree_decrement(cur, 0, &have_gt);
+ if (error)
+ goto out_error;
+ error = xfs_rmap_update(cur, &ltrec);
+ if (error)
+ goto out_error;
+ } else if (have_gt &&
+ bno + len == gtrec.rm_startblock &&
+ (ignore_off || offset + len == gtrec.rm_offset)) {
+ /*
+ * right edge contiguous, merge into right record.
+ *
+ * gtbno gtlen
+ * Orig: |ooooooooo|
+ * adding: |aaaaaaaaa|
+ * Result: |rrrrrrrrrrrrrrrrrrr|
+ * bno len
+ */
+ gtrec.rm_startblock = bno;
+ gtrec.rm_blockcount += len;
+ if (!ignore_off)
+ gtrec.rm_offset = offset;
+ error = xfs_rmap_update(cur, &gtrec);
+ if (error)
+ goto out_error;
+ } else {
+ /*
+ * no contiguous edge with identical owner, insert
+ * new record at current cursor position.
+ */
+ cur->bc_rec.r.rm_startblock = bno;
+ cur->bc_rec.r.rm_blockcount = len;
+ cur->bc_rec.r.rm_owner = owner;
+ cur->bc_rec.r.rm_offset = offset;
+ cur->bc_rec.r.rm_flags = flags;
+ trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, len,
+ owner, offset, flags);
+ error = xfs_btree_insert(cur, &i);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ }
+
+ trace_xfs_rmap_map_done(mp, cur->bc_private.a.agno, bno, len,
+ unwritten, oinfo);
+out_error:
+ if (error)
+ trace_xfs_rmap_map_error(mp, cur->bc_private.a.agno,
+ error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Add a reference to an extent in the rmap btree.
+ */
+int
+xfs_rmap_alloc(
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ struct xfs_owner_info *oinfo)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_btree_cur *cur;
+ int error;
+
+ if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ return 0;
+
+ cur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno);
+ error = xfs_rmap_map(cur, bno, len, false, oinfo);
+ if (error)
+ goto out_error;
+
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ return 0;
+
+out_error:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+}
+
+#define RMAP_LEFT_CONTIG (1 << 0)
+#define RMAP_RIGHT_CONTIG (1 << 1)
+#define RMAP_LEFT_FILLING (1 << 2)
+#define RMAP_RIGHT_FILLING (1 << 3)
+#define RMAP_LEFT_VALID (1 << 6)
+#define RMAP_RIGHT_VALID (1 << 7)
+
+#define LEFT r[0]
+#define RIGHT r[1]
+#define PREV r[2]
+#define NEW r[3]
+
+/*
+ * Convert an unwritten extent to a real extent or vice versa.
+ * Does not handle overlapping extents.
+ */
+STATIC int
+xfs_rmap_convert(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool unwritten,
+ struct xfs_owner_info *oinfo)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_rmap_irec r[4]; /* neighbor extent entries */
+ /* left is 0, right is 1, prev is 2 */
+ /* new is 3 */
+ uint64_t owner;
+ uint64_t offset;
+ uint64_t new_endoff;
+ unsigned int oldext;
+ unsigned int newext;
+ unsigned int flags = 0;
+ int i;
+ int state = 0;
+ int error;
+
+ xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+ ASSERT(!(XFS_RMAP_NON_INODE_OWNER(owner) ||
+ (flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))));
+ oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0;
+ new_endoff = offset + len;
+ trace_xfs_rmap_convert(mp, cur->bc_private.a.agno, bno, len,
+ unwritten, oinfo);
+
+ /*
+ * For the initial lookup, look for an exact match or the left-adjacent
+ * record for our insertion point. This will also give us the record for
+ * start block contiguity tests.
+ */
+ error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+
+ error = xfs_rmap_get_rec(cur, &PREV, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
+ cur->bc_private.a.agno, PREV.rm_startblock,
+ PREV.rm_blockcount, PREV.rm_owner,
+ PREV.rm_offset, PREV.rm_flags);
+
+ ASSERT(PREV.rm_offset <= offset);
+ ASSERT(PREV.rm_offset + PREV.rm_blockcount >= new_endoff);
+ ASSERT((PREV.rm_flags & XFS_RMAP_UNWRITTEN) == oldext);
+ newext = ~oldext & XFS_RMAP_UNWRITTEN;
+
+ /*
+ * Set flags determining what part of the previous oldext allocation
+ * extent is being replaced by a newext allocation.
+ */
+ if (PREV.rm_offset == offset)
+ state |= RMAP_LEFT_FILLING;
+ if (PREV.rm_offset + PREV.rm_blockcount == new_endoff)
+ state |= RMAP_RIGHT_FILLING;
+
+ /*
+ * Decrement the cursor to see if we have a left-adjacent record to our
+ * insertion point. This will give us the record for end block
+ * contiguity tests.
+ */
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (error)
+ goto done;
+ if (i) {
+ state |= RMAP_LEFT_VALID;
+ error = xfs_rmap_get_rec(cur, &LEFT, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp,
+ LEFT.rm_startblock + LEFT.rm_blockcount <= bno,
+ done);
+ trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
+ cur->bc_private.a.agno, LEFT.rm_startblock,
+ LEFT.rm_blockcount, LEFT.rm_owner,
+ LEFT.rm_offset, LEFT.rm_flags);
+ if (LEFT.rm_startblock + LEFT.rm_blockcount == bno &&
+ LEFT.rm_offset + LEFT.rm_blockcount == offset &&
+ xfs_rmap_is_mergeable(&LEFT, owner, newext))
+ state |= RMAP_LEFT_CONTIG;
+ }
+
+ /*
+ * Increment the cursor to see if we have a right-adjacent record to our
+ * insertion point. This will give us the record for end block
+ * contiguity tests.
+ */
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto done;
+ if (i) {
+ state |= RMAP_RIGHT_VALID;
+ error = xfs_rmap_get_rec(cur, &RIGHT, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, bno + len <= RIGHT.rm_startblock,
+ done);
+ trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
+ cur->bc_private.a.agno, RIGHT.rm_startblock,
+ RIGHT.rm_blockcount, RIGHT.rm_owner,
+ RIGHT.rm_offset, RIGHT.rm_flags);
+ if (bno + len == RIGHT.rm_startblock &&
+ offset + len == RIGHT.rm_offset &&
+ xfs_rmap_is_mergeable(&RIGHT, owner, newext))
+ state |= RMAP_RIGHT_CONTIG;
+ }
+
+ /* check that left + prev + right is not too long */
+ if ((state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+ RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) ==
+ (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+ RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG) &&
+ (unsigned long)LEFT.rm_blockcount + len +
+ RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX)
+ state &= ~RMAP_RIGHT_CONTIG;
+
+ trace_xfs_rmap_convert_state(mp, cur->bc_private.a.agno, state,
+ _RET_IP_);
+
+ /* reset the cursor back to PREV */
+ error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+
+ /*
+ * Switch out based on the FILLING and CONTIG state bits.
+ */
+ switch (state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+ RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) {
+ case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+ RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
+ /*
+ * Setting all of a previous oldext extent to newext.
+ * The left and right neighbors are both contiguous with new.
+ */
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+ RIGHT.rm_startblock, RIGHT.rm_blockcount,
+ RIGHT.rm_owner, RIGHT.rm_offset,
+ RIGHT.rm_flags);
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+ PREV.rm_startblock, PREV.rm_blockcount,
+ PREV.rm_owner, PREV.rm_offset,
+ PREV.rm_flags);
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ NEW = LEFT;
+ NEW.rm_blockcount += PREV.rm_blockcount + RIGHT.rm_blockcount;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG:
+ /*
+ * Setting all of a previous oldext extent to newext.
+ * The left neighbor is contiguous, the right is not.
+ */
+ trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+ PREV.rm_startblock, PREV.rm_blockcount,
+ PREV.rm_owner, PREV.rm_offset,
+ PREV.rm_flags);
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ NEW = LEFT;
+ NEW.rm_blockcount += PREV.rm_blockcount;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
+ /*
+ * Setting all of a previous oldext extent to newext.
+ * The right neighbor is contiguous, the left is not.
+ */
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+ RIGHT.rm_startblock, RIGHT.rm_blockcount,
+ RIGHT.rm_owner, RIGHT.rm_offset,
+ RIGHT.rm_flags);
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ NEW = PREV;
+ NEW.rm_blockcount = len + RIGHT.rm_blockcount;
+ NEW.rm_flags = newext;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING:
+ /*
+ * Setting all of a previous oldext extent to newext.
+ * Neither the left nor right neighbors are contiguous with
+ * the new one.
+ */
+ NEW = PREV;
+ NEW.rm_flags = newext;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG:
+ /*
+ * Setting the first part of a previous oldext extent to newext.
+ * The left neighbor is contiguous.
+ */
+ NEW = PREV;
+ NEW.rm_offset += len;
+ NEW.rm_startblock += len;
+ NEW.rm_blockcount -= len;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (error)
+ goto done;
+ NEW = LEFT;
+ NEW.rm_blockcount += len;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_LEFT_FILLING:
+ /*
+ * Setting the first part of a previous oldext extent to newext.
+ * The left neighbor is not contiguous.
+ */
+ NEW = PREV;
+ NEW.rm_startblock += len;
+ NEW.rm_offset += len;
+ NEW.rm_blockcount -= len;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ NEW.rm_startblock = bno;
+ NEW.rm_owner = owner;
+ NEW.rm_offset = offset;
+ NEW.rm_blockcount = len;
+ NEW.rm_flags = newext;
+ cur->bc_rec.r = NEW;
+ trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno,
+ len, owner, offset, newext);
+ error = xfs_btree_insert(cur, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ break;
+
+ case RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
+ /*
+ * Setting the last part of a previous oldext extent to newext.
+ * The right neighbor is contiguous with the new allocation.
+ */
+ NEW = PREV;
+ NEW.rm_blockcount -= len;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto done;
+ NEW = RIGHT;
+ NEW.rm_offset = offset;
+ NEW.rm_startblock = bno;
+ NEW.rm_blockcount += len;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_RIGHT_FILLING:
+ /*
+ * Setting the last part of a previous oldext extent to newext.
+ * The right neighbor is not contiguous.
+ */
+ NEW = PREV;
+ NEW.rm_blockcount -= len;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ error = xfs_rmap_lookup_eq(cur, bno, len, owner, offset,
+ oldext, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
+ NEW.rm_startblock = bno;
+ NEW.rm_owner = owner;
+ NEW.rm_offset = offset;
+ NEW.rm_blockcount = len;
+ NEW.rm_flags = newext;
+ cur->bc_rec.r = NEW;
+ trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno,
+ len, owner, offset, newext);
+ error = xfs_btree_insert(cur, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ break;
+
+ case 0:
+ /*
+ * Setting the middle part of a previous oldext extent to
+ * newext. Contiguity is impossible here.
+ * One extent becomes three extents.
+ */
+ /* new right extent - oldext */
+ NEW.rm_startblock = bno + len;
+ NEW.rm_owner = owner;
+ NEW.rm_offset = new_endoff;
+ NEW.rm_blockcount = PREV.rm_offset + PREV.rm_blockcount -
+ new_endoff;
+ NEW.rm_flags = PREV.rm_flags;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ /* new left extent - oldext */
+ NEW = PREV;
+ NEW.rm_blockcount = offset - PREV.rm_offset;
+ cur->bc_rec.r = NEW;
+ trace_xfs_rmap_insert(mp, cur->bc_private.a.agno,
+ NEW.rm_startblock, NEW.rm_blockcount,
+ NEW.rm_owner, NEW.rm_offset,
+ NEW.rm_flags);
+ error = xfs_btree_insert(cur, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ /*
+ * Reset the cursor to the position of the new extent
+ * we are about to insert as we can't trust it after
+ * the previous insert.
+ */
+ error = xfs_rmap_lookup_eq(cur, bno, len, owner, offset,
+ oldext, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
+ /* new middle extent - newext */
+ cur->bc_rec.r.rm_flags &= ~XFS_RMAP_UNWRITTEN;
+ cur->bc_rec.r.rm_flags |= newext;
+ trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, len,
+ owner, offset, newext);
+ error = xfs_btree_insert(cur, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ break;
+
+ case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
+ case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
+ case RMAP_LEFT_FILLING | RMAP_RIGHT_CONTIG:
+ case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG:
+ case RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
+ case RMAP_LEFT_CONTIG:
+ case RMAP_RIGHT_CONTIG:
+ /*
+ * These cases are all impossible.
+ */
+ ASSERT(0);
+ }
+
+ trace_xfs_rmap_convert_done(mp, cur->bc_private.a.agno, bno, len,
+ unwritten, oinfo);
+done:
+ if (error)
+ trace_xfs_rmap_convert_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
+#undef NEW
+#undef LEFT
+#undef RIGHT
+#undef PREV
+
+struct xfs_rmap_query_range_info {
+ xfs_rmap_query_range_fn fn;
+ void *priv;
+};
+
+/* Format btree record and pass to our callback. */
+STATIC int
+xfs_rmap_query_range_helper(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec,
+ void *priv)
+{
+ struct xfs_rmap_query_range_info *query = priv;
+ struct xfs_rmap_irec irec;
+ int error;
+
+ error = xfs_rmap_btrec_to_irec(rec, &irec);
+ if (error)
+ return error;
+ return query->fn(cur, &irec, query->priv);
+}
+
+/* Find all rmaps between two keys. */
+int
+xfs_rmap_query_range(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *low_rec,
+ struct xfs_rmap_irec *high_rec,
+ xfs_rmap_query_range_fn fn,
+ void *priv)
+{
+ union xfs_btree_irec low_brec;
+ union xfs_btree_irec high_brec;
+ struct xfs_rmap_query_range_info query;
+
+ low_brec.r = *low_rec;
+ high_brec.r = *high_rec;
+ query.priv = priv;
+ query.fn = fn;
+ return xfs_btree_query_range(cur, &low_brec, &high_brec,
+ xfs_rmap_query_range_helper, &query);
+}
+
+/* Clean up after calling xfs_rmap_finish_one. */
+void
+xfs_rmap_finish_one_cleanup(
+ struct xfs_trans *tp,
+ struct xfs_btree_cur *rcur,
+ int error)
+{
+ struct xfs_buf *agbp;
+
+ if (rcur == NULL)
+ return;
+ agbp = rcur->bc_private.a.agbp;
+ xfs_btree_del_cursor(rcur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+ if (error)
+ xfs_trans_brelse(tp, agbp);
+}
+
+/*
+ * Process one of the deferred rmap operations. We pass back the
+ * btree cursor to maintain our lock on the rmapbt between calls.
+ * This saves time and eliminates a buffer deadlock between the
+ * superblock and the AGF because we'll always grab them in the same
+ * order.
+ */
+int
+xfs_rmap_finish_one(
+ struct xfs_trans *tp,
+ enum xfs_rmap_intent_type type,
+ __uint64_t owner,
+ int whichfork,
+ xfs_fileoff_t startoff,
+ xfs_fsblock_t startblock,
+ xfs_filblks_t blockcount,
+ xfs_exntst_t state,
+ struct xfs_btree_cur **pcur)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_btree_cur *rcur;
+ struct xfs_buf *agbp = NULL;
+ int error = 0;
+ xfs_agnumber_t agno;
+ struct xfs_owner_info oinfo;
+ xfs_agblock_t bno;
+ bool unwritten;
+
+ agno = XFS_FSB_TO_AGNO(mp, startblock);
+ ASSERT(agno != NULLAGNUMBER);
+ bno = XFS_FSB_TO_AGBNO(mp, startblock);
+
+ trace_xfs_rmap_deferred(mp, agno, type, bno, owner, whichfork,
+ startoff, blockcount, state);
+
+ if (XFS_TEST_ERROR(false, mp,
+ XFS_ERRTAG_RMAP_FINISH_ONE,
+ XFS_RANDOM_RMAP_FINISH_ONE))
+ return -EIO;
+
+ /*
+ * If we haven't gotten a cursor or the cursor AG doesn't match
+ * the startblock, get one now.
+ */
+ rcur = *pcur;
+ if (rcur != NULL && rcur->bc_private.a.agno != agno) {
+ xfs_rmap_finish_one_cleanup(tp, rcur, 0);
+ rcur = NULL;
+ *pcur = NULL;
+ }
+ if (rcur == NULL) {
+ /*
+ * Refresh the freelist before we start changing the
+ * rmapbt, because a shape change could cause us to
+ * allocate blocks.
+ */
+ error = xfs_free_extent_fix_freelist(tp, agno, &agbp);
+ if (error)
+ return error;
+ if (!agbp)
+ return -EFSCORRUPTED;
+
+ rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno);
+ if (!rcur) {
+ error = -ENOMEM;
+ goto out_cur;
+ }
+ }
+ *pcur = rcur;
+
+ xfs_rmap_ino_owner(&oinfo, owner, whichfork, startoff);
+ unwritten = state == XFS_EXT_UNWRITTEN;
+ bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, startblock);
+
+ switch (type) {
+ case XFS_RMAP_ALLOC:
+ case XFS_RMAP_MAP:
+ error = xfs_rmap_map(rcur, bno, blockcount, unwritten, &oinfo);
+ break;
+ case XFS_RMAP_FREE:
+ case XFS_RMAP_UNMAP:
+ error = xfs_rmap_unmap(rcur, bno, blockcount, unwritten,
+ &oinfo);
+ break;
+ case XFS_RMAP_CONVERT:
+ error = xfs_rmap_convert(rcur, bno, blockcount, !unwritten,
+ &oinfo);
+ break;
+ default:
+ ASSERT(0);
+ error = -EFSCORRUPTED;
+ }
+ return error;
+
+out_cur:
+ xfs_trans_brelse(tp, agbp);
+
+ return error;
+}
+
+/*
+ * Don't defer an rmap if we aren't an rmap filesystem.
+ */
+static bool
+xfs_rmap_update_is_needed(
+ struct xfs_mount *mp)
+{
+ return xfs_sb_version_hasrmapbt(&mp->m_sb);
+}
+
+/*
+ * Record a rmap intent; the list is kept sorted first by AG and then by
+ * increasing age.
+ */
+static int
+__xfs_rmap_add(
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ enum xfs_rmap_intent_type type,
+ __uint64_t owner,
+ int whichfork,
+ struct xfs_bmbt_irec *bmap)
+{
+ struct xfs_rmap_intent *ri;
+
+ trace_xfs_rmap_defer(mp, XFS_FSB_TO_AGNO(mp, bmap->br_startblock),
+ type,
+ XFS_FSB_TO_AGBNO(mp, bmap->br_startblock),
+ owner, whichfork,
+ bmap->br_startoff,
+ bmap->br_blockcount,
+ bmap->br_state);
+
+ ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_SLEEP | KM_NOFS);
+ INIT_LIST_HEAD(&ri->ri_list);
+ ri->ri_type = type;
+ ri->ri_owner = owner;
+ ri->ri_whichfork = whichfork;
+ ri->ri_bmap = *bmap;
+
+ xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list);
+ return 0;
+}
+
+/* Map an extent into a file. */
+int
+xfs_rmap_map_extent(
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *PREV)
+{
+ if (!xfs_rmap_update_is_needed(mp))
+ return 0;
+
+ return __xfs_rmap_add(mp, dfops, XFS_RMAP_MAP, ip->i_ino,
+ whichfork, PREV);
+}
+
+/* Unmap an extent out of a file. */
+int
+xfs_rmap_unmap_extent(
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *PREV)
+{
+ if (!xfs_rmap_update_is_needed(mp))
+ return 0;
+
+ return __xfs_rmap_add(mp, dfops, XFS_RMAP_UNMAP, ip->i_ino,
+ whichfork, PREV);
+}
+
+/* Convert a data fork extent from unwritten to real or vice versa. */
+int
+xfs_rmap_convert_extent(
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *PREV)
+{
+ if (!xfs_rmap_update_is_needed(mp))
+ return 0;
+
+ return __xfs_rmap_add(mp, dfops, XFS_RMAP_CONVERT, ip->i_ino,
+ whichfork, PREV);
+}
+
+/* Schedule the creation of an rmap for non-file data. */
+int
+xfs_rmap_alloc_extent(
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ __uint64_t owner)
+{
+ struct xfs_bmbt_irec bmap;
+
+ if (!xfs_rmap_update_is_needed(mp))
+ return 0;
+
+ bmap.br_startblock = XFS_AGB_TO_FSB(mp, agno, bno);
+ bmap.br_blockcount = len;
+ bmap.br_startoff = 0;
+ bmap.br_state = XFS_EXT_NORM;
+
+ return __xfs_rmap_add(mp, dfops, XFS_RMAP_ALLOC, owner,
+ XFS_DATA_FORK, &bmap);
+}
+
+/* Schedule the deletion of an rmap for non-file data. */
+int
+xfs_rmap_free_extent(
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ __uint64_t owner)
+{
+ struct xfs_bmbt_irec bmap;
+
+ if (!xfs_rmap_update_is_needed(mp))
+ return 0;
+
+ bmap.br_startblock = XFS_AGB_TO_FSB(mp, agno, bno);
+ bmap.br_blockcount = len;
+ bmap.br_startoff = 0;
+ bmap.br_state = XFS_EXT_NORM;
+
+ return __xfs_rmap_add(mp, dfops, XFS_RMAP_FREE, owner,
+ XFS_DATA_FORK, &bmap);
+}
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
new file mode 100644
index 0000000000000..71cf99a4acbae
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -0,0 +1,209 @@
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_RMAP_H__
+#define __XFS_RMAP_H__
+
+static inline void
+xfs_rmap_ag_owner(
+ struct xfs_owner_info *oi,
+ uint64_t owner)
+{
+ oi->oi_owner = owner;
+ oi->oi_offset = 0;
+ oi->oi_flags = 0;
+}
+
+static inline void
+xfs_rmap_ino_bmbt_owner(
+ struct xfs_owner_info *oi,
+ xfs_ino_t ino,
+ int whichfork)
+{
+ oi->oi_owner = ino;
+ oi->oi_offset = 0;
+ oi->oi_flags = XFS_OWNER_INFO_BMBT_BLOCK;
+ if (whichfork == XFS_ATTR_FORK)
+ oi->oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
+}
+
+static inline void
+xfs_rmap_ino_owner(
+ struct xfs_owner_info *oi,
+ xfs_ino_t ino,
+ int whichfork,
+ xfs_fileoff_t offset)
+{
+ oi->oi_owner = ino;
+ oi->oi_offset = offset;
+ oi->oi_flags = 0;
+ if (whichfork == XFS_ATTR_FORK)
+ oi->oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
+}
+
+static inline void
+xfs_rmap_skip_owner_update(
+ struct xfs_owner_info *oi)
+{
+ oi->oi_owner = XFS_RMAP_OWN_UNKNOWN;
+}
+
+/* Reverse mapping functions. */
+
+struct xfs_buf;
+
+static inline __u64
+xfs_rmap_irec_offset_pack(
+ const struct xfs_rmap_irec *irec)
+{
+ __u64 x;
+
+ x = XFS_RMAP_OFF(irec->rm_offset);
+ if (irec->rm_flags & XFS_RMAP_ATTR_FORK)
+ x |= XFS_RMAP_OFF_ATTR_FORK;
+ if (irec->rm_flags & XFS_RMAP_BMBT_BLOCK)
+ x |= XFS_RMAP_OFF_BMBT_BLOCK;
+ if (irec->rm_flags & XFS_RMAP_UNWRITTEN)
+ x |= XFS_RMAP_OFF_UNWRITTEN;
+ return x;
+}
+
+static inline int
+xfs_rmap_irec_offset_unpack(
+ __u64 offset,
+ struct xfs_rmap_irec *irec)
+{
+ if (offset & ~(XFS_RMAP_OFF_MASK | XFS_RMAP_OFF_FLAGS))
+ return -EFSCORRUPTED;
+ irec->rm_offset = XFS_RMAP_OFF(offset);
+ if (offset & XFS_RMAP_OFF_ATTR_FORK)
+ irec->rm_flags |= XFS_RMAP_ATTR_FORK;
+ if (offset & XFS_RMAP_OFF_BMBT_BLOCK)
+ irec->rm_flags |= XFS_RMAP_BMBT_BLOCK;
+ if (offset & XFS_RMAP_OFF_UNWRITTEN)
+ irec->rm_flags |= XFS_RMAP_UNWRITTEN;
+ return 0;
+}
+
+static inline void
+xfs_owner_info_unpack(
+ struct xfs_owner_info *oinfo,
+ uint64_t *owner,
+ uint64_t *offset,
+ unsigned int *flags)
+{
+ unsigned int r = 0;
+
+ *owner = oinfo->oi_owner;
+ *offset = oinfo->oi_offset;
+ if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
+ r |= XFS_RMAP_ATTR_FORK;
+ if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
+ r |= XFS_RMAP_BMBT_BLOCK;
+ *flags = r;
+}
+
+static inline void
+xfs_owner_info_pack(
+ struct xfs_owner_info *oinfo,
+ uint64_t owner,
+ uint64_t offset,
+ unsigned int flags)
+{
+ oinfo->oi_owner = owner;
+ oinfo->oi_offset = XFS_RMAP_OFF(offset);
+ oinfo->oi_flags = 0;
+ if (flags & XFS_RMAP_ATTR_FORK)
+ oinfo->oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
+ if (flags & XFS_RMAP_BMBT_BLOCK)
+ oinfo->oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
+}
+
+int xfs_rmap_alloc(struct xfs_trans *tp, struct xfs_buf *agbp,
+ xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len,
+ struct xfs_owner_info *oinfo);
+int xfs_rmap_free(struct xfs_trans *tp, struct xfs_buf *agbp,
+ xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len,
+ struct xfs_owner_info *oinfo);
+
+int xfs_rmap_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+ xfs_extlen_t len, uint64_t owner, uint64_t offset,
+ unsigned int flags, int *stat);
+int xfs_rmap_lookup_eq(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+ xfs_extlen_t len, uint64_t owner, uint64_t offset,
+ unsigned int flags, int *stat);
+int xfs_rmap_insert(struct xfs_btree_cur *rcur, xfs_agblock_t agbno,
+ xfs_extlen_t len, uint64_t owner, uint64_t offset,
+ unsigned int flags);
+int xfs_rmap_get_rec(struct xfs_btree_cur *cur, struct xfs_rmap_irec *irec,
+ int *stat);
+
+typedef int (*xfs_rmap_query_range_fn)(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *rec,
+ void *priv);
+
+int xfs_rmap_query_range(struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *low_rec, struct xfs_rmap_irec *high_rec,
+ xfs_rmap_query_range_fn fn, void *priv);
+
+enum xfs_rmap_intent_type {
+ XFS_RMAP_MAP,
+ XFS_RMAP_MAP_SHARED,
+ XFS_RMAP_UNMAP,
+ XFS_RMAP_UNMAP_SHARED,
+ XFS_RMAP_CONVERT,
+ XFS_RMAP_CONVERT_SHARED,
+ XFS_RMAP_ALLOC,
+ XFS_RMAP_FREE,
+};
+
+struct xfs_rmap_intent {
+ struct list_head ri_list;
+ enum xfs_rmap_intent_type ri_type;
+ __uint64_t ri_owner;
+ int ri_whichfork;
+ struct xfs_bmbt_irec ri_bmap;
+};
+
+/* functions for updating the rmapbt based on bmbt map/unmap operations */
+int xfs_rmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+ struct xfs_inode *ip, int whichfork,
+ struct xfs_bmbt_irec *imap);
+int xfs_rmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+ struct xfs_inode *ip, int whichfork,
+ struct xfs_bmbt_irec *imap);
+int xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+ struct xfs_inode *ip, int whichfork,
+ struct xfs_bmbt_irec *imap);
+int xfs_rmap_alloc_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+ xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len,
+ __uint64_t owner);
+int xfs_rmap_free_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+ xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len,
+ __uint64_t owner);
+
+void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp,
+ struct xfs_btree_cur *rcur, int error);
+int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type,
+ __uint64_t owner, int whichfork, xfs_fileoff_t startoff,
+ xfs_fsblock_t startblock, xfs_filblks_t blockcount,
+ xfs_exntst_t state, struct xfs_btree_cur **pcur);
+
+#endif /* __XFS_RMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
new file mode 100644
index 0000000000000..bc1faebc84ecc
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -0,0 +1,511 @@
+/*
+ * Copyright (c) 2014 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_error.h"
+#include "xfs_extent_busy.h"
+
+/*
+ * Reverse map btree.
+ *
+ * This is a per-ag tree used to track the owner(s) of a given extent. With
+ * reflink it is possible for there to be multiple owners, which is a departure
+ * from classic XFS. Owner records for data extents are inserted when the
+ * extent is mapped and removed when an extent is unmapped. Owner records for
+ * all other block types (i.e. metadata) are inserted when an extent is
+ * allocated and removed when an extent is freed. There can only be one owner
+ * of a metadata extent, usually an inode or some other metadata structure like
+ * an AG btree.
+ *
+ * The rmap btree is part of the free space management, so blocks for the tree
+ * are sourced from the agfl. Hence we need transaction reservation support for
+ * this tree so that the freelist is always large enough. This also impacts on
+ * the minimum space we need to leave free in the AG.
+ *
+ * The tree is ordered by [ag block, owner, offset]. This is a large key size,
+ * but it is the only way to enforce unique keys when a block can be owned by
+ * multiple files at any offset. There's no need to order/search by extent
+ * size for online updating/management of the tree. It is intended that most
+ * reverse lookups will be to find the owner(s) of a particular block, or to
+ * try to recover tree and file data from corrupt primary metadata.
+ */
+
+static struct xfs_btree_cur *
+xfs_rmapbt_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp,
+ cur->bc_private.a.agbp, cur->bc_private.a.agno);
+}
+
+STATIC void
+xfs_rmapbt_set_root(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ int inc)
+{
+ struct xfs_buf *agbp = cur->bc_private.a.agbp;
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
+ int btnum = cur->bc_btnum;
+ struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno);
+
+ ASSERT(ptr->s != 0);
+
+ agf->agf_roots[btnum] = ptr->s;
+ be32_add_cpu(&agf->agf_levels[btnum], inc);
+ pag->pagf_levels[btnum] += inc;
+ xfs_perag_put(pag);
+
+ xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+}
+
+STATIC int
+xfs_rmapbt_alloc_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat)
+{
+ int error;
+ xfs_agblock_t bno;
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+ /* Allocate the new block from the freelist. If we can't, give up. */
+ error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
+ &bno, 1);
+ if (error) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+ }
+
+ trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno,
+ bno, 1);
+ if (bno == NULLAGBLOCK) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+ }
+
+ xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1,
+ false);
+
+ xfs_trans_agbtree_delta(cur->bc_tp, 1);
+ new->s = cpu_to_be32(bno);
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+}
+
+STATIC int
+xfs_rmapbt_free_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
+{
+ struct xfs_buf *agbp = cur->bc_private.a.agbp;
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ xfs_agblock_t bno;
+ int error;
+
+ bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
+ trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_private.a.agno,
+ bno, 1);
+ error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
+ if (error)
+ return error;
+
+ xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
+ XFS_EXTENT_BUSY_SKIP_DISCARD);
+ xfs_trans_agbtree_delta(cur->bc_tp, -1);
+
+ return 0;
+}
+
+STATIC int
+xfs_rmapbt_get_minrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ return cur->bc_mp->m_rmap_mnr[level != 0];
+}
+
+STATIC int
+xfs_rmapbt_get_maxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ return cur->bc_mp->m_rmap_mxr[level != 0];
+}
+
+STATIC void
+xfs_rmapbt_init_key_from_rec(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
+{
+ key->rmap.rm_startblock = rec->rmap.rm_startblock;
+ key->rmap.rm_owner = rec->rmap.rm_owner;
+ key->rmap.rm_offset = rec->rmap.rm_offset;
+}
+
+/*
+ * The high key for a reverse mapping record can be computed by shifting
+ * the startblock and offset to the highest value that would still map
+ * to that record. In practice this means that we add blockcount-1 to
+ * the startblock for all records, and if the record is for a data/attr
+ * fork mapping, we add blockcount-1 to the offset too.
+ */
+STATIC void
+xfs_rmapbt_init_high_key_from_rec(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
+{
+ __uint64_t off;
+ int adj;
+
+ adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1;
+
+ key->rmap.rm_startblock = rec->rmap.rm_startblock;
+ be32_add_cpu(&key->rmap.rm_startblock, adj);
+ key->rmap.rm_owner = rec->rmap.rm_owner;
+ key->rmap.rm_offset = rec->rmap.rm_offset;
+ if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) ||
+ XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset)))
+ return;
+ off = be64_to_cpu(key->rmap.rm_offset);
+ off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK);
+ key->rmap.rm_offset = cpu_to_be64(off);
+}
+
+STATIC void
+xfs_rmapbt_init_rec_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec)
+{
+ rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock);
+ rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount);
+ rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner);
+ rec->rmap.rm_offset = cpu_to_be64(
+ xfs_rmap_irec_offset_pack(&cur->bc_rec.r));
+}
+
+STATIC void
+xfs_rmapbt_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+
+ ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
+ ASSERT(agf->agf_roots[cur->bc_btnum] != 0);
+
+ ptr->s = agf->agf_roots[cur->bc_btnum];
+}
+
+STATIC __int64_t
+xfs_rmapbt_key_diff(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *key)
+{
+ struct xfs_rmap_irec *rec = &cur->bc_rec.r;
+ struct xfs_rmap_key *kp = &key->rmap;
+ __u64 x, y;
+ __int64_t d;
+
+ d = (__int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
+ if (d)
+ return d;
+
+ x = be64_to_cpu(kp->rm_owner);
+ y = rec->rm_owner;
+ if (x > y)
+ return 1;
+ else if (y > x)
+ return -1;
+
+ x = XFS_RMAP_OFF(be64_to_cpu(kp->rm_offset));
+ y = rec->rm_offset;
+ if (x > y)
+ return 1;
+ else if (y > x)
+ return -1;
+ return 0;
+}
+
+STATIC __int64_t
+xfs_rmapbt_diff_two_keys(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *k1,
+ union xfs_btree_key *k2)
+{
+ struct xfs_rmap_key *kp1 = &k1->rmap;
+ struct xfs_rmap_key *kp2 = &k2->rmap;
+ __int64_t d;
+ __u64 x, y;
+
+ d = (__int64_t)be32_to_cpu(kp1->rm_startblock) -
+ be32_to_cpu(kp2->rm_startblock);
+ if (d)
+ return d;
+
+ x = be64_to_cpu(kp1->rm_owner);
+ y = be64_to_cpu(kp2->rm_owner);
+ if (x > y)
+ return 1;
+ else if (y > x)
+ return -1;
+
+ x = XFS_RMAP_OFF(be64_to_cpu(kp1->rm_offset));
+ y = XFS_RMAP_OFF(be64_to_cpu(kp2->rm_offset));
+ if (x > y)
+ return 1;
+ else if (y > x)
+ return -1;
+ return 0;
+}
+
+static bool
+xfs_rmapbt_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_perag *pag = bp->b_pag;
+ unsigned int level;
+
+ /*
+ * magic number and level verification
+ *
+ * During growfs operations, we can't verify the exact level or owner as
+ * the perag is not fully initialised and hence not attached to the
+ * buffer. In this case, check against the maximum tree depth.
+ *
+ * Similarly, during log recovery we will have a perag structure
+ * attached, but the agf information will not yet have been initialised
+ * from the on disk AGF. Again, we can only check against maximum limits
+ * in this case.
+ */
+ if (block->bb_magic != cpu_to_be32(XFS_RMAP_CRC_MAGIC))
+ return false;
+
+ if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ return false;
+ if (!xfs_btree_sblock_v5hdr_verify(bp))
+ return false;
+
+ level = be16_to_cpu(block->bb_level);
+ if (pag && pag->pagf_init) {
+ if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi])
+ return false;
+ } else if (level >= mp->m_rmap_maxlevels)
+ return false;
+
+ return xfs_btree_sblock_verify(bp, mp->m_rmap_mxr[level != 0]);
+}
+
+static void
+xfs_rmapbt_read_verify(
+ struct xfs_buf *bp)
+{
+ if (!xfs_btree_sblock_verify_crc(bp))
+ xfs_buf_ioerror(bp, -EFSBADCRC);
+ else if (!xfs_rmapbt_verify(bp))
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
+
+ if (bp->b_error) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_verifier_error(bp);
+ }
+}
+
+static void
+xfs_rmapbt_write_verify(
+ struct xfs_buf *bp)
+{
+ if (!xfs_rmapbt_verify(bp)) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+ xfs_btree_sblock_calc_crc(bp);
+
+}
+
+const struct xfs_buf_ops xfs_rmapbt_buf_ops = {
+ .name = "xfs_rmapbt",
+ .verify_read = xfs_rmapbt_read_verify,
+ .verify_write = xfs_rmapbt_write_verify,
+};
+
+#if defined(DEBUG) || defined(XFS_WARN)
+STATIC int
+xfs_rmapbt_keys_inorder(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *k1,
+ union xfs_btree_key *k2)
+{
+ __uint32_t x;
+ __uint32_t y;
+ __uint64_t a;
+ __uint64_t b;
+
+ x = be32_to_cpu(k1->rmap.rm_startblock);
+ y = be32_to_cpu(k2->rmap.rm_startblock);
+ if (x < y)
+ return 1;
+ else if (x > y)
+ return 0;
+ a = be64_to_cpu(k1->rmap.rm_owner);
+ b = be64_to_cpu(k2->rmap.rm_owner);
+ if (a < b)
+ return 1;
+ else if (a > b)
+ return 0;
+ a = XFS_RMAP_OFF(be64_to_cpu(k1->rmap.rm_offset));
+ b = XFS_RMAP_OFF(be64_to_cpu(k2->rmap.rm_offset));
+ if (a <= b)
+ return 1;
+ return 0;
+}
+
+STATIC int
+xfs_rmapbt_recs_inorder(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *r1,
+ union xfs_btree_rec *r2)
+{
+ __uint32_t x;
+ __uint32_t y;
+ __uint64_t a;
+ __uint64_t b;
+
+ x = be32_to_cpu(r1->rmap.rm_startblock);
+ y = be32_to_cpu(r2->rmap.rm_startblock);
+ if (x < y)
+ return 1;
+ else if (x > y)
+ return 0;
+ a = be64_to_cpu(r1->rmap.rm_owner);
+ b = be64_to_cpu(r2->rmap.rm_owner);
+ if (a < b)
+ return 1;
+ else if (a > b)
+ return 0;
+ a = XFS_RMAP_OFF(be64_to_cpu(r1->rmap.rm_offset));
+ b = XFS_RMAP_OFF(be64_to_cpu(r2->rmap.rm_offset));
+ if (a <= b)
+ return 1;
+ return 0;
+}
+#endif /* DEBUG */
+
+static const struct xfs_btree_ops xfs_rmapbt_ops = {
+ .rec_len = sizeof(struct xfs_rmap_rec),
+ .key_len = 2 * sizeof(struct xfs_rmap_key),
+
+ .dup_cursor = xfs_rmapbt_dup_cursor,
+ .set_root = xfs_rmapbt_set_root,
+ .alloc_block = xfs_rmapbt_alloc_block,
+ .free_block = xfs_rmapbt_free_block,
+ .get_minrecs = xfs_rmapbt_get_minrecs,
+ .get_maxrecs = xfs_rmapbt_get_maxrecs,
+ .init_key_from_rec = xfs_rmapbt_init_key_from_rec,
+ .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec,
+ .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur,
+ .init_ptr_from_cur = xfs_rmapbt_init_ptr_from_cur,
+ .key_diff = xfs_rmapbt_key_diff,
+ .buf_ops = &xfs_rmapbt_buf_ops,
+ .diff_two_keys = xfs_rmapbt_diff_two_keys,
+#if defined(DEBUG) || defined(XFS_WARN)
+ .keys_inorder = xfs_rmapbt_keys_inorder,
+ .recs_inorder = xfs_rmapbt_recs_inorder,
+#endif
+};
+
+/*
+ * Allocate a new allocation btree cursor.
+ */
+struct xfs_btree_cur *
+xfs_rmapbt_init_cursor(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agnumber_t agno)
+{
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_btree_cur *cur;
+
+ cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
+ cur->bc_tp = tp;
+ cur->bc_mp = mp;
+ /* Overlapping btree; 2 keys per pointer. */
+ cur->bc_btnum = XFS_BTNUM_RMAP;
+ cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING;
+ cur->bc_blocklog = mp->m_sb.sb_blocklog;
+ cur->bc_ops = &xfs_rmapbt_ops;
+ cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
+
+ cur->bc_private.a.agbp = agbp;
+ cur->bc_private.a.agno = agno;
+
+ return cur;
+}
+
+/*
+ * Calculate number of records in an rmap btree block.
+ */
+int
+xfs_rmapbt_maxrecs(
+ struct xfs_mount *mp,
+ int blocklen,
+ int leaf)
+{
+ blocklen -= XFS_RMAP_BLOCK_LEN;
+
+ if (leaf)
+ return blocklen / sizeof(struct xfs_rmap_rec);
+ return blocklen /
+ (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t));
+}
+
+/* Compute the maximum height of an rmap btree. */
+void
+xfs_rmapbt_compute_maxlevels(
+ struct xfs_mount *mp)
+{
+ mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(mp,
+ mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
+}
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h
new file mode 100644
index 0000000000000..e73a55357dabe
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rmap_btree.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2014 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_RMAP_BTREE_H__
+#define __XFS_RMAP_BTREE_H__
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+
+/* rmaps only exist on crc enabled filesystems */
+#define XFS_RMAP_BLOCK_LEN XFS_BTREE_SBLOCK_CRC_LEN
+
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_RMAP_REC_ADDR(block, index) \
+ ((struct xfs_rmap_rec *) \
+ ((char *)(block) + XFS_RMAP_BLOCK_LEN + \
+ (((index) - 1) * sizeof(struct xfs_rmap_rec))))
+
+#define XFS_RMAP_KEY_ADDR(block, index) \
+ ((struct xfs_rmap_key *) \
+ ((char *)(block) + XFS_RMAP_BLOCK_LEN + \
+ ((index) - 1) * 2 * sizeof(struct xfs_rmap_key)))
+
+#define XFS_RMAP_HIGH_KEY_ADDR(block, index) \
+ ((struct xfs_rmap_key *) \
+ ((char *)(block) + XFS_RMAP_BLOCK_LEN + \
+ sizeof(struct xfs_rmap_key) + \
+ ((index) - 1) * 2 * sizeof(struct xfs_rmap_key)))
+
+#define XFS_RMAP_PTR_ADDR(block, index, maxrecs) \
+ ((xfs_rmap_ptr_t *) \
+ ((char *)(block) + XFS_RMAP_BLOCK_LEN + \
+ (maxrecs) * 2 * sizeof(struct xfs_rmap_key) + \
+ ((index) - 1) * sizeof(xfs_rmap_ptr_t)))
+
+struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp,
+ struct xfs_trans *tp, struct xfs_buf *bp,
+ xfs_agnumber_t agno);
+int xfs_rmapbt_maxrecs(struct xfs_mount *mp, int blocklen, int leaf);
+extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp);
+
+#endif /* __XFS_RMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 951c044e24e40..e2e1106c9fadc 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -70,7 +70,7 @@ const struct xfs_buf_ops xfs_rtbuf_ops = {
* Get a buffer for the bitmap or summary file block specified.
* The buffer is returned read and locked.
*/
-int
+static int
xfs_rtbuf_get(
xfs_mount_t *mp, /* file system mount structure */
xfs_trans_t *tp, /* transaction pointer */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 8a53eaa349f44..0e3d4f5ec33c6 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -24,6 +24,7 @@
#include "xfs_bit.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
+#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_ialloc.h"
#include "xfs_alloc.h"
@@ -36,6 +37,7 @@
#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
#include "xfs_log.h"
+#include "xfs_rmap_btree.h"
/*
* Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -729,6 +731,11 @@ xfs_sb_mount_common(
mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
+ mp->m_rmap_mxr[0] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, 1);
+ mp->m_rmap_mxr[1] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, 0);
+ mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2;
+ mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2;
+
mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
sbp->sb_inopblock);
@@ -738,6 +745,8 @@ xfs_sb_mount_common(
mp->m_ialloc_min_blks = sbp->sb_spino_align;
else
mp->m_ialloc_min_blks = mp->m_ialloc_blks;
+ mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
+ mp->m_ag_max_usable = xfs_alloc_ag_max_usable(mp);
}
/*
@@ -838,12 +847,10 @@ xfs_sync_sb(
struct xfs_trans *tp;
int error;
- tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_CHANGE, KM_SLEEP);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0,
+ XFS_TRANS_NO_WRITECOUNT, &tp);
+ if (error)
return error;
- }
xfs_log_sb(tp);
if (wait)
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 81ac870834da9..0c5b30bd884cd 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -38,6 +38,7 @@ extern const struct xfs_buf_ops xfs_agi_buf_ops;
extern const struct xfs_buf_ops xfs_agf_buf_ops;
extern const struct xfs_buf_ops xfs_agfl_buf_ops;
extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
+extern const struct xfs_buf_ops xfs_rmapbt_buf_ops;
extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops;
extern const struct xfs_buf_ops xfs_bmbt_buf_ops;
@@ -56,103 +57,6 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
extern const struct xfs_buf_ops xfs_rtbuf_ops;
/*
- * Transaction types. Used to distinguish types of buffers. These never reach
- * the log.
- */
-#define XFS_TRANS_SETATTR_NOT_SIZE 1
-#define XFS_TRANS_SETATTR_SIZE 2
-#define XFS_TRANS_INACTIVE 3
-#define XFS_TRANS_CREATE 4
-#define XFS_TRANS_CREATE_TRUNC 5
-#define XFS_TRANS_TRUNCATE_FILE 6
-#define XFS_TRANS_REMOVE 7
-#define XFS_TRANS_LINK 8
-#define XFS_TRANS_RENAME 9
-#define XFS_TRANS_MKDIR 10
-#define XFS_TRANS_RMDIR 11
-#define XFS_TRANS_SYMLINK 12
-#define XFS_TRANS_SET_DMATTRS 13
-#define XFS_TRANS_GROWFS 14
-#define XFS_TRANS_STRAT_WRITE 15
-#define XFS_TRANS_DIOSTRAT 16
-/* 17 was XFS_TRANS_WRITE_SYNC */
-#define XFS_TRANS_WRITEID 18
-#define XFS_TRANS_ADDAFORK 19
-#define XFS_TRANS_ATTRINVAL 20
-#define XFS_TRANS_ATRUNCATE 21
-#define XFS_TRANS_ATTR_SET 22
-#define XFS_TRANS_ATTR_RM 23
-#define XFS_TRANS_ATTR_FLAG 24
-#define XFS_TRANS_CLEAR_AGI_BUCKET 25
-#define XFS_TRANS_SB_CHANGE 26
-/*
- * Dummy entries since we use the transaction type to index into the
- * trans_type[] in xlog_recover_print_trans_head()
- */
-#define XFS_TRANS_DUMMY1 27
-#define XFS_TRANS_DUMMY2 28
-#define XFS_TRANS_QM_QUOTAOFF 29
-#define XFS_TRANS_QM_DQALLOC 30
-#define XFS_TRANS_QM_SETQLIM 31
-#define XFS_TRANS_QM_DQCLUSTER 32
-#define XFS_TRANS_QM_QINOCREATE 33
-#define XFS_TRANS_QM_QUOTAOFF_END 34
-#define XFS_TRANS_FSYNC_TS 35
-#define XFS_TRANS_GROWFSRT_ALLOC 36
-#define XFS_TRANS_GROWFSRT_ZERO 37
-#define XFS_TRANS_GROWFSRT_FREE 38
-#define XFS_TRANS_SWAPEXT 39
-#define XFS_TRANS_CHECKPOINT 40
-#define XFS_TRANS_ICREATE 41
-#define XFS_TRANS_CREATE_TMPFILE 42
-#define XFS_TRANS_TYPE_MAX 43
-/* new transaction types need to be reflected in xfs_logprint(8) */
-
-#define XFS_TRANS_TYPES \
- { XFS_TRANS_SETATTR_NOT_SIZE, "SETATTR_NOT_SIZE" }, \
- { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \
- { XFS_TRANS_INACTIVE, "INACTIVE" }, \
- { XFS_TRANS_CREATE, "CREATE" }, \
- { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \
- { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \
- { XFS_TRANS_REMOVE, "REMOVE" }, \
- { XFS_TRANS_LINK, "LINK" }, \
- { XFS_TRANS_RENAME, "RENAME" }, \
- { XFS_TRANS_MKDIR, "MKDIR" }, \
- { XFS_TRANS_RMDIR, "RMDIR" }, \
- { XFS_TRANS_SYMLINK, "SYMLINK" }, \
- { XFS_TRANS_SET_DMATTRS, "SET_DMATTRS" }, \
- { XFS_TRANS_GROWFS, "GROWFS" }, \
- { XFS_TRANS_STRAT_WRITE, "STRAT_WRITE" }, \
- { XFS_TRANS_DIOSTRAT, "DIOSTRAT" }, \
- { XFS_TRANS_WRITEID, "WRITEID" }, \
- { XFS_TRANS_ADDAFORK, "ADDAFORK" }, \
- { XFS_TRANS_ATTRINVAL, "ATTRINVAL" }, \
- { XFS_TRANS_ATRUNCATE, "ATRUNCATE" }, \
- { XFS_TRANS_ATTR_SET, "ATTR_SET" }, \
- { XFS_TRANS_ATTR_RM, "ATTR_RM" }, \
- { XFS_TRANS_ATTR_FLAG, "ATTR_FLAG" }, \
- { XFS_TRANS_CLEAR_AGI_BUCKET, "CLEAR_AGI_BUCKET" }, \
- { XFS_TRANS_SB_CHANGE, "SBCHANGE" }, \
- { XFS_TRANS_DUMMY1, "DUMMY1" }, \
- { XFS_TRANS_DUMMY2, "DUMMY2" }, \
- { XFS_TRANS_QM_QUOTAOFF, "QM_QUOTAOFF" }, \
- { XFS_TRANS_QM_DQALLOC, "QM_DQALLOC" }, \
- { XFS_TRANS_QM_SETQLIM, "QM_SETQLIM" }, \
- { XFS_TRANS_QM_DQCLUSTER, "QM_DQCLUSTER" }, \
- { XFS_TRANS_QM_QINOCREATE, "QM_QINOCREATE" }, \
- { XFS_TRANS_QM_QUOTAOFF_END, "QM_QOFF_END" }, \
- { XFS_TRANS_FSYNC_TS, "FSYNC_TS" }, \
- { XFS_TRANS_GROWFSRT_ALLOC, "GROWFSRT_ALLOC" }, \
- { XFS_TRANS_GROWFSRT_ZERO, "GROWFSRT_ZERO" }, \
- { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \
- { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \
- { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \
- { XFS_TRANS_ICREATE, "ICREATE" }, \
- { XFS_TRANS_CREATE_TMPFILE, "CREATE_TMPFILE" }, \
- { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" }
-
-/*
* This structure is used to track log items associated with
* a transaction. It points to the log item and keeps some
* flags to track the state of the log item. It also tracks
@@ -181,8 +85,9 @@ int xfs_log_calc_minimum_size(struct xfs_mount *);
#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */
#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */
#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */
-#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer
- count in superblock */
+#define XFS_TRANS_NO_WRITECOUNT 0x40 /* do not elevate SB writecount */
+#define XFS_TRANS_NOFS 0x80 /* pass KM_NOFS to kmem_alloc */
+
/*
* Field values for xfs_trans_mod_sb.
*/
@@ -212,6 +117,7 @@ int xfs_log_calc_minimum_size(struct xfs_mount *);
#define XFS_INO_BTREE_REF 3
#define XFS_ALLOC_BTREE_REF 2
#define XFS_BMAP_BTREE_REF 2
+#define XFS_RMAP_BTREE_REF 2
#define XFS_DIR_BTREE_REF 2
#define XFS_INO_REF 2
#define XFS_ATTR_BTREE_REF 1
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 68cb1e7bf2bb1..301ef2f4dbd62 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -64,6 +64,30 @@ xfs_calc_buf_res(
}
/*
+ * Per-extent log reservation for the btree changes involved in freeing or
+ * allocating an extent. In classic XFS there were two trees that will be
+ * modified (bnobt + cntbt). With rmap enabled, there are three trees
+ * (rmapbt). The number of blocks reserved is based on the formula:
+ *
+ * num trees * ((2 blocks/level * max depth) - 1)
+ *
+ * Keep in mind that max depth is calculated separately for each type of tree.
+ */
+static uint
+xfs_allocfree_log_count(
+ struct xfs_mount *mp,
+ uint num_ops)
+{
+ uint blocks;
+
+ blocks = num_ops * 2 * (2 * mp->m_ag_maxlevels - 1);
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1);
+
+ return blocks;
+}
+
+/*
* Logging inodes is really tricksy. They are logged in memory format,
* which means that what we write into the log doesn't directly translate into
* the amount of space they use on disk.
@@ -126,7 +150,7 @@ xfs_calc_inode_res(
*/
STATIC uint
xfs_calc_finobt_res(
- struct xfs_mount *mp,
+ struct xfs_mount *mp,
int alloc,
int modify)
{
@@ -137,7 +161,7 @@ xfs_calc_finobt_res(
res = xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1));
if (alloc)
- res += xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ res += xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
if (modify)
res += (uint)XFS_FSB_TO_B(mp, 1);
@@ -153,9 +177,9 @@ xfs_calc_finobt_res(
* item logged to try to account for the overhead of the transaction mechanism.
*
* Note: Most of the reservations underestimate the number of allocation
- * groups into which they could free extents in the xfs_bmap_finish() call.
+ * groups into which they could free extents in the xfs_defer_finish() call.
* This is because the number in the worst case is quite high and quite
- * unusual. In order to fix this we need to change xfs_bmap_finish() to free
+ * unusual. In order to fix this we need to change xfs_defer_finish() to free
* extents in only a single AG at a time. This will require changes to the
* EFI code as well, however, so that the EFI for the extents not freed is
* logged again in each transaction. See SGI PV #261917.
@@ -188,10 +212,10 @@ xfs_calc_write_reservation(
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
XFS_FSB_TO_B(mp, 1)) +
xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2),
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2),
XFS_FSB_TO_B(mp, 1))));
}
@@ -217,10 +241,10 @@ xfs_calc_itruncate_reservation(
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1,
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4),
XFS_FSB_TO_B(mp, 1)) +
xfs_calc_buf_res(5, 0) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
XFS_FSB_TO_B(mp, 1)) +
xfs_calc_buf_res(2 + mp->m_ialloc_blks +
mp->m_in_maxlevels, 0)));
@@ -247,7 +271,7 @@ xfs_calc_rename_reservation(
xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3),
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 3),
XFS_FSB_TO_B(mp, 1))));
}
@@ -286,7 +310,7 @@ xfs_calc_link_reservation(
xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
XFS_FSB_TO_B(mp, 1))));
}
@@ -324,7 +348,7 @@ xfs_calc_remove_reservation(
xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2),
XFS_FSB_TO_B(mp, 1))));
}
@@ -371,7 +395,7 @@ xfs_calc_create_resv_alloc(
mp->m_sb.sb_sectsize +
xfs_calc_buf_res(mp->m_ialloc_blks, XFS_FSB_TO_B(mp, 1)) +
xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
}
@@ -399,7 +423,7 @@ xfs_calc_icreate_resv_alloc(
return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
mp->m_sb.sb_sectsize +
xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
XFS_FSB_TO_B(mp, 1)) +
xfs_calc_finobt_res(mp, 0, 0);
}
@@ -483,7 +507,7 @@ xfs_calc_ifree_reservation(
xfs_calc_buf_res(1, 0) +
xfs_calc_buf_res(2 + mp->m_ialloc_blks +
mp->m_in_maxlevels, 0) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
XFS_FSB_TO_B(mp, 1)) +
xfs_calc_finobt_res(mp, 0, 1);
}
@@ -513,7 +537,7 @@ xfs_calc_growdata_reservation(
struct xfs_mount *mp)
{
return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
}
@@ -535,7 +559,7 @@ xfs_calc_growrtalloc_reservation(
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
XFS_FSB_TO_B(mp, 1)) +
xfs_calc_inode_res(mp, 1) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
}
@@ -611,7 +635,7 @@ xfs_calc_addafork_reservation(
xfs_calc_buf_res(1, mp->m_dir_geo->blksize) +
xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1,
XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
}
@@ -634,7 +658,7 @@ xfs_calc_attrinval_reservation(
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4),
XFS_FSB_TO_B(mp, 1))));
}
@@ -701,7 +725,7 @@ xfs_calc_attrrm_reservation(
XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)),
(xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2),
XFS_FSB_TO_B(mp, 1))));
}
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
index 797815012c0e3..0eb46ed6d404d 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -68,16 +68,6 @@ struct xfs_trans_resv {
#define M_RES(mp) (&(mp)->m_resv)
/*
- * Per-extent log reservation for the allocation btree changes
- * involved in freeing or allocating an extent.
- * 2 trees * (2 blocks/level * max depth - 1) * block size
- */
-#define XFS_ALLOCFREE_LOG_RES(mp,nx) \
- ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * (mp)->m_ag_maxlevels - 1)))
-#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
- ((nx) * (2 * (2 * (mp)->m_ag_maxlevels - 1)))
-
-/*
* Per-directory log reservation for any directory change.
* dir blocks: (1 btree block per level + data block + free block) * dblock size
* bmap btree: (levels + 2) * max depth * block size
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index b79dc66b2ecd4..3d503647f26b6 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -108,8 +108,8 @@ typedef enum {
} xfs_lookup_t;
typedef enum {
- XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_BMAPi, XFS_BTNUM_INOi,
- XFS_BTNUM_FINOi, XFS_BTNUM_MAX
+ XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi,
+ XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_MAX
} xfs_btnum_t;
struct xfs_name {
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 2d5df1f23bbcb..b6e527b8eccb6 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -158,22 +158,14 @@ xfs_get_acl(struct inode *inode, int type)
if (error) {
/*
* If the attribute doesn't exist make sure we have a negative
- * cache entry, for any other error assume it is transient and
- * leave the cache entry as ACL_NOT_CACHED.
+ * cache entry, for any other error assume it is transient.
*/
- if (error == -ENOATTR)
- goto out_update_cache;
- acl = ERR_PTR(error);
- goto out;
+ if (error != -ENOATTR)
+ acl = ERR_PTR(error);
+ } else {
+ acl = xfs_acl_from_disk(xfs_acl, len,
+ XFS_ACL_MAX_ENTRIES(ip->i_mount));
}
-
- acl = xfs_acl_from_disk(xfs_acl, len, XFS_ACL_MAX_ENTRIES(ip->i_mount));
- if (IS_ERR(acl))
- goto out;
-
-out_update_cache:
- set_cached_acl(inode, type, acl);
-out:
kmem_free(xfs_acl);
return acl;
}
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index d445a64b979e9..7575cfc3ad156 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -84,23 +84,80 @@ xfs_find_bdev_for_inode(
}
/*
- * We're now finished for good with this ioend structure.
- * Update the page state via the associated buffer_heads,
- * release holds on the inode and bio, and finally free
- * up memory. Do not use the ioend after this.
+ * We're now finished for good with this page. Update the page state via the
+ * associated buffer_heads, paying attention to the start and end offsets that
+ * we need to process on the page.
+ *
+ * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
+ * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
+ * the page at all, as we may be racing with memory reclaim and it can free both
+ * the bufferhead chain and the page as it will see the page as clean and
+ * unused.
+ */
+static void
+xfs_finish_page_writeback(
+ struct inode *inode,
+ struct bio_vec *bvec,
+ int error)
+{
+ unsigned int end = bvec->bv_offset + bvec->bv_len - 1;
+ struct buffer_head *head, *bh, *next;
+ unsigned int off = 0;
+ unsigned int bsize;
+
+ ASSERT(bvec->bv_offset < PAGE_SIZE);
+ ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0);
+ ASSERT(end < PAGE_SIZE);
+ ASSERT((bvec->bv_len & ((1 << inode->i_blkbits) - 1)) == 0);
+
+ bh = head = page_buffers(bvec->bv_page);
+
+ bsize = bh->b_size;
+ do {
+ next = bh->b_this_page;
+ if (off < bvec->bv_offset)
+ goto next_bh;
+ if (off > end)
+ break;
+ bh->b_end_io(bh, !error);
+next_bh:
+ off += bsize;
+ } while ((bh = next) != head);
+}
+
+/*
+ * We're now finished for good with this ioend structure. Update the page
+ * state, release holds on bios, and finally free up memory. Do not use the
+ * ioend after this.
*/
STATIC void
xfs_destroy_ioend(
- xfs_ioend_t *ioend)
+ struct xfs_ioend *ioend,
+ int error)
{
- struct buffer_head *bh, *next;
+ struct inode *inode = ioend->io_inode;
+ struct bio *last = ioend->io_bio;
+ struct bio *bio, *next;
- for (bh = ioend->io_buffer_head; bh; bh = next) {
- next = bh->b_private;
- bh->b_end_io(bh, !ioend->io_error);
- }
+ for (bio = &ioend->io_inline_bio; bio; bio = next) {
+ struct bio_vec *bvec;
+ int i;
- mempool_free(ioend, xfs_ioend_pool);
+ /*
+ * For the last bio, bi_private points to the ioend, so we
+ * need to explicitly end the iteration here.
+ */
+ if (bio == last)
+ next = NULL;
+ else
+ next = bio->bi_private;
+
+ /* walk each page on bio, ending page IO on them */
+ bio_for_each_segment_all(bvec, bio, i)
+ xfs_finish_page_writeback(inode, bvec, error);
+
+ bio_put(bio);
+ }
}
/*
@@ -120,13 +177,9 @@ xfs_setfilesize_trans_alloc(
struct xfs_trans *tp;
int error;
- tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
-
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
+ if (error)
return error;
- }
ioend->io_append_trans = tp;
@@ -174,7 +227,8 @@ xfs_setfilesize(
STATIC int
xfs_setfilesize_ioend(
- struct xfs_ioend *ioend)
+ struct xfs_ioend *ioend,
+ int error)
{
struct xfs_inode *ip = XFS_I(ioend->io_inode);
struct xfs_trans *tp = ioend->io_append_trans;
@@ -188,53 +242,32 @@ xfs_setfilesize_ioend(
__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
/* we abort the update if there was an IO error */
- if (ioend->io_error) {
+ if (error) {
xfs_trans_cancel(tp);
- return ioend->io_error;
+ return error;
}
return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
}
/*
- * Schedule IO completion handling on the final put of an ioend.
- *
- * If there is no work to do we might as well call it a day and free the
- * ioend right now.
- */
-STATIC void
-xfs_finish_ioend(
- struct xfs_ioend *ioend)
-{
- if (atomic_dec_and_test(&ioend->io_remaining)) {
- struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
-
- if (ioend->io_type == XFS_IO_UNWRITTEN)
- queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
- else if (ioend->io_append_trans)
- queue_work(mp->m_data_workqueue, &ioend->io_work);
- else
- xfs_destroy_ioend(ioend);
- }
-}
-
-/*
* IO write completion.
*/
STATIC void
xfs_end_io(
struct work_struct *work)
{
- xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work);
- struct xfs_inode *ip = XFS_I(ioend->io_inode);
- int error = 0;
+ struct xfs_ioend *ioend =
+ container_of(work, struct xfs_ioend, io_work);
+ struct xfs_inode *ip = XFS_I(ioend->io_inode);
+ int error = ioend->io_bio->bi_error;
/*
* Set an error if the mount has shut down and proceed with end I/O
* processing so it can perform whatever cleanups are necessary.
*/
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
- ioend->io_error = -EIO;
+ error = -EIO;
/*
* For unwritten extents we need to issue transactions to convert a
@@ -244,55 +277,33 @@ xfs_end_io(
* on error.
*/
if (ioend->io_type == XFS_IO_UNWRITTEN) {
- if (ioend->io_error)
+ if (error)
goto done;
error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
ioend->io_size);
} else if (ioend->io_append_trans) {
- error = xfs_setfilesize_ioend(ioend);
+ error = xfs_setfilesize_ioend(ioend, error);
} else {
ASSERT(!xfs_ioend_is_append(ioend));
}
done:
- if (error)
- ioend->io_error = error;
- xfs_destroy_ioend(ioend);
+ xfs_destroy_ioend(ioend, error);
}
-/*
- * Allocate and initialise an IO completion structure.
- * We need to track unwritten extent write completion here initially.
- * We'll need to extend this for updating the ondisk inode size later
- * (vs. incore size).
- */
-STATIC xfs_ioend_t *
-xfs_alloc_ioend(
- struct inode *inode,
- unsigned int type)
+STATIC void
+xfs_end_bio(
+ struct bio *bio)
{
- xfs_ioend_t *ioend;
-
- ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS);
-
- /*
- * Set the count to 1 initially, which will prevent an I/O
- * completion callback from happening before we have started
- * all the I/O from calling the completion routine too early.
- */
- atomic_set(&ioend->io_remaining, 1);
- ioend->io_error = 0;
- INIT_LIST_HEAD(&ioend->io_list);
- ioend->io_type = type;
- ioend->io_inode = inode;
- ioend->io_buffer_head = NULL;
- ioend->io_buffer_tail = NULL;
- ioend->io_offset = 0;
- ioend->io_size = 0;
- ioend->io_append_trans = NULL;
+ struct xfs_ioend *ioend = bio->bi_private;
+ struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
- INIT_WORK(&ioend->io_work, xfs_end_io);
- return ioend;
+ if (ioend->io_type == XFS_IO_UNWRITTEN)
+ queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
+ else if (ioend->io_append_trans)
+ queue_work(mp->m_data_workqueue, &ioend->io_work);
+ else
+ xfs_destroy_ioend(ioend, bio->bi_error);
}
STATIC int
@@ -364,50 +375,6 @@ xfs_imap_valid(
offset < imap->br_startoff + imap->br_blockcount;
}
-/*
- * BIO completion handler for buffered IO.
- */
-STATIC void
-xfs_end_bio(
- struct bio *bio)
-{
- xfs_ioend_t *ioend = bio->bi_private;
-
- if (!ioend->io_error)
- ioend->io_error = bio->bi_error;
-
- /* Toss bio and pass work off to an xfsdatad thread */
- bio->bi_private = NULL;
- bio->bi_end_io = NULL;
- bio_put(bio);
-
- xfs_finish_ioend(ioend);
-}
-
-STATIC void
-xfs_submit_ioend_bio(
- struct writeback_control *wbc,
- xfs_ioend_t *ioend,
- struct bio *bio)
-{
- atomic_inc(&ioend->io_remaining);
- bio->bi_private = ioend;
- bio->bi_end_io = xfs_end_bio;
- submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
-}
-
-STATIC struct bio *
-xfs_alloc_ioend_bio(
- struct buffer_head *bh)
-{
- struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
-
- ASSERT(bio->bi_private == NULL);
- bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
- bio->bi_bdev = bh->b_bdev;
- return bio;
-}
-
STATIC void
xfs_start_buffer_writeback(
struct buffer_head *bh)
@@ -452,28 +419,36 @@ static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
}
/*
- * Submit all of the bios for an ioend. We are only passed a single ioend at a
- * time; the caller is responsible for chaining prior to submission.
+ * Submit the bio for an ioend. We are passed an ioend with a bio attached to
+ * it, and we submit that bio. The ioend may be used for multiple bio
+ * submissions, so we only want to allocate an append transaction for the ioend
+ * once. In the case of multiple bio submission, each bio will take an IO
+ * reference to the ioend to ensure that the ioend completion is only done once
+ * all bios have been submitted and the ioend is really done.
*
* If @fail is non-zero, it means that we have a situation where some part of
* the submission process has failed after we have marked paged for writeback
- * and unlocked them. In this situation, we need to fail the ioend chain rather
- * than submit it to IO. This typically only happens on a filesystem shutdown.
+ * and unlocked them. In this situation, we need to fail the bio and ioend
+ * rather than submit it to IO. This typically only happens on a filesystem
+ * shutdown.
*/
STATIC int
xfs_submit_ioend(
struct writeback_control *wbc,
- xfs_ioend_t *ioend,
+ struct xfs_ioend *ioend,
int status)
{
- struct buffer_head *bh;
- struct bio *bio;
- sector_t lastblock = 0;
-
/* Reserve log space if we might write beyond the on-disk inode size. */
if (!status &&
- ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
+ ioend->io_type != XFS_IO_UNWRITTEN &&
+ xfs_ioend_is_append(ioend) &&
+ !ioend->io_append_trans)
status = xfs_setfilesize_trans_alloc(ioend);
+
+ ioend->io_bio->bi_private = ioend;
+ ioend->io_bio->bi_end_io = xfs_end_bio;
+ bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
+ (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
/*
* If we are failing the IO now, just mark the ioend with an
* error and finish it. This will run IO completion immediately
@@ -481,33 +456,73 @@ xfs_submit_ioend(
* time.
*/
if (status) {
- ioend->io_error = status;
- xfs_finish_ioend(ioend);
+ ioend->io_bio->bi_error = status;
+ bio_endio(ioend->io_bio);
return status;
}
- bio = NULL;
- for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
+ submit_bio(ioend->io_bio);
+ return 0;
+}
- if (!bio) {
-retry:
- bio = xfs_alloc_ioend_bio(bh);
- } else if (bh->b_blocknr != lastblock + 1) {
- xfs_submit_ioend_bio(wbc, ioend, bio);
- goto retry;
- }
+static void
+xfs_init_bio_from_bh(
+ struct bio *bio,
+ struct buffer_head *bh)
+{
+ bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+ bio->bi_bdev = bh->b_bdev;
+}
- if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
- xfs_submit_ioend_bio(wbc, ioend, bio);
- goto retry;
- }
+static struct xfs_ioend *
+xfs_alloc_ioend(
+ struct inode *inode,
+ unsigned int type,
+ xfs_off_t offset,
+ struct buffer_head *bh)
+{
+ struct xfs_ioend *ioend;
+ struct bio *bio;
- lastblock = bh->b_blocknr;
- }
- if (bio)
- xfs_submit_ioend_bio(wbc, ioend, bio);
- xfs_finish_ioend(ioend);
- return 0;
+ bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, xfs_ioend_bioset);
+ xfs_init_bio_from_bh(bio, bh);
+
+ ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
+ INIT_LIST_HEAD(&ioend->io_list);
+ ioend->io_type = type;
+ ioend->io_inode = inode;
+ ioend->io_size = 0;
+ ioend->io_offset = offset;
+ INIT_WORK(&ioend->io_work, xfs_end_io);
+ ioend->io_append_trans = NULL;
+ ioend->io_bio = bio;
+ return ioend;
+}
+
+/*
+ * Allocate a new bio, and chain the old bio to the new one.
+ *
+ * Note that we have to do perform the chaining in this unintuitive order
+ * so that the bi_private linkage is set up in the right direction for the
+ * traversal in xfs_destroy_ioend().
+ */
+static void
+xfs_chain_bio(
+ struct xfs_ioend *ioend,
+ struct writeback_control *wbc,
+ struct buffer_head *bh)
+{
+ struct bio *new;
+
+ new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
+ xfs_init_bio_from_bh(new, bh);
+
+ bio_chain(ioend->io_bio, new);
+ bio_get(ioend->io_bio); /* for xfs_destroy_ioend */
+ bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
+ (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
+ submit_bio(ioend->io_bio);
+ ioend->io_bio = new;
}
/*
@@ -523,27 +538,24 @@ xfs_add_to_ioend(
struct buffer_head *bh,
xfs_off_t offset,
struct xfs_writepage_ctx *wpc,
+ struct writeback_control *wbc,
struct list_head *iolist)
{
if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
bh->b_blocknr != wpc->last_block + 1 ||
offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
- struct xfs_ioend *new;
-
if (wpc->ioend)
list_add(&wpc->ioend->io_list, iolist);
-
- new = xfs_alloc_ioend(inode, wpc->io_type);
- new->io_offset = offset;
- new->io_buffer_head = bh;
- new->io_buffer_tail = bh;
- wpc->ioend = new;
- } else {
- wpc->ioend->io_buffer_tail->b_private = bh;
- wpc->ioend->io_buffer_tail = bh;
+ wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset, bh);
}
- bh->b_private = NULL;
+ /*
+ * If the buffer doesn't fit into the bio we need to allocate a new
+ * one. This shouldn't happen more than once for a given buffer.
+ */
+ while (xfs_bio_add_buffer(wpc->ioend->io_bio, bh) != bh->b_size)
+ xfs_chain_bio(wpc->ioend, wbc, bh);
+
wpc->ioend->io_size += bh->b_size;
wpc->last_block = bh->b_blocknr;
xfs_start_buffer_writeback(bh);
@@ -704,7 +716,7 @@ next_buffer:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
out_invalidate:
- xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+ xfs_vm_invalidatepage(page, 0, PAGE_SIZE);
return;
}
@@ -803,7 +815,7 @@ xfs_writepage_map(
lock_buffer(bh);
if (wpc->io_type != XFS_IO_OVERWRITE)
xfs_map_at_offset(inode, bh, &wpc->imap, offset);
- xfs_add_to_ioend(inode, bh, offset, wpc, &submit_list);
+ xfs_add_to_ioend(inode, bh, offset, wpc, wbc, &submit_list);
count++;
}
@@ -925,9 +937,9 @@ xfs_do_writepage(
* ---------------------------------^------------------|
*/
offset = i_size_read(inode);
- end_index = offset >> PAGE_CACHE_SHIFT;
+ end_index = offset >> PAGE_SHIFT;
if (page->index < end_index)
- end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT;
+ end_offset = (xfs_off_t)(page->index + 1) << PAGE_SHIFT;
else {
/*
* Check whether the page to write out is beyond or straddles
@@ -940,7 +952,7 @@ xfs_do_writepage(
* | | Straddles |
* ---------------------------------^-----------|--------|
*/
- unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1);
+ unsigned offset_into_page = offset & (PAGE_SIZE - 1);
/*
* Skip the page if it is fully outside i_size, e.g. due to a
@@ -971,7 +983,7 @@ xfs_do_writepage(
* memory is zeroed when mapped, and writes to that region are
* not written out to the file."
*/
- zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE);
+ zero_user_segment(page, offset_into_page, PAGE_SIZE);
/* Adjust the end_offset to the end of file */
end_offset = offset;
@@ -1038,6 +1050,20 @@ xfs_vm_releasepage(
trace_xfs_releasepage(page->mapping->host, page, 0, 0);
+ /*
+ * mm accommodates an old ext3 case where clean pages might not have had
+ * the dirty bit cleared. Thus, it can send actual dirty pages to
+ * ->releasepage() via shrink_active_list(). Conversely,
+ * block_invalidatepage() can send pages that are still marked dirty
+ * but otherwise have invalidated buffers.
+ *
+ * We've historically freed buffers on the latter. Instead, quietly
+ * filter out all dirty pages to avoid spurious buffer state warnings.
+ * This can likely be removed once shrink_active_list() is fixed.
+ */
+ if (PageDirty(page))
+ return 0;
+
xfs_count_page_state(page, &delalloc, &unwritten);
if (WARN_ON_ONCE(delalloc))
@@ -1141,6 +1167,8 @@ __xfs_get_blocks(
ssize_t size;
int new = 0;
+ BUG_ON(create && !direct);
+
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
@@ -1148,22 +1176,14 @@ __xfs_get_blocks(
ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
size = bh_result->b_size;
- if (!create && direct && offset >= i_size_read(inode))
+ if (!create && offset >= i_size_read(inode))
return 0;
/*
* Direct I/O is usually done on preallocated files, so try getting
- * a block mapping without an exclusive lock first. For buffered
- * writes we already have the exclusive iolock anyway, so avoiding
- * a lock roundtrip here by taking the ilock exclusive from the
- * beginning is a useful micro optimization.
+ * a block mapping without an exclusive lock first.
*/
- if (create && !direct) {
- lockmode = XFS_ILOCK_EXCL;
- xfs_ilock(ip, lockmode);
- } else {
- lockmode = xfs_ilock_data_map_shared(ip);
- }
+ lockmode = xfs_ilock_data_map_shared(ip);
ASSERT(offset <= mp->m_super->s_maxbytes);
if (offset + size > mp->m_super->s_maxbytes)
@@ -1182,37 +1202,19 @@ __xfs_get_blocks(
(imap.br_startblock == HOLESTARTBLOCK ||
imap.br_startblock == DELAYSTARTBLOCK) ||
(IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
- if (direct || xfs_get_extsz_hint(ip)) {
- /*
- * xfs_iomap_write_direct() expects the shared lock. It
- * is unlocked on return.
- */
- if (lockmode == XFS_ILOCK_EXCL)
- xfs_ilock_demote(ip, lockmode);
-
- error = xfs_iomap_write_direct(ip, offset, size,
- &imap, nimaps);
- if (error)
- return error;
- new = 1;
+ /*
+ * xfs_iomap_write_direct() expects the shared lock. It
+ * is unlocked on return.
+ */
+ if (lockmode == XFS_ILOCK_EXCL)
+ xfs_ilock_demote(ip, lockmode);
- } else {
- /*
- * Delalloc reservations do not require a transaction,
- * we can go on without dropping the lock here. If we
- * are allocating a new delalloc block, make sure that
- * we set the new flag so that we mark the buffer new so
- * that we know that it is newly allocated if the write
- * fails.
- */
- if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
- new = 1;
- error = xfs_iomap_write_delay(ip, offset, size, &imap);
- if (error)
- goto out_unlock;
+ error = xfs_iomap_write_direct(ip, offset, size,
+ &imap, nimaps);
+ if (error)
+ return error;
+ new = 1;
- xfs_iunlock(ip, lockmode);
- }
trace_xfs_get_blocks_alloc(ip, offset, size,
ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
: XFS_IO_DELALLOC, &imap);
@@ -1233,9 +1235,7 @@ __xfs_get_blocks(
}
/* trim mapping down to size requested */
- if (direct || size > (1 << inode->i_blkbits))
- xfs_map_trim_size(inode, iblock, bh_result,
- &imap, offset, size);
+ xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
/*
* For unwritten extents do not report a disk address in the buffered
@@ -1248,7 +1248,7 @@ __xfs_get_blocks(
if (ISUNWRITTEN(&imap))
set_buffer_unwritten(bh_result);
/* direct IO needs special help */
- if (create && direct) {
+ if (create) {
if (dax_fault)
ASSERT(!ISUNWRITTEN(&imap));
else
@@ -1277,14 +1277,7 @@ __xfs_get_blocks(
(new || ISUNWRITTEN(&imap))))
set_buffer_new(bh_result);
- if (imap.br_startblock == DELAYSTARTBLOCK) {
- BUG_ON(direct);
- if (create) {
- set_buffer_uptodate(bh_result);
- set_buffer_mapped(bh_result);
- set_buffer_delay(bh_result);
- }
- }
+ BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK);
return 0;
@@ -1334,7 +1327,7 @@ xfs_get_blocks_dax_fault(
* whereas if we have flags set we will always be called in task context
* (i.e. from a workqueue).
*/
-STATIC int
+int
xfs_end_io_direct_write(
struct kiocb *iocb,
loff_t offset,
@@ -1391,13 +1384,10 @@ xfs_end_io_direct_write(
trace_xfs_end_io_direct_write_append(ip, offset, size);
- tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
- if (error) {
- xfs_trans_cancel(tp);
- return error;
- }
- error = xfs_setfilesize(ip, tp, offset, size);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0,
+ &tp);
+ if (!error)
+ error = xfs_setfilesize(ip, tp, offset, size);
}
return error;
@@ -1406,237 +1396,12 @@ xfs_end_io_direct_write(
STATIC ssize_t
xfs_vm_direct_IO(
struct kiocb *iocb,
- struct iov_iter *iter,
- loff_t offset)
+ struct iov_iter *iter)
{
- struct inode *inode = iocb->ki_filp->f_mapping->host;
- dio_iodone_t *endio = NULL;
- int flags = 0;
- struct block_device *bdev;
-
- if (iov_iter_rw(iter) == WRITE) {
- endio = xfs_end_io_direct_write;
- flags = DIO_ASYNC_EXTEND;
- }
-
- if (IS_DAX(inode)) {
- return dax_do_io(iocb, inode, iter, offset,
- xfs_get_blocks_direct, endio, 0);
- }
-
- bdev = xfs_find_bdev_for_inode(inode);
- return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
- xfs_get_blocks_direct, endio, NULL, flags);
-}
-
-/*
- * Punch out the delalloc blocks we have already allocated.
- *
- * Don't bother with xfs_setattr given that nothing can have made it to disk yet
- * as the page is still locked at this point.
- */
-STATIC void
-xfs_vm_kill_delalloc_range(
- struct inode *inode,
- loff_t start,
- loff_t end)
-{
- struct xfs_inode *ip = XFS_I(inode);
- xfs_fileoff_t start_fsb;
- xfs_fileoff_t end_fsb;
- int error;
-
- start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
- end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
- if (end_fsb <= start_fsb)
- return;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
- end_fsb - start_fsb);
- if (error) {
- /* something screwed, just bail */
- if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
- xfs_alert(ip->i_mount,
- "xfs_vm_write_failed: unable to clean up ino %lld",
- ip->i_ino);
- }
- }
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-}
-
-STATIC void
-xfs_vm_write_failed(
- struct inode *inode,
- struct page *page,
- loff_t pos,
- unsigned len)
-{
- loff_t block_offset;
- loff_t block_start;
- loff_t block_end;
- loff_t from = pos & (PAGE_CACHE_SIZE - 1);
- loff_t to = from + len;
- struct buffer_head *bh, *head;
- struct xfs_mount *mp = XFS_I(inode)->i_mount;
-
/*
- * The request pos offset might be 32 or 64 bit, this is all fine
- * on 64-bit platform. However, for 64-bit pos request on 32-bit
- * platform, the high 32-bit will be masked off if we evaluate the
- * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
- * 0xfffff000 as an unsigned long, hence the result is incorrect
- * which could cause the following ASSERT failed in most cases.
- * In order to avoid this, we can evaluate the block_offset of the
- * start of the page by using shifts rather than masks the mismatch
- * problem.
+ * We just need the method present so that open/fcntl allow direct I/O.
*/
- block_offset = (pos >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT;
-
- ASSERT(block_offset + from == pos);
-
- head = page_buffers(page);
- block_start = 0;
- for (bh = head; bh != head || !block_start;
- bh = bh->b_this_page, block_start = block_end,
- block_offset += bh->b_size) {
- block_end = block_start + bh->b_size;
-
- /* skip buffers before the write */
- if (block_end <= from)
- continue;
-
- /* if the buffer is after the write, we're done */
- if (block_start >= to)
- break;
-
- /*
- * Process delalloc and unwritten buffers beyond EOF. We can
- * encounter unwritten buffers in the event that a file has
- * post-EOF unwritten extents and an extending write happens to
- * fail (e.g., an unaligned write that also involves a delalloc
- * to the same page).
- */
- if (!buffer_delay(bh) && !buffer_unwritten(bh))
- continue;
-
- if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
- block_offset < i_size_read(inode))
- continue;
-
- if (buffer_delay(bh))
- xfs_vm_kill_delalloc_range(inode, block_offset,
- block_offset + bh->b_size);
-
- /*
- * This buffer does not contain data anymore. make sure anyone
- * who finds it knows that for certain.
- */
- clear_buffer_delay(bh);
- clear_buffer_uptodate(bh);
- clear_buffer_mapped(bh);
- clear_buffer_new(bh);
- clear_buffer_dirty(bh);
- clear_buffer_unwritten(bh);
- }
-
-}
-
-/*
- * This used to call block_write_begin(), but it unlocks and releases the page
- * on error, and we need that page to be able to punch stale delalloc blocks out
- * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
- * the appropriate point.
- */
-STATIC int
-xfs_vm_write_begin(
- struct file *file,
- struct address_space *mapping,
- loff_t pos,
- unsigned len,
- unsigned flags,
- struct page **pagep,
- void **fsdata)
-{
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- struct page *page;
- int status;
- struct xfs_mount *mp = XFS_I(mapping->host)->i_mount;
-
- ASSERT(len <= PAGE_CACHE_SIZE);
-
- page = grab_cache_page_write_begin(mapping, index, flags);
- if (!page)
- return -ENOMEM;
-
- status = __block_write_begin(page, pos, len, xfs_get_blocks);
- if (xfs_mp_fail_writes(mp))
- status = -EIO;
- if (unlikely(status)) {
- struct inode *inode = mapping->host;
- size_t isize = i_size_read(inode);
-
- xfs_vm_write_failed(inode, page, pos, len);
- unlock_page(page);
-
- /*
- * If the write is beyond EOF, we only want to kill blocks
- * allocated in this write, not blocks that were previously
- * written successfully.
- */
- if (xfs_mp_fail_writes(mp))
- isize = 0;
- if (pos + len > isize) {
- ssize_t start = max_t(ssize_t, pos, isize);
-
- truncate_pagecache_range(inode, start, pos + len);
- }
-
- page_cache_release(page);
- page = NULL;
- }
-
- *pagep = page;
- return status;
-}
-
-/*
- * On failure, we only need to kill delalloc blocks beyond EOF in the range of
- * this specific write because they will never be written. Previous writes
- * beyond EOF where block allocation succeeded do not need to be trashed, so
- * only new blocks from this write should be trashed. For blocks within
- * EOF, generic_write_end() zeros them so they are safe to leave alone and be
- * written with all the other valid data.
- */
-STATIC int
-xfs_vm_write_end(
- struct file *file,
- struct address_space *mapping,
- loff_t pos,
- unsigned len,
- unsigned copied,
- struct page *page,
- void *fsdata)
-{
- int ret;
-
- ASSERT(len <= PAGE_CACHE_SIZE);
-
- ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
- if (unlikely(ret < len)) {
- struct inode *inode = mapping->host;
- size_t isize = i_size_read(inode);
- loff_t to = pos + len;
-
- if (to > isize) {
- /* only kill blocks in this write beyond EOF */
- if (pos > isize)
- isize = pos;
- xfs_vm_kill_delalloc_range(inode, isize, to);
- truncate_pagecache_range(inode, isize, to);
- }
- }
- return ret;
+ return -EINVAL;
}
STATIC sector_t
@@ -1749,8 +1514,6 @@ const struct address_space_operations xfs_address_space_operations = {
.set_page_dirty = xfs_vm_set_page_dirty,
.releasepage = xfs_vm_releasepage,
.invalidatepage = xfs_vm_invalidatepage,
- .write_begin = xfs_vm_write_begin,
- .write_end = xfs_vm_write_end,
.bmap = xfs_vm_bmap,
.direct_IO = xfs_vm_direct_IO,
.migratepage = buffer_migrate_page,
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index b4421177b68dc..bf2d9a141a734 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -18,7 +18,7 @@
#ifndef __XFS_AOPS_H__
#define __XFS_AOPS_H__
-extern mempool_t *xfs_ioend_pool;
+extern struct bio_set *xfs_ioend_bioset;
/*
* Types of I/O for bmap clustering and I/O completion tracking.
@@ -37,22 +37,19 @@ enum {
{ XFS_IO_OVERWRITE, "overwrite" }
/*
- * xfs_ioend struct manages large extent writes for XFS.
- * It can manage several multi-page bio's at once.
+ * Structure for buffered I/O completions.
*/
-typedef struct xfs_ioend {
+struct xfs_ioend {
struct list_head io_list; /* next ioend in chain */
unsigned int io_type; /* delalloc / unwritten */
- int io_error; /* I/O error code */
- atomic_t io_remaining; /* hold count */
struct inode *io_inode; /* file being written to */
- struct buffer_head *io_buffer_head;/* buffer linked list head */
- struct buffer_head *io_buffer_tail;/* buffer linked list tail */
size_t io_size; /* size of the extent */
xfs_off_t io_offset; /* offset in the file */
struct work_struct io_work; /* xfsdatad work queue */
struct xfs_trans *io_append_trans;/* xact. for size update */
-} xfs_ioend_t;
+ struct bio *io_bio; /* bio being built */
+ struct bio io_inline_bio; /* MUST BE LAST! */
+};
extern const struct address_space_operations xfs_address_space_operations;
@@ -63,6 +60,9 @@ int xfs_get_blocks_direct(struct inode *inode, sector_t offset,
int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
struct buffer_head *map_bh, int create);
+int xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset,
+ ssize_t size, void *private);
+
extern void xfs_count_page_state(struct page *, int *, int *);
extern struct block_device *xfs_find_bdev_for_inode(struct inode *);
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index dd4824589470e..e3da5d448bcff 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -112,8 +112,9 @@ typedef struct attrlist_cursor_kern {
*========================================================================*/
+/* Return 0 on success, or -errno; other state communicated via *context */
typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int,
- unsigned char *, int, int, unsigned char *);
+ unsigned char *, int, int);
typedef struct xfs_attr_list_context {
struct xfs_inode *dp; /* inode */
@@ -126,7 +127,6 @@ typedef struct xfs_attr_list_context {
int firstu; /* first used byte in buffer */
int flags; /* from VOP call */
int resynch; /* T/F: resynch with cursor */
- int put_value; /* T/F: need value for listent */
put_listent_func_t put_listent; /* list output fmt function */
int index; /* index into output buffer */
} xfs_attr_list_context_t;
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 2bb959ada45bb..be0b79d8900f0 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -322,7 +322,7 @@ xfs_attr3_node_inactive(
* Recurse (gasp!) through the attribute nodes until we find leaves.
* We're doing a depth-first traversal in order to invalidate everything.
*/
-int
+static int
xfs_attr3_root_inactive(
struct xfs_trans **trans,
struct xfs_inode *dp)
@@ -405,21 +405,11 @@ xfs_attr_inactive(
goto out_destroy_fork;
xfs_iunlock(dp, lock_mode);
- /*
- * Start our first transaction of the day.
- *
- * All future transactions during this code must be "chained" off
- * this one via the trans_dup() call. All transactions will contain
- * the inode, and the inode will always be marked with trans_ihold().
- * Since the inode will be locked in all transactions, we must log
- * the inode in every transaction to let it float upward through
- * the log.
- */
lock_mode = 0;
- trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL);
- error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0);
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_attrinval, 0, 0, 0, &trans);
if (error)
- goto out_cancel;
+ goto out_destroy_fork;
lock_mode = XFS_ILOCK_EXCL;
xfs_ilock(dp, lock_mode);
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 4fa14820e2e22..25e76cd6c0533 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -65,7 +65,7 @@ xfs_attr_shortform_compare(const void *a, const void *b)
* we have to calculate each entries' hashvalue and sort them before
* we can begin returning them to the user.
*/
-int
+static int
xfs_attr_shortform_list(xfs_attr_list_context_t *context)
{
attrlist_cursor_kern_t *cursor;
@@ -106,18 +106,15 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
sfe->flags,
sfe->nameval,
(int)sfe->namelen,
- (int)sfe->valuelen,
- &sfe->nameval[sfe->namelen]);
-
+ (int)sfe->valuelen);
+ if (error)
+ return error;
/*
* Either search callback finished early or
* didn't fit it all in the buffer after all.
*/
if (context->seen_enough)
break;
-
- if (error)
- return error;
sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
}
trace_xfs_attr_list_sf_all(context);
@@ -200,8 +197,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
sbp->flags,
sbp->name,
sbp->namelen,
- sbp->valuelen,
- &sbp->name[sbp->namelen]);
+ sbp->valuelen);
if (error) {
kmem_free(sbuf);
return error;
@@ -416,6 +412,9 @@ xfs_attr3_leaf_list_int(
*/
retval = 0;
for (; i < ichdr.count; entry++, i++) {
+ char *name;
+ int namelen, valuelen;
+
if (be32_to_cpu(entry->hashval) != cursor->hashval) {
cursor->hashval = be32_to_cpu(entry->hashval);
cursor->offset = 0;
@@ -425,56 +424,25 @@ xfs_attr3_leaf_list_int(
continue; /* skip incomplete entries */
if (entry->flags & XFS_ATTR_LOCAL) {
- xfs_attr_leaf_name_local_t *name_loc =
- xfs_attr3_leaf_name_local(leaf, i);
-
- retval = context->put_listent(context,
- entry->flags,
- name_loc->nameval,
- (int)name_loc->namelen,
- be16_to_cpu(name_loc->valuelen),
- &name_loc->nameval[name_loc->namelen]);
- if (retval)
- return retval;
+ xfs_attr_leaf_name_local_t *name_loc;
+
+ name_loc = xfs_attr3_leaf_name_local(leaf, i);
+ name = name_loc->nameval;
+ namelen = name_loc->namelen;
+ valuelen = be16_to_cpu(name_loc->valuelen);
} else {
- xfs_attr_leaf_name_remote_t *name_rmt =
- xfs_attr3_leaf_name_remote(leaf, i);
-
- int valuelen = be32_to_cpu(name_rmt->valuelen);
-
- if (context->put_value) {
- xfs_da_args_t args;
-
- memset((char *)&args, 0, sizeof(args));
- args.geo = context->dp->i_mount->m_attr_geo;
- args.dp = context->dp;
- args.whichfork = XFS_ATTR_FORK;
- args.valuelen = valuelen;
- args.rmtvaluelen = valuelen;
- args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
- args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
- args.rmtblkcnt = xfs_attr3_rmt_blocks(
- args.dp->i_mount, valuelen);
- retval = xfs_attr_rmtval_get(&args);
- if (!retval)
- retval = context->put_listent(context,
- entry->flags,
- name_rmt->name,
- (int)name_rmt->namelen,
- valuelen,
- args.value);
- kmem_free(args.value);
- } else {
- retval = context->put_listent(context,
- entry->flags,
- name_rmt->name,
- (int)name_rmt->namelen,
- valuelen,
- NULL);
- }
- if (retval)
- return retval;
+ xfs_attr_leaf_name_remote_t *name_rmt;
+
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
+ name = name_rmt->name;
+ namelen = name_rmt->namelen;
+ valuelen = be32_to_cpu(name_rmt->valuelen);
}
+
+ retval = context->put_listent(context, entry->flags,
+ name, namelen, valuelen);
+ if (retval)
+ break;
if (context->seen_enough)
break;
cursor->offset++;
@@ -551,8 +519,7 @@ xfs_attr_put_listent(
int flags,
unsigned char *name,
int namelen,
- int valuelen,
- unsigned char *value)
+ int valuelen)
{
struct attrlist *alist = (struct attrlist *)context->alist;
attrlist_ent_t *aep;
@@ -581,7 +548,7 @@ xfs_attr_put_listent(
trace_xfs_attr_list_full(context);
alist->al_more = 1;
context->seen_enough = 1;
- return 1;
+ return 0;
}
aep = (attrlist_ent_t *)&context->alist[context->firstu];
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index a32c1dcae2ff3..4ece4f2ffc727 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -25,6 +25,7 @@
#include "xfs_bit.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
+#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_trans.h"
@@ -40,6 +41,7 @@
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_log.h"
+#include "xfs_rmap_btree.h"
/* Kernel only BMAP related definitions and functions */
@@ -72,91 +74,11 @@ xfs_zero_extent(
struct xfs_mount *mp = ip->i_mount;
xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb);
sector_t block = XFS_BB_TO_FSBT(mp, sector);
- ssize_t size = XFS_FSB_TO_B(mp, count_fsb);
-
- if (IS_DAX(VFS_I(ip)))
- return dax_clear_sectors(xfs_find_bdev_for_inode(VFS_I(ip)),
- sector, size);
-
- /*
- * let the block layer decide on the fastest method of
- * implementing the zeroing.
- */
- return sb_issue_zeroout(mp->m_super, block, count_fsb, GFP_NOFS);
-}
-
-/*
- * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
- * caller. Frees all the extents that need freeing, which must be done
- * last due to locking considerations. We never free any extents in
- * the first transaction.
- *
- * If an inode *ip is provided, rejoin it to the transaction if
- * the transaction was committed.
- */
-int /* error */
-xfs_bmap_finish(
- struct xfs_trans **tp, /* transaction pointer addr */
- struct xfs_bmap_free *flist, /* i/o: list extents to free */
- struct xfs_inode *ip)
-{
- struct xfs_efd_log_item *efd; /* extent free data */
- struct xfs_efi_log_item *efi; /* extent free intention */
- int error; /* error return value */
- int committed;/* xact committed or not */
- struct xfs_bmap_free_item *free; /* free extent item */
- struct xfs_bmap_free_item *next; /* next item on free list */
-
- ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
- if (flist->xbf_count == 0)
- return 0;
-
- efi = xfs_trans_get_efi(*tp, flist->xbf_count);
- for (free = flist->xbf_first; free; free = free->xbfi_next)
- xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock,
- free->xbfi_blockcount);
-
- error = __xfs_trans_roll(tp, ip, &committed);
- if (error) {
- /*
- * If the transaction was committed, drop the EFD reference
- * since we're bailing out of here. The other reference is
- * dropped when the EFI hits the AIL.
- *
- * If the transaction was not committed, the EFI is freed by the
- * EFI item unlock handler on abort. Also, we have a new
- * transaction so we should return committed=1 even though we're
- * returning an error.
- */
- if (committed) {
- xfs_efi_release(efi);
- xfs_force_shutdown((*tp)->t_mountp,
- (error == -EFSCORRUPTED) ?
- SHUTDOWN_CORRUPT_INCORE :
- SHUTDOWN_META_IO_ERROR);
- }
- return error;
- }
-
- /*
- * Get an EFD and free each extent in the list, logging to the EFD in
- * the process. The remaining bmap free list is cleaned up by the caller
- * on error.
- */
- efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count);
- for (free = flist->xbf_first; free != NULL; free = next) {
- next = free->xbfi_next;
-
- error = xfs_trans_free_extent(*tp, efd, free->xbfi_startblock,
- free->xbfi_blockcount);
- if (error)
- return error;
-
- xfs_bmap_del_free(flist, NULL, free);
- }
-
- return 0;
+ return blkdev_issue_zeroout(xfs_find_bdev_for_inode(VFS_I(ip)),
+ block << (mp->m_super->s_blocksize_bits - 9),
+ count_fsb << (mp->m_super->s_blocksize_bits - 9),
+ GFP_NOFS, true);
}
int
@@ -205,9 +127,9 @@ xfs_bmap_rtalloc(
/*
* Lock out modifications to both the RT bitmap and summary inodes
*/
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP);
xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
- xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL);
+ xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM);
xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL);
/*
@@ -416,7 +338,7 @@ xfs_bmap_count_tree(
/*
* Count fsblocks of the given fork.
*/
-int /* error */
+static int /* error */
xfs_bmap_count_blocks(
xfs_trans_t *tp, /* transaction pointer */
xfs_inode_t *ip, /* incore inode */
@@ -764,7 +686,7 @@ xfs_bmap_punch_delalloc_range(
xfs_bmbt_irec_t imap;
int nimaps = 1;
xfs_fsblock_t firstblock;
- xfs_bmap_free_t flist;
+ struct xfs_defer_ops dfops;
/*
* Map the range first and check that it is a delalloc extent
@@ -795,18 +717,18 @@ xfs_bmap_punch_delalloc_range(
WARN_ON(imap.br_blockcount == 0);
/*
- * Note: while we initialise the firstblock/flist pair, they
+ * Note: while we initialise the firstblock/dfops pair, they
* should never be used because blocks should never be
* allocated or freed for a delalloc extent and hence we need
* don't cancel or finish them after the xfs_bunmapi() call.
*/
- xfs_bmap_init(&flist, &firstblock);
+ xfs_defer_init(&dfops, &firstblock);
error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
- &flist, &done);
+ &dfops, &done);
if (error)
break;
- ASSERT(!flist.xbf_count && !flist.xbf_first);
+ ASSERT(!xfs_defer_has_unfinished_work(&dfops));
next_block:
start_fsb++;
remaining--;
@@ -900,19 +822,15 @@ xfs_free_eofblocks(
* Free them up now by truncating the file to
* its current size.
*/
- tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
-
if (need_iolock) {
- if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
- xfs_trans_cancel(tp);
+ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL))
return -EAGAIN;
- }
}
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0,
+ &tp);
if (error) {
ASSERT(XFS_FORCED_SHUTDOWN(mp));
- xfs_trans_cancel(tp);
if (need_iolock)
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
return error;
@@ -967,7 +885,7 @@ xfs_alloc_file_space(
int rt;
xfs_trans_t *tp;
xfs_bmbt_irec_t imaps[1], *imapp;
- xfs_bmap_free_t free_list;
+ struct xfs_defer_ops dfops;
uint qblocks, resblks, resrtextents;
int error;
@@ -1037,9 +955,9 @@ xfs_alloc_file_space(
/*
* Allocate and setup the transaction.
*/
- tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
- resblks, resrtextents);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks,
+ resrtextents, 0, &tp);
+
/*
* Check for running out of space
*/
@@ -1048,7 +966,6 @@ xfs_alloc_file_space(
* Free the transaction structure.
*/
ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
- xfs_trans_cancel(tp);
break;
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -1059,17 +976,17 @@ xfs_alloc_file_space(
xfs_trans_ijoin(tp, ip, 0);
- xfs_bmap_init(&free_list, &firstfsb);
+ xfs_defer_init(&dfops, &firstfsb);
error = xfs_bmapi_write(tp, ip, startoffset_fsb,
allocatesize_fsb, alloc_type, &firstfsb,
- resblks, imapp, &nimaps, &free_list);
+ resblks, imapp, &nimaps, &dfops);
if (error)
goto error0;
/*
* Complete the transaction
*/
- error = xfs_bmap_finish(&tp, &free_list, NULL);
+ error = xfs_defer_finish(&tp, &dfops, NULL);
if (error)
goto error0;
@@ -1092,7 +1009,7 @@ xfs_alloc_file_space(
return error;
error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
- xfs_bmap_cancel(&free_list);
+ xfs_defer_cancel(&dfops);
xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
error1: /* Just cancel transaction */
@@ -1101,99 +1018,120 @@ error1: /* Just cancel transaction */
return error;
}
-/*
- * Zero file bytes between startoff and endoff inclusive.
- * The iolock is held exclusive and no blocks are buffered.
- *
- * This function is used by xfs_free_file_space() to zero
- * partial blocks when the range to free is not block aligned.
- * When unreserving space with boundaries that are not block
- * aligned we round up the start and round down the end
- * boundaries and then use this function to zero the parts of
- * the blocks that got dropped during the rounding.
- */
-STATIC int
-xfs_zero_remaining_bytes(
- xfs_inode_t *ip,
- xfs_off_t startoff,
- xfs_off_t endoff)
+static int
+xfs_unmap_extent(
+ struct xfs_inode *ip,
+ xfs_fileoff_t startoffset_fsb,
+ xfs_filblks_t len_fsb,
+ int *done)
{
- xfs_bmbt_irec_t imap;
- xfs_fileoff_t offset_fsb;
- xfs_off_t lastoffset;
- xfs_off_t offset;
- xfs_buf_t *bp;
- xfs_mount_t *mp = ip->i_mount;
- int nimap;
- int error = 0;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ struct xfs_defer_ops dfops;
+ xfs_fsblock_t firstfsb;
+ uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+ int error;
- /*
- * Avoid doing I/O beyond eof - it's not necessary
- * since nothing can read beyond eof. The space will
- * be zeroed when the file is extended anyway.
- */
- if (startoff >= XFS_ISIZE(ip))
- return 0;
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
+ if (error) {
+ ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+ return error;
+ }
- if (endoff > XFS_ISIZE(ip))
- endoff = XFS_ISIZE(ip);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, ip->i_gdquot,
+ ip->i_pdquot, resblks, 0, XFS_QMOPT_RES_REGBLKS);
+ if (error)
+ goto out_trans_cancel;
- for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
- uint lock_mode;
+ xfs_trans_ijoin(tp, ip, 0);
- offset_fsb = XFS_B_TO_FSBT(mp, offset);
- nimap = 1;
+ xfs_defer_init(&dfops, &firstfsb);
+ error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, 0, 2, &firstfsb,
+ &dfops, done);
+ if (error)
+ goto out_bmap_cancel;
- lock_mode = xfs_ilock_data_map_shared(ip);
- error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
- xfs_iunlock(ip, lock_mode);
+ error = xfs_defer_finish(&tp, &dfops, ip);
+ if (error)
+ goto out_bmap_cancel;
- if (error || nimap < 1)
- break;
- ASSERT(imap.br_blockcount >= 1);
- ASSERT(imap.br_startoff == offset_fsb);
- ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+ error = xfs_trans_commit(tp);
+out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
- if (imap.br_startblock == HOLESTARTBLOCK ||
- imap.br_state == XFS_EXT_UNWRITTEN) {
- /* skip the entire extent */
- lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff +
- imap.br_blockcount) - 1;
- continue;
- }
+out_bmap_cancel:
+ xfs_defer_cancel(&dfops);
+out_trans_cancel:
+ xfs_trans_cancel(tp);
+ goto out_unlock;
+}
+
+static int
+xfs_adjust_extent_unmap_boundaries(
+ struct xfs_inode *ip,
+ xfs_fileoff_t *startoffset_fsb,
+ xfs_fileoff_t *endoffset_fsb)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_bmbt_irec imap;
+ int nimap, error;
+ xfs_extlen_t mod = 0;
- lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
- if (lastoffset > endoff)
- lastoffset = endoff;
+ nimap = 1;
+ error = xfs_bmapi_read(ip, *startoffset_fsb, 1, &imap, &nimap, 0);
+ if (error)
+ return error;
- /* DAX can just zero the backing device directly */
- if (IS_DAX(VFS_I(ip))) {
- error = dax_zero_page_range(VFS_I(ip), offset,
- lastoffset - offset + 1,
- xfs_get_blocks_direct);
- if (error)
- return error;
- continue;
- }
+ if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
+ xfs_daddr_t block;
- error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ?
- mp->m_rtdev_targp : mp->m_ddev_targp,
- xfs_fsb_to_db(ip, imap.br_startblock),
- BTOBB(mp->m_sb.sb_blocksize),
- 0, &bp, NULL);
- if (error)
- return error;
+ ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+ block = imap.br_startblock;
+ mod = do_div(block, mp->m_sb.sb_rextsize);
+ if (mod)
+ *startoffset_fsb += mp->m_sb.sb_rextsize - mod;
+ }
- memset(bp->b_addr +
- (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
- 0, lastoffset - offset + 1);
+ nimap = 1;
+ error = xfs_bmapi_read(ip, *endoffset_fsb - 1, 1, &imap, &nimap, 0);
+ if (error)
+ return error;
- error = xfs_bwrite(bp);
- xfs_buf_relse(bp);
- if (error)
- return error;
+ if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
+ ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+ mod++;
+ if (mod && mod != mp->m_sb.sb_rextsize)
+ *endoffset_fsb -= mod;
}
- return error;
+
+ return 0;
+}
+
+static int
+xfs_flush_unmap_range(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t len)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct inode *inode = VFS_I(ip);
+ xfs_off_t rounding, start, end;
+ int error;
+
+ /* wait for the completion of any pending DIOs */
+ inode_dio_wait(inode);
+
+ rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE);
+ start = round_down(offset, rounding);
+ end = round_up(offset + len, rounding) - 1;
+
+ error = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (error)
+ return error;
+ truncate_pagecache_range(inode, start, end);
+ return 0;
}
int
@@ -1202,24 +1140,10 @@ xfs_free_file_space(
xfs_off_t offset,
xfs_off_t len)
{
- int done;
- xfs_fileoff_t endoffset_fsb;
- int error;
- xfs_fsblock_t firstfsb;
- xfs_bmap_free_t free_list;
- xfs_bmbt_irec_t imap;
- xfs_off_t ioffset;
- xfs_off_t iendoffset;
- xfs_extlen_t mod=0;
- xfs_mount_t *mp;
- int nimap;
- uint resblks;
- xfs_off_t rounding;
- int rt;
+ struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t startoffset_fsb;
- xfs_trans_t *tp;
-
- mp = ip->i_mount;
+ xfs_fileoff_t endoffset_fsb;
+ int done = 0, error;
trace_xfs_free_file_space(ip);
@@ -1227,143 +1151,45 @@ xfs_free_file_space(
if (error)
return error;
- error = 0;
if (len <= 0) /* if nothing being freed */
- return error;
- rt = XFS_IS_REALTIME_INODE(ip);
- startoffset_fsb = XFS_B_TO_FSB(mp, offset);
- endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
-
- /* wait for the completion of any pending DIOs */
- inode_dio_wait(VFS_I(ip));
+ return 0;
- rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
- ioffset = round_down(offset, rounding);
- iendoffset = round_up(offset + len, rounding) - 1;
- error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset,
- iendoffset);
+ error = xfs_flush_unmap_range(ip, offset, len);
if (error)
- goto out;
- truncate_pagecache_range(VFS_I(ip), ioffset, iendoffset);
+ return error;
+
+ startoffset_fsb = XFS_B_TO_FSB(mp, offset);
+ endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
/*
- * Need to zero the stuff we're not freeing, on disk.
- * If it's a realtime file & can't use unwritten extents then we
- * actually need to zero the extent edges. Otherwise xfs_bunmapi
- * will take care of it for us.
+ * Need to zero the stuff we're not freeing, on disk. If it's a RT file
+ * and we can't use unwritten extents then we actually need to ensure
+ * to zero the whole extent, otherwise we just need to take of block
+ * boundaries, and xfs_bunmapi will handle the rest.
*/
- if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
- nimap = 1;
- error = xfs_bmapi_read(ip, startoffset_fsb, 1,
- &imap, &nimap, 0);
+ if (XFS_IS_REALTIME_INODE(ip) &&
+ !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+ error = xfs_adjust_extent_unmap_boundaries(ip, &startoffset_fsb,
+ &endoffset_fsb);
if (error)
- goto out;
- ASSERT(nimap == 0 || nimap == 1);
- if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
- xfs_daddr_t block;
-
- ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
- block = imap.br_startblock;
- mod = do_div(block, mp->m_sb.sb_rextsize);
- if (mod)
- startoffset_fsb += mp->m_sb.sb_rextsize - mod;
- }
- nimap = 1;
- error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
- &imap, &nimap, 0);
- if (error)
- goto out;
- ASSERT(nimap == 0 || nimap == 1);
- if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
- ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
- mod++;
- if (mod && (mod != mp->m_sb.sb_rextsize))
- endoffset_fsb -= mod;
- }
- }
- if ((done = (endoffset_fsb <= startoffset_fsb)))
- /*
- * One contiguous piece to clear
- */
- error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
- else {
- /*
- * Some full blocks, possibly two pieces to clear
- */
- if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
- error = xfs_zero_remaining_bytes(ip, offset,
- XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
- if (!error &&
- XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
- error = xfs_zero_remaining_bytes(ip,
- XFS_FSB_TO_B(mp, endoffset_fsb),
- offset + len - 1);
+ return error;
}
- /*
- * free file space until done or until there is an error
- */
- resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
- while (!error && !done) {
-
- /*
- * allocate and setup the transaction. Allow this
- * transaction to dip into the reserve blocks to ensure
- * the freeing of the space succeeds at ENOSPC.
- */
- tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0);
-
- /*
- * check for running out of space
- */
- if (error) {
- /*
- * Free the transaction structure.
- */
- ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
- xfs_trans_cancel(tp);
- break;
+ if (endoffset_fsb > startoffset_fsb) {
+ while (!done) {
+ error = xfs_unmap_extent(ip, startoffset_fsb,
+ endoffset_fsb - startoffset_fsb, &done);
+ if (error)
+ return error;
}
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_trans_reserve_quota(tp, mp,
- ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
- resblks, 0, XFS_QMOPT_RES_REGBLKS);
- if (error)
- goto error1;
-
- xfs_trans_ijoin(tp, ip, 0);
-
- /*
- * issue the bunmapi() call to free the blocks
- */
- xfs_bmap_init(&free_list, &firstfsb);
- error = xfs_bunmapi(tp, ip, startoffset_fsb,
- endoffset_fsb - startoffset_fsb,
- 0, 2, &firstfsb, &free_list, &done);
- if (error)
- goto error0;
-
- /*
- * complete the transaction
- */
- error = xfs_bmap_finish(&tp, &free_list, NULL);
- if (error)
- goto error0;
-
- error = xfs_trans_commit(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
}
- out:
- return error;
-
- error0:
- xfs_bmap_cancel(&free_list);
- error1:
- xfs_trans_cancel(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- goto out;
+ /*
+ * Now that we've unmap all full blocks we'll have to zero out any
+ * partial block at the beginning and/or end. xfs_zero_range is
+ * smart enough to skip any holes, including those we just created.
+ */
+ return xfs_zero_range(ip, offset, len, NULL);
}
/*
@@ -1425,7 +1251,7 @@ xfs_shift_file_space(
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
int error;
- struct xfs_bmap_free free_list;
+ struct xfs_defer_ops dfops;
xfs_fsblock_t first_block;
xfs_fileoff_t stop_fsb;
xfs_fileoff_t next_fsb;
@@ -1466,7 +1292,7 @@ xfs_shift_file_space(
if (error)
return error;
error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
- offset >> PAGE_CACHE_SHIFT, -1);
+ offset >> PAGE_SHIFT, -1);
if (error)
return error;
@@ -1482,19 +1308,16 @@ xfs_shift_file_space(
}
while (!error && !done) {
- tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
/*
* We would need to reserve permanent block for transaction.
* This will come into picture when after shifting extent into
* hole we found that adjacent extents can be merged which
* may lead to freeing of a block during record update.
*/
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
- XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
+ XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp);
+ if (error)
break;
- }
xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
@@ -1506,19 +1329,19 @@ xfs_shift_file_space(
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- xfs_bmap_init(&free_list, &first_block);
+ xfs_defer_init(&dfops, &first_block);
/*
* We are using the write transaction in which max 2 bmbt
* updates are allowed
*/
error = xfs_bmap_shift_extents(tp, ip, &next_fsb, shift_fsb,
- &done, stop_fsb, &first_block, &free_list,
+ &done, stop_fsb, &first_block, &dfops,
direction, XFS_BMAP_MAX_SHIFT_EXTENTS);
if (error)
goto out_bmap_cancel;
- error = xfs_bmap_finish(&tp, &free_list, NULL);
+ error = xfs_defer_finish(&tp, &dfops, NULL);
if (error)
goto out_bmap_cancel;
@@ -1528,7 +1351,7 @@ xfs_shift_file_space(
return error;
out_bmap_cancel:
- xfs_bmap_cancel(&free_list);
+ xfs_defer_cancel(&dfops);
out_trans_cancel:
xfs_trans_cancel(tp);
return error;
@@ -1712,6 +1535,10 @@ xfs_swap_extents(
__uint64_t tmp;
int lock_flags;
+ /* XXX: we can't do this with rmap, will fix later */
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ return -EOPNOTSUPP;
+
tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
if (!tempifp) {
error = -ENOMEM;
@@ -1747,12 +1574,9 @@ xfs_swap_extents(
if (error)
goto out_unlock;
- tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+ if (error)
goto out_unlock;
- }
/*
* Lock and join the inodes to the tansaction so that transaction commit
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index af97d9a1dfb44..68a621a8e0c07 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -21,7 +21,7 @@
/* Kernel only BMAP related definitions and functions */
struct xfs_bmbt_irec;
-struct xfs_bmap_free_item;
+struct xfs_extent_free_item;
struct xfs_ifork;
struct xfs_inode;
struct xfs_mount;
@@ -31,8 +31,6 @@ struct xfs_bmalloca;
int xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
int whichfork, int *eof);
-int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
- int whichfork, int *count);
int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
xfs_fileoff_t start_fsb, xfs_fileoff_t length);
@@ -42,9 +40,6 @@ int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,
xfs_bmap_format_t formatter, void *arg);
/* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */
-void xfs_bmap_del_free(struct xfs_bmap_free *flist,
- struct xfs_bmap_free_item *prev,
- struct xfs_bmap_free_item *free);
int xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp,
struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz,
int rt, int eof, int delay, int convert,
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 9a2191b911377..47a318ce82e0a 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -80,6 +80,47 @@ xfs_buf_vmap_len(
}
/*
+ * Bump the I/O in flight count on the buftarg if we haven't yet done so for
+ * this buffer. The count is incremented once per buffer (per hold cycle)
+ * because the corresponding decrement is deferred to buffer release. Buffers
+ * can undergo I/O multiple times in a hold-release cycle and per buffer I/O
+ * tracking adds unnecessary overhead. This is used for sychronization purposes
+ * with unmount (see xfs_wait_buftarg()), so all we really need is a count of
+ * in-flight buffers.
+ *
+ * Buffers that are never released (e.g., superblock, iclog buffers) must set
+ * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count
+ * never reaches zero and unmount hangs indefinitely.
+ */
+static inline void
+xfs_buf_ioacct_inc(
+ struct xfs_buf *bp)
+{
+ if (bp->b_flags & (XBF_NO_IOACCT|_XBF_IN_FLIGHT))
+ return;
+
+ ASSERT(bp->b_flags & XBF_ASYNC);
+ bp->b_flags |= _XBF_IN_FLIGHT;
+ percpu_counter_inc(&bp->b_target->bt_io_count);
+}
+
+/*
+ * Clear the in-flight state on a buffer about to be released to the LRU or
+ * freed and unaccount from the buftarg.
+ */
+static inline void
+xfs_buf_ioacct_dec(
+ struct xfs_buf *bp)
+{
+ if (!(bp->b_flags & _XBF_IN_FLIGHT))
+ return;
+
+ ASSERT(bp->b_flags & XBF_ASYNC);
+ bp->b_flags &= ~_XBF_IN_FLIGHT;
+ percpu_counter_dec(&bp->b_target->bt_io_count);
+}
+
+/*
* When we mark a buffer stale, we remove the buffer from the LRU and clear the
* b_lru_ref count so that the buffer is freed immediately when the buffer
* reference count falls to zero. If the buffer is already on the LRU, we need
@@ -102,6 +143,14 @@ xfs_buf_stale(
*/
bp->b_flags &= ~_XBF_DELWRI_Q;
+ /*
+ * Once the buffer is marked stale and unlocked, a subsequent lookup
+ * could reset b_flags. There is no guarantee that the buffer is
+ * unaccounted (released to LRU) before that occurs. Drop in-flight
+ * status now to preserve accounting consistency.
+ */
+ xfs_buf_ioacct_dec(bp);
+
spin_lock(&bp->b_lock);
atomic_set(&bp->b_lru_ref, 0);
if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
@@ -815,7 +864,8 @@ xfs_buf_get_uncached(
struct xfs_buf *bp;
DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
- bp = _xfs_buf_alloc(target, &map, 1, 0);
+ /* flags might contain irrelevant bits, pass only what we care about */
+ bp = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT);
if (unlikely(bp == NULL))
goto fail;
@@ -866,63 +916,85 @@ xfs_buf_hold(
}
/*
- * Releases a hold on the specified buffer. If the
- * the hold count is 1, calls xfs_buf_free.
+ * Release a hold on the specified buffer. If the hold count is 1, the buffer is
+ * placed on LRU or freed (depending on b_lru_ref).
*/
void
xfs_buf_rele(
xfs_buf_t *bp)
{
struct xfs_perag *pag = bp->b_pag;
+ bool release;
+ bool freebuf = false;
trace_xfs_buf_rele(bp, _RET_IP_);
if (!pag) {
ASSERT(list_empty(&bp->b_lru));
ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
- if (atomic_dec_and_test(&bp->b_hold))
+ if (atomic_dec_and_test(&bp->b_hold)) {
+ xfs_buf_ioacct_dec(bp);
xfs_buf_free(bp);
+ }
return;
}
ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
ASSERT(atomic_read(&bp->b_hold) > 0);
- if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
- spin_lock(&bp->b_lock);
- if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
- /*
- * If the buffer is added to the LRU take a new
- * reference to the buffer for the LRU and clear the
- * (now stale) dispose list state flag
- */
- if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
- bp->b_state &= ~XFS_BSTATE_DISPOSE;
- atomic_inc(&bp->b_hold);
- }
- spin_unlock(&bp->b_lock);
- spin_unlock(&pag->pag_buf_lock);
- } else {
- /*
- * most of the time buffers will already be removed from
- * the LRU, so optimise that case by checking for the
- * XFS_BSTATE_DISPOSE flag indicating the last list the
- * buffer was on was the disposal list
- */
- if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
- list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
- } else {
- ASSERT(list_empty(&bp->b_lru));
- }
- spin_unlock(&bp->b_lock);
- ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
- rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
- spin_unlock(&pag->pag_buf_lock);
- xfs_perag_put(pag);
- xfs_buf_free(bp);
+ release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
+ spin_lock(&bp->b_lock);
+ if (!release) {
+ /*
+ * Drop the in-flight state if the buffer is already on the LRU
+ * and it holds the only reference. This is racy because we
+ * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
+ * ensures the decrement occurs only once per-buf.
+ */
+ if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
+ xfs_buf_ioacct_dec(bp);
+ goto out_unlock;
+ }
+
+ /* the last reference has been dropped ... */
+ xfs_buf_ioacct_dec(bp);
+ if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
+ /*
+ * If the buffer is added to the LRU take a new reference to the
+ * buffer for the LRU and clear the (now stale) dispose list
+ * state flag
+ */
+ if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
+ bp->b_state &= ~XFS_BSTATE_DISPOSE;
+ atomic_inc(&bp->b_hold);
}
+ spin_unlock(&pag->pag_buf_lock);
+ } else {
+ /*
+ * most of the time buffers will already be removed from the
+ * LRU, so optimise that case by checking for the
+ * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
+ * was on was the disposal list
+ */
+ if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
+ list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
+ } else {
+ ASSERT(list_empty(&bp->b_lru));
+ }
+
+ ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
+ rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
+ spin_unlock(&pag->pag_buf_lock);
+ xfs_perag_put(pag);
+ freebuf = true;
}
+
+out_unlock:
+ spin_unlock(&bp->b_lock);
+
+ if (freebuf)
+ xfs_buf_free(bp);
}
@@ -944,10 +1016,12 @@ xfs_buf_trylock(
int locked;
locked = down_trylock(&bp->b_sema) == 0;
- if (locked)
+ if (locked) {
XB_SET_OWNER(bp);
-
- trace_xfs_buf_trylock(bp, _RET_IP_);
+ trace_xfs_buf_trylock(bp, _RET_IP_);
+ } else {
+ trace_xfs_buf_trylock_fail(bp, _RET_IP_);
+ }
return locked;
}
@@ -1100,22 +1174,18 @@ xfs_bwrite(
return error;
}
-STATIC void
+static void
xfs_buf_bio_end_io(
struct bio *bio)
{
- xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private;
+ struct xfs_buf *bp = (struct xfs_buf *)bio->bi_private;
/*
* don't overwrite existing errors - otherwise we can lose errors on
* buffers that require multiple bios to complete.
*/
- if (bio->bi_error) {
- spin_lock(&bp->b_lock);
- if (!bp->b_io_error)
- bp->b_io_error = bio->bi_error;
- spin_unlock(&bp->b_lock);
- }
+ if (bio->bi_error)
+ cmpxchg(&bp->b_io_error, 0, bio->bi_error);
if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
@@ -1131,7 +1201,8 @@ xfs_buf_ioapply_map(
int map,
int *buf_offset,
int *count,
- int rw)
+ int op,
+ int op_flags)
{
int page_index;
int total_nr_pages = bp->b_page_count;
@@ -1161,16 +1232,14 @@ xfs_buf_ioapply_map(
next_chunk:
atomic_inc(&bp->b_io_remaining);
- nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
- if (nr_pages > total_nr_pages)
- nr_pages = total_nr_pages;
+ nr_pages = min(total_nr_pages, BIO_MAX_PAGES);
bio = bio_alloc(GFP_NOIO, nr_pages);
bio->bi_bdev = bp->b_target->bt_bdev;
bio->bi_iter.bi_sector = sector;
bio->bi_end_io = xfs_buf_bio_end_io;
bio->bi_private = bp;
-
+ bio_set_op_attrs(bio, op, op_flags);
for (; size && nr_pages; nr_pages--, page_index++) {
int rbytes, nbytes = PAGE_SIZE - offset;
@@ -1194,7 +1263,7 @@ next_chunk:
flush_kernel_vmap_range(bp->b_addr,
xfs_buf_vmap_len(bp));
}
- submit_bio(rw, bio);
+ submit_bio(bio);
if (size)
goto next_chunk;
} else {
@@ -1214,7 +1283,8 @@ _xfs_buf_ioapply(
struct xfs_buf *bp)
{
struct blk_plug plug;
- int rw;
+ int op;
+ int op_flags = 0;
int offset;
int size;
int i;
@@ -1233,14 +1303,13 @@ _xfs_buf_ioapply(
bp->b_ioend_wq = bp->b_target->bt_mount->m_buf_workqueue;
if (bp->b_flags & XBF_WRITE) {
+ op = REQ_OP_WRITE;
if (bp->b_flags & XBF_SYNCIO)
- rw = WRITE_SYNC;
- else
- rw = WRITE;
+ op_flags = WRITE_SYNC;
if (bp->b_flags & XBF_FUA)
- rw |= REQ_FUA;
+ op_flags |= REQ_FUA;
if (bp->b_flags & XBF_FLUSH)
- rw |= REQ_FLUSH;
+ op_flags |= REQ_PREFLUSH;
/*
* Run the write verifier callback function if it exists. If
@@ -1270,13 +1339,14 @@ _xfs_buf_ioapply(
}
}
} else if (bp->b_flags & XBF_READ_AHEAD) {
- rw = READA;
+ op = REQ_OP_READ;
+ op_flags = REQ_RAHEAD;
} else {
- rw = READ;
+ op = REQ_OP_READ;
}
/* we only use the buffer cache for meta-data */
- rw |= REQ_META;
+ op_flags |= REQ_META;
/*
* Walk all the vectors issuing IO on them. Set up the initial offset
@@ -1288,7 +1358,7 @@ _xfs_buf_ioapply(
size = BBTOB(bp->b_io_length);
blk_start_plug(&plug);
for (i = 0; i < bp->b_map_count; i++) {
- xfs_buf_ioapply_map(bp, i, &offset, &size, rw);
+ xfs_buf_ioapply_map(bp, i, &offset, &size, op, op_flags);
if (bp->b_error)
break;
if (size <= 0)
@@ -1343,6 +1413,7 @@ xfs_buf_submit(
* xfs_buf_ioend too early.
*/
atomic_set(&bp->b_io_remaining, 1);
+ xfs_buf_ioacct_inc(bp);
_xfs_buf_ioapply(bp);
/*
@@ -1528,13 +1599,19 @@ xfs_wait_buftarg(
int loop = 0;
/*
- * We need to flush the buffer workqueue to ensure that all IO
- * completion processing is 100% done. Just waiting on buffer locks is
- * not sufficient for async IO as the reference count held over IO is
- * not released until after the buffer lock is dropped. Hence we need to
- * ensure here that all reference counts have been dropped before we
- * start walking the LRU list.
+ * First wait on the buftarg I/O count for all in-flight buffers to be
+ * released. This is critical as new buffers do not make the LRU until
+ * they are released.
+ *
+ * Next, flush the buffer workqueue to ensure all completion processing
+ * has finished. Just waiting on buffer locks is not sufficient for
+ * async IO as the reference count held over IO is not released until
+ * after the buffer lock is dropped. Hence we need to ensure here that
+ * all reference counts have been dropped before we start walking the
+ * LRU list.
*/
+ while (percpu_counter_sum(&btp->bt_io_count))
+ delay(100);
drain_workqueue(btp->bt_mount->m_buf_workqueue);
/* loop until there is nothing left on the lru list. */
@@ -1631,6 +1708,8 @@ xfs_free_buftarg(
struct xfs_buftarg *btp)
{
unregister_shrinker(&btp->bt_shrinker);
+ ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
+ percpu_counter_destroy(&btp->bt_io_count);
list_lru_destroy(&btp->bt_lru);
if (mp->m_flags & XFS_MOUNT_BARRIER)
@@ -1695,6 +1774,9 @@ xfs_alloc_buftarg(
if (list_lru_init(&btp->bt_lru))
goto error;
+ if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
+ goto error;
+
btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
btp->bt_shrinker.seeks = DEFAULT_SEEKS;
@@ -1778,18 +1860,33 @@ xfs_buf_cmp(
return 0;
}
+/*
+ * submit buffers for write.
+ *
+ * When we have a large buffer list, we do not want to hold all the buffers
+ * locked while we block on the request queue waiting for IO dispatch. To avoid
+ * this problem, we lock and submit buffers in groups of 50, thereby minimising
+ * the lock hold times for lists which may contain thousands of objects.
+ *
+ * To do this, we sort the buffer list before we walk the list to lock and
+ * submit buffers, and we plug and unplug around each group of buffers we
+ * submit.
+ */
static int
-__xfs_buf_delwri_submit(
+xfs_buf_delwri_submit_buffers(
struct list_head *buffer_list,
- struct list_head *io_list,
- bool wait)
+ struct list_head *wait_list)
{
- struct blk_plug plug;
struct xfs_buf *bp, *n;
+ LIST_HEAD (submit_list);
int pinned = 0;
+ struct blk_plug plug;
+ list_sort(NULL, buffer_list, xfs_buf_cmp);
+
+ blk_start_plug(&plug);
list_for_each_entry_safe(bp, n, buffer_list, b_list) {
- if (!wait) {
+ if (!wait_list) {
if (xfs_buf_ispinned(bp)) {
pinned++;
continue;
@@ -1812,25 +1909,21 @@ __xfs_buf_delwri_submit(
continue;
}
- list_move_tail(&bp->b_list, io_list);
trace_xfs_buf_delwri_split(bp, _RET_IP_);
- }
-
- list_sort(NULL, io_list, xfs_buf_cmp);
-
- blk_start_plug(&plug);
- list_for_each_entry_safe(bp, n, io_list, b_list) {
- bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL);
- bp->b_flags |= XBF_WRITE | XBF_ASYNC;
/*
- * we do all Io submission async. This means if we need to wait
- * for IO completion we need to take an extra reference so the
- * buffer is still valid on the other side.
+ * We do all IO submission async. This means if we need
+ * to wait for IO completion we need to take an extra
+ * reference so the buffer is still valid on the other
+ * side. We need to move the buffer onto the io_list
+ * at this point so the caller can still access it.
*/
- if (wait)
+ bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_WRITE_FAIL);
+ bp->b_flags |= XBF_WRITE | XBF_ASYNC;
+ if (wait_list) {
xfs_buf_hold(bp);
- else
+ list_move_tail(&bp->b_list, wait_list);
+ } else
list_del_init(&bp->b_list);
xfs_buf_submit(bp);
@@ -1853,8 +1946,7 @@ int
xfs_buf_delwri_submit_nowait(
struct list_head *buffer_list)
{
- LIST_HEAD (io_list);
- return __xfs_buf_delwri_submit(buffer_list, &io_list, false);
+ return xfs_buf_delwri_submit_buffers(buffer_list, NULL);
}
/*
@@ -1869,15 +1961,15 @@ int
xfs_buf_delwri_submit(
struct list_head *buffer_list)
{
- LIST_HEAD (io_list);
+ LIST_HEAD (wait_list);
int error = 0, error2;
struct xfs_buf *bp;
- __xfs_buf_delwri_submit(buffer_list, &io_list, true);
+ xfs_buf_delwri_submit_buffers(buffer_list, &wait_list);
/* Wait for IO to complete. */
- while (!list_empty(&io_list)) {
- bp = list_first_entry(&io_list, struct xfs_buf, b_list);
+ while (!list_empty(&wait_list)) {
+ bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
list_del_init(&bp->b_list);
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 4eb89bd4ee73b..1c2e52b2d9261 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -43,6 +43,7 @@ typedef enum {
#define XBF_READ (1 << 0) /* buffer intended for reading from device */
#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */
#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */
+#define XBF_NO_IOACCT (1 << 3) /* bypass I/O accounting (non-LRU bufs) */
#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */
#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */
@@ -62,6 +63,7 @@ typedef enum {
#define _XBF_KMEM (1 << 21)/* backed by heap memory */
#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */
#define _XBF_COMPOUND (1 << 23)/* compound buffer */
+#define _XBF_IN_FLIGHT (1 << 25) /* I/O in flight, for accounting purposes */
typedef unsigned int xfs_buf_flags_t;
@@ -81,7 +83,8 @@ typedef unsigned int xfs_buf_flags_t;
{ _XBF_PAGES, "PAGES" }, \
{ _XBF_KMEM, "KMEM" }, \
{ _XBF_DELWRI_Q, "DELWRI_Q" }, \
- { _XBF_COMPOUND, "COMPOUND" }
+ { _XBF_COMPOUND, "COMPOUND" }, \
+ { _XBF_IN_FLIGHT, "IN_FLIGHT" }
/*
@@ -115,6 +118,8 @@ typedef struct xfs_buftarg {
/* LRU control structures */
struct shrinker bt_shrinker;
struct list_lru bt_lru;
+
+ struct percpu_counter bt_io_count;
} xfs_buftarg_t;
struct xfs_buf;
@@ -183,6 +188,26 @@ typedef struct xfs_buf {
unsigned int b_page_count; /* size of page array */
unsigned int b_offset; /* page offset in first page */
int b_error; /* error code on I/O */
+
+ /*
+ * async write failure retry count. Initialised to zero on the first
+ * failure, then when it exceeds the maximum configured without a
+ * success the write is considered to be failed permanently and the
+ * iodone handler will take appropriate action.
+ *
+ * For retry timeouts, we record the jiffie of the first failure. This
+ * means that we can change the retry timeout for buffers already under
+ * I/O and thus avoid getting stuck in a retry loop with a long timeout.
+ *
+ * last_error is used to ensure that we are getting repeated errors, not
+ * different errors. e.g. a block device might change ENOSPC to EIO when
+ * a failure timeout occurs, so we want to re-initialise the error
+ * retry behaviour appropriately when that happens.
+ */
+ int b_retries;
+ unsigned long b_first_retry_time; /* in jiffies */
+ int b_last_error;
+
const struct xfs_buf_ops *b_ops;
#ifdef XFS_BUF_LOCK_TRACKING
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 99e91a0e554ea..e455f9098d496 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -359,7 +359,7 @@ xfs_buf_item_format(
for (i = 0; i < bip->bli_format_count; i++) {
xfs_buf_item_format_segment(bip, lv, &vecp, offset,
&bip->bli_formats[i]);
- offset += bp->b_maps[i].bm_len;
+ offset += BBTOB(bp->b_maps[i].bm_len);
}
/*
@@ -915,20 +915,28 @@ xfs_buf_item_log(
for (i = 0; i < bip->bli_format_count; i++) {
if (start > last)
break;
- end = start + BBTOB(bp->b_maps[i].bm_len);
+ end = start + BBTOB(bp->b_maps[i].bm_len) - 1;
+
+ /* skip to the map that includes the first byte to log */
if (first > end) {
start += BBTOB(bp->b_maps[i].bm_len);
continue;
}
+
+ /*
+ * Trim the range to this segment and mark it in the bitmap.
+ * Note that we must convert buffer offsets to segment relative
+ * offsets (e.g., the first byte of each segment is byte 0 of
+ * that segment).
+ */
if (first < start)
first = start;
if (end > last)
end = last;
-
- xfs_buf_item_log_segment(first, end,
+ xfs_buf_item_log_segment(first - start, end - start,
&bip->bli_formats[i].blf_data_map[0]);
- start += bp->b_maps[i].bm_len;
+ start += BBTOB(bp->b_maps[i].bm_len);
}
}
@@ -949,6 +957,7 @@ xfs_buf_item_free(
xfs_buf_log_item_t *bip)
{
xfs_buf_item_free_format(bip);
+ kmem_free(bip->bli_item.li_lv_shadow);
kmem_zone_free(xfs_buf_item_zone, bip);
}
@@ -1042,35 +1051,22 @@ xfs_buf_do_callbacks(
}
}
-/*
- * This is the iodone() function for buffers which have had callbacks
- * attached to them by xfs_buf_attach_iodone(). It should remove each
- * log item from the buffer's list and call the callback of each in turn.
- * When done, the buffer's fsprivate field is set to NULL and the buffer
- * is unlocked with a call to iodone().
- */
-void
-xfs_buf_iodone_callbacks(
+static bool
+xfs_buf_iodone_callback_error(
struct xfs_buf *bp)
{
struct xfs_log_item *lip = bp->b_fspriv;
struct xfs_mount *mp = lip->li_mountp;
static ulong lasttime;
static xfs_buftarg_t *lasttarg;
-
- if (likely(!bp->b_error))
- goto do_callbacks;
+ struct xfs_error_cfg *cfg;
/*
* If we've already decided to shutdown the filesystem because of
* I/O errors, there's no point in giving this a retry.
*/
- if (XFS_FORCED_SHUTDOWN(mp)) {
- xfs_buf_stale(bp);
- bp->b_flags |= XBF_DONE;
- trace_xfs_buf_item_iodone(bp, _RET_IP_);
- goto do_callbacks;
- }
+ if (XFS_FORCED_SHUTDOWN(mp))
+ goto out_stale;
if (bp->b_target != lasttarg ||
time_after(jiffies, (lasttime + 5*HZ))) {
@@ -1079,45 +1075,93 @@ xfs_buf_iodone_callbacks(
}
lasttarg = bp->b_target;
+ /* synchronous writes will have callers process the error */
+ if (!(bp->b_flags & XBF_ASYNC))
+ goto out_stale;
+
+ trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
+ ASSERT(bp->b_iodone != NULL);
+
+ cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
+
/*
* If the write was asynchronous then no one will be looking for the
- * error. Clear the error state and write the buffer out again.
- *
- * XXX: This helps against transient write errors, but we need to find
- * a way to shut the filesystem down if the writes keep failing.
- *
- * In practice we'll shut the filesystem down soon as non-transient
- * errors tend to affect the whole device and a failing log write
- * will make us give up. But we really ought to do better here.
+ * error. If this is the first failure of this type, clear the error
+ * state and write the buffer out again. This means we always retry an
+ * async write failure at least once, but we also need to set the buffer
+ * up to behave correctly now for repeated failures.
*/
- if (bp->b_flags & XBF_ASYNC) {
- ASSERT(bp->b_iodone != NULL);
+ if (!(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) ||
+ bp->b_last_error != bp->b_error) {
+ bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL);
+ bp->b_last_error = bp->b_error;
+ if (cfg->retry_timeout && !bp->b_first_retry_time)
+ bp->b_first_retry_time = jiffies;
+
+ xfs_buf_ioerror(bp, 0);
+ xfs_buf_submit(bp);
+ return true;
+ }
- trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
+ /*
+ * Repeated failure on an async write. Take action according to the
+ * error configuration we have been set up to use.
+ */
- xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */
+ if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
+ ++bp->b_retries > cfg->max_retries)
+ goto permanent_error;
+ if (cfg->retry_timeout &&
+ time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
+ goto permanent_error;
- if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) {
- bp->b_flags |= XBF_WRITE | XBF_ASYNC |
- XBF_DONE | XBF_WRITE_FAIL;
- xfs_buf_submit(bp);
- } else {
- xfs_buf_relse(bp);
- }
+ /* At unmount we may treat errors differently */
+ if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount)
+ goto permanent_error;
- return;
- }
+ /* still a transient error, higher layers will retry */
+ xfs_buf_ioerror(bp, 0);
+ xfs_buf_relse(bp);
+ return true;
/*
- * If the write of the buffer was synchronous, we want to make
- * sure to return the error to the caller of xfs_bwrite().
+ * Permanent error - we need to trigger a shutdown if we haven't already
+ * to indicate that inconsistency will result from this action.
*/
+permanent_error:
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+out_stale:
xfs_buf_stale(bp);
bp->b_flags |= XBF_DONE;
-
trace_xfs_buf_error_relse(bp, _RET_IP_);
+ return false;
+}
+
+/*
+ * This is the iodone() function for buffers which have had callbacks attached
+ * to them by xfs_buf_attach_iodone(). We need to iterate the items on the
+ * callback list, mark the buffer as having no more callbacks and then push the
+ * buffer through IO completion processing.
+ */
+void
+xfs_buf_iodone_callbacks(
+ struct xfs_buf *bp)
+{
+ /*
+ * If there is an error, process it. Some errors require us
+ * to run callbacks after failure processing is done so we
+ * detect that and take appropriate action.
+ */
+ if (bp->b_error && xfs_buf_iodone_callback_error(bp))
+ return;
+
+ /*
+ * Successful IO or permanent error. Either way, we can clear the
+ * retry state here in preparation for the next error that may occur.
+ */
+ bp->b_last_error = 0;
+ bp->b_retries = 0;
-do_callbacks:
xfs_buf_do_callbacks(bp);
bp->b_fspriv = NULL;
bp->b_iodone = NULL;
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 93b3ab0c54350..f44f799969786 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -273,10 +273,11 @@ xfs_dir2_leaf_readbuf(
size_t bufsize,
struct xfs_dir2_leaf_map_info *mip,
xfs_dir2_off_t *curoff,
- struct xfs_buf **bpp)
+ struct xfs_buf **bpp,
+ bool trim_map)
{
struct xfs_inode *dp = args->dp;
- struct xfs_buf *bp = *bpp;
+ struct xfs_buf *bp = NULL;
struct xfs_bmbt_irec *map = mip->map;
struct blk_plug plug;
int error = 0;
@@ -286,13 +287,10 @@ xfs_dir2_leaf_readbuf(
struct xfs_da_geometry *geo = args->geo;
/*
- * If we have a buffer, we need to release it and
- * take it out of the mapping.
+ * If the caller just finished processing a buffer, it will tell us
+ * we need to trim that block out of the mapping now it is done.
*/
-
- if (bp) {
- xfs_trans_brelse(NULL, bp);
- bp = NULL;
+ if (trim_map) {
mip->map_blocks -= geo->fsbcount;
/*
* Loop to get rid of the extents for the
@@ -533,10 +531,17 @@ xfs_dir2_leaf_getdents(
*/
if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) {
int lock_mode;
+ bool trim_map = false;
+
+ if (bp) {
+ xfs_trans_brelse(NULL, bp);
+ bp = NULL;
+ trim_map = true;
+ }
lock_mode = xfs_ilock_data_map_shared(dp);
error = xfs_dir2_leaf_readbuf(args, bufsize, map_info,
- &curoff, &bp);
+ &curoff, &bp, trim_map);
xfs_iunlock(dp, lock_mode);
if (error || !map_info->map_valid)
break;
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 272c3f8b6f7d0..4ff499aa7338f 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -179,7 +179,7 @@ xfs_ioc_trim(
* matter as trimming blocks is an advisory interface.
*/
if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) ||
- range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)) ||
+ range.minlen > XFS_FSB_TO_B(mp, mp->m_ag_max_usable) ||
range.len < mp->m_sb.sb_blocksize)
return -EINVAL;
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 316b2a1bdba5f..7a30b8f11db7a 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -23,6 +23,7 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_mount.h"
+#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
@@ -74,6 +75,7 @@ xfs_qm_dqdestroy(
{
ASSERT(list_empty(&dqp->q_lru));
+ kmem_free(dqp->q_logitem.qli_item.li_lv_shadow);
mutex_destroy(&dqp->q_qlock);
XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot);
@@ -306,7 +308,7 @@ xfs_qm_dqalloc(
xfs_buf_t **O_bpp)
{
xfs_fsblock_t firstblock;
- xfs_bmap_free_t flist;
+ struct xfs_defer_ops dfops;
xfs_bmbt_irec_t map;
int nmaps, error;
xfs_buf_t *bp;
@@ -319,7 +321,7 @@ xfs_qm_dqalloc(
/*
* Initialize the bmap freelist prior to calling bmapi code.
*/
- xfs_bmap_init(&flist, &firstblock);
+ xfs_defer_init(&dfops, &firstblock);
xfs_ilock(quotip, XFS_ILOCK_EXCL);
/*
* Return if this type of quotas is turned off while we didn't
@@ -335,7 +337,7 @@ xfs_qm_dqalloc(
error = xfs_bmapi_write(tp, quotip, offset_fsb,
XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
&firstblock, XFS_QM_DQALLOC_SPACE_RES(mp),
- &map, &nmaps, &flist);
+ &map, &nmaps, &dfops);
if (error)
goto error0;
ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
@@ -367,7 +369,7 @@ xfs_qm_dqalloc(
dqp->dq_flags & XFS_DQ_ALLTYPES, bp);
/*
- * xfs_bmap_finish() may commit the current transaction and
+ * xfs_defer_finish() may commit the current transaction and
* start a second transaction if the freelist is not empty.
*
* Since we still want to modify this buffer, we need to
@@ -381,7 +383,7 @@ xfs_qm_dqalloc(
xfs_trans_bhold(tp, bp);
- error = xfs_bmap_finish(tpp, &flist, NULL);
+ error = xfs_defer_finish(tpp, &dfops, NULL);
if (error)
goto error1;
@@ -397,7 +399,7 @@ xfs_qm_dqalloc(
return 0;
error1:
- xfs_bmap_cancel(&flist);
+ xfs_defer_cancel(&dfops);
error0:
xfs_iunlock(quotip, XFS_ILOCK_EXCL);
@@ -614,11 +616,10 @@ xfs_qm_dqread(
trace_xfs_dqread(dqp);
if (flags & XFS_QMOPT_DQALLOC) {
- tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_dqalloc,
- XFS_QM_DQALLOC_SPACE_RES(mp), 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc,
+ XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp);
if (error)
- goto error1;
+ goto error0;
}
/*
@@ -692,7 +693,7 @@ error0:
* end of the chunk, skip ahead to first id in next allocated chunk
* using the SEEK_DATA interface.
*/
-int
+static int
xfs_dq_get_next_id(
xfs_mount_t *mp,
uint type,
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 814cff94e78f6..2c7a1629e064b 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -370,6 +370,8 @@ xfs_qm_qoffend_logitem_committed(
spin_lock(&ailp->xa_lock);
xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR);
+ kmem_free(qfs->qql_item.li_lv_shadow);
+ kmem_free(lip->li_lv_shadow);
kmem_free(qfs);
kmem_free(qfe);
return (xfs_lsn_t)-1;
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 88693a98fac5e..ed7ee4e8af738 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -55,12 +55,15 @@ xfs_error_test(int error_tag, int *fsidp, char *expression,
}
int
-xfs_errortag_add(int error_tag, xfs_mount_t *mp)
+xfs_errortag_add(unsigned int error_tag, xfs_mount_t *mp)
{
int i;
int len;
int64_t fsid;
+ if (error_tag >= XFS_ERRTAG_MAX)
+ return -EINVAL;
+
memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t));
for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 4ed3042a0f160..3d224702fbc0c 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -90,7 +90,9 @@ extern void xfs_verifier_error(struct xfs_buf *bp);
#define XFS_ERRTAG_STRATCMPL_IOERR 19
#define XFS_ERRTAG_DIOWRITE_IOERR 20
#define XFS_ERRTAG_BMAPIFORMAT 21
-#define XFS_ERRTAG_MAX 22
+#define XFS_ERRTAG_FREE_EXTENT 22
+#define XFS_ERRTAG_RMAP_FINISH_ONE 23
+#define XFS_ERRTAG_MAX 24
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -117,6 +119,8 @@ extern void xfs_verifier_error(struct xfs_buf *bp);
#define XFS_RANDOM_STRATCMPL_IOERR (XFS_RANDOM_DEFAULT/10)
#define XFS_RANDOM_DIOWRITE_IOERR (XFS_RANDOM_DEFAULT/10)
#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_FREE_EXTENT 1
+#define XFS_RANDOM_RMAP_FINISH_ONE 1
#ifdef DEBUG
extern int xfs_error_test_active;
@@ -128,7 +132,7 @@ extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
(rf))))
-extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp);
+extern int xfs_errortag_add(unsigned int error_tag, struct xfs_mount *mp);
extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
#else
#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr)
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index a1b2dd828b9d0..fe1bfee35898e 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -246,7 +246,7 @@ const struct export_operations xfs_export_operations = {
.fh_to_parent = xfs_fs_fh_to_parent,
.get_parent = xfs_fs_get_parent,
.commit_metadata = xfs_fs_nfs_commit_metadata,
-#ifdef CONFIG_NFSD_BLOCKLAYOUT
+#ifdef CONFIG_EXPORTFS_BLOCK_OPS
.get_uuid = xfs_fs_get_uuid,
.map_blocks = xfs_fs_map_blocks,
.commit_blocks = xfs_fs_commit_blocks,
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 4aa0153214f91..d7bc14906af87 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -20,12 +20,15 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
#include "xfs_mount.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_buf_item.h"
#include "xfs_extfree_item.h"
#include "xfs_log.h"
+#include "xfs_btree.h"
+#include "xfs_rmap.h"
kmem_zone_t *xfs_efi_zone;
@@ -40,6 +43,7 @@ void
xfs_efi_item_free(
struct xfs_efi_log_item *efip)
{
+ kmem_free(efip->efi_item.li_lv_shadow);
if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
kmem_free(efip);
else
@@ -300,6 +304,7 @@ static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
STATIC void
xfs_efd_item_free(struct xfs_efd_log_item *efdp)
{
+ kmem_free(efdp->efd_item.li_lv_shadow);
if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
kmem_free(efdp);
else
@@ -484,3 +489,69 @@ xfs_efd_init(
return efdp;
}
+
+/*
+ * Process an extent free intent item that was recovered from
+ * the log. We need to free the extents that it describes.
+ */
+int
+xfs_efi_recover(
+ struct xfs_mount *mp,
+ struct xfs_efi_log_item *efip)
+{
+ struct xfs_efd_log_item *efdp;
+ struct xfs_trans *tp;
+ int i;
+ int error = 0;
+ xfs_extent_t *extp;
+ xfs_fsblock_t startblock_fsb;
+ struct xfs_owner_info oinfo;
+
+ ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
+
+ /*
+ * First check the validity of the extents described by the
+ * EFI. If any are bad, then assume that all are bad and
+ * just toss the EFI.
+ */
+ for (i = 0; i < efip->efi_format.efi_nextents; i++) {
+ extp = &efip->efi_format.efi_extents[i];
+ startblock_fsb = XFS_BB_TO_FSB(mp,
+ XFS_FSB_TO_DADDR(mp, extp->ext_start));
+ if (startblock_fsb == 0 ||
+ extp->ext_len == 0 ||
+ startblock_fsb >= mp->m_sb.sb_dblocks ||
+ extp->ext_len >= mp->m_sb.sb_agblocks) {
+ /*
+ * This will pull the EFI from the AIL and
+ * free the memory associated with it.
+ */
+ set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
+ xfs_efi_release(efip);
+ return -EIO;
+ }
+ }
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+ if (error)
+ return error;
+ efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
+
+ xfs_rmap_skip_owner_update(&oinfo);
+ for (i = 0; i < efip->efi_format.efi_nextents; i++) {
+ extp = &efip->efi_format.efi_extents[i];
+ error = xfs_trans_free_extent(tp, efdp, extp->ext_start,
+ extp->ext_len, &oinfo);
+ if (error)
+ goto abort_error;
+
+ }
+
+ set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
+ error = xfs_trans_commit(tp);
+ return error;
+
+abort_error:
+ xfs_trans_cancel(tp);
+ return error;
+}
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 8fa8651705e1d..a32c794a86b7b 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -98,4 +98,7 @@ int xfs_efi_copy_format(xfs_log_iovec_t *buf,
void xfs_efi_item_free(xfs_efi_log_item_t *);
void xfs_efi_release(struct xfs_efi_log_item *);
+int xfs_efi_recover(struct xfs_mount *mp,
+ struct xfs_efi_log_item *efip);
+
#endif /* __XFS_EXTFREE_ITEM_H__ */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index ac0fd32de31e4..ed95e5bb04e69 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -37,6 +37,7 @@
#include "xfs_log.h"
#include "xfs_icache.h"
#include "xfs_pnfs.h"
+#include "xfs_iomap.h"
#include <linux/dcache.h>
#include <linux/falloc.h>
@@ -80,61 +81,17 @@ xfs_rw_ilock_demote(
}
/*
- * xfs_iozero clears the specified range supplied via the page cache (except in
- * the DAX case). Writes through the page cache will allocate blocks over holes,
- * though the callers usually map the holes first and avoid them. If a block is
- * not completely zeroed, then it will be read from disk before being partially
- * zeroed.
- *
- * In the DAX case, we can just directly write to the underlying pages. This
- * will not allocate blocks, but will avoid holes and unwritten extents and so
- * not do unnecessary work.
+ * Clear the specified ranges to zero through either the pagecache or DAX.
+ * Holes and unwritten extents will be left as-is as they already are zeroed.
*/
int
-xfs_iozero(
- struct xfs_inode *ip, /* inode */
- loff_t pos, /* offset in file */
- size_t count) /* size of data to zero */
+xfs_zero_range(
+ struct xfs_inode *ip,
+ xfs_off_t pos,
+ xfs_off_t count,
+ bool *did_zero)
{
- struct page *page;
- struct address_space *mapping;
- int status = 0;
-
-
- mapping = VFS_I(ip)->i_mapping;
- do {
- unsigned offset, bytes;
- void *fsdata;
-
- offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
- bytes = PAGE_CACHE_SIZE - offset;
- if (bytes > count)
- bytes = count;
-
- if (IS_DAX(VFS_I(ip))) {
- status = dax_zero_page_range(VFS_I(ip), pos, bytes,
- xfs_get_blocks_direct);
- if (status)
- break;
- } else {
- status = pagecache_write_begin(NULL, mapping, pos, bytes,
- AOP_FLAG_UNINTERRUPTIBLE,
- &page, &fsdata);
- if (status)
- break;
-
- zero_user(page, offset, bytes);
-
- status = pagecache_write_end(NULL, mapping, pos, bytes,
- bytes, page, fsdata);
- WARN_ON(status <= 0); /* can't return less than zero! */
- status = 0;
- }
- pos += bytes;
- count -= bytes;
- } while (count);
-
- return status;
+ return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops);
}
int
@@ -145,12 +102,10 @@ xfs_update_prealloc_flags(
struct xfs_trans *tp;
int error;
- tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
- error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid,
+ 0, 0, 0, &tp);
+ if (error)
return error;
- }
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
@@ -284,48 +239,35 @@ xfs_file_fsync(
}
STATIC ssize_t
-xfs_file_read_iter(
+xfs_file_dio_aio_read(
struct kiocb *iocb,
struct iov_iter *to)
{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+ struct inode *inode = mapping->host;
struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- size_t size = iov_iter_count(to);
+ loff_t isize = i_size_read(inode);
+ size_t count = iov_iter_count(to);
+ struct iov_iter data;
+ struct xfs_buftarg *target;
ssize_t ret = 0;
- int ioflags = 0;
- xfs_fsize_t n;
- loff_t pos = iocb->ki_pos;
-
- XFS_STATS_INC(mp, xs_read_calls);
- if (unlikely(iocb->ki_flags & IOCB_DIRECT))
- ioflags |= XFS_IO_ISDIRECT;
- if (file->f_mode & FMODE_NOCMTIME)
- ioflags |= XFS_IO_INVIS;
-
- if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {
- xfs_buftarg_t *target =
- XFS_IS_REALTIME_INODE(ip) ?
- mp->m_rtdev_targp : mp->m_ddev_targp;
- /* DIO must be aligned to device logical sector size */
- if ((pos | size) & target->bt_logical_sectormask) {
- if (pos == i_size_read(inode))
- return 0;
- return -EINVAL;
- }
- }
+ trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
- n = mp->m_super->s_maxbytes - pos;
- if (n <= 0 || size == 0)
- return 0;
+ if (!count)
+ return 0; /* skip atime */
- if (n < size)
- size = n;
+ if (XFS_IS_REALTIME_INODE(ip))
+ target = ip->i_mount->m_rtdev_targp;
+ else
+ target = ip->i_mount->m_ddev_targp;
- if (XFS_FORCED_SHUTDOWN(mp))
- return -EIO;
+ /* DIO must be aligned to device logical sector size */
+ if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
+ if (iocb->ki_pos == isize)
+ return 0;
+ return -EINVAL;
+ }
/*
* Locking is a bit tricky here. If we take an exclusive lock for direct
@@ -338,7 +280,7 @@ xfs_file_read_iter(
* serialisation.
*/
xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
- if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) {
+ if (mapping->nrpages) {
xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
@@ -353,8 +295,8 @@ xfs_file_read_iter(
* flush and reduce the chances of repeated iolock cycles going
* forward.
*/
- if (inode->i_mapping->nrpages) {
- ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+ if (mapping->nrpages) {
+ ret = filemap_write_and_wait(mapping);
if (ret) {
xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
return ret;
@@ -365,20 +307,95 @@ xfs_file_read_iter(
* we fail to invalidate a page, but this should never
* happen on XFS. Warn if it does fail.
*/
- ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
+ ret = invalidate_inode_pages2(mapping);
WARN_ON_ONCE(ret);
ret = 0;
}
xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
}
- trace_xfs_file_read(ip, size, pos, ioflags);
+ data = *to;
+ ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
+ xfs_get_blocks_direct, NULL, NULL, 0);
+ if (ret > 0) {
+ iocb->ki_pos += ret;
+ iov_iter_advance(to, ret);
+ }
+ xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+
+ file_accessed(iocb->ki_filp);
+ return ret;
+}
+
+static noinline ssize_t
+xfs_file_dax_read(
+ struct kiocb *iocb,
+ struct iov_iter *to)
+{
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+ struct inode *inode = mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct iov_iter data = *to;
+ size_t count = iov_iter_count(to);
+ ssize_t ret = 0;
+
+ trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
+
+ if (!count)
+ return 0; /* skip atime */
+ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+ ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0);
+ if (ret > 0) {
+ iocb->ki_pos += ret;
+ iov_iter_advance(to, ret);
+ }
+ xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+
+ file_accessed(iocb->ki_filp);
+ return ret;
+}
+
+STATIC ssize_t
+xfs_file_buffered_aio_read(
+ struct kiocb *iocb,
+ struct iov_iter *to)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
+ ssize_t ret;
+
+ trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
+
+ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
ret = generic_file_read_iter(iocb, to);
+ xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+
+ return ret;
+}
+
+STATIC ssize_t
+xfs_file_read_iter(
+ struct kiocb *iocb,
+ struct iov_iter *to)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct xfs_mount *mp = XFS_I(inode)->i_mount;
+ ssize_t ret = 0;
+
+ XFS_STATS_INC(mp, xs_read_calls);
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ if (IS_DAX(inode))
+ ret = xfs_file_dax_read(iocb, to);
+ else if (iocb->ki_flags & IOCB_DIRECT)
+ ret = xfs_file_dio_aio_read(iocb, to);
+ else
+ ret = xfs_file_buffered_aio_read(iocb, to);
+
if (ret > 0)
XFS_STATS_ADD(mp, xs_read_bytes, ret);
-
- xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
}
@@ -391,18 +408,14 @@ xfs_file_splice_read(
unsigned int flags)
{
struct xfs_inode *ip = XFS_I(infilp->f_mapping->host);
- int ioflags = 0;
ssize_t ret;
XFS_STATS_INC(ip->i_mount, xs_read_calls);
- if (infilp->f_mode & FMODE_NOCMTIME)
- ioflags |= XFS_IO_INVIS;
-
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO;
- trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
+ trace_xfs_file_splice_read(ip, count, *ppos);
/*
* DAX inodes cannot ues the page cache for splice, so we have to push
@@ -426,49 +439,6 @@ out:
}
/*
- * This routine is called to handle zeroing any space in the last block of the
- * file that is beyond the EOF. We do this since the size is being increased
- * without writing anything to that block and we don't want to read the
- * garbage on the disk.
- */
-STATIC int /* error (positive) */
-xfs_zero_last_block(
- struct xfs_inode *ip,
- xfs_fsize_t offset,
- xfs_fsize_t isize,
- bool *did_zeroing)
-{
- struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize);
- int zero_offset = XFS_B_FSB_OFFSET(mp, isize);
- int zero_len;
- int nimaps = 1;
- int error = 0;
- struct xfs_bmbt_irec imap;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- if (error)
- return error;
-
- ASSERT(nimaps > 0);
-
- /*
- * If the block underlying isize is just a hole, then there
- * is nothing to zero.
- */
- if (imap.br_startblock == HOLESTARTBLOCK)
- return 0;
-
- zero_len = mp->m_sb.sb_blocksize - zero_offset;
- if (isize + zero_len > offset)
- zero_len = offset - isize;
- *did_zeroing = true;
- return xfs_iozero(ip, isize, zero_len);
-}
-
-/*
* Zero any on disk space between the current EOF and the new, larger EOF.
*
* This handles the normal case of zeroing the remainder of the last block in
@@ -486,94 +456,11 @@ xfs_zero_eof(
xfs_fsize_t isize, /* current inode size */
bool *did_zeroing)
{
- struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t start_zero_fsb;
- xfs_fileoff_t end_zero_fsb;
- xfs_fileoff_t zero_count_fsb;
- xfs_fileoff_t last_fsb;
- xfs_fileoff_t zero_off;
- xfs_fsize_t zero_len;
- int nimaps;
- int error = 0;
- struct xfs_bmbt_irec imap;
-
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
ASSERT(offset > isize);
trace_xfs_zero_eof(ip, isize, offset - isize);
-
- /*
- * First handle zeroing the block on which isize resides.
- *
- * We only zero a part of that block so it is handled specially.
- */
- if (XFS_B_FSB_OFFSET(mp, isize) != 0) {
- error = xfs_zero_last_block(ip, offset, isize, did_zeroing);
- if (error)
- return error;
- }
-
- /*
- * Calculate the range between the new size and the old where blocks
- * needing to be zeroed may exist.
- *
- * To get the block where the last byte in the file currently resides,
- * we need to subtract one from the size and truncate back to a block
- * boundary. We subtract 1 in case the size is exactly on a block
- * boundary.
- */
- last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
- start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
- end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
- ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
- if (last_fsb == end_zero_fsb) {
- /*
- * The size was only incremented on its last block.
- * We took care of that above, so just return.
- */
- return 0;
- }
-
- ASSERT(start_zero_fsb <= end_zero_fsb);
- while (start_zero_fsb <= end_zero_fsb) {
- nimaps = 1;
- zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb,
- &imap, &nimaps, 0);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- if (error)
- return error;
-
- ASSERT(nimaps > 0);
-
- if (imap.br_state == XFS_EXT_UNWRITTEN ||
- imap.br_startblock == HOLESTARTBLOCK) {
- start_zero_fsb = imap.br_startoff + imap.br_blockcount;
- ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
- continue;
- }
-
- /*
- * There are blocks we need to zero.
- */
- zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
- zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
-
- if ((zero_off + zero_len) > offset)
- zero_len = offset - zero_off;
-
- error = xfs_iozero(ip, zero_off, zero_len);
- if (error)
- return error;
-
- *did_zeroing = true;
- start_zero_fsb = imap.br_startoff + imap.br_blockcount;
- ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
- }
-
- return 0;
+ return xfs_zero_range(ip, isize, offset - isize, did_zeroing);
}
/*
@@ -718,18 +605,18 @@ xfs_file_dio_aio_write(
int unaligned_io = 0;
int iolock;
size_t count = iov_iter_count(from);
- loff_t pos = iocb->ki_pos;
loff_t end;
struct iov_iter data;
struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp;
/* DIO must be aligned to device logical sector size */
- if (!IS_DAX(inode) && ((pos | count) & target->bt_logical_sectormask))
+ if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
return -EINVAL;
/* "unaligned" here means not aligned to a filesystem block */
- if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
+ if ((iocb->ki_pos & mp->m_blockmask) ||
+ ((iocb->ki_pos + count) & mp->m_blockmask))
unaligned_io = 1;
/*
@@ -760,11 +647,10 @@ xfs_file_dio_aio_write(
if (ret)
goto out;
count = iov_iter_count(from);
- pos = iocb->ki_pos;
- end = pos + count - 1;
+ end = iocb->ki_pos + count - 1;
/*
- * See xfs_file_read_iter() for why we do a full-file flush here.
+ * See xfs_file_dio_aio_read() for why we do a full-file flush here.
*/
if (mapping->nrpages) {
ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
@@ -791,31 +677,92 @@ xfs_file_dio_aio_write(
iolock = XFS_IOLOCK_SHARED;
}
- trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
+ trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
data = *from;
- ret = mapping->a_ops->direct_IO(iocb, &data, pos);
+ ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
+ xfs_get_blocks_direct, xfs_end_io_direct_write,
+ NULL, DIO_ASYNC_EXTEND);
/* see generic_file_direct_write() for why this is necessary */
if (mapping->nrpages) {
invalidate_inode_pages2_range(mapping,
- pos >> PAGE_CACHE_SHIFT,
- end >> PAGE_CACHE_SHIFT);
+ iocb->ki_pos >> PAGE_SHIFT,
+ end >> PAGE_SHIFT);
}
if (ret > 0) {
- pos += ret;
+ iocb->ki_pos += ret;
iov_iter_advance(from, ret);
- iocb->ki_pos = pos;
}
out:
xfs_rw_iunlock(ip, iolock);
/*
- * No fallback to buffered IO on errors for XFS. DAX can result in
- * partial writes, but direct IO will either complete fully or fail.
+ * No fallback to buffered IO on errors for XFS, direct IO will either
+ * complete fully or fail.
*/
- ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
+ ASSERT(ret < 0 || ret == count);
+ return ret;
+}
+
+static noinline ssize_t
+xfs_file_dax_write(
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+ struct inode *inode = mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ ssize_t ret = 0;
+ int unaligned_io = 0;
+ int iolock;
+ struct iov_iter data;
+
+ /* "unaligned" here means not aligned to a filesystem block */
+ if ((iocb->ki_pos & mp->m_blockmask) ||
+ ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) {
+ unaligned_io = 1;
+ iolock = XFS_IOLOCK_EXCL;
+ } else if (mapping->nrpages) {
+ iolock = XFS_IOLOCK_EXCL;
+ } else {
+ iolock = XFS_IOLOCK_SHARED;
+ }
+ xfs_rw_ilock(ip, iolock);
+
+ ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+ if (ret)
+ goto out;
+
+ /*
+ * Yes, even DAX files can have page cache attached to them: A zeroed
+ * page is inserted into the pagecache when we have to serve a write
+ * fault on a hole. It should never be dirtied and can simply be
+ * dropped from the pagecache once we get real data for the page.
+ */
+ if (mapping->nrpages) {
+ ret = invalidate_inode_pages2(mapping);
+ WARN_ON_ONCE(ret);
+ }
+
+ if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) {
+ xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+ iolock = XFS_IOLOCK_SHARED;
+ }
+
+ trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos);
+
+ data = *from;
+ ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
+ xfs_end_io_direct_write, 0);
+ if (ret > 0) {
+ iocb->ki_pos += ret;
+ iov_iter_advance(from, ret);
+ }
+out:
+ xfs_rw_iunlock(ip, iolock);
return ret;
}
@@ -842,9 +789,8 @@ xfs_file_buffered_aio_write(
current->backing_dev_info = inode_to_bdi(inode);
write_retry:
- trace_xfs_file_buffered_write(ip, iov_iter_count(from),
- iocb->ki_pos, 0);
- ret = generic_perform_write(file, from, iocb->ki_pos);
+ trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
+ ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
if (likely(ret >= 0))
iocb->ki_pos += ret;
@@ -898,20 +844,18 @@ xfs_file_write_iter(
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO;
- if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
+ if (IS_DAX(inode))
+ ret = xfs_file_dax_write(iocb, from);
+ else if (iocb->ki_flags & IOCB_DIRECT)
ret = xfs_file_dio_aio_write(iocb, from);
else
ret = xfs_file_buffered_aio_write(iocb, from);
if (ret > 0) {
- ssize_t err;
-
XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
/* Handle various SYNC-type writes */
- err = generic_write_sync(file, iocb->ki_pos - ret, ret);
- if (err < 0)
- ret = err;
+ ret = generic_write_sync(iocb, ret);
}
return ret;
}
@@ -1207,9 +1151,9 @@ xfs_find_get_desired_pgoff(
pagevec_init(&pvec, 0);
- index = startoff >> PAGE_CACHE_SHIFT;
+ index = startoff >> PAGE_SHIFT;
endoff = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount);
- end = endoff >> PAGE_CACHE_SHIFT;
+ end = endoff >> PAGE_SHIFT;
do {
int want;
unsigned nr_pages;
@@ -1558,9 +1502,9 @@ xfs_filemap_page_mkwrite(
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (IS_DAX(inode)) {
- ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL);
+ ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
} else {
- ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
+ ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
ret = block_page_mkwrite_return(ret);
}
@@ -1592,7 +1536,7 @@ xfs_filemap_fault(
* changes to xfs_get_blocks_direct() to map unwritten extent
* ioend for conversion on read-only mappings.
*/
- ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL);
+ ret = dax_fault(vma, vmf, xfs_get_blocks_dax_fault);
} else
ret = filemap_fault(vma, vmf);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1629,8 +1573,7 @@ xfs_filemap_pmd_fault(
}
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault,
- NULL);
+ ret = dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (flags & FAULT_FLAG_WRITE)
@@ -1714,7 +1657,7 @@ const struct file_operations xfs_file_operations = {
const struct file_operations xfs_dir_file_operations = {
.open = xfs_dir_open,
.read = generic_read_dir,
- .iterate = xfs_file_readdir,
+ .iterate_shared = xfs_file_readdir,
.llseek = generic_file_llseek,
.unlocked_ioctl = xfs_file_ioctl,
#ifdef CONFIG_COMPAT
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index a51353a1f87f1..4a33a33043691 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -22,6 +22,7 @@
#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
+#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
@@ -385,7 +386,7 @@ xfs_filestream_new_ag(
}
flags = (ap->userdata ? XFS_PICK_USERDATA : 0) |
- (ap->flist->xbf_low ? XFS_PICK_LOWSPACE : 0);
+ (ap->dfops->dop_low ? XFS_PICK_LOWSPACE : 0);
err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen);
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index ee3aaa0a53179..0f96847b90e11 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -23,6 +23,7 @@
#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
+#include "xfs_defer.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_inode.h"
@@ -32,6 +33,7 @@
#include "xfs_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_alloc.h"
+#include "xfs_rmap_btree.h"
#include "xfs_ialloc.h"
#include "xfs_fsops.h"
#include "xfs_itable.h"
@@ -40,6 +42,7 @@
#include "xfs_trace.h"
#include "xfs_log.h"
#include "xfs_filestream.h"
+#include "xfs_rmap.h"
/*
* File system operations
@@ -103,7 +106,9 @@ xfs_fs_geometry(
(xfs_sb_version_hasfinobt(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_FINOBT : 0) |
(xfs_sb_version_hassparseinodes(&mp->m_sb) ?
- XFS_FSOP_GEOM_FLAGS_SPINODES : 0);
+ XFS_FSOP_GEOM_FLAGS_SPINODES : 0) |
+ (xfs_sb_version_hasrmapbt(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_RMAPBT : 0);
geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
mp->m_sb.sb_logsectsize : BBSIZE;
geo->rtsectsize = mp->m_sb.sb_blocksize;
@@ -198,14 +203,10 @@ xfs_growfs_data_private(
return error;
}
- tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
- tp->t_flags |= XFS_TRANS_RESERVE;
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata,
- XFS_GROWFS_SPACE_RES(mp), 0);
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
+ XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
+ if (error)
return error;
- }
/*
* Write new AG headers to disk. Non-transactional, but written
@@ -243,10 +244,16 @@ xfs_growfs_data_private(
agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp));
agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1);
agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1);
- agf->agf_flfirst = 0;
- agf->agf_fllast = cpu_to_be32(XFS_AGFL_SIZE(mp) - 1);
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+ agf->agf_roots[XFS_BTNUM_RMAPi] =
+ cpu_to_be32(XFS_RMAP_BLOCK(mp));
+ agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1);
+ }
+
+ agf->agf_flfirst = cpu_to_be32(1);
+ agf->agf_fllast = 0;
agf->agf_flcount = 0;
- tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp);
+ tmpsize = agsize - mp->m_ag_prealloc_blocks;
agf->agf_freeblks = cpu_to_be32(tmpsize);
agf->agf_longest = cpu_to_be32(tmpsize);
if (xfs_sb_version_hascrc(&mp->m_sb))
@@ -343,7 +350,7 @@ xfs_growfs_data_private(
agno, 0);
arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
- arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
+ arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
arec->ar_blockcount = cpu_to_be32(
agsize - be32_to_cpu(arec->ar_startblock));
@@ -372,7 +379,7 @@ xfs_growfs_data_private(
agno, 0);
arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
- arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
+ arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
arec->ar_blockcount = cpu_to_be32(
agsize - be32_to_cpu(arec->ar_startblock));
nfree += be32_to_cpu(arec->ar_blockcount);
@@ -382,6 +389,72 @@ xfs_growfs_data_private(
if (error)
goto error0;
+ /* RMAP btree root block */
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+ struct xfs_rmap_rec *rrec;
+ struct xfs_btree_block *block;
+
+ bp = xfs_growfs_get_hdr_buf(mp,
+ XFS_AGB_TO_DADDR(mp, agno, XFS_RMAP_BLOCK(mp)),
+ BTOBB(mp->m_sb.sb_blocksize), 0,
+ &xfs_rmapbt_buf_ops);
+ if (!bp) {
+ error = -ENOMEM;
+ goto error0;
+ }
+
+ xfs_btree_init_block(mp, bp, XFS_RMAP_CRC_MAGIC, 0, 0,
+ agno, XFS_BTREE_CRC_BLOCKS);
+ block = XFS_BUF_TO_BLOCK(bp);
+
+
+ /*
+ * mark the AG header regions as static metadata The BNO
+ * btree block is the first block after the headers, so
+ * it's location defines the size of region the static
+ * metadata consumes.
+ *
+ * Note: unlike mkfs, we never have to account for log
+ * space when growing the data regions
+ */
+ rrec = XFS_RMAP_REC_ADDR(block, 1);
+ rrec->rm_startblock = 0;
+ rrec->rm_blockcount = cpu_to_be32(XFS_BNO_BLOCK(mp));
+ rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_FS);
+ rrec->rm_offset = 0;
+ be16_add_cpu(&block->bb_numrecs, 1);
+
+ /* account freespace btree root blocks */
+ rrec = XFS_RMAP_REC_ADDR(block, 2);
+ rrec->rm_startblock = cpu_to_be32(XFS_BNO_BLOCK(mp));
+ rrec->rm_blockcount = cpu_to_be32(2);
+ rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG);
+ rrec->rm_offset = 0;
+ be16_add_cpu(&block->bb_numrecs, 1);
+
+ /* account inode btree root blocks */
+ rrec = XFS_RMAP_REC_ADDR(block, 3);
+ rrec->rm_startblock = cpu_to_be32(XFS_IBT_BLOCK(mp));
+ rrec->rm_blockcount = cpu_to_be32(XFS_RMAP_BLOCK(mp) -
+ XFS_IBT_BLOCK(mp));
+ rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_INOBT);
+ rrec->rm_offset = 0;
+ be16_add_cpu(&block->bb_numrecs, 1);
+
+ /* account for rmap btree root */
+ rrec = XFS_RMAP_REC_ADDR(block, 4);
+ rrec->rm_startblock = cpu_to_be32(XFS_RMAP_BLOCK(mp));
+ rrec->rm_blockcount = cpu_to_be32(1);
+ rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG);
+ rrec->rm_offset = 0;
+ be16_add_cpu(&block->bb_numrecs, 1);
+
+ error = xfs_bwrite(bp);
+ xfs_buf_relse(bp);
+ if (error)
+ goto error0;
+ }
+
/*
* INO btree root block
*/
@@ -439,6 +512,8 @@ xfs_growfs_data_private(
* There are new blocks in the old last a.g.
*/
if (new) {
+ struct xfs_owner_info oinfo;
+
/*
* Change the agi length.
*/
@@ -466,14 +541,20 @@ xfs_growfs_data_private(
be32_to_cpu(agi->agi_length));
xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH);
+
/*
* Free the new space.
+ *
+ * XFS_RMAP_OWN_NULL is used here to tell the rmap btree that
+ * this doesn't actually exist in the rmap btree.
*/
- error = xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, agno,
- be32_to_cpu(agf->agf_length) - new), new);
- if (error) {
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_NULL);
+ error = xfs_free_extent(tp,
+ XFS_AGB_TO_FSB(mp, agno,
+ be32_to_cpu(agf->agf_length) - new),
+ new, &oinfo);
+ if (error)
goto error0;
- }
}
/*
@@ -505,6 +586,7 @@ xfs_growfs_data_private(
} else
mp->m_maxicount = 0;
xfs_set_low_space_thresholds(mp);
+ mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
/* update secondary superblocks. */
for (agno = 1; agno < nagcount; agno++) {
@@ -642,7 +724,7 @@ xfs_fs_counts(
cnt->allocino = percpu_counter_read_positive(&mp->m_icount);
cnt->freeino = percpu_counter_read_positive(&mp->m_ifree);
cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
- XFS_ALLOC_SET_ASIDE(mp);
+ mp->m_alloc_set_aside;
spin_lock(&mp->m_sb_lock);
cnt->freertx = mp->m_sb.sb_frextents;
@@ -671,8 +753,11 @@ xfs_reserve_blocks(
__uint64_t *inval,
xfs_fsop_resblks_t *outval)
{
- __int64_t lcounter, delta, fdblks_delta;
+ __int64_t lcounter, delta;
+ __int64_t fdblks_delta = 0;
__uint64_t request;
+ __int64_t free;
+ int error = 0;
/* If inval is null, report current values and return */
if (inval == (__uint64_t *)NULL) {
@@ -686,24 +771,23 @@ xfs_reserve_blocks(
request = *inval;
/*
- * With per-cpu counters, this becomes an interesting
- * problem. we needto work out if we are freeing or allocation
- * blocks first, then we can do the modification as necessary.
+ * With per-cpu counters, this becomes an interesting problem. we need
+ * to work out if we are freeing or allocation blocks first, then we can
+ * do the modification as necessary.
*
- * We do this under the m_sb_lock so that if we are near
- * ENOSPC, we will hold out any changes while we work out
- * what to do. This means that the amount of free space can
- * change while we do this, so we need to retry if we end up
- * trying to reserve more space than is available.
+ * We do this under the m_sb_lock so that if we are near ENOSPC, we will
+ * hold out any changes while we work out what to do. This means that
+ * the amount of free space can change while we do this, so we need to
+ * retry if we end up trying to reserve more space than is available.
*/
-retry:
spin_lock(&mp->m_sb_lock);
/*
* If our previous reservation was larger than the current value,
- * then move any unused blocks back to the free pool.
+ * then move any unused blocks back to the free pool. Modify the resblks
+ * counters directly since we shouldn't have any problems unreserving
+ * space.
*/
- fdblks_delta = 0;
if (mp->m_resblks > request) {
lcounter = mp->m_resblks_avail - request;
if (lcounter > 0) { /* release unused blocks */
@@ -711,54 +795,67 @@ retry:
mp->m_resblks_avail -= lcounter;
}
mp->m_resblks = request;
- } else {
- __int64_t free;
+ if (fdblks_delta) {
+ spin_unlock(&mp->m_sb_lock);
+ error = xfs_mod_fdblocks(mp, fdblks_delta, 0);
+ spin_lock(&mp->m_sb_lock);
+ }
+ goto out;
+ }
+
+ /*
+ * If the request is larger than the current reservation, reserve the
+ * blocks before we update the reserve counters. Sample m_fdblocks and
+ * perform a partial reservation if the request exceeds free space.
+ */
+ error = -ENOSPC;
+ do {
free = percpu_counter_sum(&mp->m_fdblocks) -
- XFS_ALLOC_SET_ASIDE(mp);
+ mp->m_alloc_set_aside;
if (!free)
- goto out; /* ENOSPC and fdblks_delta = 0 */
+ break;
delta = request - mp->m_resblks;
lcounter = free - delta;
- if (lcounter < 0) {
+ if (lcounter < 0)
/* We can't satisfy the request, just get what we can */
- mp->m_resblks += free;
- mp->m_resblks_avail += free;
- fdblks_delta = -free;
- } else {
- fdblks_delta = -delta;
- mp->m_resblks = request;
- mp->m_resblks_avail += delta;
- }
- }
-out:
- if (outval) {
- outval->resblks = mp->m_resblks;
- outval->resblks_avail = mp->m_resblks_avail;
- }
- spin_unlock(&mp->m_sb_lock);
+ fdblks_delta = free;
+ else
+ fdblks_delta = delta;
- if (fdblks_delta) {
/*
- * If we are putting blocks back here, m_resblks_avail is
- * already at its max so this will put it in the free pool.
- *
- * If we need space, we'll either succeed in getting it
- * from the free block count or we'll get an enospc. If
- * we get a ENOSPC, it means things changed while we were
- * calculating fdblks_delta and so we should try again to
- * see if there is anything left to reserve.
+ * We'll either succeed in getting space from the free block
+ * count or we'll get an ENOSPC. If we get a ENOSPC, it means
+ * things changed while we were calculating fdblks_delta and so
+ * we should try again to see if there is anything left to
+ * reserve.
*
* Don't set the reserved flag here - we don't want to reserve
* the extra reserve blocks from the reserve.....
*/
- int error;
- error = xfs_mod_fdblocks(mp, fdblks_delta, 0);
- if (error == -ENOSPC)
- goto retry;
+ spin_unlock(&mp->m_sb_lock);
+ error = xfs_mod_fdblocks(mp, -fdblks_delta, 0);
+ spin_lock(&mp->m_sb_lock);
+ } while (error == -ENOSPC);
+
+ /*
+ * Update the reserve counters if blocks have been successfully
+ * allocated.
+ */
+ if (!error && fdblks_delta) {
+ mp->m_resblks += fdblks_delta;
+ mp->m_resblks_avail += fdblks_delta;
}
- return 0;
+
+out:
+ if (outval) {
+ outval->resblks = mp->m_resblks;
+ outval->resblks_avail = mp->m_resblks_avail;
+ }
+
+ spin_unlock(&mp->m_sb_lock);
+ return error;
}
int
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index bf2d607492786..fb39a66914dd8 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -37,9 +37,6 @@
#include <linux/kthread.h>
#include <linux/freezer.h>
-STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp,
- struct xfs_perag *pag, struct xfs_inode *ip);
-
/*
* Allocate and initialise an xfs_inode.
*/
@@ -94,13 +91,6 @@ xfs_inode_free_callback(
struct inode *inode = container_of(head, struct inode, i_rcu);
struct xfs_inode *ip = XFS_I(inode);
- kmem_zone_free(xfs_inode_zone, ip);
-}
-
-void
-xfs_inode_free(
- struct xfs_inode *ip)
-{
switch (VFS_I(ip)->i_mode & S_IFMT) {
case S_IFREG:
case S_IFDIR:
@@ -118,6 +108,25 @@ xfs_inode_free(
ip->i_itemp = NULL;
}
+ kmem_zone_free(xfs_inode_zone, ip);
+}
+
+static void
+__xfs_inode_free(
+ struct xfs_inode *ip)
+{
+ /* asserts to verify all state is correct here */
+ ASSERT(atomic_read(&ip->i_pincount) == 0);
+ ASSERT(!xfs_isiflocked(ip));
+ XFS_STATS_DEC(ip->i_mount, vn_active);
+
+ call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
+}
+
+void
+xfs_inode_free(
+ struct xfs_inode *ip)
+{
/*
* Because we use RCU freeing we need to ensure the inode always
* appears to be reclaimed with an invalid inode number when in the
@@ -129,12 +138,123 @@ xfs_inode_free(
ip->i_ino = 0;
spin_unlock(&ip->i_flags_lock);
- /* asserts to verify all state is correct here */
- ASSERT(atomic_read(&ip->i_pincount) == 0);
- ASSERT(!xfs_isiflocked(ip));
- XFS_STATS_DEC(ip->i_mount, vn_active);
+ __xfs_inode_free(ip);
+}
- call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
+/*
+ * Queue a new inode reclaim pass if there are reclaimable inodes and there
+ * isn't a reclaim pass already in progress. By default it runs every 5s based
+ * on the xfs periodic sync default of 30s. Perhaps this should have it's own
+ * tunable, but that can be done if this method proves to be ineffective or too
+ * aggressive.
+ */
+static void
+xfs_reclaim_work_queue(
+ struct xfs_mount *mp)
+{
+
+ rcu_read_lock();
+ if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+ queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
+ msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * This is a fast pass over the inode cache to try to get reclaim moving on as
+ * many inodes as possible in a short period of time. It kicks itself every few
+ * seconds, as well as being kicked by the inode cache shrinker when memory
+ * goes low. It scans as quickly as possible avoiding locked inodes or those
+ * already being flushed, and once done schedules a future pass.
+ */
+void
+xfs_reclaim_worker(
+ struct work_struct *work)
+{
+ struct xfs_mount *mp = container_of(to_delayed_work(work),
+ struct xfs_mount, m_reclaim_work);
+
+ xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
+ xfs_reclaim_work_queue(mp);
+}
+
+static void
+xfs_perag_set_reclaim_tag(
+ struct xfs_perag *pag)
+{
+ struct xfs_mount *mp = pag->pag_mount;
+
+ ASSERT(spin_is_locked(&pag->pag_ici_lock));
+ if (pag->pag_ici_reclaimable++)
+ return;
+
+ /* propagate the reclaim tag up into the perag radix tree */
+ spin_lock(&mp->m_perag_lock);
+ radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno,
+ XFS_ICI_RECLAIM_TAG);
+ spin_unlock(&mp->m_perag_lock);
+
+ /* schedule periodic background inode reclaim */
+ xfs_reclaim_work_queue(mp);
+
+ trace_xfs_perag_set_reclaim(mp, pag->pag_agno, -1, _RET_IP_);
+}
+
+static void
+xfs_perag_clear_reclaim_tag(
+ struct xfs_perag *pag)
+{
+ struct xfs_mount *mp = pag->pag_mount;
+
+ ASSERT(spin_is_locked(&pag->pag_ici_lock));
+ if (--pag->pag_ici_reclaimable)
+ return;
+
+ /* clear the reclaim tag from the perag radix tree */
+ spin_lock(&mp->m_perag_lock);
+ radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno,
+ XFS_ICI_RECLAIM_TAG);
+ spin_unlock(&mp->m_perag_lock);
+ trace_xfs_perag_clear_reclaim(mp, pag->pag_agno, -1, _RET_IP_);
+}
+
+
+/*
+ * We set the inode flag atomically with the radix tree tag.
+ * Once we get tag lookups on the radix tree, this inode flag
+ * can go away.
+ */
+void
+xfs_inode_set_reclaim_tag(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_perag *pag;
+
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+ spin_lock(&pag->pag_ici_lock);
+ spin_lock(&ip->i_flags_lock);
+
+ radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino),
+ XFS_ICI_RECLAIM_TAG);
+ xfs_perag_set_reclaim_tag(pag);
+ __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+
+ spin_unlock(&ip->i_flags_lock);
+ spin_unlock(&pag->pag_ici_lock);
+ xfs_perag_put(pag);
+}
+
+STATIC void
+xfs_inode_clear_reclaim_tag(
+ struct xfs_perag *pag,
+ xfs_ino_t ino)
+{
+ radix_tree_tag_clear(&pag->pag_ici_root,
+ XFS_INO_TO_AGINO(pag->pag_mount, ino),
+ XFS_ICI_RECLAIM_TAG);
+ xfs_perag_clear_reclaim_tag(pag);
}
/*
@@ -264,7 +384,7 @@ xfs_iget_cache_hit(
*/
ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
ip->i_flags |= XFS_INEW;
- __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+ xfs_inode_clear_reclaim_tag(pag, ip->i_ino);
inode->i_state = I_NEW;
ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
@@ -645,7 +765,7 @@ restart:
* Background scanning to trim post-EOF preallocated space. This is queued
* based on the 'speculative_prealloc_lifetime' tunable (5m by default).
*/
-STATIC void
+void
xfs_queue_eofblocks(
struct xfs_mount *mp)
{
@@ -723,121 +843,6 @@ xfs_inode_ag_iterator_tag(
}
/*
- * Queue a new inode reclaim pass if there are reclaimable inodes and there
- * isn't a reclaim pass already in progress. By default it runs every 5s based
- * on the xfs periodic sync default of 30s. Perhaps this should have it's own
- * tunable, but that can be done if this method proves to be ineffective or too
- * aggressive.
- */
-static void
-xfs_reclaim_work_queue(
- struct xfs_mount *mp)
-{
-
- rcu_read_lock();
- if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
- queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
- msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
- }
- rcu_read_unlock();
-}
-
-/*
- * This is a fast pass over the inode cache to try to get reclaim moving on as
- * many inodes as possible in a short period of time. It kicks itself every few
- * seconds, as well as being kicked by the inode cache shrinker when memory
- * goes low. It scans as quickly as possible avoiding locked inodes or those
- * already being flushed, and once done schedules a future pass.
- */
-void
-xfs_reclaim_worker(
- struct work_struct *work)
-{
- struct xfs_mount *mp = container_of(to_delayed_work(work),
- struct xfs_mount, m_reclaim_work);
-
- xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
- xfs_reclaim_work_queue(mp);
-}
-
-static void
-__xfs_inode_set_reclaim_tag(
- struct xfs_perag *pag,
- struct xfs_inode *ip)
-{
- radix_tree_tag_set(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
- XFS_ICI_RECLAIM_TAG);
-
- if (!pag->pag_ici_reclaimable) {
- /* propagate the reclaim tag up into the perag radix tree */
- spin_lock(&ip->i_mount->m_perag_lock);
- radix_tree_tag_set(&ip->i_mount->m_perag_tree,
- XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
- XFS_ICI_RECLAIM_TAG);
- spin_unlock(&ip->i_mount->m_perag_lock);
-
- /* schedule periodic background inode reclaim */
- xfs_reclaim_work_queue(ip->i_mount);
-
- trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
- -1, _RET_IP_);
- }
- pag->pag_ici_reclaimable++;
-}
-
-/*
- * We set the inode flag atomically with the radix tree tag.
- * Once we get tag lookups on the radix tree, this inode flag
- * can go away.
- */
-void
-xfs_inode_set_reclaim_tag(
- xfs_inode_t *ip)
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_perag *pag;
-
- pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
- spin_lock(&pag->pag_ici_lock);
- spin_lock(&ip->i_flags_lock);
- __xfs_inode_set_reclaim_tag(pag, ip);
- __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
- spin_unlock(&ip->i_flags_lock);
- spin_unlock(&pag->pag_ici_lock);
- xfs_perag_put(pag);
-}
-
-STATIC void
-__xfs_inode_clear_reclaim(
- xfs_perag_t *pag,
- xfs_inode_t *ip)
-{
- pag->pag_ici_reclaimable--;
- if (!pag->pag_ici_reclaimable) {
- /* clear the reclaim tag from the perag radix tree */
- spin_lock(&ip->i_mount->m_perag_lock);
- radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
- XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
- XFS_ICI_RECLAIM_TAG);
- spin_unlock(&ip->i_mount->m_perag_lock);
- trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
- -1, _RET_IP_);
- }
-}
-
-STATIC void
-__xfs_inode_clear_reclaim_tag(
- xfs_mount_t *mp,
- xfs_perag_t *pag,
- xfs_inode_t *ip)
-{
- radix_tree_tag_clear(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
- __xfs_inode_clear_reclaim(pag, ip);
-}
-
-/*
* Grab the inode for reclaim exclusively.
* Return 0 if we grabbed it, non-zero otherwise.
*/
@@ -929,6 +934,7 @@ xfs_reclaim_inode(
int sync_mode)
{
struct xfs_buf *bp = NULL;
+ xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */
int error;
restart:
@@ -993,6 +999,22 @@ restart:
xfs_iflock(ip);
reclaim:
+ /*
+ * Because we use RCU freeing we need to ensure the inode always appears
+ * to be reclaimed with an invalid inode number when in the free state.
+ * We do this as early as possible under the ILOCK and flush lock so
+ * that xfs_iflush_cluster() can be guaranteed to detect races with us
+ * here. By doing this, we guarantee that once xfs_iflush_cluster has
+ * locked both the XFS_ILOCK and the flush lock that it will see either
+ * a valid, flushable inode that will serialise correctly against the
+ * locks below, or it will see a clean (and invalid) inode that it can
+ * skip.
+ */
+ spin_lock(&ip->i_flags_lock);
+ ip->i_flags = XFS_IRECLAIM;
+ ip->i_ino = 0;
+ spin_unlock(&ip->i_flags_lock);
+
xfs_ifunlock(ip);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -1006,9 +1028,9 @@ reclaim:
*/
spin_lock(&pag->pag_ici_lock);
if (!radix_tree_delete(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
+ XFS_INO_TO_AGINO(ip->i_mount, ino)))
ASSERT(0);
- __xfs_inode_clear_reclaim(pag, ip);
+ xfs_perag_clear_reclaim_tag(pag);
spin_unlock(&pag->pag_ici_lock);
/*
@@ -1023,7 +1045,7 @@ reclaim:
xfs_qm_dqdetach(ip);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_inode_free(ip);
+ __xfs_inode_free(ip);
return error;
out_ifunlock:
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 62f1f91c32cb3..05bac99bef75d 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -68,6 +68,7 @@ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip);
void xfs_eofblocks_worker(struct work_struct *);
+void xfs_queue_eofblocks(struct xfs_mount *);
int xfs_inode_ag_iterator(struct xfs_mount *mp,
int (*execute)(struct xfs_inode *ip, int flags, void *args),
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 96f606deee313..e08eaea6327b5 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -25,6 +25,7 @@
#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
+#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
@@ -431,7 +432,7 @@ xfs_lock_inumorder(int lock_mode, int subclass)
* lock more than one at a time, lockdep will report false positives saying we
* have violated locking orders.
*/
-void
+static void
xfs_lock_inodes(
xfs_inode_t **ips,
int inodes,
@@ -667,14 +668,6 @@ xfs_ip2xflags(
return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip));
}
-uint
-xfs_dic2xflags(
- struct xfs_dinode *dip)
-{
- return _xfs_dic2xflags(be16_to_cpu(dip->di_flags),
- be64_to_cpu(dip->di_flags2), XFS_DFORK_Q(dip));
-}
-
/*
* Lookups up an inode from "name". If ci_name is not NULL, then a CI match
* is allowed, otherwise it has to be an exact match. If a CI match is found,
@@ -748,7 +741,7 @@ out_unlock:
* are not linked into the directory structure - they are attached
* directly to the superblock - and so have no parent.
*/
-int
+static int
xfs_ialloc(
xfs_trans_t *tp,
xfs_inode_t *pip,
@@ -1030,7 +1023,7 @@ xfs_dir_ialloc(
tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
}
- code = xfs_trans_roll(&tp, 0);
+ code = xfs_trans_roll(&tp, NULL);
if (committed != NULL)
*committed = 1;
@@ -1085,7 +1078,7 @@ xfs_dir_ialloc(
* link count to go to zero, move the inode to AGI unlinked list so that it can
* be freed when the last active reference goes away via xfs_inactive().
*/
-int /* error */
+static int /* error */
xfs_droplink(
xfs_trans_t *tp,
xfs_inode_t *ip)
@@ -1104,7 +1097,7 @@ xfs_droplink(
/*
* Increment the link count on an inode & log the change.
*/
-int
+static int
xfs_bumplink(
xfs_trans_t *tp,
xfs_inode_t *ip)
@@ -1130,7 +1123,7 @@ xfs_create(
struct xfs_inode *ip = NULL;
struct xfs_trans *tp = NULL;
int error;
- xfs_bmap_free_t free_list;
+ struct xfs_defer_ops dfops;
xfs_fsblock_t first_block;
bool unlock_dp_on_error = false;
prid_t prid;
@@ -1161,11 +1154,9 @@ xfs_create(
rdev = 0;
resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
tres = &M_RES(mp)->tr_mkdir;
- tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
} else {
resblks = XFS_CREATE_SPACE_RES(mp, name->len);
tres = &M_RES(mp)->tr_create;
- tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
}
/*
@@ -1174,26 +1165,25 @@ xfs_create(
* the case we'll drop the one we have and get a more
* appropriate transaction later.
*/
- error = xfs_trans_reserve(tp, tres, resblks, 0);
+ error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
if (error == -ENOSPC) {
/* flush outstanding delalloc blocks and retry */
xfs_flush_inodes(mp);
- error = xfs_trans_reserve(tp, tres, resblks, 0);
+ error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
}
if (error == -ENOSPC) {
/* No space at all so try a "no-allocation" reservation */
resblks = 0;
- error = xfs_trans_reserve(tp, tres, 0, 0);
+ error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp);
}
if (error)
- goto out_trans_cancel;
-
+ goto out_release_inode;
xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
unlock_dp_on_error = true;
- xfs_bmap_init(&free_list, &first_block);
+ xfs_defer_init(&dfops, &first_block);
/*
* Reserve disk quota and the inode.
@@ -1230,7 +1220,7 @@ xfs_create(
unlock_dp_on_error = false;
error = xfs_dir_createname(tp, dp, name, ip->i_ino,
- &first_block, &free_list, resblks ?
+ &first_block, &dfops, resblks ?
resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
if (error) {
ASSERT(error != -ENOSPC);
@@ -1264,7 +1254,7 @@ xfs_create(
*/
xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
- error = xfs_bmap_finish(&tp, &free_list, NULL);
+ error = xfs_defer_finish(&tp, &dfops, NULL);
if (error)
goto out_bmap_cancel;
@@ -1280,7 +1270,7 @@ xfs_create(
return 0;
out_bmap_cancel:
- xfs_bmap_cancel(&free_list);
+ xfs_defer_cancel(&dfops);
out_trans_cancel:
xfs_trans_cancel(tp);
out_release_inode:
@@ -1337,17 +1327,16 @@ xfs_create_tmpfile(
return error;
resblks = XFS_IALLOC_SPACE_RES(mp);
- tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE_TMPFILE);
-
tres = &M_RES(mp)->tr_create_tmpfile;
- error = xfs_trans_reserve(tp, tres, resblks, 0);
+
+ error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
if (error == -ENOSPC) {
/* No space at all so try a "no-allocation" reservation */
resblks = 0;
- error = xfs_trans_reserve(tp, tres, 0, 0);
+ error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp);
}
if (error)
- goto out_trans_cancel;
+ goto out_release_inode;
error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
pdqp, resblks, 1, 0);
@@ -1413,7 +1402,7 @@ xfs_link(
xfs_mount_t *mp = tdp->i_mount;
xfs_trans_t *tp;
int error;
- xfs_bmap_free_t free_list;
+ struct xfs_defer_ops dfops;
xfs_fsblock_t first_block;
int resblks;
@@ -1432,15 +1421,14 @@ xfs_link(
if (error)
goto std_return;
- tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, resblks, 0, 0, &tp);
if (error == -ENOSPC) {
resblks = 0;
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &tp);
}
if (error)
- goto error_return;
+ goto std_return;
xfs_ilock(tdp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
@@ -1465,7 +1453,7 @@ xfs_link(
goto error_return;
}
- xfs_bmap_init(&free_list, &first_block);
+ xfs_defer_init(&dfops, &first_block);
/*
* Handle initial link state of O_TMPFILE inode
@@ -1477,7 +1465,7 @@ xfs_link(
}
error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
- &first_block, &free_list, resblks);
+ &first_block, &dfops, resblks);
if (error)
goto error_return;
xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -1495,9 +1483,9 @@ xfs_link(
if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
xfs_trans_set_sync(tp);
- error = xfs_bmap_finish(&tp, &free_list, NULL);
+ error = xfs_defer_finish(&tp, &dfops, NULL);
if (error) {
- xfs_bmap_cancel(&free_list);
+ xfs_defer_cancel(&dfops);
goto error_return;
}
@@ -1539,7 +1527,7 @@ xfs_itruncate_extents(
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp = *tpp;
- xfs_bmap_free_t free_list;
+ struct xfs_defer_ops dfops;
xfs_fsblock_t first_block;
xfs_fileoff_t first_unmap_block;
xfs_fileoff_t last_block;
@@ -1575,12 +1563,12 @@ xfs_itruncate_extents(
ASSERT(first_unmap_block < last_block);
unmap_len = last_block - first_unmap_block + 1;
while (!done) {
- xfs_bmap_init(&free_list, &first_block);
+ xfs_defer_init(&dfops, &first_block);
error = xfs_bunmapi(tp, ip,
first_unmap_block, unmap_len,
xfs_bmapi_aflag(whichfork),
XFS_ITRUNC_MAX_EXTENTS,
- &first_block, &free_list,
+ &first_block, &dfops,
&done);
if (error)
goto out_bmap_cancel;
@@ -1589,7 +1577,7 @@ xfs_itruncate_extents(
* Duplicate the transaction that has the permanent
* reservation and commit the old transaction.
*/
- error = xfs_bmap_finish(&tp, &free_list, ip);
+ error = xfs_defer_finish(&tp, &dfops, ip);
if (error)
goto out_bmap_cancel;
@@ -1615,7 +1603,7 @@ out_bmap_cancel:
* the transaction can be properly aborted. We just need to make sure
* we're not holding any resources that we were not when we came in.
*/
- xfs_bmap_cancel(&free_list);
+ xfs_defer_cancel(&dfops);
goto out;
}
@@ -1710,11 +1698,9 @@ xfs_inactive_truncate(
struct xfs_trans *tp;
int error;
- tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
if (error) {
ASSERT(XFS_FORCED_SHUTDOWN(mp));
- xfs_trans_cancel(tp);
return error;
}
@@ -1758,14 +1744,12 @@ STATIC int
xfs_inactive_ifree(
struct xfs_inode *ip)
{
- xfs_bmap_free_t free_list;
+ struct xfs_defer_ops dfops;
xfs_fsblock_t first_block;
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
int error;
- tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
-
/*
* The ifree transaction might need to allocate blocks for record
* insertion to the finobt. We don't want to fail here at ENOSPC, so
@@ -1781,9 +1765,8 @@ xfs_inactive_ifree(
* now remains allocated and sits on the unlinked list until the fs is
* repaired.
*/
- tp->t_flags |= XFS_TRANS_RESERVE;
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree,
- XFS_IFREE_SPACE_RES(mp), 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
+ XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
if (error) {
if (error == -ENOSPC) {
xfs_warn_ratelimited(mp,
@@ -1792,15 +1775,14 @@ xfs_inactive_ifree(
} else {
ASSERT(XFS_FORCED_SHUTDOWN(mp));
}
- xfs_trans_cancel(tp);
return error;
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- xfs_bmap_init(&free_list, &first_block);
- error = xfs_ifree(tp, ip, &free_list);
+ xfs_defer_init(&dfops, &first_block);
+ error = xfs_ifree(tp, ip, &dfops);
if (error) {
/*
* If we fail to free the inode, shut down. The cancel
@@ -1826,11 +1808,11 @@ xfs_inactive_ifree(
* Just ignore errors at this point. There is nothing we can do except
* to try to keep going. Make sure it's not a silent error.
*/
- error = xfs_bmap_finish(&tp, &free_list, NULL);
+ error = xfs_defer_finish(&tp, &dfops, NULL);
if (error) {
- xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
+ xfs_notice(mp, "%s: xfs_defer_finish returned error %d",
__func__, error);
- xfs_bmap_cancel(&free_list);
+ xfs_defer_cancel(&dfops);
}
error = xfs_trans_commit(tp);
if (error)
@@ -2386,7 +2368,7 @@ int
xfs_ifree(
xfs_trans_t *tp,
xfs_inode_t *ip,
- xfs_bmap_free_t *flist)
+ struct xfs_defer_ops *dfops)
{
int error;
struct xfs_icluster xic = { 0 };
@@ -2405,7 +2387,7 @@ xfs_ifree(
if (error)
return error;
- error = xfs_difree(tp, ip->i_ino, flist, &xic);
+ error = xfs_difree(tp, ip->i_ino, dfops, &xic);
if (error)
return error;
@@ -2493,7 +2475,7 @@ xfs_iunpin_wait(
* directory entry.
*
* This is still safe from a transactional point of view - it is not until we
- * get to xfs_bmap_finish() that we have the possibility of multiple
+ * get to xfs_defer_finish() that we have the possibility of multiple
* transactions in this operation. Hence as long as we remove the directory
* entry and drop the link count in the first transaction of the remove
* operation, there are no transactional constraints on the ordering here.
@@ -2508,7 +2490,7 @@ xfs_remove(
xfs_trans_t *tp = NULL;
int is_dir = S_ISDIR(VFS_I(ip)->i_mode);
int error = 0;
- xfs_bmap_free_t free_list;
+ struct xfs_defer_ops dfops;
xfs_fsblock_t first_block;
uint resblks;
@@ -2525,11 +2507,6 @@ xfs_remove(
if (error)
goto std_return;
- if (is_dir)
- tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
- else
- tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
-
/*
* We try to get the real space reservation first,
* allowing for directory btree deletion(s) implying
@@ -2540,14 +2517,15 @@ xfs_remove(
* block from the directory.
*/
resblks = XFS_REMOVE_SPACE_RES(mp);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, resblks, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, resblks, 0, 0, &tp);
if (error == -ENOSPC) {
resblks = 0;
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, 0, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, 0, 0, 0,
+ &tp);
}
if (error) {
ASSERT(error != -ENOSPC);
- goto out_trans_cancel;
+ goto std_return;
}
xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
@@ -2594,9 +2572,9 @@ xfs_remove(
if (error)
goto out_trans_cancel;
- xfs_bmap_init(&free_list, &first_block);
+ xfs_defer_init(&dfops, &first_block);
error = xfs_dir_removename(tp, dp, name, ip->i_ino,
- &first_block, &free_list, resblks);
+ &first_block, &dfops, resblks);
if (error) {
ASSERT(error != -ENOENT);
goto out_bmap_cancel;
@@ -2610,7 +2588,7 @@ xfs_remove(
if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
xfs_trans_set_sync(tp);
- error = xfs_bmap_finish(&tp, &free_list, NULL);
+ error = xfs_defer_finish(&tp, &dfops, NULL);
if (error)
goto out_bmap_cancel;
@@ -2624,7 +2602,7 @@ xfs_remove(
return 0;
out_bmap_cancel:
- xfs_bmap_cancel(&free_list);
+ xfs_defer_cancel(&dfops);
out_trans_cancel:
xfs_trans_cancel(tp);
std_return:
@@ -2685,7 +2663,7 @@ xfs_sort_for_rename(
static int
xfs_finish_rename(
struct xfs_trans *tp,
- struct xfs_bmap_free *free_list)
+ struct xfs_defer_ops *dfops)
{
int error;
@@ -2696,9 +2674,9 @@ xfs_finish_rename(
if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
xfs_trans_set_sync(tp);
- error = xfs_bmap_finish(&tp, free_list, NULL);
+ error = xfs_defer_finish(&tp, dfops, NULL);
if (error) {
- xfs_bmap_cancel(free_list);
+ xfs_defer_cancel(dfops);
xfs_trans_cancel(tp);
return error;
}
@@ -2720,7 +2698,7 @@ xfs_cross_rename(
struct xfs_inode *dp2,
struct xfs_name *name2,
struct xfs_inode *ip2,
- struct xfs_bmap_free *free_list,
+ struct xfs_defer_ops *dfops,
xfs_fsblock_t *first_block,
int spaceres)
{
@@ -2732,14 +2710,14 @@ xfs_cross_rename(
/* Swap inode number for dirent in first parent */
error = xfs_dir_replace(tp, dp1, name1,
ip2->i_ino,
- first_block, free_list, spaceres);
+ first_block, dfops, spaceres);
if (error)
goto out_trans_abort;
/* Swap inode number for dirent in second parent */
error = xfs_dir_replace(tp, dp2, name2,
ip1->i_ino,
- first_block, free_list, spaceres);
+ first_block, dfops, spaceres);
if (error)
goto out_trans_abort;
@@ -2754,7 +2732,7 @@ xfs_cross_rename(
if (S_ISDIR(VFS_I(ip2)->i_mode)) {
error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot,
dp1->i_ino, first_block,
- free_list, spaceres);
+ dfops, spaceres);
if (error)
goto out_trans_abort;
@@ -2781,7 +2759,7 @@ xfs_cross_rename(
if (S_ISDIR(VFS_I(ip1)->i_mode)) {
error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot,
dp2->i_ino, first_block,
- free_list, spaceres);
+ dfops, spaceres);
if (error)
goto out_trans_abort;
@@ -2820,10 +2798,10 @@ xfs_cross_rename(
}
xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
- return xfs_finish_rename(tp, free_list);
+ return xfs_finish_rename(tp, dfops);
out_trans_abort:
- xfs_bmap_cancel(free_list);
+ xfs_defer_cancel(dfops);
xfs_trans_cancel(tp);
return error;
}
@@ -2855,6 +2833,7 @@ xfs_rename_alloc_whiteout(
* and flag it as linkable.
*/
drop_nlink(VFS_I(tmpfile));
+ xfs_setup_iops(tmpfile);
xfs_finish_inode_setup(tmpfile);
VFS_I(tmpfile)->i_state |= I_LINKABLE;
@@ -2877,7 +2856,7 @@ xfs_rename(
{
struct xfs_mount *mp = src_dp->i_mount;
struct xfs_trans *tp;
- struct xfs_bmap_free free_list;
+ struct xfs_defer_ops dfops;
xfs_fsblock_t first_block;
struct xfs_inode *wip = NULL; /* whiteout inode */
struct xfs_inode *inodes[__XFS_SORT_INODES];
@@ -2910,15 +2889,15 @@ xfs_rename(
xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip,
inodes, &num_inodes);
- tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp);
if (error == -ENOSPC) {
spaceres = 0;
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0,
+ &tp);
}
if (error)
- goto out_trans_cancel;
+ goto out_release_wip;
/*
* Attach the dquots to the inodes
@@ -2966,13 +2945,13 @@ xfs_rename(
goto out_trans_cancel;
}
- xfs_bmap_init(&free_list, &first_block);
+ xfs_defer_init(&dfops, &first_block);
/* RENAME_EXCHANGE is unique from here on. */
if (flags & RENAME_EXCHANGE)
return xfs_cross_rename(tp, src_dp, src_name, src_ip,
target_dp, target_name, target_ip,
- &free_list, &first_block, spaceres);
+ &dfops, &first_block, spaceres);
/*
* Set up the target.
@@ -2994,7 +2973,7 @@ xfs_rename(
*/
error = xfs_dir_createname(tp, target_dp, target_name,
src_ip->i_ino, &first_block,
- &free_list, spaceres);
+ &dfops, spaceres);
if (error)
goto out_bmap_cancel;
@@ -3034,7 +3013,7 @@ xfs_rename(
*/
error = xfs_dir_replace(tp, target_dp, target_name,
src_ip->i_ino,
- &first_block, &free_list, spaceres);
+ &first_block, &dfops, spaceres);
if (error)
goto out_bmap_cancel;
@@ -3069,7 +3048,7 @@ xfs_rename(
*/
error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
target_dp->i_ino,
- &first_block, &free_list, spaceres);
+ &first_block, &dfops, spaceres);
ASSERT(error != -EEXIST);
if (error)
goto out_bmap_cancel;
@@ -3108,10 +3087,10 @@ xfs_rename(
*/
if (wip) {
error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
- &first_block, &free_list, spaceres);
+ &first_block, &dfops, spaceres);
} else
error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
- &first_block, &free_list, spaceres);
+ &first_block, &dfops, spaceres);
if (error)
goto out_bmap_cancel;
@@ -3146,15 +3125,16 @@ xfs_rename(
if (new_parent)
xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
- error = xfs_finish_rename(tp, &free_list);
+ error = xfs_finish_rename(tp, &dfops);
if (wip)
IRELE(wip);
return error;
out_bmap_cancel:
- xfs_bmap_cancel(&free_list);
+ xfs_defer_cancel(&dfops);
out_trans_cancel:
xfs_trans_cancel(tp);
+out_release_wip:
if (wip)
IRELE(wip);
return error;
@@ -3162,16 +3142,16 @@ out_trans_cancel:
STATIC int
xfs_iflush_cluster(
- xfs_inode_t *ip,
- xfs_buf_t *bp)
+ struct xfs_inode *ip,
+ struct xfs_buf *bp)
{
- xfs_mount_t *mp = ip->i_mount;
+ struct xfs_mount *mp = ip->i_mount;
struct xfs_perag *pag;
unsigned long first_index, mask;
unsigned long inodes_per_cluster;
- int ilist_size;
- xfs_inode_t **ilist;
- xfs_inode_t *iq;
+ int cilist_size;
+ struct xfs_inode **cilist;
+ struct xfs_inode *cip;
int nr_found;
int clcount = 0;
int bufwasdelwri;
@@ -3180,23 +3160,23 @@ xfs_iflush_cluster(
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
- ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
- ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
- if (!ilist)
+ cilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
+ cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS);
+ if (!cilist)
goto out_put;
mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1);
first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
rcu_read_lock();
/* really need a gang lookup range call here */
- nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
+ nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist,
first_index, inodes_per_cluster);
if (nr_found == 0)
goto out_free;
for (i = 0; i < nr_found; i++) {
- iq = ilist[i];
- if (iq == ip)
+ cip = cilist[i];
+ if (cip == ip)
continue;
/*
@@ -3205,20 +3185,30 @@ xfs_iflush_cluster(
* We need to check under the i_flags_lock for a valid inode
* here. Skip it if it is not valid or the wrong inode.
*/
- spin_lock(&ip->i_flags_lock);
- if (!ip->i_ino ||
- (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
- spin_unlock(&ip->i_flags_lock);
+ spin_lock(&cip->i_flags_lock);
+ if (!cip->i_ino ||
+ __xfs_iflags_test(cip, XFS_ISTALE)) {
+ spin_unlock(&cip->i_flags_lock);
continue;
}
- spin_unlock(&ip->i_flags_lock);
+
+ /*
+ * Once we fall off the end of the cluster, no point checking
+ * any more inodes in the list because they will also all be
+ * outside the cluster.
+ */
+ if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) {
+ spin_unlock(&cip->i_flags_lock);
+ break;
+ }
+ spin_unlock(&cip->i_flags_lock);
/*
* Do an un-protected check to see if the inode is dirty and
* is a candidate for flushing. These checks will be repeated
* later after the appropriate locks are acquired.
*/
- if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0)
+ if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0)
continue;
/*
@@ -3226,15 +3216,28 @@ xfs_iflush_cluster(
* then this inode cannot be flushed and is skipped.
*/
- if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED))
+ if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED))
continue;
- if (!xfs_iflock_nowait(iq)) {
- xfs_iunlock(iq, XFS_ILOCK_SHARED);
+ if (!xfs_iflock_nowait(cip)) {
+ xfs_iunlock(cip, XFS_ILOCK_SHARED);
continue;
}
- if (xfs_ipincount(iq)) {
- xfs_ifunlock(iq);
- xfs_iunlock(iq, XFS_ILOCK_SHARED);
+ if (xfs_ipincount(cip)) {
+ xfs_ifunlock(cip);
+ xfs_iunlock(cip, XFS_ILOCK_SHARED);
+ continue;
+ }
+
+
+ /*
+ * Check the inode number again, just to be certain we are not
+ * racing with freeing in xfs_reclaim_inode(). See the comments
+ * in that function for more information as to why the initial
+ * check is not sufficient.
+ */
+ if (!cip->i_ino) {
+ xfs_ifunlock(cip);
+ xfs_iunlock(cip, XFS_ILOCK_SHARED);
continue;
}
@@ -3242,18 +3245,18 @@ xfs_iflush_cluster(
* arriving here means that this inode can be flushed. First
* re-check that it's dirty before flushing.
*/
- if (!xfs_inode_clean(iq)) {
+ if (!xfs_inode_clean(cip)) {
int error;
- error = xfs_iflush_int(iq, bp);
+ error = xfs_iflush_int(cip, bp);
if (error) {
- xfs_iunlock(iq, XFS_ILOCK_SHARED);
+ xfs_iunlock(cip, XFS_ILOCK_SHARED);
goto cluster_corrupt_out;
}
clcount++;
} else {
- xfs_ifunlock(iq);
+ xfs_ifunlock(cip);
}
- xfs_iunlock(iq, XFS_ILOCK_SHARED);
+ xfs_iunlock(cip, XFS_ILOCK_SHARED);
}
if (clcount) {
@@ -3263,7 +3266,7 @@ xfs_iflush_cluster(
out_free:
rcu_read_unlock();
- kmem_free(ilist);
+ kmem_free(cilist);
out_put:
xfs_perag_put(pag);
return 0;
@@ -3306,8 +3309,8 @@ cluster_corrupt_out:
/*
* Unlocks the flush lock
*/
- xfs_iflush_abort(iq, false);
- kmem_free(ilist);
+ xfs_iflush_abort(cip, false);
+ kmem_free(cilist);
xfs_perag_put(pag);
return -EFSCORRUPTED;
}
@@ -3327,7 +3330,7 @@ xfs_iflush(
struct xfs_buf **bpp)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_buf *bp;
+ struct xfs_buf *bp = NULL;
struct xfs_dinode *dip;
int error;
@@ -3369,14 +3372,22 @@ xfs_iflush(
}
/*
- * Get the buffer containing the on-disk inode.
+ * Get the buffer containing the on-disk inode. We are doing a try-lock
+ * operation here, so we may get an EAGAIN error. In that case, we
+ * simply want to return with the inode still dirty.
+ *
+ * If we get any other error, we effectively have a corruption situation
+ * and we cannot flush the inode, so we treat it the same as failing
+ * xfs_iflush_int().
*/
error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK,
0);
- if (error || !bp) {
+ if (error == -EAGAIN) {
xfs_ifunlock(ip);
return error;
}
+ if (error)
+ goto corrupt_out;
/*
* First flush out the inode that xfs_iflush was called with.
@@ -3404,7 +3415,8 @@ xfs_iflush(
return 0;
corrupt_out:
- xfs_buf_relse(bp);
+ if (bp)
+ xfs_buf_relse(bp);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
cluster_corrupt_out:
error = -EFSCORRUPTED;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 43e1d51b15eb8..e1a411e08f00f 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -27,7 +27,7 @@
struct xfs_dinode;
struct xfs_inode;
struct xfs_buf;
-struct xfs_bmap_free;
+struct xfs_defer_ops;
struct xfs_bmbt_irec;
struct xfs_inode_log_item;
struct xfs_mount;
@@ -395,14 +395,10 @@ void xfs_ilock_demote(xfs_inode_t *, uint);
int xfs_isilocked(xfs_inode_t *, uint);
uint xfs_ilock_data_map_shared(struct xfs_inode *);
uint xfs_ilock_attr_map_shared(struct xfs_inode *);
-int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t,
- xfs_nlink_t, xfs_dev_t, prid_t, int,
- struct xfs_buf **, xfs_inode_t **);
uint xfs_ip2xflags(struct xfs_inode *);
-uint xfs_dic2xflags(struct xfs_dinode *);
int xfs_ifree(struct xfs_trans *, xfs_inode_t *,
- struct xfs_bmap_free *);
+ struct xfs_defer_ops *);
int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
int, xfs_fsize_t);
void xfs_iext_realloc(xfs_inode_t *, int, int);
@@ -411,7 +407,6 @@ void xfs_iunpin_wait(xfs_inode_t *);
#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount))
int xfs_iflush(struct xfs_inode *, struct xfs_buf **);
-void xfs_lock_inodes(xfs_inode_t **, int, uint);
void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
@@ -419,8 +414,6 @@ xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,
xfs_nlink_t, xfs_dev_t, prid_t, int,
struct xfs_inode **, int *);
-int xfs_droplink(struct xfs_trans *, struct xfs_inode *);
-int xfs_bumplink(struct xfs_trans *, struct xfs_inode *);
/* from xfs_file.c */
enum xfs_prealloc_flags {
@@ -434,12 +427,16 @@ int xfs_update_prealloc_flags(struct xfs_inode *ip,
enum xfs_prealloc_flags flags);
int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset,
xfs_fsize_t isize, bool *did_zeroing);
-int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count);
+int xfs_zero_range(struct xfs_inode *ip, xfs_off_t pos, xfs_off_t count,
+ bool *did_zero);
loff_t __xfs_seek_hole_data(struct inode *inode, loff_t start,
loff_t eof, int whence);
/* from xfs_iops.c */
+extern void xfs_setup_inode(struct xfs_inode *ip);
+extern void xfs_setup_iops(struct xfs_inode *ip);
+
/*
* When setting up a newly allocated inode, we need to call
* xfs_finish_inode_setup() once the inode is fully instantiated at
@@ -447,7 +444,6 @@ loff_t __xfs_seek_hole_data(struct inode *inode, loff_t start,
* before we've completed instantiation. Otherwise we can do it
* the moment the inode lookup is complete.
*/
-extern void xfs_setup_inode(struct xfs_inode *ip);
static inline void xfs_finish_inode_setup(struct xfs_inode *ip)
{
xfs_iflags_clear(ip, XFS_INEW);
@@ -458,6 +454,7 @@ static inline void xfs_finish_inode_setup(struct xfs_inode *ip)
static inline void xfs_setup_existing_inode(struct xfs_inode *ip)
{
xfs_setup_inode(ip);
+ xfs_setup_iops(ip);
xfs_finish_inode_setup(ip);
}
@@ -476,14 +473,4 @@ do { \
extern struct kmem_zone *xfs_inode_zone;
-/*
- * Flags for read/write calls
- */
-#define XFS_IO_ISDIRECT 0x00001 /* bypass page cache */
-#define XFS_IO_INVIS 0x00002 /* don't update inode timestamps */
-
-#define XFS_IO_FLAGS \
- { XFS_IO_ISDIRECT, "DIRECT" }, \
- { XFS_IO_INVIS, "INVIS"}
-
#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index c48b5b18d771f..892c2aced2078 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -210,7 +210,7 @@ xfs_inode_item_format_data_fork(
*/
data_bytes = roundup(ip->i_df.if_bytes, 4);
ASSERT(ip->i_df.if_real_bytes == 0 ||
- ip->i_df.if_real_bytes == data_bytes);
+ ip->i_df.if_real_bytes >= data_bytes);
ASSERT(ip->i_df.if_u1.if_data != NULL);
ASSERT(ip->i_d.di_size > 0);
xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL,
@@ -305,7 +305,7 @@ xfs_inode_item_format_attr_fork(
*/
data_bytes = roundup(ip->i_afp->if_bytes, 4);
ASSERT(ip->i_afp->if_real_bytes == 0 ||
- ip->i_afp->if_real_bytes == data_bytes);
+ ip->i_afp->if_real_bytes >= data_bytes);
ASSERT(ip->i_afp->if_u1.if_data != NULL);
xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL,
ip->i_afp->if_u1.if_data,
@@ -479,6 +479,8 @@ STATIC uint
xfs_inode_item_push(
struct xfs_log_item *lip,
struct list_head *buffer_list)
+ __releases(&lip->li_ailp->xa_lock)
+ __acquires(&lip->li_ailp->xa_lock)
{
struct xfs_inode_log_item *iip = INODE_ITEM(lip);
struct xfs_inode *ip = iip->ili_inode;
@@ -649,6 +651,7 @@ void
xfs_inode_item_destroy(
xfs_inode_t *ip)
{
+ kmem_free(ip->i_itemp->ili_item.li_lv_shadow);
kmem_zone_free(xfs_ili_zone, ip->i_itemp);
}
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index bcb6c19ce3ea4..96a70fd1f5d67 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -232,7 +232,7 @@ xfs_open_by_handle(
}
if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
- error = -EACCES;
+ error = -EPERM;
goto out_dput;
}
@@ -277,7 +277,6 @@ xfs_readlink_by_handle(
{
struct dentry *dentry;
__u32 olen;
- void *link;
int error;
if (!capable(CAP_SYS_ADMIN))
@@ -288,7 +287,7 @@ xfs_readlink_by_handle(
return PTR_ERR(dentry);
/* Restrict this handle operation to symlinks only. */
- if (!d_is_symlink(dentry)) {
+ if (!d_inode(dentry)->i_op->readlink) {
error = -EINVAL;
goto out_dput;
}
@@ -298,21 +297,8 @@ xfs_readlink_by_handle(
goto out_dput;
}
- link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
- if (!link) {
- error = -ENOMEM;
- goto out_dput;
- }
-
- error = xfs_readlink(XFS_I(d_inode(dentry)), link);
- if (error)
- goto out_kfree;
- error = readlink_copy(hreq->ohandle, olen, link);
- if (error)
- goto out_kfree;
+ error = d_inode(dentry)->i_op->readlink(dentry, hreq->ohandle, olen);
- out_kfree:
- kfree(link);
out_dput:
dput(dentry);
return error;
@@ -334,12 +320,10 @@ xfs_set_dmattrs(
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+ if (error)
return error;
- }
+
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
@@ -403,6 +387,7 @@ xfs_attrlist_by_handle(
{
int error = -ENOMEM;
attrlist_cursor_kern_t *cursor;
+ struct xfs_fsop_attrlist_handlereq __user *p = arg;
xfs_fsop_attrlist_handlereq_t al_hreq;
struct dentry *dentry;
char *kbuf;
@@ -435,6 +420,11 @@ xfs_attrlist_by_handle(
if (error)
goto out_kfree;
+ if (copy_to_user(&p->pos, cursor, sizeof(attrlist_cursor_kern_t))) {
+ error = -EFAULT;
+ goto out_kfree;
+ }
+
if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen))
error = -EFAULT;
@@ -611,13 +601,12 @@ xfs_attrmulti_by_handle(
int
xfs_ioc_space(
- struct xfs_inode *ip,
- struct inode *inode,
struct file *filp,
- int ioflags,
unsigned int cmd,
xfs_flock64_t *bf)
{
+ struct inode *inode = file_inode(filp);
+ struct xfs_inode *ip = XFS_I(inode);
struct iattr iattr;
enum xfs_prealloc_flags flags = 0;
uint iolock = XFS_IOLOCK_EXCL;
@@ -642,7 +631,7 @@ xfs_ioc_space(
if (filp->f_flags & O_DSYNC)
flags |= XFS_PREALLOC_SYNC;
- if (ioflags & XFS_IO_INVIS)
+ if (filp->f_mode & FMODE_NOCMTIME)
flags |= XFS_PREALLOC_INVISIBLE;
error = mnt_want_write_file(filp);
@@ -1141,10 +1130,9 @@ xfs_ioctl_setattr_get_trans(
if (XFS_FORCED_SHUTDOWN(mp))
goto out_unlock;
- tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
if (error)
- goto out_cancel;
+ return ERR_PTR(error);
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | join_flags);
@@ -1481,8 +1469,7 @@ xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full)
STATIC int
xfs_ioc_getbmap(
- struct xfs_inode *ip,
- int ioflags,
+ struct file *file,
unsigned int cmd,
void __user *arg)
{
@@ -1496,10 +1483,10 @@ xfs_ioc_getbmap(
return -EINVAL;
bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
- if (ioflags & XFS_IO_INVIS)
+ if (file->f_mode & FMODE_NOCMTIME)
bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ;
- error = xfs_getbmap(ip, &bmx, xfs_getbmap_format,
+ error = xfs_getbmap(XFS_I(file_inode(file)), &bmx, xfs_getbmap_format,
(__force struct getbmap *)arg+1);
if (error)
return error;
@@ -1592,6 +1579,17 @@ xfs_ioc_swapext(
goto out_put_tmp_file;
}
+ /*
+ * We need to ensure that the fds passed in point to XFS inodes
+ * before we cast and access them as XFS structures as we have no
+ * control over what the user passes us here.
+ */
+ if (f.file->f_op != &xfs_file_operations ||
+ tmp.file->f_op != &xfs_file_operations) {
+ error = -EINVAL;
+ goto out_put_tmp_file;
+ }
+
ip = XFS_I(file_inode(f.file));
tip = XFS_I(file_inode(tmp.file));
@@ -1636,12 +1634,8 @@ xfs_file_ioctl(
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
void __user *arg = (void __user *)p;
- int ioflags = 0;
int error;
- if (filp->f_mode & FMODE_NOCMTIME)
- ioflags |= XFS_IO_INVIS;
-
trace_xfs_file_ioctl(ip);
switch (cmd) {
@@ -1660,7 +1654,7 @@ xfs_file_ioctl(
if (copy_from_user(&bf, arg, sizeof(bf)))
return -EFAULT;
- return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
+ return xfs_ioc_space(filp, cmd, &bf);
}
case XFS_IOC_DIOINFO: {
struct dioattr da;
@@ -1719,7 +1713,7 @@ xfs_file_ioctl(
case XFS_IOC_GETBMAP:
case XFS_IOC_GETBMAPA:
- return xfs_ioc_getbmap(ip, ioflags, cmd, arg);
+ return xfs_ioc_getbmap(filp, cmd, arg);
case XFS_IOC_GETBMAPX:
return xfs_ioc_getbmapx(ip, arg);
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index 77c02c7900b6e..8b52881bfd901 100644
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@@ -20,10 +20,7 @@
extern int
xfs_ioc_space(
- struct xfs_inode *ip,
- struct inode *inode,
struct file *filp,
- int ioflags,
unsigned int cmd,
xfs_flock64_t *bf);
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 1a05d8ae327db..321f57721b922 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -532,12 +532,8 @@ xfs_file_compat_ioctl(
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
void __user *arg = (void __user *)p;
- int ioflags = 0;
int error;
- if (filp->f_mode & FMODE_NOCMTIME)
- ioflags |= XFS_IO_INVIS;
-
trace_xfs_file_compat_ioctl(ip);
switch (cmd) {
@@ -589,7 +585,7 @@ xfs_file_compat_ioctl(
if (xfs_compat_flock64_copyin(&bf, arg))
return -EFAULT;
cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
- return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
+ return xfs_ioc_space(filp, cmd, &bf);
}
case XFS_IOC_FSGEOMETRY_V1_32:
return xfs_compat_ioc_fsgeometry_v1(mp, arg);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index d81bdc080370e..2114d53df4331 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -15,6 +15,7 @@
* along with this program; if not, write the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#include <linux/iomap.h>
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
@@ -22,6 +23,7 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
+#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_bmap_btree.h"
@@ -127,11 +129,12 @@ xfs_iomap_write_direct(
int quota_flag;
int rt;
xfs_trans_t *tp;
- xfs_bmap_free_t free_list;
+ struct xfs_defer_ops dfops;
uint qblocks, resblks, resrtextents;
int error;
int lockmode;
int bmapi_flags = XFS_BMAPI_PREALLOC;
+ uint tflags = 0;
rt = XFS_IS_REALTIME_INODE(ip);
extsz = xfs_get_extsz_hint(ip);
@@ -192,11 +195,6 @@ xfs_iomap_write_direct(
return error;
/*
- * Allocate and setup the transaction
- */
- tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
-
- /*
* For DAX, we do not allocate unwritten extents, but instead we zero
* the block before we commit the transaction. Ideally we'd like to do
* this outside the transaction context, but if we commit and then crash
@@ -209,23 +207,17 @@ xfs_iomap_write_direct(
* the reserve block pool for bmbt block allocation if there is no space
* left but we need to do unwritten extent conversion.
*/
-
if (IS_DAX(VFS_I(ip))) {
bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO;
if (ISUNWRITTEN(imap)) {
- tp->t_flags |= XFS_TRANS_RESERVE;
+ tflags |= XFS_TRANS_RESERVE;
resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
}
}
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
- resblks, resrtextents);
- /*
- * Check for running out of space, note: need lock to return
- */
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, resrtextents,
+ tflags, &tp);
+ if (error)
return error;
- }
lockmode = XFS_ILOCK_EXCL;
xfs_ilock(ip, lockmode);
@@ -240,18 +232,18 @@ xfs_iomap_write_direct(
* From this point onwards we overwrite the imap pointer that the
* caller gave to us.
*/
- xfs_bmap_init(&free_list, &firstfsb);
+ xfs_defer_init(&dfops, &firstfsb);
nimaps = 1;
error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
bmapi_flags, &firstfsb, resblks, imap,
- &nimaps, &free_list);
+ &nimaps, &dfops);
if (error)
goto out_bmap_cancel;
/*
* Complete the transaction
*/
- error = xfs_bmap_finish(&tp, &free_list, NULL);
+ error = xfs_defer_finish(&tp, &dfops, NULL);
if (error)
goto out_bmap_cancel;
@@ -275,7 +267,7 @@ out_unlock:
return error;
out_bmap_cancel:
- xfs_bmap_cancel(&free_list);
+ xfs_defer_cancel(&dfops);
xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
out_trans_cancel:
xfs_trans_cancel(tp);
@@ -694,7 +686,7 @@ xfs_iomap_write_allocate(
xfs_fileoff_t offset_fsb, last_block;
xfs_fileoff_t end_fsb, map_start_fsb;
xfs_fsblock_t first_block;
- xfs_bmap_free_t free_list;
+ struct xfs_defer_ops dfops;
xfs_filblks_t count_fsb;
xfs_trans_t *tp;
int nimaps;
@@ -726,19 +718,17 @@ xfs_iomap_write_allocate(
nimaps = 0;
while (nimaps == 0) {
- tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
- tp->t_flags |= XFS_TRANS_RESERVE;
nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
- nres, 0);
- if (error) {
- xfs_trans_cancel(tp);
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, nres,
+ 0, XFS_TRANS_RESERVE, &tp);
+ if (error)
return error;
- }
+
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- xfs_bmap_init(&free_list, &first_block);
+ xfs_defer_init(&dfops, &first_block);
/*
* it is possible that the extents have changed since
@@ -794,11 +784,11 @@ xfs_iomap_write_allocate(
error = xfs_bmapi_write(tp, ip, map_start_fsb,
count_fsb, 0, &first_block,
nres, imap, &nimaps,
- &free_list);
+ &dfops);
if (error)
goto trans_cancel;
- error = xfs_bmap_finish(&tp, &free_list, NULL);
+ error = xfs_defer_finish(&tp, &dfops, NULL);
if (error)
goto trans_cancel;
@@ -832,7 +822,7 @@ xfs_iomap_write_allocate(
}
trans_cancel:
- xfs_bmap_cancel(&free_list);
+ xfs_defer_cancel(&dfops);
xfs_trans_cancel(tp);
error0:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -853,7 +843,7 @@ xfs_iomap_write_unwritten(
int nimaps;
xfs_trans_t *tp;
xfs_bmbt_irec_t imap;
- xfs_bmap_free_t free_list;
+ struct xfs_defer_ops dfops;
xfs_fsize_t i_size;
uint resblks;
int error;
@@ -878,25 +868,18 @@ xfs_iomap_write_unwritten(
do {
/*
- * set up a transaction to convert the range of extents
+ * Set up a transaction to convert the range of extents
* from unwritten to real. Do allocations in a loop until
* we have covered the range passed in.
*
- * Note that we open code the transaction allocation here
- * to pass KM_NOFS--we can't risk to recursing back into
- * the filesystem here as we might be asked to write out
- * the same inode that we complete here and might deadlock
- * on the iolock.
+ * Note that we can't risk to recursing back into the filesystem
+ * here as we might be asked to write out the same inode that we
+ * complete here and might deadlock on the iolock.
*/
- sb_start_intwrite(mp->m_super);
- tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS);
- tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT;
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
- resblks, 0);
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
+ XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp);
+ if (error)
return error;
- }
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
@@ -904,11 +887,11 @@ xfs_iomap_write_unwritten(
/*
* Modify the unwritten extent state of the buffer.
*/
- xfs_bmap_init(&free_list, &firstfsb);
+ xfs_defer_init(&dfops, &firstfsb);
nimaps = 1;
error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
XFS_BMAPI_CONVERT, &firstfsb, resblks,
- &imap, &nimaps, &free_list);
+ &imap, &nimaps, &dfops);
if (error)
goto error_on_bmapi_transaction;
@@ -927,7 +910,7 @@ xfs_iomap_write_unwritten(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
- error = xfs_bmap_finish(&tp, &free_list, NULL);
+ error = xfs_defer_finish(&tp, &dfops, NULL);
if (error)
goto error_on_bmapi_transaction;
@@ -954,8 +937,178 @@ xfs_iomap_write_unwritten(
return 0;
error_on_bmapi_transaction:
- xfs_bmap_cancel(&free_list);
+ xfs_defer_cancel(&dfops);
xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
+
+void
+xfs_bmbt_to_iomap(
+ struct xfs_inode *ip,
+ struct iomap *iomap,
+ struct xfs_bmbt_irec *imap)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ if (imap->br_startblock == HOLESTARTBLOCK) {
+ iomap->blkno = IOMAP_NULL_BLOCK;
+ iomap->type = IOMAP_HOLE;
+ } else if (imap->br_startblock == DELAYSTARTBLOCK) {
+ iomap->blkno = IOMAP_NULL_BLOCK;
+ iomap->type = IOMAP_DELALLOC;
+ } else {
+ iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock);
+ if (imap->br_state == XFS_EXT_UNWRITTEN)
+ iomap->type = IOMAP_UNWRITTEN;
+ else
+ iomap->type = IOMAP_MAPPED;
+ }
+ iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
+ iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
+ iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
+}
+
+static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps)
+{
+ return !nimaps ||
+ imap->br_startblock == HOLESTARTBLOCK ||
+ imap->br_startblock == DELAYSTARTBLOCK;
+}
+
+static int
+xfs_file_iomap_begin(
+ struct inode *inode,
+ loff_t offset,
+ loff_t length,
+ unsigned flags,
+ struct iomap *iomap)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_bmbt_irec imap;
+ xfs_fileoff_t offset_fsb, end_fsb;
+ int nimaps = 1, error = 0;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+ ASSERT(offset <= mp->m_super->s_maxbytes);
+ if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes)
+ length = mp->m_super->s_maxbytes - offset;
+ offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ end_fsb = XFS_B_TO_FSB(mp, offset + length);
+
+ error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
+ &nimaps, XFS_BMAPI_ENTIRE);
+ if (error) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+ }
+
+ if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) {
+ /*
+ * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
+ * pages to keep the chunks of work done where somewhat symmetric
+ * with the work writeback does. This is a completely arbitrary
+ * number pulled out of thin air as a best guess for initial
+ * testing.
+ *
+ * Note that the values needs to be less than 32-bits wide until
+ * the lower level functions are updated.
+ */
+ length = min_t(loff_t, length, 1024 * PAGE_SIZE);
+ if (xfs_get_extsz_hint(ip)) {
+ /*
+ * xfs_iomap_write_direct() expects the shared lock. It
+ * is unlocked on return.
+ */
+ xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
+ error = xfs_iomap_write_direct(ip, offset, length, &imap,
+ nimaps);
+ } else {
+ error = xfs_iomap_write_delay(ip, offset, length, &imap);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ }
+
+ if (error)
+ return error;
+
+ trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
+ xfs_bmbt_to_iomap(ip, iomap, &imap);
+ } else if (nimaps) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ trace_xfs_iomap_found(ip, offset, length, 0, &imap);
+ xfs_bmbt_to_iomap(ip, iomap, &imap);
+ } else {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ trace_xfs_iomap_not_found(ip, offset, length, 0, &imap);
+ iomap->blkno = IOMAP_NULL_BLOCK;
+ iomap->type = IOMAP_HOLE;
+ iomap->offset = offset;
+ iomap->length = length;
+ }
+
+ return 0;
+}
+
+static int
+xfs_file_iomap_end_delalloc(
+ struct xfs_inode *ip,
+ loff_t offset,
+ loff_t length,
+ ssize_t written)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t start_fsb;
+ xfs_fileoff_t end_fsb;
+ int error = 0;
+
+ start_fsb = XFS_B_TO_FSB(mp, offset + written);
+ end_fsb = XFS_B_TO_FSB(mp, offset + length);
+
+ /*
+ * Trim back delalloc blocks if we didn't manage to write the whole
+ * range reserved.
+ *
+ * We don't need to care about racing delalloc as we hold i_mutex
+ * across the reserve/allocate/unreserve calls. If there are delalloc
+ * blocks in the range, they are ours.
+ */
+ if (start_fsb < end_fsb) {
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+ end_fsb - start_fsb);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+ if (error && !XFS_FORCED_SHUTDOWN(mp)) {
+ xfs_alert(mp, "%s: unable to clean up ino %lld",
+ __func__, ip->i_ino);
+ return error;
+ }
+ }
+
+ return 0;
+}
+
+static int
+xfs_file_iomap_end(
+ struct inode *inode,
+ loff_t offset,
+ loff_t length,
+ ssize_t written,
+ unsigned flags,
+ struct iomap *iomap)
+{
+ if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
+ return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
+ length, written);
+ return 0;
+}
+
+struct iomap_ops xfs_iomap_ops = {
+ .iomap_begin = xfs_file_iomap_begin,
+ .iomap_end = xfs_file_iomap_end,
+};
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 8688e663d7440..e066d045e2ffe 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,6 +18,8 @@
#ifndef __XFS_IOMAP_H__
#define __XFS_IOMAP_H__
+#include <linux/iomap.h>
+
struct xfs_inode;
struct xfs_bmbt_irec;
@@ -29,4 +31,9 @@ int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t,
struct xfs_bmbt_irec *);
int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
+void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
+ struct xfs_bmbt_irec *);
+
+extern struct iomap_ops xfs_iomap_ops;
+
#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index fb7dc61f4a29d..ab820f84ed507 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -38,12 +38,13 @@
#include "xfs_dir2.h"
#include "xfs_trans_space.h"
#include "xfs_pnfs.h"
+#include "xfs_iomap.h"
#include <linux/capability.h>
#include <linux/xattr.h>
#include <linux/posix_acl.h>
#include <linux/security.h>
-#include <linux/fiemap.h>
+#include <linux/iomap.h>
#include <linux/slab.h>
/*
@@ -181,6 +182,8 @@ xfs_generic_create(
}
#endif
+ xfs_setup_iops(ip);
+
if (tmpfile)
d_tmpfile(dentry, inode);
else
@@ -368,6 +371,8 @@ xfs_vn_symlink(
if (unlikely(error))
goto out_cleanup_inode;
+ xfs_setup_iops(cip);
+
d_instantiate(dentry, inode);
xfs_finish_inode_setup(cip);
return 0;
@@ -442,6 +447,16 @@ xfs_vn_get_link(
return ERR_PTR(error);
}
+STATIC const char *
+xfs_vn_get_link_inline(
+ struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
+{
+ ASSERT(XFS_I(inode)->i_df.if_flags & XFS_IFINLINE);
+ return XFS_I(inode)->i_df.if_u1.if_data;
+}
+
STATIC int
xfs_vn_getattr(
struct vfsmount *mnt,
@@ -599,12 +614,12 @@ xfs_setattr_nonsize(
return error;
}
- tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
if (error)
- goto out_trans_cancel;
+ goto out_dqrele;
xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
/*
* Change file ownership. Must be the owner or privileged.
@@ -633,12 +648,10 @@ xfs_setattr_nonsize(
NULL, capable(CAP_FOWNER) ?
XFS_QMOPT_FORCE_RES : 0);
if (error) /* out of quota */
- goto out_unlock;
+ goto out_cancel;
}
}
- xfs_trans_ijoin(tp, ip, 0);
-
/*
* Change file ownership. Must be the owner or privileged.
*/
@@ -722,10 +735,9 @@ xfs_setattr_nonsize(
return 0;
-out_unlock:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-out_trans_cancel:
+out_cancel:
xfs_trans_cancel(tp);
+out_dqrele:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
return error;
@@ -790,20 +802,30 @@ xfs_setattr_size(
return error;
/*
+ * Wait for all direct I/O to complete.
+ */
+ inode_dio_wait(inode);
+
+ /*
* File data changes must be complete before we start the transaction to
* modify the inode. This needs to be done before joining the inode to
* the transaction because the inode cannot be unlocked once it is a
* part of the transaction.
*
- * Start with zeroing any data block beyond EOF that we may expose on
- * file extension.
+ * Start with zeroing any data beyond EOF that we may expose on file
+ * extension, or zeroing out the rest of the block on a downward
+ * truncate.
*/
if (newsize > oldsize) {
error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing);
- if (error)
- return error;
+ } else {
+ error = iomap_truncate_page(inode, newsize, &did_zeroing,
+ &xfs_iomap_ops);
}
+ if (error)
+ return error;
+
/*
* We are going to log the inode size change in this transaction so
* any previous writes that are beyond the on disk EOF and the new
@@ -812,17 +834,14 @@ xfs_setattr_size(
* problem. Note that this includes any block zeroing we did above;
* otherwise those blocks may not be zeroed after a crash.
*/
- if (newsize > ip->i_d.di_size &&
- (oldsize != ip->i_d.di_size || did_zeroing)) {
+ if (did_zeroing ||
+ (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) {
error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
ip->i_d.di_size, newsize);
if (error)
return error;
}
- /* Now wait for all direct I/O to complete. */
- inode_dio_wait(inode);
-
/*
* We've already locked out new page faults, so now we can safely remove
* pages from the page cache knowing they won't get refaulted until we
@@ -834,25 +853,17 @@ xfs_setattr_size(
* We have to do all the page cache truncate work outside the
* transaction context as the "lock" order is page lock->log space
* reservation as defined by extent allocation in the writeback path.
- * Hence a truncate can fail with ENOMEM from xfs_trans_reserve(), but
+ * Hence a truncate can fail with ENOMEM from xfs_trans_alloc(), but
* having already truncated the in-memory version of the file (i.e. made
* user visible changes). There's not much we can do about this, except
* to hope that the caller sees ENOMEM and retries the truncate
* operation.
*/
- if (IS_DAX(inode))
- error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct);
- else
- error = block_truncate_page(inode->i_mapping, newsize,
- xfs_get_blocks);
- if (error)
- return error;
truncate_setsize(inode, newsize);
- tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
if (error)
- goto out_trans_cancel;
+ return error;
lock_flags |= XFS_ILOCK_EXCL;
xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -971,12 +982,9 @@ xfs_vn_update_time(
trace_xfs_update_time(ip);
- tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
+ if (error)
return error;
- }
xfs_ilock(ip, XFS_ILOCK_EXCL);
if (flags & S_CTIME)
@@ -991,51 +999,6 @@ xfs_vn_update_time(
return xfs_trans_commit(tp);
}
-#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
-
-/*
- * Call fiemap helper to fill in user data.
- * Returns positive errors to xfs_getbmap.
- */
-STATIC int
-xfs_fiemap_format(
- void **arg,
- struct getbmapx *bmv,
- int *full)
-{
- int error;
- struct fiemap_extent_info *fieinfo = *arg;
- u32 fiemap_flags = 0;
- u64 logical, physical, length;
-
- /* Do nothing for a hole */
- if (bmv->bmv_block == -1LL)
- return 0;
-
- logical = BBTOB(bmv->bmv_offset);
- physical = BBTOB(bmv->bmv_block);
- length = BBTOB(bmv->bmv_length);
-
- if (bmv->bmv_oflags & BMV_OF_PREALLOC)
- fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
- else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
- fiemap_flags |= (FIEMAP_EXTENT_DELALLOC |
- FIEMAP_EXTENT_UNKNOWN);
- physical = 0; /* no block yet */
- }
- if (bmv->bmv_oflags & BMV_OF_LAST)
- fiemap_flags |= FIEMAP_EXTENT_LAST;
-
- error = fiemap_fill_next_extent(fieinfo, logical, physical,
- length, fiemap_flags);
- if (error > 0) {
- error = 0;
- *full = 1; /* user array now full */
- }
-
- return error;
-}
-
STATIC int
xfs_vn_fiemap(
struct inode *inode,
@@ -1043,38 +1006,13 @@ xfs_vn_fiemap(
u64 start,
u64 length)
{
- xfs_inode_t *ip = XFS_I(inode);
- struct getbmapx bm;
int error;
- error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS);
- if (error)
- return error;
-
- /* Set up bmap header for xfs internal routine */
- bm.bmv_offset = BTOBBT(start);
- /* Special case for whole file */
- if (length == FIEMAP_MAX_OFFSET)
- bm.bmv_length = -1LL;
- else
- bm.bmv_length = BTOBB(start + length) - bm.bmv_offset;
-
- /* We add one because in getbmap world count includes the header */
- bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM :
- fieinfo->fi_extents_max + 1;
- bm.bmv_count = min_t(__s32, bm.bmv_count,
- (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
- bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES;
- if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
- bm.bmv_iflags |= BMV_IF_ATTRFORK;
- if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
- bm.bmv_iflags |= BMV_IF_DELALLOC;
-
- error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo);
- if (error)
- return error;
+ xfs_ilock(XFS_I(inode), XFS_IOLOCK_SHARED);
+ error = iomap_fiemap(inode, fieinfo, start, length, &xfs_iomap_ops);
+ xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED);
- return 0;
+ return error;
}
STATIC int
@@ -1167,6 +1105,18 @@ static const struct inode_operations xfs_symlink_inode_operations = {
.update_time = xfs_vn_update_time,
};
+static const struct inode_operations xfs_inline_symlink_inode_operations = {
+ .readlink = generic_readlink,
+ .get_link = xfs_vn_get_link_inline,
+ .getattr = xfs_vn_getattr,
+ .setattr = xfs_vn_setattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .removexattr = generic_removexattr,
+ .listxattr = xfs_vn_listxattr,
+ .update_time = xfs_vn_update_time,
+};
+
STATIC void
xfs_diflags_to_iflags(
struct inode *inode,
@@ -1193,7 +1143,7 @@ xfs_diflags_to_iflags(
}
/*
- * Initialize the Linux inode and set up the operation vectors.
+ * Initialize the Linux inode.
*
* When reading existing inodes from disk this is called directly from xfs_iget,
* when creating a new inode it is called from xfs_ialloc after setting up the
@@ -1232,32 +1182,12 @@ xfs_setup_inode(
i_size_write(inode, ip->i_d.di_size);
xfs_diflags_to_iflags(inode, ip);
- ip->d_ops = ip->i_mount->m_nondir_inode_ops;
- lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class);
- switch (inode->i_mode & S_IFMT) {
- case S_IFREG:
- inode->i_op = &xfs_inode_operations;
- inode->i_fop = &xfs_file_operations;
- inode->i_mapping->a_ops = &xfs_address_space_operations;
- break;
- case S_IFDIR:
+ if (S_ISDIR(inode->i_mode)) {
lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class);
- if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
- inode->i_op = &xfs_dir_ci_inode_operations;
- else
- inode->i_op = &xfs_dir_inode_operations;
- inode->i_fop = &xfs_dir_file_operations;
ip->d_ops = ip->i_mount->m_dir_inode_ops;
- break;
- case S_IFLNK:
- inode->i_op = &xfs_symlink_inode_operations;
- if (!(ip->i_df.if_flags & XFS_IFINLINE))
- inode->i_mapping->a_ops = &xfs_address_space_operations;
- break;
- default:
- inode->i_op = &xfs_inode_operations;
- init_special_inode(inode, inode->i_mode, inode->i_rdev);
- break;
+ } else {
+ ip->d_ops = ip->i_mount->m_nondir_inode_ops;
+ lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class);
}
/*
@@ -1277,3 +1207,35 @@ xfs_setup_inode(
cache_no_acl(inode);
}
}
+
+void
+xfs_setup_iops(
+ struct xfs_inode *ip)
+{
+ struct inode *inode = &ip->i_vnode;
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFREG:
+ inode->i_op = &xfs_inode_operations;
+ inode->i_fop = &xfs_file_operations;
+ inode->i_mapping->a_ops = &xfs_address_space_operations;
+ break;
+ case S_IFDIR:
+ if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
+ inode->i_op = &xfs_dir_ci_inode_operations;
+ else
+ inode->i_op = &xfs_dir_inode_operations;
+ inode->i_fop = &xfs_dir_file_operations;
+ break;
+ case S_IFLNK:
+ if (ip->i_df.if_flags & XFS_IFINLINE)
+ inode->i_op = &xfs_inline_symlink_inode_operations;
+ else
+ inode->i_op = &xfs_symlink_inode_operations;
+ break;
+ default:
+ inode->i_op = &xfs_inode_operations;
+ init_special_inode(inode, inode->i_mode, inode->i_rdev);
+ break;
+ }
+}
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index ec0e239a0fa90..b8d64d520e125 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -135,7 +135,7 @@ typedef __u32 xfs_nlink_t;
* Size of block device i/o is parameterized here.
* Currently the system supports page-sized i/o.
*/
-#define BLKDEV_IOSHIFT PAGE_CACHE_SHIFT
+#define BLKDEV_IOSHIFT PAGE_SHIFT
#define BLKDEV_IOSIZE (1<<BLKDEV_IOSHIFT)
/* number of BB's per block device block */
#define BLKDEV_BB BTOBB(BLKDEV_IOSIZE)
@@ -328,13 +328,6 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
return x;
}
-/* ARM old ABI has some weird alignment/padding */
-#if defined(__arm__) && !defined(__ARM_EABI__)
-#define __arch_pack __attribute__((packed))
-#else
-#define __arch_pack
-#endif
-
#define ASSERT_ALWAYS(expr) \
(unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index b49ccf5c1d756..3b74fa011bb15 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -435,8 +435,7 @@ xfs_log_reserve(
int cnt,
struct xlog_ticket **ticp,
__uint8_t client,
- bool permanent,
- uint t_type)
+ bool permanent)
{
struct xlog *log = mp->m_log;
struct xlog_ticket *tic;
@@ -456,7 +455,6 @@ xfs_log_reserve(
if (!tic)
return -ENOMEM;
- tic->t_trans_type = t_type;
*ticp = tic;
xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
@@ -790,7 +788,7 @@ xfs_log_mount_cancel(
* As far as I know, there weren't any dependencies on the old behaviour.
*/
-int
+static int
xfs_log_unmount_write(xfs_mount_t *mp)
{
struct xlog *log = mp->m_log;
@@ -823,8 +821,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
} while (iclog != first_iclog);
#endif
if (! (XLOG_FORCED_SHUTDOWN(log))) {
- error = xfs_log_reserve(mp, 600, 1, &tic,
- XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE);
+ error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0);
if (!error) {
/* the data section must be 32 bit size aligned */
struct {
@@ -1039,7 +1036,7 @@ xfs_log_space_wake(
* there's no point in running a dummy transaction at this point because we
* can't start trying to idle the log until both the CIL and AIL are empty.
*/
-int
+static int
xfs_log_need_covered(xfs_mount_t *mp)
{
struct xlog *log = mp->m_log;
@@ -1180,7 +1177,7 @@ xlog_space_left(
* The log manager needs its own routine, in order to control what
* happens with the buffer after the write completes.
*/
-void
+static void
xlog_iodone(xfs_buf_t *bp)
{
struct xlog_in_core *iclog = bp->b_fspriv;
@@ -1305,7 +1302,7 @@ xfs_log_work_queue(
* disk. If there is nothing dirty, then we might need to cover the log to
* indicate that the filesystem is idle.
*/
-void
+static void
xfs_log_worker(
struct work_struct *work)
{
@@ -1418,7 +1415,7 @@ xlog_alloc_log(
*/
error = -ENOMEM;
bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL,
- BTOBB(log->l_iclog_size), 0);
+ BTOBB(log->l_iclog_size), XBF_NO_IOACCT);
if (!bp)
goto out_free_log;
@@ -1457,7 +1454,8 @@ xlog_alloc_log(
prev_iclog = iclog;
bp = xfs_buf_get_uncached(mp->m_logdev_targp,
- BTOBB(log->l_iclog_size), 0);
+ BTOBB(log->l_iclog_size),
+ XBF_NO_IOACCT);
if (!bp)
goto out_free_iclog;
@@ -2032,58 +2030,8 @@ xlog_print_tic_res(
REG_TYPE_STR(ICREATE, "inode create")
};
#undef REG_TYPE_STR
-#define TRANS_TYPE_STR(type) [XFS_TRANS_##type] = #type
- static char *trans_type_str[XFS_TRANS_TYPE_MAX] = {
- TRANS_TYPE_STR(SETATTR_NOT_SIZE),
- TRANS_TYPE_STR(SETATTR_SIZE),
- TRANS_TYPE_STR(INACTIVE),
- TRANS_TYPE_STR(CREATE),
- TRANS_TYPE_STR(CREATE_TRUNC),
- TRANS_TYPE_STR(TRUNCATE_FILE),
- TRANS_TYPE_STR(REMOVE),
- TRANS_TYPE_STR(LINK),
- TRANS_TYPE_STR(RENAME),
- TRANS_TYPE_STR(MKDIR),
- TRANS_TYPE_STR(RMDIR),
- TRANS_TYPE_STR(SYMLINK),
- TRANS_TYPE_STR(SET_DMATTRS),
- TRANS_TYPE_STR(GROWFS),
- TRANS_TYPE_STR(STRAT_WRITE),
- TRANS_TYPE_STR(DIOSTRAT),
- TRANS_TYPE_STR(WRITEID),
- TRANS_TYPE_STR(ADDAFORK),
- TRANS_TYPE_STR(ATTRINVAL),
- TRANS_TYPE_STR(ATRUNCATE),
- TRANS_TYPE_STR(ATTR_SET),
- TRANS_TYPE_STR(ATTR_RM),
- TRANS_TYPE_STR(ATTR_FLAG),
- TRANS_TYPE_STR(CLEAR_AGI_BUCKET),
- TRANS_TYPE_STR(SB_CHANGE),
- TRANS_TYPE_STR(DUMMY1),
- TRANS_TYPE_STR(DUMMY2),
- TRANS_TYPE_STR(QM_QUOTAOFF),
- TRANS_TYPE_STR(QM_DQALLOC),
- TRANS_TYPE_STR(QM_SETQLIM),
- TRANS_TYPE_STR(QM_DQCLUSTER),
- TRANS_TYPE_STR(QM_QINOCREATE),
- TRANS_TYPE_STR(QM_QUOTAOFF_END),
- TRANS_TYPE_STR(FSYNC_TS),
- TRANS_TYPE_STR(GROWFSRT_ALLOC),
- TRANS_TYPE_STR(GROWFSRT_ZERO),
- TRANS_TYPE_STR(GROWFSRT_FREE),
- TRANS_TYPE_STR(SWAPEXT),
- TRANS_TYPE_STR(CHECKPOINT),
- TRANS_TYPE_STR(ICREATE),
- TRANS_TYPE_STR(CREATE_TMPFILE)
- };
-#undef TRANS_TYPE_STR
xfs_warn(mp, "xlog_write: reservation summary:");
- xfs_warn(mp, " trans type = %s (%u)",
- ((ticket->t_trans_type <= 0 ||
- ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
- "bad-trans-type" : trans_type_str[ticket->t_trans_type]),
- ticket->t_trans_type);
xfs_warn(mp, " unit res = %d bytes",
ticket->t_unit_res);
xfs_warn(mp, " current res = %d bytes",
@@ -3378,7 +3326,7 @@ xfs_log_force(
{
int error;
- trace_xfs_log_force(mp, 0);
+ trace_xfs_log_force(mp, 0, _RET_IP_);
error = _xfs_log_force(mp, flags, NULL);
if (error)
xfs_warn(mp, "%s: error %d returned.", __func__, error);
@@ -3527,7 +3475,7 @@ xfs_log_force_lsn(
{
int error;
- trace_xfs_log_force(mp, lsn);
+ trace_xfs_log_force(mp, lsn, _RET_IP_);
error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
if (error)
xfs_warn(mp, "%s: error %d returned.", __func__, error);
@@ -3709,7 +3657,6 @@ xlog_ticket_alloc(
tic->t_tid = prandom_u32();
tic->t_clientid = client;
tic->t_flags = XLOG_TIC_INITED;
- tic->t_trans_type = 0;
if (permanent)
tic->t_flags |= XLOG_TIC_PERM_RESERV;
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index aa533a7d50f21..b5e71072fde59 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -161,15 +161,10 @@ int xfs_log_reserve(struct xfs_mount *mp,
int count,
struct xlog_ticket **ticket,
__uint8_t clientid,
- bool permanent,
- uint t_type);
+ bool permanent);
int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
-int xfs_log_unmount_write(struct xfs_mount *mp);
void xfs_log_unmount(struct xfs_mount *mp);
int xfs_log_force_umount(struct xfs_mount *mp, int logerror);
-int xfs_log_need_covered(struct xfs_mount *mp);
-
-void xlog_iodone(struct xfs_buf *);
struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
void xfs_log_ticket_put(struct xlog_ticket *ticket);
@@ -179,7 +174,6 @@ void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
void xfs_log_work_queue(struct xfs_mount *mp);
-void xfs_log_worker(struct work_struct *work);
void xfs_log_quiesce(struct xfs_mount *mp);
bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 4e7649351f5a2..a4ab192e1792d 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -51,7 +51,6 @@ xlog_cil_ticket_alloc(
tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
KM_SLEEP|KM_NOFS);
- tic->t_trans_type = XFS_TRANS_CHECKPOINT;
/*
* set the current reservation to zero so we know to steal the basic
@@ -79,6 +78,157 @@ xlog_cil_init_post_recovery(
log->l_cilp->xc_ctx->sequence = 1;
}
+static inline int
+xlog_cil_iovec_space(
+ uint niovecs)
+{
+ return round_up((sizeof(struct xfs_log_vec) +
+ niovecs * sizeof(struct xfs_log_iovec)),
+ sizeof(uint64_t));
+}
+
+/*
+ * Allocate or pin log vector buffers for CIL insertion.
+ *
+ * The CIL currently uses disposable buffers for copying a snapshot of the
+ * modified items into the log during a push. The biggest problem with this is
+ * the requirement to allocate the disposable buffer during the commit if:
+ * a) does not exist; or
+ * b) it is too small
+ *
+ * If we do this allocation within xlog_cil_insert_format_items(), it is done
+ * under the xc_ctx_lock, which means that a CIL push cannot occur during
+ * the memory allocation. This means that we have a potential deadlock situation
+ * under low memory conditions when we have lots of dirty metadata pinned in
+ * the CIL and we need a CIL commit to occur to free memory.
+ *
+ * To avoid this, we need to move the memory allocation outside the
+ * xc_ctx_lock, but because the log vector buffers are disposable, that opens
+ * up a TOCTOU race condition w.r.t. the CIL committing and removing the log
+ * vector buffers between the check and the formatting of the item into the
+ * log vector buffer within the xc_ctx_lock.
+ *
+ * Because the log vector buffer needs to be unchanged during the CIL push
+ * process, we cannot share the buffer between the transaction commit (which
+ * modifies the buffer) and the CIL push context that is writing the changes
+ * into the log. This means skipping preallocation of buffer space is
+ * unreliable, but we most definitely do not want to be allocating and freeing
+ * buffers unnecessarily during commits when overwrites can be done safely.
+ *
+ * The simplest solution to this problem is to allocate a shadow buffer when a
+ * log item is committed for the second time, and then to only use this buffer
+ * if necessary. The buffer can remain attached to the log item until such time
+ * it is needed, and this is the buffer that is reallocated to match the size of
+ * the incoming modification. Then during the formatting of the item we can swap
+ * the active buffer with the new one if we can't reuse the existing buffer. We
+ * don't free the old buffer as it may be reused on the next modification if
+ * it's size is right, otherwise we'll free and reallocate it at that point.
+ *
+ * This function builds a vector for the changes in each log item in the
+ * transaction. It then works out the length of the buffer needed for each log
+ * item, allocates them and attaches the vector to the log item in preparation
+ * for the formatting step which occurs under the xc_ctx_lock.
+ *
+ * While this means the memory footprint goes up, it avoids the repeated
+ * alloc/free pattern that repeated modifications of an item would otherwise
+ * cause, and hence minimises the CPU overhead of such behaviour.
+ */
+static void
+xlog_cil_alloc_shadow_bufs(
+ struct xlog *log,
+ struct xfs_trans *tp)
+{
+ struct xfs_log_item_desc *lidp;
+
+ list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+ struct xfs_log_item *lip = lidp->lid_item;
+ struct xfs_log_vec *lv;
+ int niovecs = 0;
+ int nbytes = 0;
+ int buf_size;
+ bool ordered = false;
+
+ /* Skip items which aren't dirty in this transaction. */
+ if (!(lidp->lid_flags & XFS_LID_DIRTY))
+ continue;
+
+ /* get number of vecs and size of data to be stored */
+ lip->li_ops->iop_size(lip, &niovecs, &nbytes);
+
+ /*
+ * Ordered items need to be tracked but we do not wish to write
+ * them. We need a logvec to track the object, but we do not
+ * need an iovec or buffer to be allocated for copying data.
+ */
+ if (niovecs == XFS_LOG_VEC_ORDERED) {
+ ordered = true;
+ niovecs = 0;
+ nbytes = 0;
+ }
+
+ /*
+ * We 64-bit align the length of each iovec so that the start
+ * of the next one is naturally aligned. We'll need to
+ * account for that slack space here. Then round nbytes up
+ * to 64-bit alignment so that the initial buffer alignment is
+ * easy to calculate and verify.
+ */
+ nbytes += niovecs * sizeof(uint64_t);
+ nbytes = round_up(nbytes, sizeof(uint64_t));
+
+ /*
+ * The data buffer needs to start 64-bit aligned, so round up
+ * that space to ensure we can align it appropriately and not
+ * overrun the buffer.
+ */
+ buf_size = nbytes + xlog_cil_iovec_space(niovecs);
+
+ /*
+ * if we have no shadow buffer, or it is too small, we need to
+ * reallocate it.
+ */
+ if (!lip->li_lv_shadow ||
+ buf_size > lip->li_lv_shadow->lv_size) {
+
+ /*
+ * We free and allocate here as a realloc would copy
+ * unecessary data. We don't use kmem_zalloc() for the
+ * same reason - we don't need to zero the data area in
+ * the buffer, only the log vector header and the iovec
+ * storage.
+ */
+ kmem_free(lip->li_lv_shadow);
+
+ lv = kmem_alloc(buf_size, KM_SLEEP|KM_NOFS);
+ memset(lv, 0, xlog_cil_iovec_space(niovecs));
+
+ lv->lv_item = lip;
+ lv->lv_size = buf_size;
+ if (ordered)
+ lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+ else
+ lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
+ lip->li_lv_shadow = lv;
+ } else {
+ /* same or smaller, optimise common overwrite case */
+ lv = lip->li_lv_shadow;
+ if (ordered)
+ lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+ else
+ lv->lv_buf_len = 0;
+ lv->lv_bytes = 0;
+ lv->lv_next = NULL;
+ }
+
+ /* Ensure the lv is set up according to ->iop_size */
+ lv->lv_niovecs = niovecs;
+
+ /* The allocated data region lies beyond the iovec region */
+ lv->lv_buf = (char *)lv + xlog_cil_iovec_space(niovecs);
+ }
+
+}
+
/*
* Prepare the log item for insertion into the CIL. Calculate the difference in
* log space and vectors it will consume, and if it is a new item pin it as
@@ -101,16 +251,19 @@ xfs_cil_prepare_item(
/*
* If there is no old LV, this is the first time we've seen the item in
* this CIL context and so we need to pin it. If we are replacing the
- * old_lv, then remove the space it accounts for and free it.
+ * old_lv, then remove the space it accounts for and make it the shadow
+ * buffer for later freeing. In both cases we are now switching to the
+ * shadow buffer, so update the the pointer to it appropriately.
*/
- if (!old_lv)
+ if (!old_lv) {
lv->lv_item->li_ops->iop_pin(lv->lv_item);
- else if (old_lv != lv) {
+ lv->lv_item->li_lv_shadow = NULL;
+ } else if (old_lv != lv) {
ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
*diff_len -= old_lv->lv_bytes;
*diff_iovecs -= old_lv->lv_niovecs;
- kmem_free(old_lv);
+ lv->lv_item->li_lv_shadow = old_lv;
}
/* attach new log vector to log item */
@@ -134,11 +287,13 @@ xfs_cil_prepare_item(
* write it out asynchronously without needing to relock the object that was
* modified at the time it gets written into the iclog.
*
- * This function builds a vector for the changes in each log item in the
- * transaction. It then works out the length of the buffer needed for each log
- * item, allocates them and formats the vector for the item into the buffer.
- * The buffer is then attached to the log item are then inserted into the
- * Committed Item List for tracking until the next checkpoint is written out.
+ * This function takes the prepared log vectors attached to each log item, and
+ * formats the changes into the log vector buffer. The buffer it uses is
+ * dependent on the current state of the vector in the CIL - the shadow lv is
+ * guaranteed to be large enough for the current modification, but we will only
+ * use that if we can't reuse the existing lv. If we can't reuse the existing
+ * lv, then simple swap it out for the shadow lv. We don't free it - that is
+ * done lazily either by th enext modification or the freeing of the log item.
*
* We don't set up region headers during this process; we simply copy the
* regions into the flat buffer. We can do this because we still have to do a
@@ -171,59 +326,29 @@ xlog_cil_insert_format_items(
list_for_each_entry(lidp, &tp->t_items, lid_trans) {
struct xfs_log_item *lip = lidp->lid_item;
struct xfs_log_vec *lv;
- struct xfs_log_vec *old_lv;
- int niovecs = 0;
- int nbytes = 0;
- int buf_size;
+ struct xfs_log_vec *old_lv = NULL;
+ struct xfs_log_vec *shadow;
bool ordered = false;
/* Skip items which aren't dirty in this transaction. */
if (!(lidp->lid_flags & XFS_LID_DIRTY))
continue;
- /* get number of vecs and size of data to be stored */
- lip->li_ops->iop_size(lip, &niovecs, &nbytes);
-
- /* Skip items that do not have any vectors for writing */
- if (!niovecs)
- continue;
-
/*
- * Ordered items need to be tracked but we do not wish to write
- * them. We need a logvec to track the object, but we do not
- * need an iovec or buffer to be allocated for copying data.
+ * The formatting size information is already attached to
+ * the shadow lv on the log item.
*/
- if (niovecs == XFS_LOG_VEC_ORDERED) {
+ shadow = lip->li_lv_shadow;
+ if (shadow->lv_buf_len == XFS_LOG_VEC_ORDERED)
ordered = true;
- niovecs = 0;
- nbytes = 0;
- }
- /*
- * We 64-bit align the length of each iovec so that the start
- * of the next one is naturally aligned. We'll need to
- * account for that slack space here. Then round nbytes up
- * to 64-bit alignment so that the initial buffer alignment is
- * easy to calculate and verify.
- */
- nbytes += niovecs * sizeof(uint64_t);
- nbytes = round_up(nbytes, sizeof(uint64_t));
-
- /* grab the old item if it exists for reservation accounting */
- old_lv = lip->li_lv;
-
- /*
- * The data buffer needs to start 64-bit aligned, so round up
- * that space to ensure we can align it appropriately and not
- * overrun the buffer.
- */
- buf_size = nbytes +
- round_up((sizeof(struct xfs_log_vec) +
- niovecs * sizeof(struct xfs_log_iovec)),
- sizeof(uint64_t));
+ /* Skip items that do not have any vectors for writing */
+ if (!shadow->lv_niovecs && !ordered)
+ continue;
/* compare to existing item size */
- if (lip->li_lv && buf_size <= lip->li_lv->lv_size) {
+ old_lv = lip->li_lv;
+ if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) {
/* same or smaller, optimise common overwrite case */
lv = lip->li_lv;
lv->lv_next = NULL;
@@ -237,32 +362,29 @@ xlog_cil_insert_format_items(
*/
*diff_iovecs -= lv->lv_niovecs;
*diff_len -= lv->lv_bytes;
+
+ /* Ensure the lv is set up according to ->iop_size */
+ lv->lv_niovecs = shadow->lv_niovecs;
+
+ /* reset the lv buffer information for new formatting */
+ lv->lv_buf_len = 0;
+ lv->lv_bytes = 0;
+ lv->lv_buf = (char *)lv +
+ xlog_cil_iovec_space(lv->lv_niovecs);
} else {
- /* allocate new data chunk */
- lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS);
+ /* switch to shadow buffer! */
+ lv = shadow;
lv->lv_item = lip;
- lv->lv_size = buf_size;
if (ordered) {
/* track as an ordered logvec */
ASSERT(lip->li_lv == NULL);
- lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
goto insert;
}
- lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
}
- /* Ensure the lv is set up according to ->iop_size */
- lv->lv_niovecs = niovecs;
-
- /* The allocated data region lies beyond the iovec region */
- lv->lv_buf_len = 0;
- lv->lv_bytes = 0;
- lv->lv_buf = (char *)lv + buf_size - nbytes;
ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t)));
-
lip->li_ops->iop_format(lip, lv);
insert:
- ASSERT(lv->lv_buf_len <= nbytes);
xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs);
}
}
@@ -784,6 +906,13 @@ xfs_log_commit_cil(
struct xlog *log = mp->m_log;
struct xfs_cil *cil = log->l_cilp;
+ /*
+ * Do all necessary memory allocation before we lock the CIL.
+ * This ensures the allocation does not deadlock with a CIL
+ * push in memory reclaim (e.g. from kswapd).
+ */
+ xlog_cil_alloc_shadow_bufs(log, tp);
+
/* lock out background commit */
down_read(&cil->xc_ctx_lock);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index ed8896310c00b..765f084759b5d 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -175,7 +175,6 @@ typedef struct xlog_ticket {
char t_cnt; /* current count : 1 */
char t_clientid; /* who does this belong to; : 1 */
char t_flags; /* properties of reservation : 1 */
- uint t_trans_type; /* transaction type : 4 */
/* reservation array fields */
uint t_res_num; /* num in array : 4 */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 396565f432476..e8638fd2c0c3a 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -43,6 +43,7 @@
#include "xfs_bmap_btree.h"
#include "xfs_error.h"
#include "xfs_dir2.h"
+#include "xfs_rmap_item.h"
#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
@@ -1911,6 +1912,8 @@ xlog_recover_reorder_trans(
case XFS_LI_QUOTAOFF:
case XFS_LI_EFD:
case XFS_LI_EFI:
+ case XFS_LI_RUI:
+ case XFS_LI_RUD:
trace_xfs_log_recover_item_reorder_tail(log,
trans, item, pass);
list_move_tail(&item->ri_list, &inode_list);
@@ -2228,6 +2231,7 @@ xlog_recover_get_buf_lsn(
case XFS_ABTC_CRC_MAGIC:
case XFS_ABTB_MAGIC:
case XFS_ABTC_MAGIC:
+ case XFS_RMAP_CRC_MAGIC:
case XFS_IBT_CRC_MAGIC:
case XFS_IBT_MAGIC: {
struct xfs_btree_block *btb = blk;
@@ -2396,6 +2400,9 @@ xlog_recover_validate_buf_type(
case XFS_BMAP_MAGIC:
bp->b_ops = &xfs_bmbt_buf_ops;
break;
+ case XFS_RMAP_CRC_MAGIC:
+ bp->b_ops = &xfs_rmapbt_buf_ops;
+ break;
default:
xfs_warn(mp, "Bad btree block magic!");
ASSERT(0);
@@ -3415,6 +3422,99 @@ xlog_recover_efd_pass2(
}
/*
+ * This routine is called to create an in-core extent rmap update
+ * item from the rui format structure which was logged on disk.
+ * It allocates an in-core rui, copies the extents from the format
+ * structure into it, and adds the rui to the AIL with the given
+ * LSN.
+ */
+STATIC int
+xlog_recover_rui_pass2(
+ struct xlog *log,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ int error;
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_rui_log_item *ruip;
+ struct xfs_rui_log_format *rui_formatp;
+
+ rui_formatp = item->ri_buf[0].i_addr;
+
+ ruip = xfs_rui_init(mp, rui_formatp->rui_nextents);
+ error = xfs_rui_copy_format(&item->ri_buf[0], &ruip->rui_format);
+ if (error) {
+ xfs_rui_item_free(ruip);
+ return error;
+ }
+ atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents);
+
+ spin_lock(&log->l_ailp->xa_lock);
+ /*
+ * The RUI has two references. One for the RUD and one for RUI to ensure
+ * it makes it into the AIL. Insert the RUI into the AIL directly and
+ * drop the RUI reference. Note that xfs_trans_ail_update() drops the
+ * AIL lock.
+ */
+ xfs_trans_ail_update(log->l_ailp, &ruip->rui_item, lsn);
+ xfs_rui_release(ruip);
+ return 0;
+}
+
+
+/*
+ * This routine is called when an RUD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding RUI if it
+ * was still in the log. To do this it searches the AIL for the RUI with an id
+ * equal to that in the RUD format structure. If we find it we drop the RUD
+ * reference, which removes the RUI from the AIL and frees it.
+ */
+STATIC int
+xlog_recover_rud_pass2(
+ struct xlog *log,
+ struct xlog_recover_item *item)
+{
+ struct xfs_rud_log_format *rud_formatp;
+ struct xfs_rui_log_item *ruip = NULL;
+ struct xfs_log_item *lip;
+ __uint64_t rui_id;
+ struct xfs_ail_cursor cur;
+ struct xfs_ail *ailp = log->l_ailp;
+
+ rud_formatp = item->ri_buf[0].i_addr;
+ ASSERT(item->ri_buf[0].i_len == sizeof(struct xfs_rud_log_format));
+ rui_id = rud_formatp->rud_rui_id;
+
+ /*
+ * Search for the RUI with the id in the RUD format structure in the
+ * AIL.
+ */
+ spin_lock(&ailp->xa_lock);
+ lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+ while (lip != NULL) {
+ if (lip->li_type == XFS_LI_RUI) {
+ ruip = (struct xfs_rui_log_item *)lip;
+ if (ruip->rui_format.rui_id == rui_id) {
+ /*
+ * Drop the RUD reference to the RUI. This
+ * removes the RUI from the AIL and frees it.
+ */
+ spin_unlock(&ailp->xa_lock);
+ xfs_rui_release(ruip);
+ spin_lock(&ailp->xa_lock);
+ break;
+ }
+ }
+ lip = xfs_trans_ail_cursor_next(ailp, &cur);
+ }
+
+ xfs_trans_ail_cursor_done(&cur);
+ spin_unlock(&ailp->xa_lock);
+
+ return 0;
+}
+
+/*
* This routine is called when an inode create format structure is found in a
* committed transaction in the log. It's purpose is to initialise the inodes
* being allocated on disk. This requires us to get inode cluster buffers that
@@ -3639,6 +3739,8 @@ xlog_recover_ra_pass2(
case XFS_LI_EFI:
case XFS_LI_EFD:
case XFS_LI_QUOTAOFF:
+ case XFS_LI_RUI:
+ case XFS_LI_RUD:
default:
break;
}
@@ -3662,6 +3764,8 @@ xlog_recover_commit_pass1(
case XFS_LI_EFD:
case XFS_LI_DQUOT:
case XFS_LI_ICREATE:
+ case XFS_LI_RUI:
+ case XFS_LI_RUD:
/* nothing to do in pass 1 */
return 0;
default:
@@ -3692,6 +3796,10 @@ xlog_recover_commit_pass2(
return xlog_recover_efi_pass2(log, item, trans->r_lsn);
case XFS_LI_EFD:
return xlog_recover_efd_pass2(log, item);
+ case XFS_LI_RUI:
+ return xlog_recover_rui_pass2(log, item, trans->r_lsn);
+ case XFS_LI_RUD:
+ return xlog_recover_rud_pass2(log, item);
case XFS_LI_DQUOT:
return xlog_recover_dquot_pass2(log, buffer_list, item,
trans->r_lsn);
@@ -3843,7 +3951,7 @@ xlog_recover_add_to_cont_trans(
old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
old_len = item->ri_buf[item->ri_cnt-1].i_len;
- ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP);
+ ptr = kmem_realloc(old_ptr, len + old_len, KM_SLEEP);
memcpy(&ptr[old_len], dp, len);
item->ri_buf[item->ri_cnt-1].i_len += len;
item->ri_buf[item->ri_cnt-1].i_addr = ptr;
@@ -4164,127 +4272,156 @@ xlog_recover_process_data(
return 0;
}
-/*
- * Process an extent free intent item that was recovered from
- * the log. We need to free the extents that it describes.
- */
+/* Recover the EFI if necessary. */
STATIC int
xlog_recover_process_efi(
- xfs_mount_t *mp,
- xfs_efi_log_item_t *efip)
+ struct xfs_mount *mp,
+ struct xfs_ail *ailp,
+ struct xfs_log_item *lip)
{
- xfs_efd_log_item_t *efdp;
- xfs_trans_t *tp;
- int i;
- int error = 0;
- xfs_extent_t *extp;
- xfs_fsblock_t startblock_fsb;
-
- ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
+ struct xfs_efi_log_item *efip;
+ int error;
/*
- * First check the validity of the extents described by the
- * EFI. If any are bad, then assume that all are bad and
- * just toss the EFI.
+ * Skip EFIs that we've already processed.
*/
- for (i = 0; i < efip->efi_format.efi_nextents; i++) {
- extp = &(efip->efi_format.efi_extents[i]);
- startblock_fsb = XFS_BB_TO_FSB(mp,
- XFS_FSB_TO_DADDR(mp, extp->ext_start));
- if ((startblock_fsb == 0) ||
- (extp->ext_len == 0) ||
- (startblock_fsb >= mp->m_sb.sb_dblocks) ||
- (extp->ext_len >= mp->m_sb.sb_agblocks)) {
- /*
- * This will pull the EFI from the AIL and
- * free the memory associated with it.
- */
- set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
- xfs_efi_release(efip);
- return -EIO;
- }
- }
+ efip = container_of(lip, struct xfs_efi_log_item, efi_item);
+ if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags))
+ return 0;
- tp = xfs_trans_alloc(mp, 0);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
- if (error)
- goto abort_error;
- efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
+ spin_unlock(&ailp->xa_lock);
+ error = xfs_efi_recover(mp, efip);
+ spin_lock(&ailp->xa_lock);
- for (i = 0; i < efip->efi_format.efi_nextents; i++) {
- extp = &(efip->efi_format.efi_extents[i]);
- error = xfs_trans_free_extent(tp, efdp, extp->ext_start,
- extp->ext_len);
- if (error)
- goto abort_error;
+ return error;
+}
- }
+/* Release the EFI since we're cancelling everything. */
+STATIC void
+xlog_recover_cancel_efi(
+ struct xfs_mount *mp,
+ struct xfs_ail *ailp,
+ struct xfs_log_item *lip)
+{
+ struct xfs_efi_log_item *efip;
- set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
- error = xfs_trans_commit(tp);
- return error;
+ efip = container_of(lip, struct xfs_efi_log_item, efi_item);
+
+ spin_unlock(&ailp->xa_lock);
+ xfs_efi_release(efip);
+ spin_lock(&ailp->xa_lock);
+}
+
+/* Recover the RUI if necessary. */
+STATIC int
+xlog_recover_process_rui(
+ struct xfs_mount *mp,
+ struct xfs_ail *ailp,
+ struct xfs_log_item *lip)
+{
+ struct xfs_rui_log_item *ruip;
+ int error;
+
+ /*
+ * Skip RUIs that we've already processed.
+ */
+ ruip = container_of(lip, struct xfs_rui_log_item, rui_item);
+ if (test_bit(XFS_RUI_RECOVERED, &ruip->rui_flags))
+ return 0;
+
+ spin_unlock(&ailp->xa_lock);
+ error = xfs_rui_recover(mp, ruip);
+ spin_lock(&ailp->xa_lock);
-abort_error:
- xfs_trans_cancel(tp);
return error;
}
+/* Release the RUI since we're cancelling everything. */
+STATIC void
+xlog_recover_cancel_rui(
+ struct xfs_mount *mp,
+ struct xfs_ail *ailp,
+ struct xfs_log_item *lip)
+{
+ struct xfs_rui_log_item *ruip;
+
+ ruip = container_of(lip, struct xfs_rui_log_item, rui_item);
+
+ spin_unlock(&ailp->xa_lock);
+ xfs_rui_release(ruip);
+ spin_lock(&ailp->xa_lock);
+}
+
+/* Is this log item a deferred action intent? */
+static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
+{
+ switch (lip->li_type) {
+ case XFS_LI_EFI:
+ case XFS_LI_RUI:
+ return true;
+ default:
+ return false;
+ }
+}
+
/*
- * When this is called, all of the EFIs which did not have
- * corresponding EFDs should be in the AIL. What we do now
- * is free the extents associated with each one.
+ * When this is called, all of the log intent items which did not have
+ * corresponding log done items should be in the AIL. What we do now
+ * is update the data structures associated with each one.
*
- * Since we process the EFIs in normal transactions, they
- * will be removed at some point after the commit. This prevents
- * us from just walking down the list processing each one.
- * We'll use a flag in the EFI to skip those that we've already
- * processed and use the AIL iteration mechanism's generation
- * count to try to speed this up at least a bit.
+ * Since we process the log intent items in normal transactions, they
+ * will be removed at some point after the commit. This prevents us
+ * from just walking down the list processing each one. We'll use a
+ * flag in the intent item to skip those that we've already processed
+ * and use the AIL iteration mechanism's generation count to try to
+ * speed this up at least a bit.
*
- * When we start, we know that the EFIs are the only things in
- * the AIL. As we process them, however, other items are added
- * to the AIL. Since everything added to the AIL must come after
- * everything already in the AIL, we stop processing as soon as
- * we see something other than an EFI in the AIL.
+ * When we start, we know that the intents are the only things in the
+ * AIL. As we process them, however, other items are added to the
+ * AIL.
*/
STATIC int
-xlog_recover_process_efis(
+xlog_recover_process_intents(
struct xlog *log)
{
struct xfs_log_item *lip;
- struct xfs_efi_log_item *efip;
int error = 0;
struct xfs_ail_cursor cur;
struct xfs_ail *ailp;
+ xfs_lsn_t last_lsn;
ailp = log->l_ailp;
spin_lock(&ailp->xa_lock);
lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+ last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
while (lip != NULL) {
/*
- * We're done when we see something other than an EFI.
- * There should be no EFIs left in the AIL now.
+ * We're done when we see something other than an intent.
+ * There should be no intents left in the AIL now.
*/
- if (lip->li_type != XFS_LI_EFI) {
+ if (!xlog_item_is_intent(lip)) {
#ifdef DEBUG
for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
- ASSERT(lip->li_type != XFS_LI_EFI);
+ ASSERT(!xlog_item_is_intent(lip));
#endif
break;
}
/*
- * Skip EFIs that we've already processed.
+ * We should never see a redo item with a LSN higher than
+ * the last transaction we found in the log at the start
+ * of recovery.
*/
- efip = container_of(lip, struct xfs_efi_log_item, efi_item);
- if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {
- lip = xfs_trans_ail_cursor_next(ailp, &cur);
- continue;
- }
+ ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0);
- spin_unlock(&ailp->xa_lock);
- error = xlog_recover_process_efi(log->l_mp, efip);
- spin_lock(&ailp->xa_lock);
+ switch (lip->li_type) {
+ case XFS_LI_EFI:
+ error = xlog_recover_process_efi(log->l_mp, ailp, lip);
+ break;
+ case XFS_LI_RUI:
+ error = xlog_recover_process_rui(log->l_mp, ailp, lip);
+ break;
+ }
if (error)
goto out;
lip = xfs_trans_ail_cursor_next(ailp, &cur);
@@ -4296,15 +4433,14 @@ out:
}
/*
- * A cancel occurs when the mount has failed and we're bailing out. Release all
- * pending EFIs so they don't pin the AIL.
+ * A cancel occurs when the mount has failed and we're bailing out.
+ * Release all pending log intent items so they don't pin the AIL.
*/
STATIC int
-xlog_recover_cancel_efis(
+xlog_recover_cancel_intents(
struct xlog *log)
{
struct xfs_log_item *lip;
- struct xfs_efi_log_item *efip;
int error = 0;
struct xfs_ail_cursor cur;
struct xfs_ail *ailp;
@@ -4314,22 +4450,25 @@ xlog_recover_cancel_efis(
lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
while (lip != NULL) {
/*
- * We're done when we see something other than an EFI.
- * There should be no EFIs left in the AIL now.
+ * We're done when we see something other than an intent.
+ * There should be no intents left in the AIL now.
*/
- if (lip->li_type != XFS_LI_EFI) {
+ if (!xlog_item_is_intent(lip)) {
#ifdef DEBUG
for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
- ASSERT(lip->li_type != XFS_LI_EFI);
+ ASSERT(!xlog_item_is_intent(lip));
#endif
break;
}
- efip = container_of(lip, struct xfs_efi_log_item, efi_item);
-
- spin_unlock(&ailp->xa_lock);
- xfs_efi_release(efip);
- spin_lock(&ailp->xa_lock);
+ switch (lip->li_type) {
+ case XFS_LI_EFI:
+ xlog_recover_cancel_efi(log->l_mp, ailp, lip);
+ break;
+ case XFS_LI_RUI:
+ xlog_recover_cancel_rui(log->l_mp, ailp, lip);
+ break;
+ }
lip = xfs_trans_ail_cursor_next(ailp, &cur);
}
@@ -4355,10 +4494,9 @@ xlog_recover_clear_agi_bucket(
int offset;
int error;
- tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_clearagi, 0, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_clearagi, 0, 0, 0, &tp);
if (error)
- goto out_abort;
+ goto out_error;
error = xfs_read_agi(mp, tp, agno, &agibp);
if (error)
@@ -5025,6 +5163,7 @@ xlog_do_recover(
xfs_warn(mp, "Failed post-recovery per-ag init: %d", error);
return error;
}
+ mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
xlog_recover_check_summary(log);
@@ -5141,16 +5280,17 @@ xlog_recover_finish(
*/
if (log->l_flags & XLOG_RECOVERY_NEEDED) {
int error;
- error = xlog_recover_process_efis(log);
+ error = xlog_recover_process_intents(log);
if (error) {
- xfs_alert(log->l_mp, "Failed to recover EFIs");
+ xfs_alert(log->l_mp, "Failed to recover intents");
return error;
}
+
/*
- * Sync the log to get all the EFIs out of the AIL.
+ * Sync the log to get all the intents out of the AIL.
* This isn't absolutely necessary, but it helps in
* case the unlink transactions would have problems
- * pushing the EFIs out of the way.
+ * pushing the intents out of the way.
*/
xfs_log_force(log->l_mp, XFS_LOG_SYNC);
@@ -5175,7 +5315,7 @@ xlog_recover_cancel(
int error = 0;
if (log->l_flags & XLOG_RECOVERY_NEEDED)
- error = xlog_recover_cancel_efis(log);
+ error = xlog_recover_cancel_intents(log);
return error;
}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 536a0ee9cd5af..faeead671f9ff 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -24,6 +24,7 @@
#include "xfs_bit.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
+#include "xfs_defer.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_inode.h"
@@ -41,6 +42,7 @@
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_sysfs.h"
+#include "xfs_rmap_btree.h"
static DEFINE_MUTEX(xfs_uuid_table_mutex);
@@ -89,7 +91,6 @@ xfs_uuid_mount(
if (hole < 0) {
xfs_uuid_table = kmem_realloc(xfs_uuid_table,
(xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
- xfs_uuid_table_size * sizeof(*xfs_uuid_table),
KM_SLEEP);
hole = xfs_uuid_table_size++;
}
@@ -171,7 +172,7 @@ xfs_sb_validate_fsb_count(
ASSERT(sbp->sb_blocklog >= BBSHIFT);
/* Limited by ULONG_MAX of page cache index */
- if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
+ if (nblocks >> (PAGE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
return -EFBIG;
return 0;
}
@@ -231,6 +232,8 @@ xfs_initialize_perag(
if (maxagi)
*maxagi = index;
+
+ mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp);
return 0;
out_unwind:
@@ -273,13 +276,15 @@ xfs_readsb(
buf_ops = NULL;
/*
- * Allocate a (locked) buffer to hold the superblock.
- * This will be kept around at all times to optimize
- * access to the superblock.
+ * Allocate a (locked) buffer to hold the superblock. This will be kept
+ * around at all times to optimize access to the superblock. Therefore,
+ * set XBF_NO_IOACCT to make sure it doesn't hold the buftarg count
+ * elevated.
*/
reread:
error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
- BTOBB(sector_size), 0, &bp, buf_ops);
+ BTOBB(sector_size), XBF_NO_IOACCT, &bp,
+ buf_ops);
if (error) {
if (loud)
xfs_warn(mp, "SB validate failed with error %d.", error);
@@ -678,9 +683,13 @@ xfs_mountfs(
xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK);
xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK);
xfs_ialloc_compute_maxlevels(mp);
+ xfs_rmapbt_compute_maxlevels(mp);
xfs_set_maxicount(mp);
+ /* enable fail_at_unmount as default */
+ mp->m_fail_unmount = 1;
+
error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname);
if (error)
goto out;
@@ -690,10 +699,15 @@ xfs_mountfs(
if (error)
goto out_remove_sysfs;
- error = xfs_uuid_mount(mp);
+ error = xfs_error_sysfs_init(mp);
if (error)
goto out_del_stats;
+
+ error = xfs_uuid_mount(mp);
+ if (error)
+ goto out_remove_error_sysfs;
+
/*
* Set the minimum read and write sizes
*/
@@ -957,6 +971,7 @@ xfs_mountfs(
cancel_delayed_work_sync(&mp->m_reclaim_work);
xfs_reclaim_inodes(mp, SYNC_WAIT);
out_log_dealloc:
+ mp->m_flags |= XFS_MOUNT_UNMOUNTING;
xfs_log_mount_cancel(mp);
out_fail_wait:
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
@@ -968,6 +983,8 @@ xfs_mountfs(
xfs_da_unmount(mp);
out_remove_uuid:
xfs_uuid_unmount(mp);
+ out_remove_error_sysfs:
+ xfs_error_sysfs_del(mp);
out_del_stats:
xfs_sysfs_del(&mp->m_stats.xs_kobj);
out_remove_sysfs:
@@ -1006,6 +1023,14 @@ xfs_unmountfs(
xfs_log_force(mp, XFS_LOG_SYNC);
/*
+ * We now need to tell the world we are unmounting. This will allow
+ * us to detect that the filesystem is going away and we should error
+ * out anything that we have been retrying in the background. This will
+ * prevent neverending retries in AIL pushing from hanging the unmount.
+ */
+ mp->m_flags |= XFS_MOUNT_UNMOUNTING;
+
+ /*
* Flush all pending changes from the AIL.
*/
xfs_ail_push_all_sync(mp->m_ail);
@@ -1056,6 +1081,7 @@ xfs_unmountfs(
#endif
xfs_free_perag(mp);
+ xfs_error_sysfs_del(mp);
xfs_sysfs_del(&mp->m_stats.xs_kobj);
xfs_sysfs_del(&mp->m_kobj);
}
@@ -1195,7 +1221,7 @@ xfs_mod_fdblocks(
batch = XFS_FDBLOCKS_BATCH;
__percpu_counter_add(&mp->m_fdblocks, delta, batch);
- if (__percpu_counter_compare(&mp->m_fdblocks, XFS_ALLOC_SET_ASIDE(mp),
+ if (__percpu_counter_compare(&mp->m_fdblocks, mp->m_alloc_set_aside,
XFS_FDBLOCKS_BATCH) >= 0) {
/* we had space! */
return 0;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index bac6b3435591b..b36676cde1030 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -37,6 +37,32 @@ enum {
XFS_LOWSP_MAX,
};
+/*
+ * Error Configuration
+ *
+ * Error classes define the subsystem the configuration belongs to.
+ * Error numbers define the errors that are configurable.
+ */
+enum {
+ XFS_ERR_METADATA,
+ XFS_ERR_CLASS_MAX,
+};
+enum {
+ XFS_ERR_DEFAULT,
+ XFS_ERR_EIO,
+ XFS_ERR_ENOSPC,
+ XFS_ERR_ENODEV,
+ XFS_ERR_ERRNO_MAX,
+};
+
+#define XFS_ERR_RETRY_FOREVER -1
+
+struct xfs_error_cfg {
+ struct xfs_kobj kobj;
+ int max_retries;
+ unsigned long retry_timeout; /* in jiffies, 0 = no timeout */
+};
+
typedef struct xfs_mount {
struct super_block *m_super;
xfs_tid_t m_tid; /* next unused tid for fs */
@@ -90,9 +116,15 @@ typedef struct xfs_mount {
uint m_bmap_dmnr[2]; /* min bmap btree records */
uint m_inobt_mxr[2]; /* max inobt btree records */
uint m_inobt_mnr[2]; /* min inobt btree records */
+ uint m_rmap_mxr[2]; /* max rmap btree records */
+ uint m_rmap_mnr[2]; /* min rmap btree records */
uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
uint m_in_maxlevels; /* max inobt btree levels. */
+ uint m_rmap_maxlevels; /* max rmap btree levels */
+ xfs_extlen_t m_ag_prealloc_blocks; /* reserved ag blocks */
+ uint m_alloc_set_aside; /* space we can't use */
+ uint m_ag_max_usable; /* max space per AG */
struct radix_tree_root m_perag_tree; /* per-ag accounting info */
spinlock_t m_perag_lock; /* lock for m_perag_tree */
struct mutex m_growlock; /* growfs mutex */
@@ -127,6 +159,9 @@ typedef struct xfs_mount {
int64_t m_low_space[XFS_LOWSP_MAX];
/* low free space thresholds */
struct xfs_kobj m_kobj;
+ struct xfs_kobj m_error_kobj;
+ struct xfs_kobj m_error_meta_kobj;
+ struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX];
struct xstats m_stats; /* per-fs stats */
struct workqueue_struct *m_buf_workqueue;
@@ -148,6 +183,7 @@ typedef struct xfs_mount {
*/
__uint32_t m_generation;
+ bool m_fail_unmount;
#ifdef DEBUG
/*
* DEBUG mode instrumentation to test and/or trigger delayed allocation
@@ -166,6 +202,7 @@ typedef struct xfs_mount {
#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops
must be synchronous except
for space allocations */
+#define XFS_MOUNT_UNMOUNTING (1ULL << 1) /* filesystem is unmounting */
#define XFS_MOUNT_WAS_CLEAN (1ULL << 3)
#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem
operations, typically for
@@ -231,12 +268,12 @@ static inline unsigned long
xfs_preferred_iosize(xfs_mount_t *mp)
{
if (mp->m_flags & XFS_MOUNT_COMPAT_IOSIZE)
- return PAGE_CACHE_SIZE;
+ return PAGE_SIZE;
return (mp->m_swidth ?
(mp->m_swidth << mp->m_sb.sb_blocklog) :
((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ?
(1 << (int)MAX(mp->m_readio_log, mp->m_writeio_log)) :
- PAGE_CACHE_SIZE));
+ PAGE_SIZE));
}
#define XFS_LAST_UNMOUNT_WAS_CLEAN(mp) \
@@ -364,4 +401,7 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *);
int xfs_zero_extent(struct xfs_inode *ip, xfs_fsblock_t start_fsb,
xfs_off_t count_fsb);
+struct xfs_error_cfg * xfs_error_get_cfg(struct xfs_mount *mp,
+ int error_class, int error);
+
#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index 184c44effdd5a..69e2986a37761 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -22,6 +22,11 @@
BUILD_BUG_ON_MSG(sizeof(structname) != (size), "XFS: sizeof(" \
#structname ") is wrong, expected " #size)
+#define XFS_CHECK_OFFSET(structname, member, off) \
+ BUILD_BUG_ON_MSG(offsetof(structname, member) != (off), \
+ "XFS: offsetof(" #structname ", " #member ") is wrong, " \
+ "expected " #off)
+
static inline void __init
xfs_check_ondisk_structs(void)
{
@@ -34,6 +39,8 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key, 8);
XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec, 16);
XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block, 4);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_shdr, 48);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_lhdr, 64);
XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block, 72);
XFS_CHECK_STRUCT_SIZE(struct xfs_dinode, 176);
XFS_CHECK_STRUCT_SIZE(struct xfs_disk_dquot, 104);
@@ -42,11 +49,14 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_dsymlink_hdr, 56);
XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_key, 4);
XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_rec, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_key, 20);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_rec, 24);
XFS_CHECK_STRUCT_SIZE(struct xfs_timestamp, 8);
XFS_CHECK_STRUCT_SIZE(xfs_alloc_key_t, 8);
XFS_CHECK_STRUCT_SIZE(xfs_alloc_ptr_t, 4);
XFS_CHECK_STRUCT_SIZE(xfs_alloc_rec_t, 8);
XFS_CHECK_STRUCT_SIZE(xfs_inobt_ptr_t, 4);
+ XFS_CHECK_STRUCT_SIZE(xfs_rmap_ptr_t, 4);
/* dir/attr trees */
XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leaf_hdr, 80);
@@ -75,27 +85,39 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_remote_t, 12);
*/
+ XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, valuelen, 0);
+ XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, namelen, 2);
+ XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, nameval, 3);
+ XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valueblk, 0);
+ XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valuelen, 4);
+ XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8);
+ XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9);
XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 40);
- XFS_CHECK_STRUCT_SIZE(xfs_attr_shortform_t, 8);
+ XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.totsize, 0);
+ XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.count, 2);
+ XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].namelen, 4);
+ XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].valuelen, 5);
+ XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].flags, 6);
+ XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].nameval, 7);
XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12);
XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16);
XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8);
XFS_CHECK_STRUCT_SIZE(xfs_da_node_hdr_t, 16);
XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_free_t, 4);
XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_hdr_t, 16);
- XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_unused_t, 6);
+ XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, freetag, 0);
+ XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, length, 2);
XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_hdr_t, 16);
XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_t, 16);
- XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino4_t, 4);
- XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino8_t, 8);
- XFS_CHECK_STRUCT_SIZE(xfs_dir2_inou_t, 8);
XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_entry_t, 8);
XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_hdr_t, 16);
XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_t, 16);
XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_tail_t, 4);
XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_entry_t, 3);
+ XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, namelen, 0);
+ XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, offset, 1);
+ XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, name, 3);
XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t, 10);
- XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_off_t, 2);
/* log structures */
XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24);
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index ade236e90bb36..0f14b2e4bf6cb 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -1,6 +1,7 @@
/*
* Copyright (c) 2014 Christoph Hellwig.
*/
+#include <linux/iomap.h>
#include "xfs.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
@@ -79,32 +80,6 @@ xfs_fs_get_uuid(
return 0;
}
-static void
-xfs_bmbt_to_iomap(
- struct xfs_inode *ip,
- struct iomap *iomap,
- struct xfs_bmbt_irec *imap)
-{
- struct xfs_mount *mp = ip->i_mount;
-
- if (imap->br_startblock == HOLESTARTBLOCK) {
- iomap->blkno = IOMAP_NULL_BLOCK;
- iomap->type = IOMAP_HOLE;
- } else if (imap->br_startblock == DELAYSTARTBLOCK) {
- iomap->blkno = IOMAP_NULL_BLOCK;
- iomap->type = IOMAP_DELALLOC;
- } else {
- iomap->blkno =
- XFS_FSB_TO_DADDR(ip->i_mount, imap->br_startblock);
- if (imap->br_state == XFS_EXT_UNWRITTEN)
- iomap->type = IOMAP_UNWRITTEN;
- else
- iomap->type = IOMAP_MAPPED;
- }
- iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
- iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
-}
-
/*
* Get a layout for the pNFS client.
*/
@@ -293,8 +268,8 @@ xfs_fs_commit_blocks(
* Make sure reads through the pagecache see the new data.
*/
error = invalidate_inode_pages2_range(inode->i_mapping,
- start >> PAGE_CACHE_SHIFT,
- (end - 1) >> PAGE_CACHE_SHIFT);
+ start >> PAGE_SHIFT,
+ (end - 1) >> PAGE_SHIFT);
WARN_ON_ONCE(error);
error = xfs_iomap_write_unwritten(ip, start, length);
@@ -308,12 +283,9 @@ xfs_fs_commit_blocks(
goto out_drop_iolock;
}
- tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+ if (error)
goto out_drop_iolock;
- }
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
index 93f74853961b1..e8339f74966b1 100644
--- a/fs/xfs/xfs_pnfs.h
+++ b/fs/xfs/xfs_pnfs.h
@@ -1,7 +1,7 @@
#ifndef _XFS_PNFS_H
#define _XFS_PNFS_H 1
-#if defined(CONFIG_NFSD_BLOCKLAYOUT) || defined(CONFIG_NFSD_SCSILAYOUT)
+#ifdef CONFIG_EXPORTFS_BLOCK_OPS
int xfs_fs_get_uuid(struct super_block *sb, u8 *buf, u32 *len, u64 *offset);
int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
struct iomap *iomap, bool write, u32 *device_generation);
@@ -15,5 +15,5 @@ xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex)
{
return 0;
}
-#endif /* CONFIG_NFSD_PNFS */
+#endif /* CONFIG_EXPORTFS_BLOCK_OPS */
#endif /* _XFS_PNFS_H */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index be125e1758c1a..a60d9e2739d14 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -783,13 +783,10 @@ xfs_qm_qino_alloc(
}
}
- tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create,
- XFS_QM_QINOCREATE_SPACE_RES(mp), 0);
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_create,
+ XFS_QM_QINOCREATE_SPACE_RES(mp), 0, 0, &tp);
+ if (error)
return error;
- }
if (need_alloc) {
error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip,
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index f4d0e0a8f517c..475a3882a81fe 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -236,10 +236,8 @@ xfs_qm_scall_trunc_qfile(
xfs_ilock(ip, XFS_IOLOCK_EXCL);
- tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
if (error) {
- xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
goto out_put;
}
@@ -436,12 +434,9 @@ xfs_qm_scall_setqlim(
defq = xfs_get_defquota(dqp, q);
xfs_dqunlock(dqp);
- tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0);
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_setqlim, 0, 0, 0, &tp);
+ if (error)
goto out_rele;
- }
xfs_dqlock(dqp);
xfs_trans_dqjoin(tp, dqp);
@@ -569,13 +564,9 @@ xfs_qm_log_quotaoff_end(
int error;
xfs_qoff_logitem_t *qoffi;
- tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END);
-
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0);
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_equotaoff, 0, 0, 0, &tp);
+ if (error)
return error;
- }
qoffi = xfs_trans_get_qoff_item(tp, startqoff,
flags & XFS_ALL_QUOTA_ACCT);
@@ -603,12 +594,9 @@ xfs_qm_log_quotaoff(
*qoffstartp = NULL;
- tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0);
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_quotaoff, 0, 0, 0, &tp);
+ if (error)
goto out;
- }
qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT);
xfs_trans_log_quotaoff_item(tp, qoffi);
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
new file mode 100644
index 0000000000000..2500f28689d5b
--- /dev/null
+++ b/fs/xfs/xfs_rmap_item.c
@@ -0,0 +1,536 @@
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_buf_item.h"
+#include "xfs_rmap_item.h"
+#include "xfs_log.h"
+#include "xfs_rmap.h"
+
+
+kmem_zone_t *xfs_rui_zone;
+kmem_zone_t *xfs_rud_zone;
+
+static inline struct xfs_rui_log_item *RUI_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_rui_log_item, rui_item);
+}
+
+void
+xfs_rui_item_free(
+ struct xfs_rui_log_item *ruip)
+{
+ if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS)
+ kmem_free(ruip);
+ else
+ kmem_zone_free(xfs_rui_zone, ruip);
+}
+
+/*
+ * This returns the number of iovecs needed to log the given rui item.
+ * We only need 1 iovec for an rui item. It just logs the rui_log_format
+ * structure.
+ */
+static inline int
+xfs_rui_item_sizeof(
+ struct xfs_rui_log_item *ruip)
+{
+ return sizeof(struct xfs_rui_log_format) +
+ (ruip->rui_format.rui_nextents - 1) *
+ sizeof(struct xfs_map_extent);
+}
+
+STATIC void
+xfs_rui_item_size(
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
+{
+ *nvecs += 1;
+ *nbytes += xfs_rui_item_sizeof(RUI_ITEM(lip));
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given rui log item. We use only 1 iovec, and we point that
+ * at the rui_log_format structure embedded in the rui item.
+ * It is at this point that we assert that all of the extent
+ * slots in the rui item have been filled.
+ */
+STATIC void
+xfs_rui_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_vec *lv)
+{
+ struct xfs_rui_log_item *ruip = RUI_ITEM(lip);
+ struct xfs_log_iovec *vecp = NULL;
+
+ ASSERT(atomic_read(&ruip->rui_next_extent) ==
+ ruip->rui_format.rui_nextents);
+
+ ruip->rui_format.rui_type = XFS_LI_RUI;
+ ruip->rui_format.rui_size = 1;
+
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUI_FORMAT, &ruip->rui_format,
+ xfs_rui_item_sizeof(ruip));
+}
+
+/*
+ * Pinning has no meaning for an rui item, so just return.
+ */
+STATIC void
+xfs_rui_item_pin(
+ struct xfs_log_item *lip)
+{
+}
+
+/*
+ * The unpin operation is the last place an RUI is manipulated in the log. It is
+ * either inserted in the AIL or aborted in the event of a log I/O error. In
+ * either case, the RUI transaction has been successfully committed to make it
+ * this far. Therefore, we expect whoever committed the RUI to either construct
+ * and commit the RUD or drop the RUD's reference in the event of error. Simply
+ * drop the log's RUI reference now that the log is done with it.
+ */
+STATIC void
+xfs_rui_item_unpin(
+ struct xfs_log_item *lip,
+ int remove)
+{
+ struct xfs_rui_log_item *ruip = RUI_ITEM(lip);
+
+ xfs_rui_release(ruip);
+}
+
+/*
+ * RUI items have no locking or pushing. However, since RUIs are pulled from
+ * the AIL when their corresponding RUDs are committed to disk, their situation
+ * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller
+ * will eventually flush the log. This should help in getting the RUI out of
+ * the AIL.
+ */
+STATIC uint
+xfs_rui_item_push(
+ struct xfs_log_item *lip,
+ struct list_head *buffer_list)
+{
+ return XFS_ITEM_PINNED;
+}
+
+/*
+ * The RUI has been either committed or aborted if the transaction has been
+ * cancelled. If the transaction was cancelled, an RUD isn't going to be
+ * constructed and thus we free the RUI here directly.
+ */
+STATIC void
+xfs_rui_item_unlock(
+ struct xfs_log_item *lip)
+{
+ if (lip->li_flags & XFS_LI_ABORTED)
+ xfs_rui_item_free(RUI_ITEM(lip));
+}
+
+/*
+ * The RUI is logged only once and cannot be moved in the log, so simply return
+ * the lsn at which it's been logged.
+ */
+STATIC xfs_lsn_t
+xfs_rui_item_committed(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+ return lsn;
+}
+
+/*
+ * The RUI dependency tracking op doesn't do squat. It can't because
+ * it doesn't know where the free extent is coming from. The dependency
+ * tracking has to be handled by the "enclosing" metadata object. For
+ * example, for inodes, the inode is locked throughout the extent freeing
+ * so the dependency should be recorded there.
+ */
+STATIC void
+xfs_rui_item_committing(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all rui log items.
+ */
+static const struct xfs_item_ops xfs_rui_item_ops = {
+ .iop_size = xfs_rui_item_size,
+ .iop_format = xfs_rui_item_format,
+ .iop_pin = xfs_rui_item_pin,
+ .iop_unpin = xfs_rui_item_unpin,
+ .iop_unlock = xfs_rui_item_unlock,
+ .iop_committed = xfs_rui_item_committed,
+ .iop_push = xfs_rui_item_push,
+ .iop_committing = xfs_rui_item_committing,
+};
+
+/*
+ * Allocate and initialize an rui item with the given number of extents.
+ */
+struct xfs_rui_log_item *
+xfs_rui_init(
+ struct xfs_mount *mp,
+ uint nextents)
+
+{
+ struct xfs_rui_log_item *ruip;
+ uint size;
+
+ ASSERT(nextents > 0);
+ if (nextents > XFS_RUI_MAX_FAST_EXTENTS) {
+ size = (uint)(sizeof(struct xfs_rui_log_item) +
+ ((nextents - 1) * sizeof(struct xfs_map_extent)));
+ ruip = kmem_zalloc(size, KM_SLEEP);
+ } else {
+ ruip = kmem_zone_zalloc(xfs_rui_zone, KM_SLEEP);
+ }
+
+ xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops);
+ ruip->rui_format.rui_nextents = nextents;
+ ruip->rui_format.rui_id = (uintptr_t)(void *)ruip;
+ atomic_set(&ruip->rui_next_extent, 0);
+ atomic_set(&ruip->rui_refcount, 2);
+
+ return ruip;
+}
+
+/*
+ * Copy an RUI format buffer from the given buf, and into the destination
+ * RUI format structure. The RUI/RUD items were designed not to need any
+ * special alignment handling.
+ */
+int
+xfs_rui_copy_format(
+ struct xfs_log_iovec *buf,
+ struct xfs_rui_log_format *dst_rui_fmt)
+{
+ struct xfs_rui_log_format *src_rui_fmt;
+ uint len;
+
+ src_rui_fmt = buf->i_addr;
+ len = sizeof(struct xfs_rui_log_format) +
+ (src_rui_fmt->rui_nextents - 1) *
+ sizeof(struct xfs_map_extent);
+
+ if (buf->i_len != len)
+ return -EFSCORRUPTED;
+
+ memcpy((char *)dst_rui_fmt, (char *)src_rui_fmt, len);
+ return 0;
+}
+
+/*
+ * Freeing the RUI requires that we remove it from the AIL if it has already
+ * been placed there. However, the RUI may not yet have been placed in the AIL
+ * when called by xfs_rui_release() from RUD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the reference
+ * count to ensure only the last caller frees the RUI.
+ */
+void
+xfs_rui_release(
+ struct xfs_rui_log_item *ruip)
+{
+ if (atomic_dec_and_test(&ruip->rui_refcount)) {
+ xfs_trans_ail_remove(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR);
+ xfs_rui_item_free(ruip);
+ }
+}
+
+static inline struct xfs_rud_log_item *RUD_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_rud_log_item, rud_item);
+}
+
+STATIC void
+xfs_rud_item_size(
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
+{
+ *nvecs += 1;
+ *nbytes += sizeof(struct xfs_rud_log_format);
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given rud log item. We use only 1 iovec, and we point that
+ * at the rud_log_format structure embedded in the rud item.
+ * It is at this point that we assert that all of the extent
+ * slots in the rud item have been filled.
+ */
+STATIC void
+xfs_rud_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_vec *lv)
+{
+ struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
+ struct xfs_log_iovec *vecp = NULL;
+
+ rudp->rud_format.rud_type = XFS_LI_RUD;
+ rudp->rud_format.rud_size = 1;
+
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUD_FORMAT, &rudp->rud_format,
+ sizeof(struct xfs_rud_log_format));
+}
+
+/*
+ * Pinning has no meaning for an rud item, so just return.
+ */
+STATIC void
+xfs_rud_item_pin(
+ struct xfs_log_item *lip)
+{
+}
+
+/*
+ * Since pinning has no meaning for an rud item, unpinning does
+ * not either.
+ */
+STATIC void
+xfs_rud_item_unpin(
+ struct xfs_log_item *lip,
+ int remove)
+{
+}
+
+/*
+ * There isn't much you can do to push on an rud item. It is simply stuck
+ * waiting for the log to be flushed to disk.
+ */
+STATIC uint
+xfs_rud_item_push(
+ struct xfs_log_item *lip,
+ struct list_head *buffer_list)
+{
+ return XFS_ITEM_PINNED;
+}
+
+/*
+ * The RUD is either committed or aborted if the transaction is cancelled. If
+ * the transaction is cancelled, drop our reference to the RUI and free the
+ * RUD.
+ */
+STATIC void
+xfs_rud_item_unlock(
+ struct xfs_log_item *lip)
+{
+ struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
+
+ if (lip->li_flags & XFS_LI_ABORTED) {
+ xfs_rui_release(rudp->rud_ruip);
+ kmem_zone_free(xfs_rud_zone, rudp);
+ }
+}
+
+/*
+ * When the rud item is committed to disk, all we need to do is delete our
+ * reference to our partner rui item and then free ourselves. Since we're
+ * freeing ourselves we must return -1 to keep the transaction code from
+ * further referencing this item.
+ */
+STATIC xfs_lsn_t
+xfs_rud_item_committed(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+ struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
+
+ /*
+ * Drop the RUI reference regardless of whether the RUD has been
+ * aborted. Once the RUD transaction is constructed, it is the sole
+ * responsibility of the RUD to release the RUI (even if the RUI is
+ * aborted due to log I/O error).
+ */
+ xfs_rui_release(rudp->rud_ruip);
+ kmem_zone_free(xfs_rud_zone, rudp);
+
+ return (xfs_lsn_t)-1;
+}
+
+/*
+ * The RUD dependency tracking op doesn't do squat. It can't because
+ * it doesn't know where the free extent is coming from. The dependency
+ * tracking has to be handled by the "enclosing" metadata object. For
+ * example, for inodes, the inode is locked throughout the extent freeing
+ * so the dependency should be recorded there.
+ */
+STATIC void
+xfs_rud_item_committing(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all rud log items.
+ */
+static const struct xfs_item_ops xfs_rud_item_ops = {
+ .iop_size = xfs_rud_item_size,
+ .iop_format = xfs_rud_item_format,
+ .iop_pin = xfs_rud_item_pin,
+ .iop_unpin = xfs_rud_item_unpin,
+ .iop_unlock = xfs_rud_item_unlock,
+ .iop_committed = xfs_rud_item_committed,
+ .iop_push = xfs_rud_item_push,
+ .iop_committing = xfs_rud_item_committing,
+};
+
+/*
+ * Allocate and initialize an rud item with the given number of extents.
+ */
+struct xfs_rud_log_item *
+xfs_rud_init(
+ struct xfs_mount *mp,
+ struct xfs_rui_log_item *ruip)
+
+{
+ struct xfs_rud_log_item *rudp;
+
+ rudp = kmem_zone_zalloc(xfs_rud_zone, KM_SLEEP);
+ xfs_log_item_init(mp, &rudp->rud_item, XFS_LI_RUD, &xfs_rud_item_ops);
+ rudp->rud_ruip = ruip;
+ rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id;
+
+ return rudp;
+}
+
+/*
+ * Process an rmap update intent item that was recovered from the log.
+ * We need to update the rmapbt.
+ */
+int
+xfs_rui_recover(
+ struct xfs_mount *mp,
+ struct xfs_rui_log_item *ruip)
+{
+ int i;
+ int error = 0;
+ struct xfs_map_extent *rmap;
+ xfs_fsblock_t startblock_fsb;
+ bool op_ok;
+ struct xfs_rud_log_item *rudp;
+ enum xfs_rmap_intent_type type;
+ int whichfork;
+ xfs_exntst_t state;
+ struct xfs_trans *tp;
+ struct xfs_btree_cur *rcur = NULL;
+
+ ASSERT(!test_bit(XFS_RUI_RECOVERED, &ruip->rui_flags));
+
+ /*
+ * First check the validity of the extents described by the
+ * RUI. If any are bad, then assume that all are bad and
+ * just toss the RUI.
+ */
+ for (i = 0; i < ruip->rui_format.rui_nextents; i++) {
+ rmap = &ruip->rui_format.rui_extents[i];
+ startblock_fsb = XFS_BB_TO_FSB(mp,
+ XFS_FSB_TO_DADDR(mp, rmap->me_startblock));
+ switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
+ case XFS_RMAP_EXTENT_MAP:
+ case XFS_RMAP_EXTENT_UNMAP:
+ case XFS_RMAP_EXTENT_CONVERT:
+ case XFS_RMAP_EXTENT_ALLOC:
+ case XFS_RMAP_EXTENT_FREE:
+ op_ok = true;
+ break;
+ default:
+ op_ok = false;
+ break;
+ }
+ if (!op_ok || startblock_fsb == 0 ||
+ rmap->me_len == 0 ||
+ startblock_fsb >= mp->m_sb.sb_dblocks ||
+ rmap->me_len >= mp->m_sb.sb_agblocks ||
+ (rmap->me_flags & ~XFS_RMAP_EXTENT_FLAGS)) {
+ /*
+ * This will pull the RUI from the AIL and
+ * free the memory associated with it.
+ */
+ set_bit(XFS_RUI_RECOVERED, &ruip->rui_flags);
+ xfs_rui_release(ruip);
+ return -EIO;
+ }
+ }
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+ if (error)
+ return error;
+ rudp = xfs_trans_get_rud(tp, ruip);
+
+ for (i = 0; i < ruip->rui_format.rui_nextents; i++) {
+ rmap = &ruip->rui_format.rui_extents[i];
+ state = (rmap->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ?
+ XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
+ whichfork = (rmap->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ?
+ XFS_ATTR_FORK : XFS_DATA_FORK;
+ switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
+ case XFS_RMAP_EXTENT_MAP:
+ type = XFS_RMAP_MAP;
+ break;
+ case XFS_RMAP_EXTENT_UNMAP:
+ type = XFS_RMAP_UNMAP;
+ break;
+ case XFS_RMAP_EXTENT_CONVERT:
+ type = XFS_RMAP_CONVERT;
+ break;
+ case XFS_RMAP_EXTENT_ALLOC:
+ type = XFS_RMAP_ALLOC;
+ break;
+ case XFS_RMAP_EXTENT_FREE:
+ type = XFS_RMAP_FREE;
+ break;
+ default:
+ error = -EFSCORRUPTED;
+ goto abort_error;
+ }
+ error = xfs_trans_log_finish_rmap_update(tp, rudp, type,
+ rmap->me_owner, whichfork,
+ rmap->me_startoff, rmap->me_startblock,
+ rmap->me_len, state, &rcur);
+ if (error)
+ goto abort_error;
+
+ }
+
+ xfs_rmap_finish_one_cleanup(tp, rcur, error);
+ set_bit(XFS_RUI_RECOVERED, &ruip->rui_flags);
+ error = xfs_trans_commit(tp);
+ return error;
+
+abort_error:
+ xfs_rmap_finish_one_cleanup(tp, rcur, error);
+ xfs_trans_cancel(tp);
+ return error;
+}
diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h
new file mode 100644
index 0000000000000..aefcc3a318a59
--- /dev/null
+++ b/fs/xfs/xfs_rmap_item.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_RMAP_ITEM_H__
+#define __XFS_RMAP_ITEM_H__
+
+/*
+ * There are (currently) three pairs of rmap btree redo item types: map, unmap,
+ * and convert. The common abbreviations for these are RUI (rmap update
+ * intent) and RUD (rmap update done). The redo item type is encoded in the
+ * flags field of each xfs_map_extent.
+ *
+ * *I items should be recorded in the *first* of a series of rolled
+ * transactions, and the *D items should be recorded in the same transaction
+ * that records the associated rmapbt updates. Typically, the first
+ * transaction will record a bmbt update, followed by some number of
+ * transactions containing rmapbt updates, and finally transactions with any
+ * bnobt/cntbt updates.
+ *
+ * Should the system crash after the commit of the first transaction but
+ * before the commit of the final transaction in a series, log recovery will
+ * use the redo information recorded by the intent items to replay the
+ * (rmapbt/bnobt/cntbt) metadata updates in the non-first transaction.
+ */
+
+/* kernel only RUI/RUD definitions */
+
+struct xfs_mount;
+struct kmem_zone;
+
+/*
+ * Max number of extents in fast allocation path.
+ */
+#define XFS_RUI_MAX_FAST_EXTENTS 16
+
+/*
+ * Define RUI flag bits. Manipulated by set/clear/test_bit operators.
+ */
+#define XFS_RUI_RECOVERED 1
+
+/*
+ * This is the "rmap update intent" log item. It is used to log the fact that
+ * some reverse mappings need to change. It is used in conjunction with the
+ * "rmap update done" log item described below.
+ *
+ * These log items follow the same rules as struct xfs_efi_log_item; see the
+ * comments about that structure (in xfs_extfree_item.h) for more details.
+ */
+struct xfs_rui_log_item {
+ struct xfs_log_item rui_item;
+ atomic_t rui_refcount;
+ atomic_t rui_next_extent;
+ unsigned long rui_flags; /* misc flags */
+ struct xfs_rui_log_format rui_format;
+};
+
+/*
+ * This is the "rmap update done" log item. It is used to log the fact that
+ * some rmapbt updates mentioned in an earlier rui item have been performed.
+ */
+struct xfs_rud_log_item {
+ struct xfs_log_item rud_item;
+ struct xfs_rui_log_item *rud_ruip;
+ struct xfs_rud_log_format rud_format;
+};
+
+extern struct kmem_zone *xfs_rui_zone;
+extern struct kmem_zone *xfs_rud_zone;
+
+struct xfs_rui_log_item *xfs_rui_init(struct xfs_mount *, uint);
+struct xfs_rud_log_item *xfs_rud_init(struct xfs_mount *,
+ struct xfs_rui_log_item *);
+int xfs_rui_copy_format(struct xfs_log_iovec *buf,
+ struct xfs_rui_log_format *dst_rui_fmt);
+void xfs_rui_item_free(struct xfs_rui_log_item *);
+void xfs_rui_release(struct xfs_rui_log_item *);
+int xfs_rui_recover(struct xfs_mount *mp, struct xfs_rui_log_item *ruip);
+
+#endif /* __XFS_RMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index abf44435d04a3..802bcc326d9fb 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -23,6 +23,7 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_mount.h"
+#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
@@ -769,7 +770,7 @@ xfs_growfs_rt_alloc(
xfs_daddr_t d; /* disk block address */
int error; /* error return value */
xfs_fsblock_t firstblock;/* first block allocated in xaction */
- struct xfs_bmap_free flist; /* list of freed blocks */
+ struct xfs_defer_ops dfops; /* list of freed blocks */
xfs_fsblock_t fsbno; /* filesystem block for bno */
struct xfs_bmbt_irec map; /* block map output */
int nmap; /* number of block maps */
@@ -780,29 +781,28 @@ xfs_growfs_rt_alloc(
* Allocate space to the file, as necessary.
*/
while (oblocks < nblocks) {
- tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks);
/*
* Reserve space & log for one extent added to the file.
*/
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtalloc,
- resblks, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtalloc, resblks,
+ 0, 0, &tp);
if (error)
- goto out_trans_cancel;
+ return error;
/*
* Lock the inode.
*/
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- xfs_bmap_init(&flist, &firstblock);
+ xfs_defer_init(&dfops, &firstblock);
/*
* Allocate blocks to the bitmap file.
*/
nmap = 1;
error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks,
XFS_BMAPI_METADATA, &firstblock,
- resblks, &map, &nmap, &flist);
+ resblks, &map, &nmap, &dfops);
if (!error && nmap < 1)
error = -ENOSPC;
if (error)
@@ -810,7 +810,7 @@ xfs_growfs_rt_alloc(
/*
* Free any blocks freed up in the transaction, then commit.
*/
- error = xfs_bmap_finish(&tp, &flist, NULL);
+ error = xfs_defer_finish(&tp, &dfops, NULL);
if (error)
goto out_bmap_cancel;
error = xfs_trans_commit(tp);
@@ -823,14 +823,13 @@ xfs_growfs_rt_alloc(
for (bno = map.br_startoff, fsbno = map.br_startblock;
bno < map.br_startoff + map.br_blockcount;
bno++, fsbno++) {
- tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ZERO);
/*
* Reserve log for one block zeroing.
*/
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero,
- 0, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtzero,
+ 0, 0, 0, &tp);
if (error)
- goto out_trans_cancel;
+ return error;
/*
* Lock the bitmap inode.
*/
@@ -864,7 +863,7 @@ xfs_growfs_rt_alloc(
return 0;
out_bmap_cancel:
- xfs_bmap_cancel(&flist);
+ xfs_defer_cancel(&dfops);
out_trans_cancel:
xfs_trans_cancel(tp);
return error;
@@ -994,11 +993,10 @@ xfs_growfs_rt(
/*
* Start a transaction, get the log reservation.
*/
- tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtfree,
- 0, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtfree, 0, 0, 0,
+ &tp);
if (error)
- goto error_cancel;
+ break;
/*
* Lock out other callers by grabbing the bitmap inode lock.
*/
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 76c0a4a9bb170..355dd9e1cb641 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -98,8 +98,6 @@ xfs_growfs_rt(
/*
* From xfs_rtbitmap.c
*/
-int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtblock_t block, int issum, struct xfs_buf **bpp);
int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_rtblock_t start, xfs_extlen_t len, int val,
xfs_rtblock_t *new, int *stat);
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 8686df6c76095..6e812fe0fd43c 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -61,6 +61,7 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
{ "bmbt2", XFSSTAT_END_BMBT_V2 },
{ "ibt2", XFSSTAT_END_IBT_V2 },
{ "fibt2", XFSSTAT_END_FIBT_V2 },
+ { "rmapbt", XFSSTAT_END_RMAP_V2 },
/* we print both series of quota information together */
{ "qm", XFSSTAT_END_QM },
};
@@ -128,7 +129,6 @@ static int xqm_proc_open(struct inode *inode, struct file *file)
}
static const struct file_operations xqm_proc_fops = {
- .owner = THIS_MODULE,
.open = xqm_proc_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index 483b0eff19883..657865f51e783 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -197,7 +197,23 @@ struct xfsstats {
__uint32_t xs_fibt_2_alloc;
__uint32_t xs_fibt_2_free;
__uint32_t xs_fibt_2_moves;
-#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_FIBT_V2+6)
+#define XFSSTAT_END_RMAP_V2 (XFSSTAT_END_FIBT_V2+15)
+ __uint32_t xs_rmap_2_lookup;
+ __uint32_t xs_rmap_2_compare;
+ __uint32_t xs_rmap_2_insrec;
+ __uint32_t xs_rmap_2_delrec;
+ __uint32_t xs_rmap_2_newroot;
+ __uint32_t xs_rmap_2_killroot;
+ __uint32_t xs_rmap_2_increment;
+ __uint32_t xs_rmap_2_decrement;
+ __uint32_t xs_rmap_2_lshift;
+ __uint32_t xs_rmap_2_rshift;
+ __uint32_t xs_rmap_2_split;
+ __uint32_t xs_rmap_2_join;
+ __uint32_t xs_rmap_2_alloc;
+ __uint32_t xs_rmap_2_free;
+ __uint32_t xs_rmap_2_moves;
+#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_RMAP_V2+6)
__uint32_t xs_qm_dqreclaims;
__uint32_t xs_qm_dqreclaim_misses;
__uint32_t xs_qm_dquot_dups;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index d760934109b5d..24ef83ef04de2 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -46,6 +46,7 @@
#include "xfs_quota.h"
#include "xfs_sysfs.h"
#include "xfs_ondisk.h"
+#include "xfs_rmap_item.h"
#include <linux/namei.h>
#include <linux/init.h>
@@ -58,8 +59,7 @@
#include <linux/parser.h>
static const struct super_operations xfs_super_operations;
-static kmem_zone_t *xfs_ioend_zone;
-mempool_t *xfs_ioend_pool;
+struct bio_set *xfs_ioend_bioset;
static struct kset *xfs_kset; /* top-level xfs sysfs dir */
#ifdef DEBUG
@@ -350,6 +350,7 @@ xfs_parseargs(
case Opt_pqnoenforce:
mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
mp->m_qflags &= ~XFS_PQUOTA_ENFD;
+ break;
case Opt_gquota:
case Opt_grpquota:
mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
@@ -546,7 +547,7 @@ xfs_showargs(
return 0;
}
-__uint64_t
+static __uint64_t
xfs_max_file_offset(
unsigned int blockshift)
{
@@ -556,10 +557,10 @@ xfs_max_file_offset(
/* Figure out maximum filesize, on Linux this can depend on
* the filesystem blocksize (on 32 bit platforms).
* __block_write_begin does this in an [unsigned] long...
- * page->index << (PAGE_CACHE_SHIFT - bbits)
+ * page->index << (PAGE_SHIFT - bbits)
* So, for page sized blocks (4K on 32 bit platforms),
* this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
- * (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)
+ * (((u64)PAGE_SIZE << (BITS_PER_LONG-1))-1)
* but for smaller blocksizes it is less (bbits = log2 bsize).
* Note1: get_block_t takes a long (implicit cast from above)
* Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch
@@ -570,10 +571,10 @@ xfs_max_file_offset(
#if BITS_PER_LONG == 32
# if defined(CONFIG_LBDAF)
ASSERT(sizeof(sector_t) == 8);
- pagefactor = PAGE_CACHE_SIZE;
+ pagefactor = PAGE_SIZE;
bitshift = BITS_PER_LONG;
# else
- pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift);
+ pagefactor = PAGE_SIZE >> (PAGE_SHIFT - blockshift);
# endif
#endif
@@ -928,7 +929,7 @@ xfs_fs_alloc_inode(
/*
* Now that the generic code is guaranteed not to be accessing
- * the linux inode, we can reclaim the inode.
+ * the linux inode, we can inactivate and reclaim the inode.
*/
STATIC void
xfs_fs_destroy_inode(
@@ -938,9 +939,14 @@ xfs_fs_destroy_inode(
trace_xfs_destroy_inode(ip);
- XFS_STATS_INC(ip->i_mount, vn_reclaim);
+ ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+ XFS_STATS_INC(ip->i_mount, vn_rele);
+ XFS_STATS_INC(ip->i_mount, vn_remove);
+
+ xfs_inactive(ip);
ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
+ XFS_STATS_INC(ip->i_mount, vn_reclaim);
/*
* We should never get here with one of the reclaim flags already set.
@@ -987,24 +993,6 @@ xfs_fs_inode_init_once(
"xfsino", ip->i_ino);
}
-STATIC void
-xfs_fs_evict_inode(
- struct inode *inode)
-{
- xfs_inode_t *ip = XFS_I(inode);
-
- ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
-
- trace_xfs_evict_inode(ip);
-
- truncate_inode_pages_final(&inode->i_data);
- clear_inode(inode);
- XFS_STATS_INC(ip->i_mount, vn_rele);
- XFS_STATS_INC(ip->i_mount, vn_remove);
-
- xfs_inactive(ip);
-}
-
/*
* We do an unlocked check for XFS_IDONTCACHE here because we are already
* serialised against cache hits here via the inode->i_lock and igrab() in
@@ -1088,7 +1076,7 @@ xfs_fs_statfs(
statp->f_blocks = sbp->sb_dblocks - lsize;
spin_unlock(&mp->m_sb_lock);
- statp->f_bfree = fdblocks - XFS_ALLOC_SET_ASIDE(mp);
+ statp->f_bfree = fdblocks - mp->m_alloc_set_aside;
statp->f_bavail = statp->f_bfree;
fakeinos = statp->f_bfree << sbp->sb_inopblog;
@@ -1276,6 +1264,16 @@ xfs_fs_remount(
return -EINVAL;
}
+ if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+ xfs_sb_has_ro_compat_feature(sbp,
+ XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
+ xfs_warn(mp,
+"ro->rw transition prohibited on unknown (0x%x) ro-compat filesystem",
+ (sbp->sb_features_ro_compat &
+ XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
+ return -EINVAL;
+ }
+
mp->m_flags &= ~XFS_MOUNT_RDONLY;
/*
@@ -1297,6 +1295,7 @@ xfs_fs_remount(
*/
xfs_restore_resvblks(mp);
xfs_log_work_queue(mp);
+ xfs_queue_eofblocks(mp);
}
/* rw -> ro */
@@ -1309,6 +1308,13 @@ xfs_fs_remount(
* return it to the same size.
*/
xfs_save_resvblks(mp);
+
+ /*
+ * Cancel background eofb scanning so it cannot race with the
+ * final log force+buftarg wait and deadlock the remount.
+ */
+ cancel_delayed_work_sync(&mp->m_eofblocks_work);
+
xfs_quiesce_attr(mp);
mp->m_flags |= XFS_MOUNT_RDONLY;
}
@@ -1558,21 +1564,19 @@ xfs_fs_fill_super(
if (mp->m_flags & XFS_MOUNT_DAX) {
xfs_warn(mp,
- "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
- if (sb->s_blocksize != PAGE_SIZE) {
- xfs_alert(mp,
- "Filesystem block size invalid for DAX Turning DAX off.");
- mp->m_flags &= ~XFS_MOUNT_DAX;
- } else if (!sb->s_bdev->bd_disk->fops->direct_access) {
+ "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
+
+ error = bdev_dax_supported(sb, sb->s_blocksize);
+ if (error) {
xfs_alert(mp,
- "Block device does not support DAX Turning DAX off.");
+ "DAX unsupported by block device. Turning off DAX.");
mp->m_flags &= ~XFS_MOUNT_DAX;
}
}
- if (xfs_sb_version_hassparseinodes(&mp->m_sb))
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb))
xfs_alert(mp,
- "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!");
+ "EXPERIMENTAL reverse mapping btree feature enabled. Use at your own risk!");
error = xfs_mountfs(mp);
if (error)
@@ -1663,7 +1667,6 @@ xfs_fs_free_cached_objects(
static const struct super_operations xfs_super_operations = {
.alloc_inode = xfs_fs_alloc_inode,
.destroy_inode = xfs_fs_destroy_inode,
- .evict_inode = xfs_fs_evict_inode,
.drop_inode = xfs_fs_drop_inode,
.put_super = xfs_fs_put_super,
.sync_fs = xfs_fs_sync_fs,
@@ -1688,23 +1691,19 @@ MODULE_ALIAS_FS("xfs");
STATIC int __init
xfs_init_zones(void)
{
-
- xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend");
- if (!xfs_ioend_zone)
+ xfs_ioend_bioset = bioset_create(4 * MAX_BUF_PER_PAGE,
+ offsetof(struct xfs_ioend, io_inline_bio));
+ if (!xfs_ioend_bioset)
goto out;
- xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE,
- xfs_ioend_zone);
- if (!xfs_ioend_pool)
- goto out_destroy_ioend_zone;
-
xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t),
"xfs_log_ticket");
if (!xfs_log_ticket_zone)
- goto out_destroy_ioend_pool;
+ goto out_free_ioend_bioset;
- xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t),
- "xfs_bmap_free_item");
+ xfs_bmap_free_item_zone = kmem_zone_init(
+ sizeof(struct xfs_extent_free_item),
+ "xfs_bmap_free_item");
if (!xfs_bmap_free_item_zone)
goto out_destroy_log_ticket_zone;
@@ -1771,8 +1770,24 @@ xfs_init_zones(void)
if (!xfs_icreate_zone)
goto out_destroy_ili_zone;
+ xfs_rud_zone = kmem_zone_init(sizeof(struct xfs_rud_log_item),
+ "xfs_rud_item");
+ if (!xfs_rud_zone)
+ goto out_destroy_icreate_zone;
+
+ xfs_rui_zone = kmem_zone_init((sizeof(struct xfs_rui_log_item) +
+ ((XFS_RUI_MAX_FAST_EXTENTS - 1) *
+ sizeof(struct xfs_map_extent))),
+ "xfs_rui_item");
+ if (!xfs_rui_zone)
+ goto out_destroy_rud_zone;
+
return 0;
+ out_destroy_rud_zone:
+ kmem_zone_destroy(xfs_rud_zone);
+ out_destroy_icreate_zone:
+ kmem_zone_destroy(xfs_icreate_zone);
out_destroy_ili_zone:
kmem_zone_destroy(xfs_ili_zone);
out_destroy_inode_zone:
@@ -1797,10 +1812,8 @@ xfs_init_zones(void)
kmem_zone_destroy(xfs_bmap_free_item_zone);
out_destroy_log_ticket_zone:
kmem_zone_destroy(xfs_log_ticket_zone);
- out_destroy_ioend_pool:
- mempool_destroy(xfs_ioend_pool);
- out_destroy_ioend_zone:
- kmem_zone_destroy(xfs_ioend_zone);
+ out_free_ioend_bioset:
+ bioset_free(xfs_ioend_bioset);
out:
return -ENOMEM;
}
@@ -1813,6 +1826,8 @@ xfs_destroy_zones(void)
* destroy caches.
*/
rcu_barrier();
+ kmem_zone_destroy(xfs_rui_zone);
+ kmem_zone_destroy(xfs_rud_zone);
kmem_zone_destroy(xfs_icreate_zone);
kmem_zone_destroy(xfs_ili_zone);
kmem_zone_destroy(xfs_inode_zone);
@@ -1826,9 +1841,7 @@ xfs_destroy_zones(void)
kmem_zone_destroy(xfs_btree_cur_zone);
kmem_zone_destroy(xfs_bmap_free_item_zone);
kmem_zone_destroy(xfs_log_ticket_zone);
- mempool_destroy(xfs_ioend_pool);
- kmem_zone_destroy(xfs_ioend_zone);
-
+ bioset_free(xfs_ioend_bioset);
}
STATIC int __init
@@ -1864,6 +1877,9 @@ init_xfs_fs(void)
printk(KERN_INFO XFS_VERSION_STRING " with "
XFS_BUILD_OPTIONS " enabled\n");
+ xfs_extent_free_init_defer_op();
+ xfs_rmap_update_init_defer_op();
+
xfs_dir_startup();
error = xfs_init_zones();
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 2dfb1ce4585f2..529bce9fc37ef 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -61,8 +61,6 @@ struct xfs_mount;
struct xfs_buftarg;
struct block_device;
-extern __uint64_t xfs_max_file_offset(unsigned int);
-
extern void xfs_flush_inodes(struct xfs_mount *mp);
extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *,
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index b44284c1adda1..58142aeeeea69 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -26,6 +26,7 @@
#include "xfs_mount.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
+#include "xfs_defer.h"
#include "xfs_dir2.h"
#include "xfs_inode.h"
#include "xfs_ialloc.h"
@@ -131,6 +132,8 @@ xfs_readlink(
trace_xfs_readlink(ip);
+ ASSERT(!(ip->i_df.if_flags & XFS_IFINLINE));
+
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
@@ -150,12 +153,7 @@ xfs_readlink(
}
- if (ip->i_df.if_flags & XFS_IFINLINE) {
- memcpy(link, ip->i_df.if_u1.if_data, pathlen);
- link[pathlen] = '\0';
- } else {
- error = xfs_readlink_bmap(ip, link);
- }
+ error = xfs_readlink_bmap(ip, link);
out:
xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -175,7 +173,7 @@ xfs_symlink(
struct xfs_inode *ip = NULL;
int error = 0;
int pathlen;
- struct xfs_bmap_free free_list;
+ struct xfs_defer_ops dfops;
xfs_fsblock_t first_block;
bool unlock_dp_on_error = false;
xfs_fileoff_t first_fsb;
@@ -221,7 +219,6 @@ xfs_symlink(
if (error)
return error;
- tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
/*
* The symlink will fit into the inode data fork?
* There can't be any attributes so we get the whole variable part.
@@ -231,13 +228,15 @@ xfs_symlink(
else
fs_blocks = xfs_symlink_blocks(mp, pathlen);
resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, resblks, 0);
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, resblks, 0, 0, &tp);
if (error == -ENOSPC && fs_blocks == 0) {
resblks = 0;
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, 0, 0, 0,
+ &tp);
}
if (error)
- goto out_trans_cancel;
+ goto out_release_inode;
xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
@@ -271,7 +270,7 @@ xfs_symlink(
* Initialize the bmap freelist prior to calling either
* bmapi or the directory create code.
*/
- xfs_bmap_init(&free_list, &first_block);
+ xfs_defer_init(&dfops, &first_block);
/*
* Allocate an inode for the symlink.
@@ -302,19 +301,11 @@ xfs_symlink(
* If the symlink will fit into the inode, write it inline.
*/
if (pathlen <= XFS_IFORK_DSIZE(ip)) {
- xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
- memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
- ip->i_d.di_size = pathlen;
-
- /*
- * The inode was initially created in extent format.
- */
- ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
- ip->i_df.if_flags |= XFS_IFINLINE;
+ xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen);
+ ip->i_d.di_size = pathlen;
ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
-
} else {
int offset;
@@ -323,7 +314,7 @@ xfs_symlink(
error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks,
XFS_BMAPI_METADATA, &first_block, resblks,
- mval, &nmaps, &free_list);
+ mval, &nmaps, &dfops);
if (error)
goto out_bmap_cancel;
@@ -371,7 +362,7 @@ xfs_symlink(
* Create the directory entry for the symlink.
*/
error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
- &first_block, &free_list, resblks);
+ &first_block, &dfops, resblks);
if (error)
goto out_bmap_cancel;
xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -386,7 +377,7 @@ xfs_symlink(
xfs_trans_set_sync(tp);
}
- error = xfs_bmap_finish(&tp, &free_list, NULL);
+ error = xfs_defer_finish(&tp, &dfops, NULL);
if (error)
goto out_bmap_cancel;
@@ -402,7 +393,7 @@ xfs_symlink(
return 0;
out_bmap_cancel:
- xfs_bmap_cancel(&free_list);
+ xfs_defer_cancel(&dfops);
out_trans_cancel:
xfs_trans_cancel(tp);
out_release_inode:
@@ -436,7 +427,7 @@ xfs_inactive_symlink_rmt(
int done;
int error;
xfs_fsblock_t first_block;
- xfs_bmap_free_t free_list;
+ struct xfs_defer_ops dfops;
int i;
xfs_mount_t *mp;
xfs_bmbt_irec_t mval[XFS_SYMLINK_MAPS];
@@ -455,12 +446,9 @@ xfs_inactive_symlink_rmt(
*/
ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
- tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
- if (error) {
- xfs_trans_cancel(tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+ if (error)
return error;
- }
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
@@ -478,7 +466,7 @@ xfs_inactive_symlink_rmt(
* Find the block(s) so we can inval and unmap them.
*/
done = 0;
- xfs_bmap_init(&free_list, &first_block);
+ xfs_defer_init(&dfops, &first_block);
nmaps = ARRAY_SIZE(mval);
error = xfs_bmapi_read(ip, 0, xfs_symlink_blocks(mp, size),
mval, &nmaps, 0);
@@ -498,17 +486,17 @@ xfs_inactive_symlink_rmt(
xfs_trans_binval(tp, bp);
}
/*
- * Unmap the dead block(s) to the free_list.
+ * Unmap the dead block(s) to the dfops.
*/
error = xfs_bunmapi(tp, ip, 0, size, 0, nmaps,
- &first_block, &free_list, &done);
+ &first_block, &dfops, &done);
if (error)
goto error_bmap_cancel;
ASSERT(done);
/*
* Commit the first transaction. This logs the EFI and the inode.
*/
- error = xfs_bmap_finish(&tp, &free_list, ip);
+ error = xfs_defer_finish(&tp, &dfops, ip);
if (error)
goto error_bmap_cancel;
/*
@@ -538,7 +526,7 @@ xfs_inactive_symlink_rmt(
return 0;
error_bmap_cancel:
- xfs_bmap_cancel(&free_list);
+ xfs_defer_cancel(&dfops);
error_trans_cancel:
xfs_trans_cancel(tp);
error_unlock:
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 6ced4f1434948..79cfd3fc53240 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -17,10 +17,11 @@
*/
#include "xfs.h"
-#include "xfs_sysfs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
+#include "xfs_sysfs.h"
#include "xfs_log.h"
#include "xfs_log_priv.h"
#include "xfs_stats.h"
@@ -362,3 +363,294 @@ struct kobj_type xfs_log_ktype = {
.sysfs_ops = &xfs_sysfs_ops,
.default_attrs = xfs_log_attrs,
};
+
+/*
+ * Metadata IO error configuration
+ *
+ * The sysfs structure here is:
+ * ...xfs/<dev>/error/<class>/<errno>/<error_attrs>
+ *
+ * where <class> allows us to discriminate between data IO and metadata IO,
+ * and any other future type of IO (e.g. special inode or directory error
+ * handling) we care to support.
+ */
+static inline struct xfs_error_cfg *
+to_error_cfg(struct kobject *kobject)
+{
+ struct xfs_kobj *kobj = to_kobj(kobject);
+ return container_of(kobj, struct xfs_error_cfg, kobj);
+}
+
+static inline struct xfs_mount *
+err_to_mp(struct kobject *kobject)
+{
+ struct xfs_kobj *kobj = to_kobj(kobject);
+ return container_of(kobj, struct xfs_mount, m_error_kobj);
+}
+
+static ssize_t
+max_retries_show(
+ struct kobject *kobject,
+ char *buf)
+{
+ struct xfs_error_cfg *cfg = to_error_cfg(kobject);
+
+ return snprintf(buf, PAGE_SIZE, "%d\n", cfg->max_retries);
+}
+
+static ssize_t
+max_retries_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
+{
+ struct xfs_error_cfg *cfg = to_error_cfg(kobject);
+ int ret;
+ int val;
+
+ ret = kstrtoint(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val < -1)
+ return -EINVAL;
+
+ cfg->max_retries = val;
+ return count;
+}
+XFS_SYSFS_ATTR_RW(max_retries);
+
+static ssize_t
+retry_timeout_seconds_show(
+ struct kobject *kobject,
+ char *buf)
+{
+ struct xfs_error_cfg *cfg = to_error_cfg(kobject);
+
+ return snprintf(buf, PAGE_SIZE, "%ld\n",
+ jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC);
+}
+
+static ssize_t
+retry_timeout_seconds_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
+{
+ struct xfs_error_cfg *cfg = to_error_cfg(kobject);
+ int ret;
+ int val;
+
+ ret = kstrtoint(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ /* 1 day timeout maximum */
+ if (val < 0 || val > 86400)
+ return -EINVAL;
+
+ cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC);
+ return count;
+}
+XFS_SYSFS_ATTR_RW(retry_timeout_seconds);
+
+static ssize_t
+fail_at_unmount_show(
+ struct kobject *kobject,
+ char *buf)
+{
+ struct xfs_mount *mp = err_to_mp(kobject);
+
+ return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_fail_unmount);
+}
+
+static ssize_t
+fail_at_unmount_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
+{
+ struct xfs_mount *mp = err_to_mp(kobject);
+ int ret;
+ int val;
+
+ ret = kstrtoint(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val < 0 || val > 1)
+ return -EINVAL;
+
+ mp->m_fail_unmount = val;
+ return count;
+}
+XFS_SYSFS_ATTR_RW(fail_at_unmount);
+
+static struct attribute *xfs_error_attrs[] = {
+ ATTR_LIST(max_retries),
+ ATTR_LIST(retry_timeout_seconds),
+ NULL,
+};
+
+
+struct kobj_type xfs_error_cfg_ktype = {
+ .release = xfs_sysfs_release,
+ .sysfs_ops = &xfs_sysfs_ops,
+ .default_attrs = xfs_error_attrs,
+};
+
+struct kobj_type xfs_error_ktype = {
+ .release = xfs_sysfs_release,
+ .sysfs_ops = &xfs_sysfs_ops,
+};
+
+/*
+ * Error initialization tables. These need to be ordered in the same
+ * order as the enums used to index the array. All class init tables need to
+ * define a "default" behaviour as the first entry, all other entries can be
+ * empty.
+ */
+struct xfs_error_init {
+ char *name;
+ int max_retries;
+ int retry_timeout; /* in seconds */
+};
+
+static const struct xfs_error_init xfs_error_meta_init[XFS_ERR_ERRNO_MAX] = {
+ { .name = "default",
+ .max_retries = XFS_ERR_RETRY_FOREVER,
+ .retry_timeout = 0,
+ },
+ { .name = "EIO",
+ .max_retries = XFS_ERR_RETRY_FOREVER,
+ .retry_timeout = 0,
+ },
+ { .name = "ENOSPC",
+ .max_retries = XFS_ERR_RETRY_FOREVER,
+ .retry_timeout = 0,
+ },
+ { .name = "ENODEV",
+ .max_retries = 0,
+ },
+};
+
+static int
+xfs_error_sysfs_init_class(
+ struct xfs_mount *mp,
+ int class,
+ const char *parent_name,
+ struct xfs_kobj *parent_kobj,
+ const struct xfs_error_init init[])
+{
+ struct xfs_error_cfg *cfg;
+ int error;
+ int i;
+
+ ASSERT(class < XFS_ERR_CLASS_MAX);
+
+ error = xfs_sysfs_init(parent_kobj, &xfs_error_ktype,
+ &mp->m_error_kobj, parent_name);
+ if (error)
+ return error;
+
+ for (i = 0; i < XFS_ERR_ERRNO_MAX; i++) {
+ cfg = &mp->m_error_cfg[class][i];
+ error = xfs_sysfs_init(&cfg->kobj, &xfs_error_cfg_ktype,
+ parent_kobj, init[i].name);
+ if (error)
+ goto out_error;
+
+ cfg->max_retries = init[i].max_retries;
+ cfg->retry_timeout = msecs_to_jiffies(
+ init[i].retry_timeout * MSEC_PER_SEC);
+ }
+ return 0;
+
+out_error:
+ /* unwind the entries that succeeded */
+ for (i--; i >= 0; i--) {
+ cfg = &mp->m_error_cfg[class][i];
+ xfs_sysfs_del(&cfg->kobj);
+ }
+ xfs_sysfs_del(parent_kobj);
+ return error;
+}
+
+int
+xfs_error_sysfs_init(
+ struct xfs_mount *mp)
+{
+ int error;
+
+ /* .../xfs/<dev>/error/ */
+ error = xfs_sysfs_init(&mp->m_error_kobj, &xfs_error_ktype,
+ &mp->m_kobj, "error");
+ if (error)
+ return error;
+
+ error = sysfs_create_file(&mp->m_error_kobj.kobject,
+ ATTR_LIST(fail_at_unmount));
+
+ if (error)
+ goto out_error;
+
+ /* .../xfs/<dev>/error/metadata/ */
+ error = xfs_error_sysfs_init_class(mp, XFS_ERR_METADATA,
+ "metadata", &mp->m_error_meta_kobj,
+ xfs_error_meta_init);
+ if (error)
+ goto out_error;
+
+ return 0;
+
+out_error:
+ xfs_sysfs_del(&mp->m_error_kobj);
+ return error;
+}
+
+void
+xfs_error_sysfs_del(
+ struct xfs_mount *mp)
+{
+ struct xfs_error_cfg *cfg;
+ int i, j;
+
+ for (i = 0; i < XFS_ERR_CLASS_MAX; i++) {
+ for (j = 0; j < XFS_ERR_ERRNO_MAX; j++) {
+ cfg = &mp->m_error_cfg[i][j];
+
+ xfs_sysfs_del(&cfg->kobj);
+ }
+ }
+ xfs_sysfs_del(&mp->m_error_meta_kobj);
+ xfs_sysfs_del(&mp->m_error_kobj);
+}
+
+struct xfs_error_cfg *
+xfs_error_get_cfg(
+ struct xfs_mount *mp,
+ int error_class,
+ int error)
+{
+ struct xfs_error_cfg *cfg;
+
+ if (error < 0)
+ error = -error;
+
+ switch (error) {
+ case EIO:
+ cfg = &mp->m_error_cfg[error_class][XFS_ERR_EIO];
+ break;
+ case ENOSPC:
+ cfg = &mp->m_error_cfg[error_class][XFS_ERR_ENOSPC];
+ break;
+ case ENODEV:
+ cfg = &mp->m_error_cfg[error_class][XFS_ERR_ENODEV];
+ break;
+ default:
+ cfg = &mp->m_error_cfg[error_class][XFS_ERR_DEFAULT];
+ break;
+ }
+
+ return cfg;
+}
diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h
index be692e59938db..d04637181ef21 100644
--- a/fs/xfs/xfs_sysfs.h
+++ b/fs/xfs/xfs_sysfs.h
@@ -58,4 +58,7 @@ xfs_sysfs_del(
wait_for_completion(&kobj->complete);
}
+int xfs_error_sysfs_init(struct xfs_mount *mp);
+void xfs_error_sysfs_del(struct xfs_mount *mp);
+
#endif /* __XFS_SYSFS_H__ */
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 13a029806805f..7f17ae6d709a1 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -22,7 +22,9 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
+#include "xfs_defer.h"
#include "xfs_da_format.h"
+#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_da_btree.h"
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index c8d58426008ed..551b7e26980c5 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -38,6 +38,7 @@ struct xlog_recover_item;
struct xfs_buf_log_format;
struct xfs_inode_log_format;
struct xfs_bmbt_irec;
+struct xfs_btree_cur;
DECLARE_EVENT_CLASS(xfs_attr_list_class,
TP_PROTO(struct xfs_attr_list_context *ctx),
@@ -354,6 +355,7 @@ DEFINE_BUF_EVENT(xfs_buf_submit_wait);
DEFINE_BUF_EVENT(xfs_buf_bawrite);
DEFINE_BUF_EVENT(xfs_buf_lock);
DEFINE_BUF_EVENT(xfs_buf_lock_done);
+DEFINE_BUF_EVENT(xfs_buf_trylock_fail);
DEFINE_BUF_EVENT(xfs_buf_trylock);
DEFINE_BUF_EVENT(xfs_buf_unlock);
DEFINE_BUF_EVENT(xfs_buf_iowait);
@@ -364,7 +366,6 @@ DEFINE_BUF_EVENT(xfs_buf_delwri_split);
DEFINE_BUF_EVENT(xfs_buf_get_uncached);
DEFINE_BUF_EVENT(xfs_bdstrat_shut);
DEFINE_BUF_EVENT(xfs_buf_item_relse);
-DEFINE_BUF_EVENT(xfs_buf_item_iodone);
DEFINE_BUF_EVENT(xfs_buf_item_iodone_async);
DEFINE_BUF_EVENT(xfs_buf_error_relse);
DEFINE_BUF_EVENT(xfs_buf_wait_buftarg);
@@ -944,7 +945,6 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
TP_ARGS(log, tic),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(unsigned, trans_type)
__field(char, ocnt)
__field(char, cnt)
__field(int, curr_res)
@@ -962,7 +962,6 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
),
TP_fast_assign(
__entry->dev = log->l_mp->m_super->s_dev;
- __entry->trans_type = tic->t_trans_type;
__entry->ocnt = tic->t_ocnt;
__entry->cnt = tic->t_cnt;
__entry->curr_res = tic->t_curr_res;
@@ -980,14 +979,13 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
__entry->curr_block = log->l_curr_block;
__entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
),
- TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
+ TP_printk("dev %d:%d t_ocnt %u t_cnt %u t_curr_res %u "
"t_unit_res %u t_flags %s reserveq %s "
"writeq %s grant_reserve_cycle %d "
"grant_reserve_bytes %d grant_write_cycle %d "
"grant_write_bytes %d curr_cycle %d curr_block %d "
"tail_cycle %d tail_block %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __print_symbolic(__entry->trans_type, XFS_TRANS_TYPES),
__entry->ocnt,
__entry->cnt,
__entry->curr_res,
@@ -1053,19 +1051,21 @@ DECLARE_EVENT_CLASS(xfs_log_item_class,
)
TRACE_EVENT(xfs_log_force,
- TP_PROTO(struct xfs_mount *mp, xfs_lsn_t lsn),
- TP_ARGS(mp, lsn),
+ TP_PROTO(struct xfs_mount *mp, xfs_lsn_t lsn, unsigned long caller_ip),
+ TP_ARGS(mp, lsn, caller_ip),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_lsn_t, lsn)
+ __field(unsigned long, caller_ip)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->lsn = lsn;
+ __entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d lsn 0x%llx",
+ TP_printk("dev %d:%d lsn 0x%llx caller %ps",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->lsn)
+ __entry->lsn, (void *)__entry->caller_ip)
)
#define DEFINE_LOG_ITEM_EVENT(name) \
@@ -1136,15 +1136,14 @@ TRACE_EVENT(xfs_log_assign_tail_lsn,
)
DECLARE_EVENT_CLASS(xfs_file_class,
- TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),
- TP_ARGS(ip, count, offset, flags),
+ TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset),
+ TP_ARGS(ip, count, offset),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(xfs_fsize_t, size)
__field(loff_t, offset)
__field(size_t, count)
- __field(int, flags)
),
TP_fast_assign(
__entry->dev = VFS_I(ip)->i_sb->s_dev;
@@ -1152,25 +1151,25 @@ DECLARE_EVENT_CLASS(xfs_file_class,
__entry->size = ip->i_d.di_size;
__entry->offset = offset;
__entry->count = count;
- __entry->flags = flags;
),
- TP_printk("dev %d:%d ino 0x%llx size 0x%llx "
- "offset 0x%llx count 0x%zx ioflags %s",
+ TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count 0x%zx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->size,
__entry->offset,
- __entry->count,
- __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
+ __entry->count)
)
#define DEFINE_RW_EVENT(name) \
DEFINE_EVENT(xfs_file_class, name, \
- TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \
- TP_ARGS(ip, count, offset, flags))
-DEFINE_RW_EVENT(xfs_file_read);
+ TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset), \
+ TP_ARGS(ip, count, offset))
+DEFINE_RW_EVENT(xfs_file_buffered_read);
+DEFINE_RW_EVENT(xfs_file_direct_read);
+DEFINE_RW_EVENT(xfs_file_dax_read);
DEFINE_RW_EVENT(xfs_file_buffered_write);
DEFINE_RW_EVENT(xfs_file_direct_write);
+DEFINE_RW_EVENT(xfs_file_dax_write);
DEFINE_RW_EVENT(xfs_file_splice_read);
DECLARE_EVENT_CLASS(xfs_page_class,
@@ -1297,6 +1296,9 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
+DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
+DEFINE_IOMAP_EVENT(xfs_iomap_found);
+DEFINE_IOMAP_EVENT(xfs_iomap_not_found);
DECLARE_EVENT_CLASS(xfs_simple_io_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -2184,6 +2186,379 @@ DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
DEFINE_DISCARD_EVENT(xfs_discard_exclude);
DEFINE_DISCARD_EVENT(xfs_discard_busy);
+/* btree cursor events */
+DECLARE_EVENT_CLASS(xfs_btree_cur_class,
+ TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp),
+ TP_ARGS(cur, level, bp),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_btnum_t, btnum)
+ __field(int, level)
+ __field(int, nlevels)
+ __field(int, ptr)
+ __field(xfs_daddr_t, daddr)
+ ),
+ TP_fast_assign(
+ __entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->btnum = cur->bc_btnum;
+ __entry->level = level;
+ __entry->nlevels = cur->bc_nlevels;
+ __entry->ptr = cur->bc_ptrs[level];
+ __entry->daddr = bp ? bp->b_bn : -1;
+ ),
+ TP_printk("dev %d:%d btnum %d level %d/%d ptr %d daddr 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->btnum,
+ __entry->level,
+ __entry->nlevels,
+ __entry->ptr,
+ (unsigned long long)__entry->daddr)
+)
+
+#define DEFINE_BTREE_CUR_EVENT(name) \
+DEFINE_EVENT(xfs_btree_cur_class, name, \
+ TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp), \
+ TP_ARGS(cur, level, bp))
+DEFINE_BTREE_CUR_EVENT(xfs_btree_updkeys);
+DEFINE_BTREE_CUR_EVENT(xfs_btree_overlapped_query_range);
+
+/* deferred ops */
+struct xfs_defer_pending;
+struct xfs_defer_intake;
+struct xfs_defer_ops;
+
+DECLARE_EVENT_CLASS(xfs_defer_class,
+ TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop),
+ TP_ARGS(mp, dop),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(void *, dop)
+ __field(bool, committed)
+ __field(bool, low)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp ? mp->m_super->s_dev : 0;
+ __entry->dop = dop;
+ __entry->committed = dop->dop_committed;
+ __entry->low = dop->dop_low;
+ ),
+ TP_printk("dev %d:%d ops %p committed %d low %d\n",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dop,
+ __entry->committed,
+ __entry->low)
+)
+#define DEFINE_DEFER_EVENT(name) \
+DEFINE_EVENT(xfs_defer_class, name, \
+ TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop), \
+ TP_ARGS(mp, dop))
+
+DECLARE_EVENT_CLASS(xfs_defer_error_class,
+ TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop, int error),
+ TP_ARGS(mp, dop, error),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(void *, dop)
+ __field(bool, committed)
+ __field(bool, low)
+ __field(int, error)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp ? mp->m_super->s_dev : 0;
+ __entry->dop = dop;
+ __entry->committed = dop->dop_committed;
+ __entry->low = dop->dop_low;
+ __entry->error = error;
+ ),
+ TP_printk("dev %d:%d ops %p committed %d low %d err %d\n",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dop,
+ __entry->committed,
+ __entry->low,
+ __entry->error)
+)
+#define DEFINE_DEFER_ERROR_EVENT(name) \
+DEFINE_EVENT(xfs_defer_error_class, name, \
+ TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop, int error), \
+ TP_ARGS(mp, dop, error))
+
+DECLARE_EVENT_CLASS(xfs_defer_pending_class,
+ TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp),
+ TP_ARGS(mp, dfp),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(int, type)
+ __field(void *, intent)
+ __field(bool, committed)
+ __field(int, nr)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp ? mp->m_super->s_dev : 0;
+ __entry->type = dfp->dfp_type->type;
+ __entry->intent = dfp->dfp_intent;
+ __entry->committed = dfp->dfp_committed;
+ __entry->nr = dfp->dfp_count;
+ ),
+ TP_printk("dev %d:%d optype %d intent %p committed %d nr %d\n",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->type,
+ __entry->intent,
+ __entry->committed,
+ __entry->nr)
+)
+#define DEFINE_DEFER_PENDING_EVENT(name) \
+DEFINE_EVENT(xfs_defer_pending_class, name, \
+ TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp), \
+ TP_ARGS(mp, dfp))
+
+DECLARE_EVENT_CLASS(xfs_phys_extent_deferred_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ int type, xfs_agblock_t agbno, xfs_extlen_t len),
+ TP_ARGS(mp, agno, type, agbno, len),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(int, type)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->type = type;
+ __entry->agbno = agbno;
+ __entry->len = len;
+ ),
+ TP_printk("dev %d:%d op %d agno %u agbno %u len %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->type,
+ __entry->agno,
+ __entry->agbno,
+ __entry->len)
+);
+#define DEFINE_PHYS_EXTENT_DEFERRED_EVENT(name) \
+DEFINE_EVENT(xfs_phys_extent_deferred_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ int type, \
+ xfs_agblock_t bno, \
+ xfs_extlen_t len), \
+ TP_ARGS(mp, agno, type, bno, len))
+
+DECLARE_EVENT_CLASS(xfs_map_extent_deferred_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ int op,
+ xfs_agblock_t agbno,
+ xfs_ino_t ino,
+ int whichfork,
+ xfs_fileoff_t offset,
+ xfs_filblks_t len,
+ xfs_exntst_t state),
+ TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_ino_t, ino)
+ __field(xfs_agblock_t, agbno)
+ __field(int, whichfork)
+ __field(xfs_fileoff_t, l_loff)
+ __field(xfs_filblks_t, l_len)
+ __field(xfs_exntst_t, l_state)
+ __field(int, op)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->ino = ino;
+ __entry->agbno = agbno;
+ __entry->whichfork = whichfork;
+ __entry->l_loff = offset;
+ __entry->l_len = len;
+ __entry->l_state = state;
+ __entry->op = op;
+ ),
+ TP_printk("dev %d:%d op %d agno %u agbno %u owner %lld %s offset %llu len %llu state %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->op,
+ __entry->agno,
+ __entry->agbno,
+ __entry->ino,
+ __entry->whichfork == XFS_ATTR_FORK ? "attr" : "data",
+ __entry->l_loff,
+ __entry->l_len,
+ __entry->l_state)
+);
+#define DEFINE_MAP_EXTENT_DEFERRED_EVENT(name) \
+DEFINE_EVENT(xfs_map_extent_deferred_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ int op, \
+ xfs_agblock_t agbno, \
+ xfs_ino_t ino, \
+ int whichfork, \
+ xfs_fileoff_t offset, \
+ xfs_filblks_t len, \
+ xfs_exntst_t state), \
+ TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state))
+
+DEFINE_DEFER_EVENT(xfs_defer_init);
+DEFINE_DEFER_EVENT(xfs_defer_cancel);
+DEFINE_DEFER_EVENT(xfs_defer_trans_roll);
+DEFINE_DEFER_EVENT(xfs_defer_trans_abort);
+DEFINE_DEFER_EVENT(xfs_defer_finish);
+DEFINE_DEFER_EVENT(xfs_defer_finish_done);
+
+DEFINE_DEFER_ERROR_EVENT(xfs_defer_trans_roll_error);
+DEFINE_DEFER_ERROR_EVENT(xfs_defer_finish_error);
+DEFINE_DEFER_ERROR_EVENT(xfs_defer_op_finish_error);
+
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_intake_work);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_intake_cancel);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_commit);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_cancel);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort);
+
+#define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT
+DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer);
+DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_deferred);
+
+/* rmap tracepoints */
+DECLARE_EVENT_CLASS(xfs_rmap_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten,
+ struct xfs_owner_info *oinfo),
+ TP_ARGS(mp, agno, agbno, len, unwritten, oinfo),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
+ __field(uint64_t, owner)
+ __field(uint64_t, offset)
+ __field(unsigned long, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agbno = agbno;
+ __entry->len = len;
+ __entry->owner = oinfo->oi_owner;
+ __entry->offset = oinfo->oi_offset;
+ __entry->flags = oinfo->oi_flags;
+ if (unwritten)
+ __entry->flags |= XFS_RMAP_UNWRITTEN;
+ ),
+ TP_printk("dev %d:%d agno %u agbno %u len %u owner %lld offset %llu flags 0x%lx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->len,
+ __entry->owner,
+ __entry->offset,
+ __entry->flags)
+);
+#define DEFINE_RMAP_EVENT(name) \
+DEFINE_EVENT(xfs_rmap_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten, \
+ struct xfs_owner_info *oinfo), \
+ TP_ARGS(mp, agno, agbno, len, unwritten, oinfo))
+
+/* simple AG-based error/%ip tracepoint class */
+DECLARE_EVENT_CLASS(xfs_ag_error_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int error,
+ unsigned long caller_ip),
+ TP_ARGS(mp, agno, error, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(int, error)
+ __field(unsigned long, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->error = error;
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d agno %u error %d caller %ps",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->error,
+ (char *)__entry->caller_ip)
+);
+
+#define DEFINE_AG_ERROR_EVENT(name) \
+DEFINE_EVENT(xfs_ag_error_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int error, \
+ unsigned long caller_ip), \
+ TP_ARGS(mp, agno, error, caller_ip))
+
+DEFINE_RMAP_EVENT(xfs_rmap_unmap);
+DEFINE_RMAP_EVENT(xfs_rmap_unmap_done);
+DEFINE_AG_ERROR_EVENT(xfs_rmap_unmap_error);
+DEFINE_RMAP_EVENT(xfs_rmap_map);
+DEFINE_RMAP_EVENT(xfs_rmap_map_done);
+DEFINE_AG_ERROR_EVENT(xfs_rmap_map_error);
+DEFINE_RMAP_EVENT(xfs_rmap_convert);
+DEFINE_RMAP_EVENT(xfs_rmap_convert_done);
+DEFINE_AG_ERROR_EVENT(xfs_rmap_convert_error);
+DEFINE_AG_ERROR_EVENT(xfs_rmap_convert_state);
+
+DECLARE_EVENT_CLASS(xfs_rmapbt_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t agbno, xfs_extlen_t len,
+ uint64_t owner, uint64_t offset, unsigned int flags),
+ TP_ARGS(mp, agno, agbno, len, owner, offset, flags),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
+ __field(uint64_t, owner)
+ __field(uint64_t, offset)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agbno = agbno;
+ __entry->len = len;
+ __entry->owner = owner;
+ __entry->offset = offset;
+ __entry->flags = flags;
+ ),
+ TP_printk("dev %d:%d agno %u agbno %u len %u owner %lld offset %llu flags 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->len,
+ __entry->owner,
+ __entry->offset,
+ __entry->flags)
+);
+#define DEFINE_RMAPBT_EVENT(name) \
+DEFINE_EVENT(xfs_rmapbt_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ xfs_agblock_t agbno, xfs_extlen_t len, \
+ uint64_t owner, uint64_t offset, unsigned int flags), \
+ TP_ARGS(mp, agno, agbno, len, owner, offset, flags))
+
+#define DEFINE_RMAP_DEFERRED_EVENT DEFINE_MAP_EXTENT_DEFERRED_EVENT
+DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_defer);
+DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_deferred);
+
+DEFINE_BUSY_EVENT(xfs_rmapbt_alloc_block);
+DEFINE_BUSY_EVENT(xfs_rmapbt_free_block);
+DEFINE_RMAPBT_EVENT(xfs_rmap_update);
+DEFINE_RMAPBT_EVENT(xfs_rmap_insert);
+DEFINE_RMAPBT_EVENT(xfs_rmap_delete);
+DEFINE_AG_ERROR_EVENT(xfs_rmap_insert_error);
+DEFINE_AG_ERROR_EVENT(xfs_rmap_delete_error);
+DEFINE_AG_ERROR_EVENT(xfs_rmap_update_error);
+DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range_result);
+DEFINE_RMAPBT_EVENT(xfs_rmap_find_right_neighbor_result);
+DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_result);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 20c53666cb4b3..5f3d33d16e670 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -47,47 +47,6 @@ xfs_trans_init(
}
/*
- * This routine is called to allocate a transaction structure.
- * The type parameter indicates the type of the transaction. These
- * are enumerated in xfs_trans.h.
- *
- * Dynamically allocate the transaction structure from the transaction
- * zone, initialize it, and return it to the caller.
- */
-xfs_trans_t *
-xfs_trans_alloc(
- xfs_mount_t *mp,
- uint type)
-{
- xfs_trans_t *tp;
-
- sb_start_intwrite(mp->m_super);
- tp = _xfs_trans_alloc(mp, type, KM_SLEEP);
- tp->t_flags |= XFS_TRANS_FREEZE_PROT;
- return tp;
-}
-
-xfs_trans_t *
-_xfs_trans_alloc(
- xfs_mount_t *mp,
- uint type,
- xfs_km_flags_t memflags)
-{
- xfs_trans_t *tp;
-
- WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
- atomic_inc(&mp->m_active_trans);
-
- tp = kmem_zone_zalloc(xfs_trans_zone, memflags);
- tp->t_magic = XFS_TRANS_HEADER_MAGIC;
- tp->t_type = type;
- tp->t_mountp = mp;
- INIT_LIST_HEAD(&tp->t_items);
- INIT_LIST_HEAD(&tp->t_busy);
- return tp;
-}
-
-/*
* Free the transaction structure. If there is more clean up
* to do when the structure is freed, add it here.
*/
@@ -99,7 +58,7 @@ xfs_trans_free(
xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false);
atomic_dec(&tp->t_mountp->m_active_trans);
- if (tp->t_flags & XFS_TRANS_FREEZE_PROT)
+ if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT))
sb_end_intwrite(tp->t_mountp->m_super);
xfs_trans_free_dqinfo(tp);
kmem_zone_free(xfs_trans_zone, tp);
@@ -125,7 +84,6 @@ xfs_trans_dup(
* Initialize the new transaction structure.
*/
ntp->t_magic = XFS_TRANS_HEADER_MAGIC;
- ntp->t_type = tp->t_type;
ntp->t_mountp = tp->t_mountp;
INIT_LIST_HEAD(&ntp->t_items);
INIT_LIST_HEAD(&ntp->t_busy);
@@ -135,9 +93,9 @@ xfs_trans_dup(
ntp->t_flags = XFS_TRANS_PERM_LOG_RES |
(tp->t_flags & XFS_TRANS_RESERVE) |
- (tp->t_flags & XFS_TRANS_FREEZE_PROT);
+ (tp->t_flags & XFS_TRANS_NO_WRITECOUNT);
/* We gave our writer reference to the new transaction */
- tp->t_flags &= ~XFS_TRANS_FREEZE_PROT;
+ tp->t_flags |= XFS_TRANS_NO_WRITECOUNT;
ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket);
ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used;
tp->t_blk_res = tp->t_blk_res_used;
@@ -165,7 +123,7 @@ xfs_trans_dup(
* This does not do quota reservations. That typically is done by the
* caller afterwards.
*/
-int
+static int
xfs_trans_reserve(
struct xfs_trans *tp,
struct xfs_trans_res *resp,
@@ -219,7 +177,7 @@ xfs_trans_reserve(
resp->tr_logres,
resp->tr_logcount,
&tp->t_ticket, XFS_TRANSACTION,
- permanent, tp->t_type);
+ permanent);
}
if (error)
@@ -268,6 +226,42 @@ undo_blocks:
return error;
}
+int
+xfs_trans_alloc(
+ struct xfs_mount *mp,
+ struct xfs_trans_res *resp,
+ uint blocks,
+ uint rtextents,
+ uint flags,
+ struct xfs_trans **tpp)
+{
+ struct xfs_trans *tp;
+ int error;
+
+ if (!(flags & XFS_TRANS_NO_WRITECOUNT))
+ sb_start_intwrite(mp->m_super);
+
+ WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
+ atomic_inc(&mp->m_active_trans);
+
+ tp = kmem_zone_zalloc(xfs_trans_zone,
+ (flags & XFS_TRANS_NOFS) ? KM_NOFS : KM_SLEEP);
+ tp->t_magic = XFS_TRANS_HEADER_MAGIC;
+ tp->t_flags = flags;
+ tp->t_mountp = mp;
+ INIT_LIST_HEAD(&tp->t_items);
+ INIT_LIST_HEAD(&tp->t_busy);
+
+ error = xfs_trans_reserve(tp, resp, blocks, rtextents);
+ if (error) {
+ xfs_trans_cancel(tp);
+ return error;
+ }
+
+ *tpp = tp;
+ return 0;
+}
+
/*
* Record the indicated change to the given field for application
* to the file system's superblock when the transaction commits.
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index e7c49cf43fbc8..e2bf86aad33df 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -33,6 +33,9 @@ struct xfs_trans;
struct xfs_trans_res;
struct xfs_dquot_acct;
struct xfs_busy_extent;
+struct xfs_rud_log_item;
+struct xfs_rui_log_item;
+struct xfs_btree_cur;
typedef struct xfs_log_item {
struct list_head li_ail; /* AIL pointers */
@@ -52,6 +55,7 @@ typedef struct xfs_log_item {
/* delayed logging */
struct list_head li_cil; /* CIL pointers */
struct xfs_log_vec *li_lv; /* active log vector */
+ struct xfs_log_vec *li_lv_shadow; /* standby vector */
xfs_lsn_t li_seq; /* CIL commit seq */
} xfs_log_item_t;
@@ -90,7 +94,6 @@ void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
*/
typedef struct xfs_trans {
unsigned int t_magic; /* magic number */
- unsigned int t_type; /* transaction type */
unsigned int t_log_res; /* amt of log space resvd */
unsigned int t_log_count; /* count for perm log res */
unsigned int t_blk_res; /* # of blocks resvd */
@@ -148,10 +151,9 @@ typedef struct xfs_trans {
/*
* XFS transaction mechanism exported interfaces.
*/
-xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint);
-xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t);
-int xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *,
- uint, uint);
+int xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp,
+ uint blocks, uint rtextents, uint flags,
+ struct xfs_trans **tpp);
void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
struct xfs_buf *xfs_trans_get_buf_map(struct xfs_trans *tp,
@@ -211,17 +213,14 @@ void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
-struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint);
-void xfs_trans_log_efi_extent(xfs_trans_t *,
- struct xfs_efi_log_item *,
- xfs_fsblock_t,
- xfs_extlen_t);
-struct xfs_efd_log_item *xfs_trans_get_efd(xfs_trans_t *,
+
+void xfs_extent_free_init_defer_op(void);
+struct xfs_efd_log_item *xfs_trans_get_efd(struct xfs_trans *,
struct xfs_efi_log_item *,
uint);
int xfs_trans_free_extent(struct xfs_trans *,
struct xfs_efd_log_item *, xfs_fsblock_t,
- xfs_extlen_t);
+ xfs_extlen_t, struct xfs_owner_info *);
int xfs_trans_commit(struct xfs_trans *);
int __xfs_trans_roll(struct xfs_trans **, struct xfs_inode *, int *);
int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
@@ -237,4 +236,16 @@ void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp,
extern kmem_zone_t *xfs_trans_zone;
extern kmem_zone_t *xfs_log_item_desc_zone;
+/* rmap updates */
+enum xfs_rmap_intent_type;
+
+void xfs_rmap_update_init_defer_op(void);
+struct xfs_rud_log_item *xfs_trans_get_rud(struct xfs_trans *tp,
+ struct xfs_rui_log_item *ruip);
+int xfs_trans_log_finish_rmap_update(struct xfs_trans *tp,
+ struct xfs_rud_log_item *rudp, enum xfs_rmap_intent_type type,
+ __uint64_t owner, int whichfork, xfs_fileoff_t startoff,
+ xfs_fsblock_t startblock, xfs_filblks_t blockcount,
+ xfs_exntst_t state, struct xfs_btree_cur **pcur);
+
#endif /* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index a96ae540eb629..459ddec137a48 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -21,66 +21,15 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
#include "xfs_mount.h"
+#include "xfs_defer.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_extfree_item.h"
#include "xfs_alloc.h"
-
-/*
- * This routine is called to allocate an "extent free intention"
- * log item that will hold nextents worth of extents. The
- * caller must use all nextents extents, because we are not
- * flexible about this at all.
- */
-xfs_efi_log_item_t *
-xfs_trans_get_efi(xfs_trans_t *tp,
- uint nextents)
-{
- xfs_efi_log_item_t *efip;
-
- ASSERT(tp != NULL);
- ASSERT(nextents > 0);
-
- efip = xfs_efi_init(tp->t_mountp, nextents);
- ASSERT(efip != NULL);
-
- /*
- * Get a log_item_desc to point at the new item.
- */
- xfs_trans_add_item(tp, &efip->efi_item);
- return efip;
-}
-
-/*
- * This routine is called to indicate that the described
- * extent is to be logged as needing to be freed. It should
- * be called once for each extent to be freed.
- */
-void
-xfs_trans_log_efi_extent(xfs_trans_t *tp,
- xfs_efi_log_item_t *efip,
- xfs_fsblock_t start_block,
- xfs_extlen_t ext_len)
-{
- uint next_extent;
- xfs_extent_t *extp;
-
- tp->t_flags |= XFS_TRANS_DIRTY;
- efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
-
- /*
- * atomic_inc_return gives us the value after the increment;
- * we want to use it as an array index so we need to subtract 1 from
- * it.
- */
- next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
- ASSERT(next_extent < efip->efi_format.efi_nextents);
- extp = &(efip->efi_format.efi_extents[next_extent]);
- extp->ext_start = start_block;
- extp->ext_len = ext_len;
-}
-
+#include "xfs_bmap.h"
+#include "xfs_trace.h"
/*
* This routine is called to allocate an "extent free done"
@@ -88,12 +37,12 @@ xfs_trans_log_efi_extent(xfs_trans_t *tp,
* caller must use all nextents extents, because we are not
* flexible about this at all.
*/
-xfs_efd_log_item_t *
-xfs_trans_get_efd(xfs_trans_t *tp,
- xfs_efi_log_item_t *efip,
- uint nextents)
+struct xfs_efd_log_item *
+xfs_trans_get_efd(struct xfs_trans *tp,
+ struct xfs_efi_log_item *efip,
+ uint nextents)
{
- xfs_efd_log_item_t *efdp;
+ struct xfs_efd_log_item *efdp;
ASSERT(tp != NULL);
ASSERT(nextents > 0);
@@ -118,13 +67,19 @@ xfs_trans_free_extent(
struct xfs_trans *tp,
struct xfs_efd_log_item *efdp,
xfs_fsblock_t start_block,
- xfs_extlen_t ext_len)
+ xfs_extlen_t ext_len,
+ struct xfs_owner_info *oinfo)
{
+ struct xfs_mount *mp = tp->t_mountp;
uint next_extent;
+ xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, start_block);
+ xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, start_block);
struct xfs_extent *extp;
int error;
- error = xfs_free_extent(tp, start_block, ext_len);
+ trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len);
+
+ error = xfs_free_extent(tp, start_block, ext_len, oinfo);
/*
* Mark the transaction dirty, even on error. This ensures the
@@ -145,3 +100,139 @@ xfs_trans_free_extent(
return error;
}
+
+/* Sort bmap items by AG. */
+static int
+xfs_extent_free_diff_items(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
+{
+ struct xfs_mount *mp = priv;
+ struct xfs_extent_free_item *ra;
+ struct xfs_extent_free_item *rb;
+
+ ra = container_of(a, struct xfs_extent_free_item, xefi_list);
+ rb = container_of(b, struct xfs_extent_free_item, xefi_list);
+ return XFS_FSB_TO_AGNO(mp, ra->xefi_startblock) -
+ XFS_FSB_TO_AGNO(mp, rb->xefi_startblock);
+}
+
+/* Get an EFI. */
+STATIC void *
+xfs_extent_free_create_intent(
+ struct xfs_trans *tp,
+ unsigned int count)
+{
+ struct xfs_efi_log_item *efip;
+
+ ASSERT(tp != NULL);
+ ASSERT(count > 0);
+
+ efip = xfs_efi_init(tp->t_mountp, count);
+ ASSERT(efip != NULL);
+
+ /*
+ * Get a log_item_desc to point at the new item.
+ */
+ xfs_trans_add_item(tp, &efip->efi_item);
+ return efip;
+}
+
+/* Log a free extent to the intent item. */
+STATIC void
+xfs_extent_free_log_item(
+ struct xfs_trans *tp,
+ void *intent,
+ struct list_head *item)
+{
+ struct xfs_efi_log_item *efip = intent;
+ struct xfs_extent_free_item *free;
+ uint next_extent;
+ struct xfs_extent *extp;
+
+ free = container_of(item, struct xfs_extent_free_item, xefi_list);
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+
+ /*
+ * atomic_inc_return gives us the value after the increment;
+ * we want to use it as an array index so we need to subtract 1 from
+ * it.
+ */
+ next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
+ ASSERT(next_extent < efip->efi_format.efi_nextents);
+ extp = &efip->efi_format.efi_extents[next_extent];
+ extp->ext_start = free->xefi_startblock;
+ extp->ext_len = free->xefi_blockcount;
+}
+
+/* Get an EFD so we can process all the free extents. */
+STATIC void *
+xfs_extent_free_create_done(
+ struct xfs_trans *tp,
+ void *intent,
+ unsigned int count)
+{
+ return xfs_trans_get_efd(tp, intent, count);
+}
+
+/* Process a free extent. */
+STATIC int
+xfs_extent_free_finish_item(
+ struct xfs_trans *tp,
+ struct xfs_defer_ops *dop,
+ struct list_head *item,
+ void *done_item,
+ void **state)
+{
+ struct xfs_extent_free_item *free;
+ int error;
+
+ free = container_of(item, struct xfs_extent_free_item, xefi_list);
+ error = xfs_trans_free_extent(tp, done_item,
+ free->xefi_startblock,
+ free->xefi_blockcount,
+ &free->xefi_oinfo);
+ kmem_free(free);
+ return error;
+}
+
+/* Abort all pending EFIs. */
+STATIC void
+xfs_extent_free_abort_intent(
+ void *intent)
+{
+ xfs_efi_release(intent);
+}
+
+/* Cancel a free extent. */
+STATIC void
+xfs_extent_free_cancel_item(
+ struct list_head *item)
+{
+ struct xfs_extent_free_item *free;
+
+ free = container_of(item, struct xfs_extent_free_item, xefi_list);
+ kmem_free(free);
+}
+
+static const struct xfs_defer_op_type xfs_extent_free_defer_type = {
+ .type = XFS_DEFER_OPS_TYPE_FREE,
+ .max_items = XFS_EFI_MAX_FAST_EXTENTS,
+ .diff_items = xfs_extent_free_diff_items,
+ .create_intent = xfs_extent_free_create_intent,
+ .abort_intent = xfs_extent_free_abort_intent,
+ .log_item = xfs_extent_free_log_item,
+ .create_done = xfs_extent_free_create_done,
+ .finish_item = xfs_extent_free_finish_item,
+ .cancel_item = xfs_extent_free_cancel_item,
+};
+
+/* Register the deferred op type. */
+void
+xfs_extent_free_init_defer_op(void)
+{
+ xfs_defer_init_op_type(&xfs_extent_free_defer_type);
+}
diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c
new file mode 100644
index 0000000000000..5a50ef8815680
--- /dev/null
+++ b/fs/xfs/xfs_trans_rmap.c
@@ -0,0 +1,271 @@
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_rmap_item.h"
+#include "xfs_alloc.h"
+#include "xfs_rmap.h"
+
+/* Set the map extent flags for this reverse mapping. */
+static void
+xfs_trans_set_rmap_flags(
+ struct xfs_map_extent *rmap,
+ enum xfs_rmap_intent_type type,
+ int whichfork,
+ xfs_exntst_t state)
+{
+ rmap->me_flags = 0;
+ if (state == XFS_EXT_UNWRITTEN)
+ rmap->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN;
+ if (whichfork == XFS_ATTR_FORK)
+ rmap->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK;
+ switch (type) {
+ case XFS_RMAP_MAP:
+ rmap->me_flags |= XFS_RMAP_EXTENT_MAP;
+ break;
+ case XFS_RMAP_UNMAP:
+ rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP;
+ break;
+ case XFS_RMAP_CONVERT:
+ rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT;
+ break;
+ case XFS_RMAP_ALLOC:
+ rmap->me_flags |= XFS_RMAP_EXTENT_ALLOC;
+ break;
+ case XFS_RMAP_FREE:
+ rmap->me_flags |= XFS_RMAP_EXTENT_FREE;
+ break;
+ default:
+ ASSERT(0);
+ }
+}
+
+struct xfs_rud_log_item *
+xfs_trans_get_rud(
+ struct xfs_trans *tp,
+ struct xfs_rui_log_item *ruip)
+{
+ struct xfs_rud_log_item *rudp;
+
+ rudp = xfs_rud_init(tp->t_mountp, ruip);
+ xfs_trans_add_item(tp, &rudp->rud_item);
+ return rudp;
+}
+
+/*
+ * Finish an rmap update and log it to the RUD. Note that the transaction is
+ * marked dirty regardless of whether the rmap update succeeds or fails to
+ * support the RUI/RUD lifecycle rules.
+ */
+int
+xfs_trans_log_finish_rmap_update(
+ struct xfs_trans *tp,
+ struct xfs_rud_log_item *rudp,
+ enum xfs_rmap_intent_type type,
+ __uint64_t owner,
+ int whichfork,
+ xfs_fileoff_t startoff,
+ xfs_fsblock_t startblock,
+ xfs_filblks_t blockcount,
+ xfs_exntst_t state,
+ struct xfs_btree_cur **pcur)
+{
+ int error;
+
+ error = xfs_rmap_finish_one(tp, type, owner, whichfork, startoff,
+ startblock, blockcount, state, pcur);
+
+ /*
+ * Mark the transaction dirty, even on error. This ensures the
+ * transaction is aborted, which:
+ *
+ * 1.) releases the RUI and frees the RUD
+ * 2.) shuts down the filesystem
+ */
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ rudp->rud_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+
+ return error;
+}
+
+/* Sort rmap intents by AG. */
+static int
+xfs_rmap_update_diff_items(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
+{
+ struct xfs_mount *mp = priv;
+ struct xfs_rmap_intent *ra;
+ struct xfs_rmap_intent *rb;
+
+ ra = container_of(a, struct xfs_rmap_intent, ri_list);
+ rb = container_of(b, struct xfs_rmap_intent, ri_list);
+ return XFS_FSB_TO_AGNO(mp, ra->ri_bmap.br_startblock) -
+ XFS_FSB_TO_AGNO(mp, rb->ri_bmap.br_startblock);
+}
+
+/* Get an RUI. */
+STATIC void *
+xfs_rmap_update_create_intent(
+ struct xfs_trans *tp,
+ unsigned int count)
+{
+ struct xfs_rui_log_item *ruip;
+
+ ASSERT(tp != NULL);
+ ASSERT(count > 0);
+
+ ruip = xfs_rui_init(tp->t_mountp, count);
+ ASSERT(ruip != NULL);
+
+ /*
+ * Get a log_item_desc to point at the new item.
+ */
+ xfs_trans_add_item(tp, &ruip->rui_item);
+ return ruip;
+}
+
+/* Log rmap updates in the intent item. */
+STATIC void
+xfs_rmap_update_log_item(
+ struct xfs_trans *tp,
+ void *intent,
+ struct list_head *item)
+{
+ struct xfs_rui_log_item *ruip = intent;
+ struct xfs_rmap_intent *rmap;
+ uint next_extent;
+ struct xfs_map_extent *map;
+
+ rmap = container_of(item, struct xfs_rmap_intent, ri_list);
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ ruip->rui_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+
+ /*
+ * atomic_inc_return gives us the value after the increment;
+ * we want to use it as an array index so we need to subtract 1 from
+ * it.
+ */
+ next_extent = atomic_inc_return(&ruip->rui_next_extent) - 1;
+ ASSERT(next_extent < ruip->rui_format.rui_nextents);
+ map = &ruip->rui_format.rui_extents[next_extent];
+ map->me_owner = rmap->ri_owner;
+ map->me_startblock = rmap->ri_bmap.br_startblock;
+ map->me_startoff = rmap->ri_bmap.br_startoff;
+ map->me_len = rmap->ri_bmap.br_blockcount;
+ xfs_trans_set_rmap_flags(map, rmap->ri_type, rmap->ri_whichfork,
+ rmap->ri_bmap.br_state);
+}
+
+/* Get an RUD so we can process all the deferred rmap updates. */
+STATIC void *
+xfs_rmap_update_create_done(
+ struct xfs_trans *tp,
+ void *intent,
+ unsigned int count)
+{
+ return xfs_trans_get_rud(tp, intent);
+}
+
+/* Process a deferred rmap update. */
+STATIC int
+xfs_rmap_update_finish_item(
+ struct xfs_trans *tp,
+ struct xfs_defer_ops *dop,
+ struct list_head *item,
+ void *done_item,
+ void **state)
+{
+ struct xfs_rmap_intent *rmap;
+ int error;
+
+ rmap = container_of(item, struct xfs_rmap_intent, ri_list);
+ error = xfs_trans_log_finish_rmap_update(tp, done_item,
+ rmap->ri_type,
+ rmap->ri_owner, rmap->ri_whichfork,
+ rmap->ri_bmap.br_startoff,
+ rmap->ri_bmap.br_startblock,
+ rmap->ri_bmap.br_blockcount,
+ rmap->ri_bmap.br_state,
+ (struct xfs_btree_cur **)state);
+ kmem_free(rmap);
+ return error;
+}
+
+/* Clean up after processing deferred rmaps. */
+STATIC void
+xfs_rmap_update_finish_cleanup(
+ struct xfs_trans *tp,
+ void *state,
+ int error)
+{
+ struct xfs_btree_cur *rcur = state;
+
+ xfs_rmap_finish_one_cleanup(tp, rcur, error);
+}
+
+/* Abort all pending RUIs. */
+STATIC void
+xfs_rmap_update_abort_intent(
+ void *intent)
+{
+ xfs_rui_release(intent);
+}
+
+/* Cancel a deferred rmap update. */
+STATIC void
+xfs_rmap_update_cancel_item(
+ struct list_head *item)
+{
+ struct xfs_rmap_intent *rmap;
+
+ rmap = container_of(item, struct xfs_rmap_intent, ri_list);
+ kmem_free(rmap);
+}
+
+static const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
+ .type = XFS_DEFER_OPS_TYPE_RMAP,
+ .max_items = XFS_RUI_MAX_FAST_EXTENTS,
+ .diff_items = xfs_rmap_update_diff_items,
+ .create_intent = xfs_rmap_update_create_intent,
+ .abort_intent = xfs_rmap_update_abort_intent,
+ .log_item = xfs_rmap_update_log_item,
+ .create_done = xfs_rmap_update_create_done,
+ .finish_item = xfs_rmap_update_finish_item,
+ .finish_cleanup = xfs_rmap_update_finish_cleanup,
+ .cancel_item = xfs_rmap_update_cancel_item,
+};
+
+/* Register the deferred op type. */
+void
+xfs_rmap_update_init_defer_op(void)
+{
+ xfs_defer_init_op_type(&xfs_rmap_update_defer_type);
+}
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 110f1d7d86b0b..ea62245fee263 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -32,11 +32,11 @@
static int
-xfs_xattr_get(const struct xattr_handler *handler, struct dentry *dentry,
- const char *name, void *value, size_t size)
+xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused,
+ struct inode *inode, const char *name, void *value, size_t size)
{
int xflags = handler->flags;
- struct xfs_inode *ip = XFS_I(d_inode(dentry));
+ struct xfs_inode *ip = XFS_I(inode);
int error, asize = size;
/* Convert Linux syscall to XFS internal ATTR flags */
@@ -74,11 +74,12 @@ xfs_forget_acl(
}
static int
-xfs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry,
- const char *name, const void *value, size_t size, int flags)
+xfs_xattr_set(const struct xattr_handler *handler, struct dentry *unused,
+ struct inode *inode, const char *name, const void *value,
+ size_t size, int flags)
{
int xflags = handler->flags;
- struct xfs_inode *ip = XFS_I(d_inode(dentry));
+ struct xfs_inode *ip = XFS_I(inode);
int error;
/* Convert Linux syscall to XFS internal ATTR flags */
@@ -92,7 +93,7 @@ xfs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry,
error = xfs_attr_set(ip, (unsigned char *)name,
(void *)value, size, xflags);
if (!error)
- xfs_forget_acl(d_inode(dentry), name, xflags);
+ xfs_forget_acl(inode, name, xflags);
return error;
}
@@ -146,7 +147,7 @@ __xfs_xattr_put_listent(
arraytop = context->count + prefix_len + namelen + 1;
if (arraytop > context->firstu) {
context->count = -1; /* insufficient space */
- return 1;
+ return 0;
}
offset = (char *)context->alist + context->count;
strncpy(offset, prefix, prefix_len);
@@ -166,8 +167,7 @@ xfs_xattr_put_listent(
int flags,
unsigned char *name,
int namelen,
- int valuelen,
- unsigned char *value)
+ int valuelen)
{
char *prefix;
int prefix_len;
@@ -221,11 +221,15 @@ xfs_xattr_put_listent(
}
ssize_t
-xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
+xfs_vn_listxattr(
+ struct dentry *dentry,
+ char *data,
+ size_t size)
{
struct xfs_attr_list_context context;
struct attrlist_cursor_kern cursor = { 0 };
- struct inode *inode = d_inode(dentry);
+ struct inode *inode = d_inode(dentry);
+ int error;
/*
* First read the regular on-disk attributes.
@@ -239,7 +243,9 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
context.firstu = context.bufsize;
context.put_listent = xfs_xattr_put_listent;
- xfs_attr_list_int(&context);
+ error = xfs_attr_list_int(&context);
+ if (error)
+ return error;
if (context.count < 0)
return -ERANGE;