summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2015-03-27 18:47:11 +1100
committerStephen Rothwell <sfr@canb.auug.org.au>2015-03-27 18:47:11 +1100
commitdc850a4456fb289472912ee1528b34c58f3c29cd (patch)
tree17325db755abd3557b8a147b95b683016db18ec0
parent8c352219eda33eb5adf17d016a4152785e092fd6 (diff)
parente38d2ff606af30055d62f248d8a8ce1c1114427e (diff)
downloadlinux-dc850a4456fb289472912ee1528b34c58f3c29cd.tar.gz
linux-dc850a4456fb289472912ee1528b34c58f3c29cd.tar.xz
Merge branch 'akpm-current/current'
Conflicts: Documentation/printk-formats.txt arch/arm64/Kconfig arch/s390/mm/mmap.c fs/fat/inode.c
-rw-r--r--.mailmap1
-rw-r--r--CREDITS13
-rw-r--r--Documentation/ABI/obsolete/sysfs-block-zram119
-rw-r--r--Documentation/ABI/testing/sysfs-block-zram25
-rw-r--r--Documentation/ABI/testing/sysfs-class-zram24
-rw-r--r--Documentation/CodingStyle13
-rw-r--r--Documentation/blockdev/zram.txt120
-rw-r--r--Documentation/cma/debugfs.txt21
-rw-r--r--Documentation/devicetree/bindings/rtc/digicolor-rtc.txt17
-rw-r--r--Documentation/devicetree/bindings/rtc/stmp3xxx-rtc.txt5
-rw-r--r--Documentation/filesystems/proc.txt15
-rw-r--r--Documentation/filesystems/vfat.txt10
-rw-r--r--Documentation/kernel-parameters.txt10
-rw-r--r--Documentation/printk-formats.txt59
-rw-r--r--Documentation/sysctl/kernel.txt62
-rw-r--r--Documentation/sysctl/vm.txt11
-rw-r--r--Documentation/vm/cleancache.txt4
-rw-r--r--Documentation/vm/hugetlbpage.txt31
-rw-r--r--Documentation/vm/unevictable-lru.txt26
-rw-r--r--Documentation/vm/zsmalloc.txt70
-rw-r--r--MAINTAINERS71
-rw-r--r--arch/Kconfig15
-rw-r--r--arch/alpha/Kconfig4
-rw-r--r--arch/alpha/include/uapi/asm/mman.h1
-rw-r--r--arch/arc/kernel/troubleshoot.c3
-rw-r--r--arch/arm/Kconfig7
-rw-r--r--arch/arm/include/asm/Kbuild1
-rw-r--r--arch/arm/include/asm/elf.h4
-rw-r--r--arch/arm/include/asm/pgtable-3level.h1
-rw-r--r--arch/arm/include/asm/seccomp.h11
-rw-r--r--arch/arm/mm/init.c3
-rw-r--r--arch/arm/mm/mmap.c16
-rw-r--r--arch/arm/plat-pxa/dma.c111
-rw-r--r--arch/arm64/Kconfig16
-rw-r--r--arch/arm64/include/asm/elf.h5
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h4
-rw-r--r--arch/arm64/include/asm/page.h4
-rw-r--r--arch/arm64/include/asm/pgalloc.h8
-rw-r--r--arch/arm64/include/asm/pgtable-hwdef.h6
-rw-r--r--arch/arm64/include/asm/pgtable-types.h12
-rw-r--r--arch/arm64/include/asm/pgtable.h10
-rw-r--r--arch/arm64/include/asm/tlb.h4
-rw-r--r--arch/arm64/mm/init.c2
-rw-r--r--arch/arm64/mm/mmap.c20
-rw-r--r--arch/arm64/mm/mmu.c4
-rw-r--r--arch/cris/arch-v10/kernel/fasttimer.c85
-rw-r--r--arch/cris/arch-v10/kernel/setup.c58
-rw-r--r--arch/cris/arch-v32/kernel/fasttimer.c85
-rw-r--r--arch/cris/arch-v32/kernel/setup.c62
-rw-r--r--arch/ia64/Kconfig18
-rw-r--r--arch/ia64/include/asm/page.h4
-rw-r--r--arch/ia64/include/asm/pgalloc.h4
-rw-r--r--arch/ia64/include/asm/pgtable.h12
-rw-r--r--arch/ia64/kernel/ivt.S12
-rw-r--r--arch/ia64/kernel/machine_kexec.c4
-rw-r--r--arch/m68k/Kconfig4
-rw-r--r--arch/microblaze/include/asm/seccomp.h8
-rw-r--r--arch/microblaze/kernel/cpu/mb.c149
-rw-r--r--arch/mips/Kconfig7
-rw-r--r--arch/mips/include/asm/elf.h4
-rw-r--r--arch/mips/include/asm/seccomp.h7
-rw-r--r--arch/mips/include/uapi/asm/mman.h1
-rw-r--r--arch/mips/mm/mmap.c24
-rw-r--r--arch/nios2/kernel/cpuinfo.c77
-rw-r--r--arch/openrisc/kernel/setup.c50
-rw-r--r--arch/parisc/Kconfig5
-rw-r--r--arch/parisc/include/asm/Kbuild1
-rw-r--r--arch/parisc/include/asm/pgalloc.h2
-rw-r--r--arch/parisc/include/asm/pgtable.h16
-rw-r--r--arch/parisc/include/asm/seccomp.h16
-rw-r--r--arch/parisc/include/uapi/asm/mman.h1
-rw-r--r--arch/parisc/kernel/entry.S4
-rw-r--r--arch/parisc/kernel/head.S4
-rw-r--r--arch/parisc/mm/init.c2
-rw-r--r--arch/powerpc/Kconfig8
-rw-r--r--arch/powerpc/include/asm/elf.h4
-rw-r--r--arch/powerpc/include/asm/pgtable-ppc64.h2
-rw-r--r--arch/powerpc/include/asm/seccomp.h10
-rw-r--r--arch/powerpc/include/uapi/asm/Kbuild1
-rw-r--r--arch/powerpc/include/uapi/asm/seccomp.h16
-rw-r--r--arch/powerpc/mm/mmap.c28
-rw-r--r--arch/s390/Kconfig7
-rw-r--r--arch/s390/include/asm/elf.h12
-rw-r--r--arch/s390/mm/mmap.c32
-rw-r--r--arch/s390/pci/pci_debug.c6
-rw-r--r--arch/sh/Kconfig4
-rw-r--r--arch/sh/kernel/dwarf.c18
-rw-r--r--arch/sparc/Kconfig4
-rw-r--r--arch/sparc/include/asm/pgtable_64.h9
-rw-r--r--arch/sparc/include/asm/seccomp.h11
-rw-r--r--arch/sparc/kernel/mdesc.c22
-rw-r--r--arch/tile/Kconfig5
-rw-r--r--arch/um/Kconfig.um5
-rw-r--r--arch/x86/Kconfig20
-rw-r--r--arch/x86/include/asm/e820.h8
-rw-r--r--arch/x86/include/asm/elf.h3
-rw-r--r--arch/x86/include/asm/page_types.h2
-rw-r--r--arch/x86/include/asm/paravirt.h8
-rw-r--r--arch/x86/include/asm/paravirt_types.h8
-rw-r--r--arch/x86/include/asm/pgalloc.h8
-rw-r--r--arch/x86/include/asm/pgtable-2level_types.h1
-rw-r--r--arch/x86/include/asm/pgtable-3level_types.h2
-rw-r--r--arch/x86/include/asm/pgtable.h13
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h1
-rw-r--r--arch/x86/include/asm/pgtable_types.h4
-rw-r--r--arch/x86/include/asm/seccomp.h21
-rw-r--r--arch/x86/include/asm/seccomp_32.h11
-rw-r--r--arch/x86/include/asm/seccomp_64.h17
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c12
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/machine_kexec_64.c1
-rw-r--r--arch/x86/kernel/paravirt.c6
-rw-r--r--arch/x86/kernel/signal.c22
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/ioremap.c23
-rw-r--r--arch/x86/mm/mmap.c38
-rw-r--r--arch/x86/mm/pgtable.c79
-rw-r--r--arch/x86/xen/mmu.c14
-rw-r--r--arch/xtensa/include/uapi/asm/mman.h1
-rw-r--r--block/genhd.c2
-rw-r--r--drivers/base/memory.c21
-rw-r--r--drivers/base/power/wakeup.c16
-rw-r--r--drivers/block/paride/pg.c4
-rw-r--r--drivers/block/zram/zram_drv.c854
-rw-r--r--drivers/block/zram/zram_drv.h7
-rw-r--r--drivers/clk/bcm/clk-kona.c28
-rw-r--r--drivers/clk/bcm/clk-kona.h1
-rw-r--r--drivers/cpuidle/governors/menu.c8
-rw-r--r--drivers/gpu/drm/i915/intel_drv.h3
-rw-r--r--drivers/gpu/drm/i915/intel_panel.c1
-rw-r--r--drivers/hwmon/ina2xx.c17
-rw-r--r--drivers/hwmon/lm85.c26
-rw-r--r--drivers/hwmon/w83795.c8
-rw-r--r--drivers/infiniband/hw/cxgb4/mem.c2
-rw-r--r--drivers/media/dvb-frontends/cxd2820r_c.c2
-rw-r--r--drivers/media/dvb-frontends/cxd2820r_core.c6
-rw-r--r--drivers/media/dvb-frontends/cxd2820r_priv.h2
-rw-r--r--drivers/media/dvb-frontends/cxd2820r_t.c2
-rw-r--r--drivers/media/dvb-frontends/cxd2820r_t2.c2
-rw-r--r--drivers/memstick/core/mspro_block.c3
-rw-r--r--drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c15
-rw-r--r--drivers/parisc/ccio-dma.c54
-rw-r--r--drivers/parisc/sba_iommu.c86
-rw-r--r--drivers/rtc/Kconfig37
-rw-r--r--drivers/rtc/Makefile4
-rw-r--r--drivers/rtc/class.c24
-rw-r--r--drivers/rtc/hctosys.c2
-rw-r--r--drivers/rtc/interface.c13
-rw-r--r--drivers/rtc/rtc-ab-b5ze-s3.c2
-rw-r--r--drivers/rtc/rtc-abx805.c223
-rw-r--r--drivers/rtc/rtc-abx80x.c193
-rw-r--r--drivers/rtc/rtc-cmos.c36
-rw-r--r--drivers/rtc/rtc-da9052.c97
-rw-r--r--drivers/rtc/rtc-digicolor.c227
-rw-r--r--drivers/rtc/rtc-ds1305.c6
-rw-r--r--drivers/rtc/rtc-ds1685.c5
-rw-r--r--drivers/rtc/rtc-em3027.c11
-rw-r--r--drivers/rtc/rtc-hym8563.c2
-rw-r--r--drivers/rtc/rtc-mc13xxx.c2
-rw-r--r--drivers/rtc/rtc-mrst.c16
-rw-r--r--drivers/rtc/rtc-mt6397.c352
-rw-r--r--drivers/rtc/rtc-omap.c14
-rw-r--r--drivers/rtc/rtc-pcf8563.c7
-rw-r--r--drivers/rtc/rtc-s3c.c163
-rw-r--r--drivers/rtc/rtc-stmp3xxx.c66
-rw-r--r--drivers/rtc/rtc-tegra.c4
-rw-r--r--drivers/rtc/rtc-x1205.c4
-rw-r--r--drivers/s390/cio/blacklist.c12
-rw-r--r--drivers/s390/scsi/zfcp_erp.c4
-rw-r--r--drivers/scsi/bfa/bfad.c22
-rw-r--r--drivers/staging/lustre/lustre/Kconfig1
-rw-r--r--drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h4
-rw-r--r--drivers/xen/tmem.c16
-rw-r--r--firmware/ihex2fw.c1
-rw-r--r--fs/Kconfig.binfmt3
-rw-r--r--fs/adfs/dir_fplus.c1
-rw-r--r--fs/adfs/super.c20
-rw-r--r--fs/affs/affs.h28
-rw-r--r--fs/affs/amigaffs.c3
-rw-r--r--fs/affs/file.c2
-rw-r--r--fs/affs/inode.c32
-rw-r--r--fs/affs/namei.c6
-rw-r--r--fs/affs/super.c43
-rw-r--r--fs/binfmt_elf.c22
-rw-r--r--fs/binfmt_misc.c30
-rw-r--r--fs/buffer.c4
-rw-r--r--fs/cifs/connect.c6
-rw-r--r--fs/cifs/file.c8
-rw-r--r--fs/exec.c6
-rw-r--r--fs/ext4/fsync.c5
-rw-r--r--fs/fat/cache.c81
-rw-r--r--fs/fat/dir.c6
-rw-r--r--fs/fat/fat.h11
-rw-r--r--fs/fat/fatent.c3
-rw-r--r--fs/fat/file.c65
-rw-r--r--fs/fat/inode.c88
-rw-r--r--fs/fat/misc.c4
-rw-r--r--fs/fat/namei_msdos.c2
-rw-r--r--fs/fat/namei_vfat.c2
-rw-r--r--fs/file.c3
-rw-r--r--fs/hfsplus/bfind.c4
-rw-r--r--fs/hfsplus/catalog.c3
-rw-r--r--fs/hfsplus/ioctl.c12
-rw-r--r--fs/hfsplus/xattr.c77
-rw-r--r--fs/hfsplus/xattr.h22
-rw-r--r--fs/hfsplus/xattr_security.c38
-rw-r--r--fs/hfsplus/xattr_trusted.c37
-rw-r--r--fs/hfsplus/xattr_user.c35
-rw-r--r--fs/hugetlbfs/inode.c92
-rw-r--r--fs/jbd2/journal.c11
-rw-r--r--fs/jbd2/transaction.c20
-rw-r--r--fs/locks.c38
-rw-r--r--fs/mpage.c23
-rw-r--r--fs/nfs/Kconfig2
-rw-r--r--fs/nfs/write.c5
-rw-r--r--fs/nfsd/Kconfig1
-rw-r--r--fs/nilfs2/alloc.c5
-rw-r--r--fs/nilfs2/bmap.c48
-rw-r--r--fs/nilfs2/bmap.h13
-rw-r--r--fs/nilfs2/btree.c66
-rw-r--r--fs/nilfs2/cpfile.c58
-rw-r--r--fs/nilfs2/direct.c17
-rw-r--r--fs/nilfs2/inode.c6
-rw-r--r--fs/nilfs2/mdt.c54
-rw-r--r--fs/nilfs2/mdt.h10
-rw-r--r--fs/nilfs2/page.c24
-rw-r--r--fs/nilfs2/segment.c17
-rw-r--r--fs/ocfs2/alloc.c64
-rw-r--r--fs/ocfs2/aops.c176
-rw-r--r--fs/ocfs2/buffer_head_io.c6
-rw-r--r--fs/ocfs2/cluster/heartbeat.c91
-rw-r--r--fs/ocfs2/dir.c40
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c23
-rw-r--r--fs/ocfs2/dlm/dlmthread.c10
-rw-r--r--fs/ocfs2/dlmglue.c2
-rw-r--r--fs/ocfs2/export.c2
-rw-r--r--fs/ocfs2/inode.c26
-rw-r--r--fs/ocfs2/localalloc.c4
-rw-r--r--fs/ocfs2/move_extents.c3
-rw-r--r--fs/ocfs2/namei.c15
-rw-r--r--fs/ocfs2/ocfs2.h2
-rw-r--r--fs/ocfs2/refcounttree.c44
-rw-r--r--fs/ocfs2/slot_map.c4
-rw-r--r--fs/ocfs2/stack_o2cb.c2
-rw-r--r--fs/ocfs2/stack_user.c8
-rw-r--r--fs/ocfs2/suballoc.c27
-rw-r--r--fs/ocfs2/super.c70
-rw-r--r--fs/ocfs2/super.h2
-rw-r--r--fs/ocfs2/xattr.c26
-rw-r--r--fs/posix_acl.c46
-rw-r--r--fs/proc/array.c24
-rw-r--r--fs/proc/base.c82
-rw-r--r--fs/proc/fd.c27
-rw-r--r--fs/super.c2
-rw-r--r--include/asm-generic/pgtable.h30
-rw-r--r--include/asm-generic/seccomp.h2
-rw-r--r--include/linux/a.out.h67
-rw-r--r--include/linux/bitmap.h8
-rw-r--r--include/linux/bitops.h4
-rw-r--r--include/linux/capability.h29
-rw-r--r--include/linux/cleancache.h13
-rw-r--r--include/linux/cma.h12
-rw-r--r--include/linux/compaction.h1
-rw-r--r--include/linux/crc64_ecma.h56
-rw-r--r--include/linux/cred.h23
-rw-r--r--include/linux/elf-randomize.h22
-rw-r--r--include/linux/fs.h9
-rw-r--r--include/linux/gfp.h16
-rw-r--r--include/linux/huge_mm.h4
-rw-r--r--include/linux/hugetlb.h18
-rw-r--r--include/linux/io.h8
-rw-r--r--include/linux/ioport.h8
-rw-r--r--include/linux/kconfig.h15
-rw-r--r--include/linux/kernel.h12
-rw-r--r--include/linux/kexec.h2
-rw-r--r--include/linux/ksm.h17
-rw-r--r--include/linux/memblock.h8
-rw-r--r--include/linux/memory_hotplug.h6
-rw-r--r--include/linux/mempool.h2
-rw-r--r--include/linux/migrate.h5
-rw-r--r--include/linux/mm.h126
-rw-r--r--include/linux/mm_types.h4
-rw-r--r--include/linux/mmzone.h8
-rw-r--r--include/linux/nmi.h21
-rw-r--r--include/linux/oom.h3
-rw-r--r--include/linux/page-flags.h319
-rw-r--r--include/linux/pagemap.h25
-rw-r--r--include/linux/poison.h4
-rw-r--r--include/linux/printk.h5
-rw-r--r--include/linux/rmap.h9
-rw-r--r--include/linux/rtc.h4
-rw-r--r--include/linux/string.h1
-rw-r--r--include/linux/string_helpers.h8
-rw-r--r--include/linux/swap.h1
-rw-r--r--include/linux/types.h6
-rw-r--r--include/linux/uidgid.h12
-rw-r--r--include/linux/util_macros.h40
-rw-r--r--include/linux/vm_event_item.h1
-rw-r--r--include/linux/zsmalloc.h1
-rw-r--r--include/trace/events/xen.h2
-rw-r--r--include/uapi/asm-generic/mman-common.h1
-rw-r--r--init/Kconfig19
-rw-r--r--init/main.c2
-rw-r--r--ipc/msg.c34
-rw-r--r--ipc/sem.c26
-rw-r--r--ipc/shm.c42
-rw-r--r--ipc/util.c6
-rw-r--r--kernel/Makefile4
-rw-r--r--kernel/capability.c35
-rw-r--r--kernel/cgroup.c6
-rw-r--r--kernel/cpuset.c18
-rw-r--r--kernel/cred.c3
-rw-r--r--kernel/fork.c65
-rw-r--r--kernel/groups.c3
-rw-r--r--kernel/hung_task.c4
-rw-r--r--kernel/pid.c15
-rw-r--r--kernel/ptrace.c39
-rw-r--r--kernel/resource.c32
-rw-r--r--kernel/signal.c14
-rw-r--r--kernel/sys.c2
-rw-r--r--kernel/sys_ni.c14
-rw-r--r--kernel/sysctl.c44
-rw-r--r--kernel/trace/trace_stack.c4
-rw-r--r--kernel/watchdog.c289
-rw-r--r--lib/Kconfig7
-rw-r--r--lib/Kconfig.debug12
-rw-r--r--lib/Makefile3
-rw-r--r--lib/cpumask.c9
-rw-r--r--lib/crc64_ecma.c341
-rw-r--r--lib/dma-debug.c2
-rw-r--r--lib/find_bit.c193
-rw-r--r--lib/find_last_bit.c36
-rw-r--r--lib/find_next_bit.c285
-rw-r--r--lib/ioremap.c53
-rw-r--r--lib/lru_cache.c9
-rw-r--r--lib/string_helpers.c193
-rw-r--r--lib/test-string_helpers.c40
-rw-r--r--lib/vsprintf.c372
-rw-r--r--mm/Kconfig6
-rw-r--r--mm/Makefile2
-rw-r--r--mm/cleancache.c276
-rw-r--r--mm/cma.c49
-rw-r--r--mm/cma.h24
-rw-r--r--mm/cma_debug.c171
-rw-r--r--mm/compaction.c30
-rw-r--r--mm/filemap.c30
-rw-r--r--mm/gup.c128
-rw-r--r--mm/huge_memory.c113
-rw-r--r--mm/hugetlb.c160
-rw-r--r--mm/internal.h8
-rw-r--r--mm/ksm.c12
-rw-r--r--mm/madvise.c176
-rw-r--r--mm/memblock.c4
-rw-r--r--mm/memcontrol.c241
-rw-r--r--mm/memory-failure.c109
-rw-r--r--mm/memory.c375
-rw-r--r--mm/memory_hotplug.c46
-rw-r--r--mm/mempolicy.c6
-rw-r--r--mm/mempool.c10
-rw-r--r--mm/memtest.c (renamed from arch/x86/mm/memtest.c)16
-rw-r--r--mm/migrate.c22
-rw-r--r--mm/mlock.c131
-rw-r--r--mm/mmap.c12
-rw-r--r--mm/mremap.c25
-rw-r--r--mm/oom_kill.c9
-rw-r--r--mm/page-writeback.c19
-rw-r--r--mm/page_alloc.c260
-rw-r--r--mm/page_isolation.c7
-rw-r--r--mm/rmap.c52
-rw-r--r--mm/shmem.c4
-rw-r--r--mm/slab.c22
-rw-r--r--mm/slub.c22
-rw-r--r--mm/swap.c45
-rw-r--r--mm/swap_state.c6
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/truncate.c39
-rw-r--r--mm/util.c35
-rw-r--r--mm/vmalloc.c103
-rw-r--r--mm/vmscan.c117
-rw-r--r--mm/vmstat.c1
-rw-r--r--mm/zsmalloc.c976
-rw-r--r--mm/zswap.c4
-rw-r--r--net/openvswitch/flow.c4
-rw-r--r--net/sunrpc/Kconfig2
-rw-r--r--net/sunrpc/cache.c8
-rwxr-xr-xscripts/checkpatch.pl88
-rw-r--r--scripts/spelling.txt1
-rw-r--r--security/Kconfig1
388 files changed, 9401 insertions, 4927 deletions
diff --git a/.mailmap b/.mailmap
index 0d971cfb0772..6287004040e7 100644
--- a/.mailmap
+++ b/.mailmap
@@ -100,6 +100,7 @@ Rajesh Shah <rajesh.shah@intel.com>
Ralf Baechle <ralf@linux-mips.org>
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
Rémi Denis-Courmont <rdenis@simphalempin.com>
+Ricardo Ribalda Delgado <ricardo.ribalda@gmail.com>
Rudolf Marek <R.Marek@sh.cvut.cz>
Rui Saraiva <rmps@joel.ist.utl.pt>
Sachin P Sant <ssant@in.ibm.com>
diff --git a/CREDITS b/CREDITS
index 843e17647f3b..a0060bd054b5 100644
--- a/CREDITS
+++ b/CREDITS
@@ -3008,6 +3008,19 @@ W: http://www.qsl.net/dl1bke/
D: Generic Z8530 driver, AX.25 DAMA slave implementation
D: Several AX.25 hacks
+N: Ricardo Ribalda Delgado
+E: ricardo.ribalda@gmail.com
+W: http://ribalda.com
+D: PLX USB338x driver
+D: PCA9634 driver
+D: Option GTM671WFS
+D: Fintek F81216A
+D: Various kernel hacks
+S: Qtechnology A/S
+S: Valby Langgade 142
+S: 2500 Valby
+S: Denmark
+
N: Francois-Rene Rideau
E: fare@tunes.org
W: http://www.tunes.org/~fare
diff --git a/Documentation/ABI/obsolete/sysfs-block-zram b/Documentation/ABI/obsolete/sysfs-block-zram
new file mode 100644
index 000000000000..720ea92cfb2e
--- /dev/null
+++ b/Documentation/ABI/obsolete/sysfs-block-zram
@@ -0,0 +1,119 @@
+What: /sys/block/zram<id>/num_reads
+Date: August 2015
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ The num_reads file is read-only and specifies the number of
+ reads (failed or successful) done on this device.
+ Now accessible via zram<id>/stat node.
+
+What: /sys/block/zram<id>/num_writes
+Date: August 2015
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ The num_writes file is read-only and specifies the number of
+ writes (failed or successful) done on this device.
+ Now accessible via zram<id>/stat node.
+
+What: /sys/block/zram<id>/invalid_io
+Date: August 2015
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ The invalid_io file is read-only and specifies the number of
+ non-page-size-aligned I/O requests issued to this device.
+ Now accessible via zram<id>/io_stat node.
+
+What: /sys/block/zram<id>/failed_reads
+Date: August 2015
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ The failed_reads file is read-only and specifies the number of
+ failed reads happened on this device.
+ Now accessible via zram<id>/io_stat node.
+
+What: /sys/block/zram<id>/failed_writes
+Date: August 2015
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ The failed_writes file is read-only and specifies the number of
+ failed writes happened on this device.
+ Now accessible via zram<id>/io_stat node.
+
+What: /sys/block/zram<id>/notify_free
+Date: August 2015
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ The notify_free file is read-only. Depending on device usage
+ scenario it may account a) the number of pages freed because
+ of swap slot free notifications or b) the number of pages freed
+ because of REQ_DISCARD requests sent by bio. The former ones
+ are sent to a swap block device when a swap slot is freed, which
+ implies that this disk is being used as a swap disk. The latter
+ ones are sent by filesystem mounted with discard option,
+ whenever some data blocks are getting discarded.
+ Now accessible via zram<id>/io_stat node.
+
+What: /sys/block/zram<id>/zero_pages
+Date: August 2015
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ The zero_pages file is read-only and specifies number of zero
+ filled pages written to this disk. No memory is allocated for
+ such pages.
+ Now accessible via zram<id>/mm_stat node.
+
+What: /sys/block/zram<id>/orig_data_size
+Date: August 2015
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ The orig_data_size file is read-only and specifies uncompressed
+ size of data stored in this disk. This excludes zero-filled
+ pages (zero_pages) since no memory is allocated for them.
+ Unit: bytes
+ Now accessible via zram<id>/mm_stat node.
+
+What: /sys/block/zram<id>/compr_data_size
+Date: August 2015
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ The compr_data_size file is read-only and specifies compressed
+ size of data stored in this disk. So, compression ratio can be
+ calculated using orig_data_size and this statistic.
+ Unit: bytes
+ Now accessible via zram<id>/mm_stat node.
+
+What: /sys/block/zram<id>/mem_used_total
+Date: August 2015
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ The mem_used_total file is read-only and specifies the amount
+ of memory, including allocator fragmentation and metadata
+ overhead, allocated for this disk. So, allocator space
+ efficiency can be calculated using compr_data_size and this
+ statistic.
+ Unit: bytes
+ Now accessible via zram<id>/mm_stat node.
+
+What: /sys/block/zram<id>/mem_used_max
+Date: August 2015
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ The mem_used_max file is read/write and specifies the amount
+ of maximum memory zram have consumed to store compressed data.
+ For resetting the value, you should write "0". Otherwise,
+ you could see -EINVAL.
+ Unit: bytes
+ Downgraded to write-only node: so it's possible to set new
+ value only; its current value is stored in zram<id>/mm_stat
+ node.
+
+What: /sys/block/zram<id>/mem_limit
+Date: August 2015
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ The mem_limit file is read/write and specifies the maximum
+ amount of memory ZRAM can use to store the compressed data.
+ The limit could be changed in run time and "0" means disable
+ the limit. No limit is the initial state. Unit: bytes
+ Downgraded to write-only node: so it's possible to set new
+ value only; its current value is stored in zram<id>/mm_stat
+ node.
diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram
index a6148eaf91e5..2e69e83bf510 100644
--- a/Documentation/ABI/testing/sysfs-block-zram
+++ b/Documentation/ABI/testing/sysfs-block-zram
@@ -141,3 +141,28 @@ Description:
amount of memory ZRAM can use to store the compressed data. The
limit could be changed in run time and "0" means disable the
limit. No limit is the initial state. Unit: bytes
+
+What: /sys/block/zram<id>/compact
+Date: August 2015
+Contact: Minchan Kim <minchan@kernel.org>
+Description:
+ The compact file is write-only and trigger compaction for
+ allocator zrm uses. The allocator moves some objects so that
+ it could free fragment space.
+
+What: /sys/block/zram<id>/io_stat
+Date: August 2015
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ The io_stat file is read-only and accumulates device's I/O
+ statistics not accounted by block layer. For example,
+ failed_reads, failed_writes, etc. File format is similar to
+ block layer statistics file format.
+
+What: /sys/block/zram<id>/mm_stat
+Date: August 2015
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ The mm_stat file is read-only and represents device's mm
+ statistics (orig_data_size, compr_data_size, etc.) in a format
+ similar to block layer statistics file format.
diff --git a/Documentation/ABI/testing/sysfs-class-zram b/Documentation/ABI/testing/sysfs-class-zram
new file mode 100644
index 000000000000..c39d2870f74f
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-zram
@@ -0,0 +1,24 @@
+What: /sys/class/zram-control/
+Date: March 2015
+KernelVersion: 4.1
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ The zram-control/ class sub-directory belongs to zram
+ device class
+
+What: /sys/class/zram-control/zram_add
+Date: March 2015
+KernelVersion: 4.1
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ RO attribute. Read operation will cause zram to add a new
+ device and return its device id back to user (so one can
+ use /dev/zram<id>), or error code.
+
+ What: /sys/class/zram-control/zram_add
+Date: March 2015
+KernelVersion: 4.1
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ Remove a specific /dev/zramX device, where X is a device_id
+ provided by user
diff --git a/Documentation/CodingStyle b/Documentation/CodingStyle
index e55accfca276..f4b78eafd92a 100644
--- a/Documentation/CodingStyle
+++ b/Documentation/CodingStyle
@@ -662,6 +662,19 @@ macros using parameters.
#define CONSTANT 0x4000
#define CONSTEXP (CONSTANT | 3)
+5) namespace collisions when defining local variables in macros resembling
+functions:
+
+#define FOO(x) \
+({ \
+ typeof(x) ret; \
+ ret = calc_ret(x); \
+ (ret); \
+)}
+
+ret is a common name for a local variable - __foo_ret is less likely
+to collide with an existing variable.
+
The cpp manual deals with macros exhaustively. The gcc internals manual also
covers RTL which is used frequently with assembly language in the kernel.
diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 7fcf9c6592ec..44b1a7797876 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -19,7 +19,9 @@ Following shows a typical sequence of steps for using zram.
1) Load Module:
modprobe zram num_devices=4
This creates 4 devices: /dev/zram{0,1,2,3}
- (num_devices parameter is optional. Default: 1)
+
+num_devices parameter is optional and tells zram how many devices should be
+pre-created. Default: 1.
2) Set max number of compression streams
Compression backend may use up to max_comp_streams compression streams,
@@ -97,27 +99,107 @@ size of the disk when not in use so a huge zram is wasteful.
mkfs.ext4 /dev/zram1
mount /dev/zram1 /tmp
-7) Stats:
- Per-device statistics are exported as various nodes under
- /sys/block/zram<id>/
- disksize
- num_reads
- num_writes
- failed_reads
- failed_writes
- invalid_io
- notify_free
- zero_pages
- orig_data_size
- compr_data_size
- mem_used_total
- mem_used_max
-
-8) Deactivate:
+7) Add/remove zram devices
+
+zram provides a control interface, which enables dynamic (on-demand) device
+addition and removal.
+
+In order to add a new /dev/zramX device, perform read operation on zram_add
+attribute. This will return either new device's device id (meaning that you
+can use /dev/zram<id>) or error code.
+
+Example:
+ cat /sys/class/zram-control/zram_add
+ 1
+
+To remove the existing /dev/zramX device (where X is a device id)
+execute
+ echo X > /sys/class/zram-control/zram_remove
+
+8) Stats:
+Per-device statistics are exported as various nodes under /sys/block/zram<id>/
+
+A brief description of exported device attritbutes. For more details please
+read Documentation/ABI/testing/sysfs-block-zram.
+
+Name access description
+---- ------ -----------
+disksize RW show and set the device's disk size
+initstate RO shows the initialization state of the device
+reset WO trigger device reset
+num_reads RO the number of reads
+failed_reads RO the number of failed reads
+num_write RO the number of writes
+failed_writes RO the number of failed writes
+invalid_io RO the number of non-page-size-aligned I/O requests
+max_comp_streams RW the number of possible concurrent compress operations
+comp_algorithm RW show and change the compression algorithm
+notify_free RO the number of notifications to free pages (either
+ slot free notifications or REQ_DISCARD requests)
+zero_pages RO the number of zero filled pages written to this disk
+orig_data_size RO uncompressed size of data stored in this disk
+compr_data_size RO compressed size of data stored in this disk
+mem_used_total RO the amount of memory allocated for this disk
+mem_used_max RW the maximum amount memory zram have consumed to
+ store compressed data
+mem_limit RW the maximum amount of memory ZRAM can use to store
+ the compressed data
+num_migrated RO the number of objects migrated migrated by compaction
+compact WO trigger memory compaction
+
+
+WARNING
+=======
+per-stat sysfs attributes are considered to be deprecated.
+The basic strategy is:
+-- the existing RW nodes will be downgraded to WO nodes (in linux 4.11)
+-- deprecated RO sysfs nodes will eventually be removed (in linux 4.11)
+
+The list of deprecated attributes can be found here:
+Documentation/ABI/obsolete/sysfs-block-zram
+
+Basically, every attribute that has its own read accessible sysfs node
+(e.g. num_reads) *AND* is accessible via one of the stat files (zram<id>/stat
+or zram<id>/io_stat or zram<id>/mm_stat) is considered to be deprecated.
+
+User space is advised to use the following files to read the device statistics.
+
+File /sys/block/zram<id>/stat
+
+Represents block layer statistics. Read Documentation/block/stat.txt for
+details.
+
+
+File /sys/block/zram<id>/io_stat
+
+The stat file represents device's I/O statistics not accounted by block
+layer and, thus, not available in zram<id>/stat file. It consists of a
+single line of text and contains the following stats separated by
+whitespace:
+ failed_reads
+ failed_writes
+ invalid_io
+ notify_free
+
+
+File /sys/block/zram<id>/mm_stat
+
+The stat file represents device's mm statistics. It consists of a single
+line of text and contains the following stats separated by whitespace:
+ orig_data_size
+ compr_data_size
+ mem_used_total
+ mem_limit
+ mem_used_max
+ zero_pages
+ num_migrated
+
+
+9) Deactivate:
swapoff /dev/zram0
umount /dev/zram1
-9) Reset:
+10) Reset:
Write any positive value to 'reset' sysfs node
echo 1 > /sys/block/zram0/reset
echo 1 > /sys/block/zram1/reset
diff --git a/Documentation/cma/debugfs.txt b/Documentation/cma/debugfs.txt
new file mode 100644
index 000000000000..6cef20a8cedc
--- /dev/null
+++ b/Documentation/cma/debugfs.txt
@@ -0,0 +1,21 @@
+The CMA debugfs interface is useful to retrieve basic information out of the
+different CMA areas and to test allocation/release in each of the areas.
+
+Each CMA zone represents a directory under <debugfs>/cma/, indexed by the
+kernel's CMA index. So the first CMA zone would be:
+
+ <debugfs>/cma/cma-0
+
+The structure of the files created under that directory is as follows:
+
+ - [RO] base_pfn: The base PFN (Page Frame Number) of the zone.
+ - [RO] count: Amount of memory in the CMA area.
+ - [RO] order_per_bit: Order of pages represented by one bit.
+ - [RO] bitmap: The bitmap of page states in the zone.
+ - [WO] alloc: Allocate N pages from that CMA area. For example:
+
+ echo 5 > <debugfs>/cma/cma-2/alloc
+
+would try to allocate 5 pages from the cma-2 area.
+
+ - [WO] free: Free N pages from that CMA area, similar to the above.
diff --git a/Documentation/devicetree/bindings/rtc/digicolor-rtc.txt b/Documentation/devicetree/bindings/rtc/digicolor-rtc.txt
new file mode 100644
index 000000000000..d464986012cd
--- /dev/null
+++ b/Documentation/devicetree/bindings/rtc/digicolor-rtc.txt
@@ -0,0 +1,17 @@
+Conexant Digicolor Real Time Clock controller
+
+This binding currently supports the CX92755 SoC.
+
+Required properties:
+- compatible: should be "cnxt,cx92755-rtc"
+- reg: physical base address of the controller and length of memory mapped
+ region.
+- interrupts: rtc alarm interrupt
+
+Example:
+
+ rtc@f0000c30 {
+ compatible = "cnxt,cx92755-rtc";
+ reg = <0xf0000c30 0x18>;
+ interrupts = <25>;
+ };
diff --git a/Documentation/devicetree/bindings/rtc/stmp3xxx-rtc.txt b/Documentation/devicetree/bindings/rtc/stmp3xxx-rtc.txt
index b800070fe6e9..fa6a94226669 100644
--- a/Documentation/devicetree/bindings/rtc/stmp3xxx-rtc.txt
+++ b/Documentation/devicetree/bindings/rtc/stmp3xxx-rtc.txt
@@ -7,6 +7,11 @@ Required properties:
region.
- interrupts: rtc alarm interrupt
+Optional properties:
+- stmp,crystal-freq: override crystal frequency as determined from fuse bits.
+ Only <32000> and <32768> are possible for the hardware. Use <0> for
+ "no crystal".
+
Example:
rtc@80056000 {
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index a07ba61662ed..8e36c7e3c345 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -200,12 +200,12 @@ contains details information about the process itself. Its fields are
explained in Table 1-4.
(for SMP CONFIG users)
-For making accounting scalable, RSS related information are handled in
-asynchronous manner and the vaule may not be very precise. To see a precise
+For making accounting scalable, RSS related information are handled in an
+asynchronous manner and the value may not be very precise. To see a precise
snapshot of a moment, you can see /proc/<pid>/smaps file and scan page table.
It's slow but very precise.
-Table 1-2: Contents of the status files (as of 2.6.30-rc7)
+Table 1-2: Contents of the status files (as of 3.20.0)
..............................................................................
Field Content
Name filename of the executable
@@ -213,6 +213,7 @@ Table 1-2: Contents of the status files (as of 2.6.30-rc7)
in an uninterruptible wait, Z is zombie,
T is traced or stopped)
Tgid thread group ID
+ Ngid NUMA group ID (0 if none)
Pid process id
PPid process id of the parent process
TracerPid PID of process tracing this process (0 if not)
@@ -220,6 +221,10 @@ Table 1-2: Contents of the status files (as of 2.6.30-rc7)
Gid Real, effective, saved set, and file system GIDs
FDSize number of file descriptor slots currently allocated
Groups supplementary group list
+ NStgid descendant namespace thread group ID hierarchy
+ NSpid descendant namespace process ID hierarchy
+ NSpgid descendant namespace process group ID hierarchy
+ NSsid descendant namespace session ID hierarchy
VmPeak peak virtual memory size
VmSize total program size
VmLck locked memory size
@@ -1704,6 +1709,10 @@ A typical output is
flags: 0100002
mnt_id: 19
+All locks associated with a file descriptor are shown in its fdinfo too.
+
+lock: 1: FLOCK ADVISORY WRITE 359 00:13:11691 0 EOF
+
The files such as eventfd, fsnotify, signalfd, epoll among the regular pos/flags
pair provide additional information particular to the objects they represent.
diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt
index ce1126aceed8..223c32171dcc 100644
--- a/Documentation/filesystems/vfat.txt
+++ b/Documentation/filesystems/vfat.txt
@@ -180,6 +180,16 @@ dos1xfloppy -- If set, use a fallback default BIOS Parameter Block
<bool>: 0,1,yes,no,true,false
+LIMITATION
+---------------------------------------------------------------------
+* The fallocated region of file is discarded at umount/evict time
+ when using fallocate with FALLOC_FL_KEEP_SIZE.
+ So, User should assume that fallocated region can be discarded at
+ last close if there is memory pressure resulting in eviction of
+ the inode from the memory. As a result, for any dependency on
+ the fallocated region, user should make sure to recheck fallocate
+ after reopening the file.
+
TODO
----------------------------------------------------------------------
* Need to get rid of the raw scanning stuff. Instead, always use
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 29349b491ea9..427a603e822b 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1989,7 +1989,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
seconds. Use this parameter to check at some
other rate. 0 disables periodic checking.
- memtest= [KNL,X86] Enable memtest
+ memtest= [KNL,X86,ARM] Enable memtest
Format: <integer>
default : 0 <disable>
Specifies the number of memtest passes to be
@@ -2236,8 +2236,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
nmi_watchdog= [KNL,BUGS=X86] Debugging features for SMP kernels
Format: [panic,][nopanic,][num]
- Valid num: 0
+ Valid num: 0 or 1
0 - turn nmi_watchdog off
+ 1 - turn nmi_watchdog on
When panic is specified, panic when an NMI watchdog
timeout occurs (or 'nopanic' to override the opposite
default).
@@ -2322,6 +2323,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
register save and restore. The kernel will only save
legacy floating-point registers on task switch.
+ nohugeiomap [KNL,x86] Disable kernel huge I/O mappings.
+
noxsave [BUGS=X86] Disables x86 extended register state save
and restore using xsave. The kernel will fallback to
enabling legacy floating-point and sse state.
@@ -2464,7 +2467,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
nousb [USB] Disable the USB subsystem
- nowatchdog [KNL] Disable the lockup detector (NMI watchdog).
+ nowatchdog [KNL] Disable both lockup detectors, i.e.
+ soft-lockup and NMI watchdog (hard-lockup).
nowb [ARM]
diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt
index 87f10424bb28..8d4969048aeb 100644
--- a/Documentation/printk-formats.txt
+++ b/Documentation/printk-formats.txt
@@ -8,6 +8,21 @@ If variable is of Type, use printk format specifier:
unsigned long long %llu or %llx
size_t %zu or %zx
ssize_t %zd or %zx
+ s32 %d or %x
+ u32 %u or %x
+ s64 %lld or %llx
+ u64 %llu or %llx
+
+If <type> is dependent on a config option for its size (e.g., sector_t,
+blkcnt_t) or is architecture-dependent for its size (e.g., tcflag_t), use a
+format specifier of its largest possible type and explicitly cast to it.
+Example:
+
+ printk("test: sector number/total blocks: %llu/%llu\n",
+ (unsigned long long)sector, (unsigned long long)blockcount);
+
+Reminder: sizeof() result is of type size_t.
+
Raw pointer value SHOULD be printed with %p. The kernel supports
the following extended format specifiers for pointer types:
@@ -54,6 +69,7 @@ Struct Resources:
For printing struct resources. The 'R' and 'r' specifiers result in a
printed resource with ('R') or without ('r') a decoded flags member.
+ Passed by reference.
Physical addresses types phys_addr_t:
@@ -132,6 +148,8 @@ MAC/FDDI addresses:
specifier to use reversed byte order suitable for visual interpretation
of Bluetooth addresses which are in the little endian order.
+ Passed by reference.
+
IPv4 addresses:
%pI4 1.2.3.4
@@ -146,6 +164,8 @@ IPv4 addresses:
host, network, big or little endian order addresses respectively. Where
no specifier is provided the default network/big endian order is used.
+ Passed by reference.
+
IPv6 addresses:
%pI6 0001:0002:0003:0004:0005:0006:0007:0008
@@ -160,6 +180,8 @@ IPv6 addresses:
print a compressed IPv6 address as described by
http://tools.ietf.org/html/rfc5952
+ Passed by reference.
+
IPv4/IPv6 addresses (generic, with port, flowinfo, scope):
%pIS 1.2.3.4 or 0001:0002:0003:0004:0005:0006:0007:0008
@@ -186,6 +208,8 @@ IPv4/IPv6 addresses (generic, with port, flowinfo, scope):
specifiers can be used as well and are ignored in case of an IPv6
address.
+ Passed by reference.
+
Further examples:
%pISfc 1.2.3.4 or [1:2:3:4:5:6:7:8]/123456789
@@ -207,6 +231,8 @@ UUID/GUID addresses:
Where no additional specifiers are used the default big endian
order with lower case hex characters will be printed.
+ Passed by reference.
+
dentry names:
%pd{,2,3,4}
%pD{,2,3,4}
@@ -216,6 +242,16 @@ dentry names:
equivalent of %s dentry->d_name.name we used to use, %pd<n> prints
n last components. %pD does the same thing for struct file.
+ Passed by reference.
+
+task_struct comm name:
+
+ %pT
+
+ For printing task_struct->comm.
+
+ Passed by reference (NULL for "current").
+
struct va_format:
%pV
@@ -231,13 +267,19 @@ struct va_format:
Do not use this feature without some mechanism to verify the
correctness of the format string and va_list arguments.
-u64 SHOULD be printed with %llu/%llx:
+ Passed by reference.
- printk("%llu", u64_var);
+struct clk:
-s64 SHOULD be printed with %lld/%llx:
+ %pC pll1
+ %pCn pll1
+ %pCr 1560000000
- printk("%lld", s64_var);
+ For printing struct clk structures. '%pC' and '%pCn' print the name
+ (Common Clock Framework) or address (legacy clock framework) of the
+ structure; '%pCr' prints the current clock rate.
+
+ Passed by reference.
bitmap and its derivatives such as cpumask and nodemask:
@@ -248,15 +290,8 @@ bitmap and its derivatives such as cpumask and nodemask:
%*pb output the bitmap with field width as the number of bits and %*pbl
output the bitmap as range list with field width as the number of bits.
-If <type> is dependent on a config option for its size (e.g., sector_t,
-blkcnt_t) or is architecture-dependent for its size (e.g., tcflag_t), use a
-format specifier of its largest possible type and explicitly cast to it.
-Example:
+ Passed by reference.
- printk("test: sector number/total blocks: %llu/%llu\n",
- (unsigned long long)sector, (unsigned long long)blockcount);
-
-Reminder: sizeof() result is of type size_t.
Thank you for your cooperation and attention.
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 83ab25660fc9..99d7eb3a1416 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -77,12 +77,14 @@ show up in /proc/sys/kernel:
- shmmax [ sysv ipc ]
- shmmni
- softlockup_all_cpu_backtrace
+- soft_watchdog
- stop-a [ SPARC only ]
- sysrq ==> Documentation/sysrq.txt
- sysctl_writes_strict
- tainted
- threads-max
- unknown_nmi_panic
+- watchdog
- watchdog_thresh
- version
@@ -417,16 +419,23 @@ successful IPC object allocation.
nmi_watchdog:
-Enables/Disables the NMI watchdog on x86 systems. When the value is
-non-zero the NMI watchdog is enabled and will continuously test all
-online cpus to determine whether or not they are still functioning
-properly. Currently, passing "nmi_watchdog=" parameter at boot time is
-required for this function to work.
+This parameter can be used to control the NMI watchdog
+(i.e. the hard lockup detector) on x86 systems.
-If LAPIC NMI watchdog method is in use (nmi_watchdog=2 kernel
-parameter), the NMI watchdog shares registers with oprofile. By
-disabling the NMI watchdog, oprofile may have more registers to
-utilize.
+ 0 - disable the hard lockup detector
+ 1 - enable the hard lockup detector
+
+The hard lockup detector monitors each CPU for its ability to respond to
+timer interrupts. The mechanism utilizes CPU performance counter registers
+that are programmed to generate Non-Maskable Interrupts (NMIs) periodically
+while a CPU is busy. Hence, the alternative name 'NMI watchdog'.
+
+The NMI watchdog is disabled by default if the kernel is running as a guest
+in a KVM virtual machine. This default can be overridden by adding
+
+ nmi_watchdog=1
+
+to the guest kernel command line (see Documentation/kernel-parameters.txt).
==============================================================
@@ -816,6 +825,22 @@ NMI.
==============================================================
+soft_watchdog
+
+This parameter can be used to control the soft lockup detector.
+
+ 0 - disable the soft lockup detector
+ 1 - enable the soft lockup detector
+
+The soft lockup detector monitors CPUs for threads that are hogging the CPUs
+without rescheduling voluntarily, and thus prevent the 'watchdog/N' threads
+from running. The mechanism depends on the CPUs ability to respond to timer
+interrupts which are needed for the 'watchdog/N' threads to be woken up by
+the watchdog timer function, otherwise the NMI watchdog - if enabled - can
+detect a hard lockup condition.
+
+==============================================================
+
tainted:
Non-zero if the kernel has been tainted. Numeric values, which
@@ -858,6 +883,25 @@ example. If a system hangs up, try pressing the NMI switch.
==============================================================
+watchdog:
+
+This parameter can be used to disable or enable the soft lockup detector
+_and_ the NMI watchdog (i.e. the hard lockup detector) at the same time.
+
+ 0 - disable both lockup detectors
+ 1 - enable both lockup detectors
+
+The soft lockup detector and the NMI watchdog can also be disabled or
+enabled individually, using the soft_watchdog and nmi_watchdog parameters.
+If the watchdog parameter is read, for example by executing
+
+ cat /proc/sys/kernel/watchdog
+
+the output of this command (0 or 1) shows the logical OR of soft_watchdog
+and nmi_watchdog.
+
+==============================================================
+
watchdog_thresh:
This value can be used to control the frequency of hrtimer and NMI
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 902b4574acfb..9832ec52f859 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -21,6 +21,7 @@ Currently, these files are in /proc/sys/vm:
- admin_reserve_kbytes
- block_dump
- compact_memory
+- compact_unevictable_allowed
- dirty_background_bytes
- dirty_background_ratio
- dirty_bytes
@@ -106,6 +107,16 @@ huge pages although processes will also directly compact memory as required.
==============================================================
+compact_unevictable_allowed
+
+Available only when CONFIG_COMPACTION is set. When set to 1, compaction is
+allowed to examine the unevictable lru (mlocked pages) for pages to compact.
+This should be used on systems where stalls for minor page faults are an
+acceptable trade for large contiguous free memory. Set to 0 to prevent
+compaction from moving pages that are unevictable. Default value is 1.
+
+==============================================================
+
dirty_background_bytes
Contains the amount of dirty memory at which the background kernel
diff --git a/Documentation/vm/cleancache.txt b/Documentation/vm/cleancache.txt
index 01d76282444e..e4b49df7a048 100644
--- a/Documentation/vm/cleancache.txt
+++ b/Documentation/vm/cleancache.txt
@@ -28,9 +28,7 @@ IMPLEMENTATION OVERVIEW
A cleancache "backend" that provides transcendent memory registers itself
to the kernel's cleancache "frontend" by calling cleancache_register_ops,
passing a pointer to a cleancache_ops structure with funcs set appropriately.
-Note that cleancache_register_ops returns the previous settings so that
-chaining can be performed if desired. The functions provided must conform to
-certain semantics as follows:
+The functions provided must conform to certain semantics as follows:
Most important, cleancache is "ephemeral". Pages which are copied into
cleancache have an indefinite lifetime which is completely unknowable
diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt
index f2d3a100fe38..b32b9cd50c0e 100644
--- a/Documentation/vm/hugetlbpage.txt
+++ b/Documentation/vm/hugetlbpage.txt
@@ -267,21 +267,34 @@ call, then it is required that system administrator mount a file system of
type hugetlbfs:
mount -t hugetlbfs \
- -o uid=<value>,gid=<value>,mode=<value>,size=<value>,nr_inodes=<value> \
- none /mnt/huge
+ -o uid=<value>,gid=<value>,mode=<value>,pagesize=<value>,size=<value>,\
+ min_size=<value>,nr_inodes=<value> none /mnt/huge
This command mounts a (pseudo) filesystem of type hugetlbfs on the directory
/mnt/huge. Any files created on /mnt/huge uses huge pages. The uid and gid
options sets the owner and group of the root of the file system. By default
the uid and gid of the current process are taken. The mode option sets the
mode of root of file system to value & 01777. This value is given in octal.
-By default the value 0755 is picked. The size option sets the maximum value of
-memory (huge pages) allowed for that filesystem (/mnt/huge). The size is
-rounded down to HPAGE_SIZE. The option nr_inodes sets the maximum number of
-inodes that /mnt/huge can use. If the size or nr_inodes option is not
-provided on command line then no limits are set. For size and nr_inodes
-options, you can use [G|g]/[M|m]/[K|k] to represent giga/mega/kilo. For
-example, size=2K has the same meaning as size=2048.
+By default the value 0755 is picked. If the paltform supports multiple huge
+page sizes, the pagesize option can be used to specify the huge page size and
+associated pool. pagesize is specified in bytes. If pagesize is not specified
+the paltform's default huge page size and associated pool will be used. The
+size option sets the maximum value of memory (huge pages) allowed for that
+filesystem (/mnt/huge). The size option can be specified in bytes, or as a
+percentage of the specified huge page pool (nr_hugepages). The size is
+rounded down to HPAGE_SIZE boundary. The min_size option sets the minimum
+value of memory (huge pages) allowed for the filesystem. min_size can be
+specified in the same way as size, either bytes or a percentage of the
+huge page pool. At mount time, the number of huge pages specified by
+min_size are reserved for use by the filesystem. If there are not enough
+free huge pages available, the mount will fail. As huge pages are allocated
+to the filesystem and freed, the reserve count is adjusted so that the sum
+of allocated and reserved huge pages is always at least min_size. The option
+nr_inodes sets the maximum number of inodes that /mnt/huge can use. If the
+size, min_size or nr_inodes option is not provided on command line then
+no limits are set. For pagesize, size, min_size and nr_inodes options, you
+can use [G|g]/[M|m]/[K|k] to represent giga/mega/kilo. For example, size=2K
+has the same meaning as size=2048.
While read system calls are supported on files that reside on hugetlb
file systems, write system calls are not.
diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt
index 744f82f86c58..86cb4624fc5a 100644
--- a/Documentation/vm/unevictable-lru.txt
+++ b/Documentation/vm/unevictable-lru.txt
@@ -317,7 +317,7 @@ If the VMA passes some filtering as described in "Filtering Special Vmas"
below, mlock_fixup() will attempt to merge the VMA with its neighbors or split
off a subset of the VMA if the range does not cover the entire VMA. Once the
VMA has been merged or split or neither, mlock_fixup() will call
-__mlock_vma_pages_range() to fault in the pages via get_user_pages() and to
+populate_vma_page_range() to fault in the pages via get_user_pages() and to
mark the pages as mlocked via mlock_vma_page().
Note that the VMA being mlocked might be mapped with PROT_NONE. In this case,
@@ -327,7 +327,7 @@ fault path or in vmscan.
Also note that a page returned by get_user_pages() could be truncated or
migrated out from under us, while we're trying to mlock it. To detect this,
-__mlock_vma_pages_range() checks page_mapping() after acquiring the page lock.
+populate_vma_page_range() checks page_mapping() after acquiring the page lock.
If the page is still associated with its mapping, we'll go ahead and call
mlock_vma_page(). If the mapping is gone, we just unlock the page and move on.
In the worst case, this will result in a page mapped in a VM_LOCKED VMA
@@ -392,7 +392,7 @@ ignored for munlock.
If the VMA is VM_LOCKED, mlock_fixup() again attempts to merge or split off the
specified range. The range is then munlocked via the function
-__mlock_vma_pages_range() - the same function used to mlock a VMA range -
+populate_vma_page_range() - the same function used to mlock a VMA range -
passing a flag to indicate that munlock() is being performed.
Because the VMA access protections could have been changed to PROT_NONE after
@@ -402,7 +402,7 @@ get_user_pages() was enhanced to accept a flag to ignore the permissions when
fetching the pages - all of which should be resident as a result of previous
mlocking.
-For munlock(), __mlock_vma_pages_range() unlocks individual pages by calling
+For munlock(), populate_vma_page_range() unlocks individual pages by calling
munlock_vma_page(). munlock_vma_page() unconditionally clears the PG_mlocked
flag using TestClearPageMlocked(). As with mlock_vma_page(),
munlock_vma_page() use the Test*PageMlocked() function to handle the case where
@@ -463,21 +463,11 @@ populate the page table.
To mlock a range of memory under the unevictable/mlock infrastructure, the
mmap() handler and task address space expansion functions call
-mlock_vma_pages_range() specifying the vma and the address range to mlock.
-mlock_vma_pages_range() filters VMAs like mlock_fixup(), as described above in
-"Filtering Special VMAs". It will clear the VM_LOCKED flag, which will have
-already been set by the caller, in filtered VMAs. Thus these VMA's need not be
-visited for munlock when the region is unmapped.
-
-For "normal" VMAs, mlock_vma_pages_range() calls __mlock_vma_pages_range() to
-fault/allocate the pages and mlock them. Again, like mlock_fixup(),
-mlock_vma_pages_range() downgrades the mmap semaphore to read mode before
-attempting to fault/allocate and mlock the pages and "upgrades" the semaphore
-back to write mode before returning.
-
-The callers of mlock_vma_pages_range() will have already added the memory range
+populate_vma_page_range() specifying the vma and the address range to mlock.
+
+The callers of populate_vma_page_range() will have already added the memory range
to be mlocked to the task's "locked_vm". To account for filtered VMAs,
-mlock_vma_pages_range() returns the number of pages NOT mlocked. All of the
+populate_vma_page_range() returns the number of pages NOT mlocked. All of the
callers then subtract a non-negative return value from the task's locked_vm. A
negative return value represent an error - for example, from get_user_pages()
attempting to fault in a VMA with PROT_NONE access. In this case, we leave the
diff --git a/Documentation/vm/zsmalloc.txt b/Documentation/vm/zsmalloc.txt
new file mode 100644
index 000000000000..64ed63c4f69d
--- /dev/null
+++ b/Documentation/vm/zsmalloc.txt
@@ -0,0 +1,70 @@
+zsmalloc
+--------
+
+This allocator is designed for use with zram. Thus, the allocator is
+supposed to work well under low memory conditions. In particular, it
+never attempts higher order page allocation which is very likely to
+fail under memory pressure. On the other hand, if we just use single
+(0-order) pages, it would suffer from very high fragmentation --
+any object of size PAGE_SIZE/2 or larger would occupy an entire page.
+This was one of the major issues with its predecessor (xvmalloc).
+
+To overcome these issues, zsmalloc allocates a bunch of 0-order pages
+and links them together using various 'struct page' fields. These linked
+pages act as a single higher-order page i.e. an object can span 0-order
+page boundaries. The code refers to these linked pages as a single entity
+called zspage.
+
+For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE
+since this satisfies the requirements of all its current users (in the
+worst case, page is incompressible and is thus stored "as-is" i.e. in
+uncompressed form). For allocation requests larger than this size, failure
+is returned (see zs_malloc).
+
+Additionally, zs_malloc() does not return a dereferenceable pointer.
+Instead, it returns an opaque handle (unsigned long) which encodes actual
+location of the allocated object. The reason for this indirection is that
+zsmalloc does not keep zspages permanently mapped since that would cause
+issues on 32-bit systems where the VA region for kernel space mappings
+is very small. So, before using the allocating memory, the object has to
+be mapped using zs_map_object() to get a usable pointer and subsequently
+unmapped using zs_unmap_object().
+
+stat
+----
+
+With CONFIG_ZSMALLOC_STAT, we could see zsmalloc internal information via
+/sys/kernel/debug/zsmalloc/<user name>. Here is a sample of stat output:
+
+# cat /sys/kernel/debug/zsmalloc/zram0/classes
+
+ class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage
+ ..
+ ..
+ 9 176 0 1 186 129 8 4
+ 10 192 1 0 2880 2872 135 3
+ 11 208 0 1 819 795 42 2
+ 12 224 0 1 219 159 12 4
+ ..
+ ..
+
+
+class: index
+size: object size zspage stores
+almost_empty: the number of ZS_ALMOST_EMPTY zspages(see below)
+almost_full: the number of ZS_ALMOST_FULL zspages(see below)
+obj_allocated: the number of objects allocated
+obj_used: the number of objects allocated to the user
+pages_used: the number of pages allocated for the class
+pages_per_zspage: the number of 0-order pages to make a zspage
+
+We assign a zspage to ZS_ALMOST_EMPTY fullness group when:
+ n <= N / f, where
+n = number of allocated objects
+N = total number of objects zspage can store
+f = fullness_threshold_frac(ie, 4 at the moment)
+
+Similarly, we assign zspage to:
+ ZS_ALMOST_FULL when n > N / f
+ ZS_EMPTY when n == 0
+ ZS_FULL when n == N
diff --git a/MAINTAINERS b/MAINTAINERS
index 1365e321d815..fdfdc24a6bac 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -631,16 +631,16 @@ F: drivers/iommu/amd_iommu*.[ch]
F: include/linux/amd-iommu.h
AMD KFD
-M: Oded Gabbay <oded.gabbay@amd.com>
-L: dri-devel@lists.freedesktop.org
-T: git git://people.freedesktop.org/~gabbayo/linux.git
-S: Supported
-F: drivers/gpu/drm/amd/amdkfd/
+M: Oded Gabbay <oded.gabbay@amd.com>
+L: dri-devel@lists.freedesktop.org
+T: git git://people.freedesktop.org/~gabbayo/linux.git
+S: Supported
+F: drivers/gpu/drm/amd/amdkfd/
F: drivers/gpu/drm/amd/include/cik_structs.h
F: drivers/gpu/drm/amd/include/kgd_kfd_interface.h
-F: drivers/gpu/drm/radeon/radeon_kfd.c
-F: drivers/gpu/drm/radeon/radeon_kfd.h
-F: include/uapi/linux/kfd_ioctl.h
+F: drivers/gpu/drm/radeon/radeon_kfd.c
+F: drivers/gpu/drm/radeon/radeon_kfd.h
+F: include/uapi/linux/kfd_ioctl.h
AMD MICROCODE UPDATE SUPPORT
M: Andreas Herrmann <herrmann.der.user@googlemail.com>
@@ -1975,10 +1975,10 @@ F: Documentation/filesystems/befs.txt
F: fs/befs/
BECKHOFF CX5020 ETHERCAT MASTER DRIVER
-M: Dariusz Marcinkiewicz <reksio@newterm.pl>
-L: netdev@vger.kernel.org
-S: Maintained
-F: drivers/net/ethernet/ec_bhf.c
+M: Dariusz Marcinkiewicz <reksio@newterm.pl>
+L: netdev@vger.kernel.org
+S: Maintained
+F: drivers/net/ethernet/ec_bhf.c
BFS FILE SYSTEM
M: "Tigran A. Aivazian" <tigran@aivazian.fsnet.co.uk>
@@ -2904,11 +2904,11 @@ S: Supported
F: drivers/net/ethernet/chelsio/cxgb3/
CXGB3 ISCSI DRIVER (CXGB3I)
-M: Karen Xie <kxie@chelsio.com>
-L: linux-scsi@vger.kernel.org
-W: http://www.chelsio.com
-S: Supported
-F: drivers/scsi/cxgbi/cxgb3i
+M: Karen Xie <kxie@chelsio.com>
+L: linux-scsi@vger.kernel.org
+W: http://www.chelsio.com
+S: Supported
+F: drivers/scsi/cxgbi/cxgb3i
CXGB3 IWARP RNIC DRIVER (IW_CXGB3)
M: Steve Wise <swise@chelsio.com>
@@ -2925,11 +2925,11 @@ S: Supported
F: drivers/net/ethernet/chelsio/cxgb4/
CXGB4 ISCSI DRIVER (CXGB4I)
-M: Karen Xie <kxie@chelsio.com>
-L: linux-scsi@vger.kernel.org
-W: http://www.chelsio.com
-S: Supported
-F: drivers/scsi/cxgbi/cxgb4i
+M: Karen Xie <kxie@chelsio.com>
+L: linux-scsi@vger.kernel.org
+W: http://www.chelsio.com
+S: Supported
+F: drivers/scsi/cxgbi/cxgb4i
CXGB4 IWARP RNIC DRIVER (IW_CXGB4)
M: Steve Wise <swise@chelsio.com>
@@ -5233,7 +5233,7 @@ F: arch/x86/kernel/tboot.c
INTEL WIRELESS WIMAX CONNECTION 2400
M: Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
M: linux-wimax@intel.com
-L: wimax@linuxwimax.org (subscribers-only)
+L: wimax@linuxwimax.org (subscribers-only)
S: Supported
W: http://linuxwimax.org
F: Documentation/wimax/README.i2400m
@@ -5947,7 +5947,7 @@ F: arch/powerpc/platforms/512x/
F: arch/powerpc/platforms/52xx/
LINUX FOR POWERPC EMBEDDED PPC4XX
-M: Alistair Popple <alistair@popple.id.au>
+M: Alistair Popple <alistair@popple.id.au>
M: Matt Porter <mporter@kernel.crashing.org>
W: http://www.penguinppc.org/
L: linuxppc-dev@lists.ozlabs.org
@@ -6419,7 +6419,7 @@ S: Supported
F: drivers/watchdog/mena21_wdt.c
MEN CHAMELEON BUS (mcb)
-M: Johannes Thumshirn <johannes.thumshirn@men.de>
+M: Johannes Thumshirn <johannes.thumshirn@men.de>
S: Supported
F: drivers/mcb/
F: include/linux/mcb.h
@@ -7979,10 +7979,10 @@ L: rtc-linux@googlegroups.com
S: Maintained
QAT DRIVER
-M: Tadeusz Struk <tadeusz.struk@intel.com>
-L: qat-linux@intel.com
-S: Supported
-F: drivers/crypto/qat/
+M: Tadeusz Struk <tadeusz.struk@intel.com>
+L: qat-linux@intel.com
+S: Supported
+F: drivers/crypto/qat/
QIB DRIVER
M: Mike Marciniszyn <infinipath@intel.com>
@@ -10166,11 +10166,11 @@ F: include/linux/cdrom.h
F: include/uapi/linux/cdrom.h
UNISYS S-PAR DRIVERS
-M: Benjamin Romer <benjamin.romer@unisys.com>
-M: David Kershner <david.kershner@unisys.com>
-L: sparmaintainer@unisys.com (Unisys internal)
-S: Supported
-F: drivers/staging/unisys/
+M: Benjamin Romer <benjamin.romer@unisys.com>
+M: David Kershner <david.kershner@unisys.com>
+L: sparmaintainer@unisys.com (Unisys internal)
+S: Supported
+F: drivers/staging/unisys/
UNIVERSAL FLASH STORAGE HOST CONTROLLER DRIVER
M: Vinayak Holikatti <vinholikatti@gmail.com>
@@ -10721,7 +10721,7 @@ F: drivers/media/rc/winbond-cir.c
WIMAX STACK
M: Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
M: linux-wimax@intel.com
-L: wimax@linuxwimax.org (subscribers-only)
+L: wimax@linuxwimax.org (subscribers-only)
S: Supported
W: http://linuxwimax.org
F: Documentation/wimax/README.wimax
@@ -11012,6 +11012,7 @@ L: linux-mm@kvack.org
S: Maintained
F: mm/zsmalloc.c
F: include/linux/zsmalloc.h
+F: Documentation/vm/zsmalloc.txt
ZSWAP COMPRESSED SWAP CACHING
M: Seth Jennings <sjennings@variantweb.net>
diff --git a/arch/Kconfig b/arch/Kconfig
index 05d7a8a458d5..e1068987bad1 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -446,6 +446,9 @@ config HAVE_IRQ_TIME_ACCOUNTING
config HAVE_ARCH_TRANSPARENT_HUGEPAGE
bool
+config HAVE_ARCH_HUGE_VMAP
+ bool
+
config HAVE_ARCH_SOFT_DIRTY
bool
@@ -484,6 +487,18 @@ config HAVE_IRQ_EXIT_ON_IRQ_STACK
This spares a stack switch and improves cache usage on softirq
processing.
+config PGTABLE_LEVELS
+ int
+ default 2
+
+config ARCH_HAS_ELF_RANDOMIZE
+ bool
+ help
+ An architecture supports choosing randomized locations for
+ stack, mmap, brk, and ET_DYN. Defined functions:
+ - arch_mmap_rnd()
+ - arch_randomize_brk()
+
#
# ABI hall of shame
#
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index b7ff9a318c31..bf9e9d3b3792 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -76,6 +76,10 @@ config GENERIC_ISA_DMA
bool
default y
+config PGTABLE_LEVELS
+ int
+ default 3
+
source "init/Kconfig"
source "kernel/Kconfig.freezer"
diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
index 0086b472bc2b..836fbd44f65b 100644
--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h
@@ -44,6 +44,7 @@
#define MADV_WILLNEED 3 /* will need these pages */
#define MADV_SPACEAVAIL 5 /* ensure resources are available */
#define MADV_DONTNEED 6 /* don't need these pages */
+#define MADV_FREE 7 /* free pages only if memory pressure */
/* common/generic parameters */
#define MADV_REMOVE 9 /* remove these pages & resources */
diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c
index 1badf9b84b51..e00a01879025 100644
--- a/arch/arc/kernel/troubleshoot.c
+++ b/arch/arc/kernel/troubleshoot.c
@@ -52,7 +52,7 @@ static void show_callee_regs(struct callee_regs *cregs)
print_reg_file(&(cregs->r13), 13);
}
-void print_task_path_n_nm(struct task_struct *tsk, char *buf)
+static void print_task_path_n_nm(struct task_struct *tsk, char *buf)
{
struct path path;
char *path_nm = NULL;
@@ -77,7 +77,6 @@ void print_task_path_n_nm(struct task_struct *tsk, char *buf)
done:
pr_info("Path: %s\n", path_nm);
}
-EXPORT_SYMBOL(print_task_path_n_nm);
static void show_faulting_vma(unsigned long address, char *buf)
{
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index eeec8ec906e8..27706a99cdf9 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1,8 +1,8 @@
config ARM
bool
default y
- select ARCH_BINFMT_ELF_RANDOMIZE_PIE
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
+ select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAVE_CUSTOM_GPIO_H
select ARCH_HAS_GCOV_PROFILE_ALL
@@ -286,6 +286,11 @@ config GENERIC_BUG
def_bool y
depends on BUG
+config PGTABLE_LEVELS
+ int
+ default 3 if ARM_LPAE
+ default 2
+
source "init/Kconfig"
source "kernel/Kconfig.freezer"
diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild
index fe74c0d1e485..d7be5a9fd171 100644
--- a/arch/arm/include/asm/Kbuild
+++ b/arch/arm/include/asm/Kbuild
@@ -22,6 +22,7 @@ generic-y += preempt.h
generic-y += resource.h
generic-y += rwsem.h
generic-y += scatterlist.h
+generic-y += seccomp.h
generic-y += sections.h
generic-y += segment.h
generic-y += sembuf.h
diff --git a/arch/arm/include/asm/elf.h b/arch/arm/include/asm/elf.h
index afb9cafd3786..c1ff8ab12914 100644
--- a/arch/arm/include/asm/elf.h
+++ b/arch/arm/include/asm/elf.h
@@ -125,10 +125,6 @@ int dump_task_regs(struct task_struct *t, elf_gregset_t *elfregs);
extern void elf_set_personality(const struct elf32_hdr *);
#define SET_PERSONALITY(ex) elf_set_personality(&(ex))
-struct mm_struct;
-extern unsigned long arch_randomize_brk(struct mm_struct *mm);
-#define arch_randomize_brk arch_randomize_brk
-
#ifdef CONFIG_MMU
#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
struct linux_binprm;
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index a745a2a53853..6d6012a320b2 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -249,6 +249,7 @@ PMD_BIT_FUNC(mkold, &= ~PMD_SECT_AF);
PMD_BIT_FUNC(mksplitting, |= L_PMD_SECT_SPLITTING);
PMD_BIT_FUNC(mkwrite, &= ~L_PMD_SECT_RDONLY);
PMD_BIT_FUNC(mkdirty, |= L_PMD_SECT_DIRTY);
+PMD_BIT_FUNC(mkclean, &= ~L_PMD_SECT_DIRTY);
PMD_BIT_FUNC(mkyoung, |= PMD_SECT_AF);
#define pmd_mkhuge(pmd) (__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT))
diff --git a/arch/arm/include/asm/seccomp.h b/arch/arm/include/asm/seccomp.h
deleted file mode 100644
index 52b156b341f5..000000000000
--- a/arch/arm/include/asm/seccomp.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef _ASM_ARM_SECCOMP_H
-#define _ASM_ARM_SECCOMP_H
-
-#include <linux/unistd.h>
-
-#define __NR_seccomp_read __NR_read
-#define __NR_seccomp_write __NR_write
-#define __NR_seccomp_exit __NR_exit
-#define __NR_seccomp_sigreturn __NR_rt_sigreturn
-
-#endif /* _ASM_ARM_SECCOMP_H */
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 1609b022a72f..3d0e9aed4b40 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -335,6 +335,9 @@ void __init bootmem_init(void)
find_limits(&min, &max_low, &max_high);
+ early_memtest((phys_addr_t)min << PAGE_SHIFT,
+ (phys_addr_t)max_low << PAGE_SHIFT);
+
/*
* Sparsemem tries to allocate bootmem in memory_present(),
* so must be done after the fixed reservations
diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c
index 5e85ed371364..407dc786583a 100644
--- a/arch/arm/mm/mmap.c
+++ b/arch/arm/mm/mmap.c
@@ -169,14 +169,22 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
return addr;
}
+unsigned long arch_mmap_rnd(void)
+{
+ unsigned long rnd;
+
+ /* 8 bits of randomness in 20 address space bits */
+ rnd = (unsigned long)get_random_int() % (1 << 8);
+
+ return rnd << PAGE_SHIFT;
+}
+
void arch_pick_mmap_layout(struct mm_struct *mm)
{
unsigned long random_factor = 0UL;
- /* 8 bits of randomness in 20 address space bits */
- if ((current->flags & PF_RANDOMIZE) &&
- !(current->personality & ADDR_NO_RANDOMIZE))
- random_factor = (get_random_int() % (1 << 8)) << PAGE_SHIFT;
+ if (current->flags & PF_RANDOMIZE)
+ random_factor = arch_mmap_rnd();
if (mmap_is_legacy()) {
mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
diff --git a/arch/arm/plat-pxa/dma.c b/arch/arm/plat-pxa/dma.c
index 054fc5a1a11c..d92f07f6ecfb 100644
--- a/arch/arm/plat-pxa/dma.c
+++ b/arch/arm/plat-pxa/dma.c
@@ -51,19 +51,19 @@ static struct dentry *dbgfs_root, *dbgfs_state, **dbgfs_chan;
static int dbg_show_requester_chan(struct seq_file *s, void *p)
{
- int pos = 0;
int chan = (int)s->private;
int i;
u32 drcmr;
- pos += seq_printf(s, "DMA channel %d requesters list :\n", chan);
+ seq_printf(s, "DMA channel %d requesters list :\n", chan);
for (i = 0; i < DMA_MAX_REQUESTERS; i++) {
drcmr = DRCMR(i);
if ((drcmr & DRCMR_CHLNUM) == chan)
- pos += seq_printf(s, "\tRequester %d (MAPVLD=%d)\n", i,
- !!(drcmr & DRCMR_MAPVLD));
+ seq_printf(s, "\tRequester %d (MAPVLD=%d)\n",
+ i, !!(drcmr & DRCMR_MAPVLD));
}
- return pos;
+
+ return 0;
}
static inline int dbg_burst_from_dcmd(u32 dcmd)
@@ -83,7 +83,6 @@ static int is_phys_valid(unsigned long addr)
static int dbg_show_descriptors(struct seq_file *s, void *p)
{
- int pos = 0;
int chan = (int)s->private;
int i, max_show = 20, burst, width;
u32 dcmd;
@@ -94,44 +93,43 @@ static int dbg_show_descriptors(struct seq_file *s, void *p)
spin_lock_irqsave(&dma_channels[chan].lock, flags);
phys_desc = DDADR(chan);
- pos += seq_printf(s, "DMA channel %d descriptors :\n", chan);
- pos += seq_printf(s, "[%03d] First descriptor unknown\n", 0);
+ seq_printf(s, "DMA channel %d descriptors :\n", chan);
+ seq_printf(s, "[%03d] First descriptor unknown\n", 0);
for (i = 1; i < max_show && is_phys_valid(phys_desc); i++) {
desc = phys_to_virt(phys_desc);
dcmd = desc->dcmd;
burst = dbg_burst_from_dcmd(dcmd);
width = (1 << ((dcmd >> 14) & 0x3)) >> 1;
- pos += seq_printf(s, "[%03d] Desc at %08lx(virt %p)\n",
- i, phys_desc, desc);
- pos += seq_printf(s, "\tDDADR = %08x\n", desc->ddadr);
- pos += seq_printf(s, "\tDSADR = %08x\n", desc->dsadr);
- pos += seq_printf(s, "\tDTADR = %08x\n", desc->dtadr);
- pos += seq_printf(s, "\tDCMD = %08x (%s%s%s%s%s%s%sburst=%d"
- " width=%d len=%d)\n",
- dcmd,
- DCMD_STR(INCSRCADDR), DCMD_STR(INCTRGADDR),
- DCMD_STR(FLOWSRC), DCMD_STR(FLOWTRG),
- DCMD_STR(STARTIRQEN), DCMD_STR(ENDIRQEN),
- DCMD_STR(ENDIAN), burst, width,
- dcmd & DCMD_LENGTH);
+ seq_printf(s, "[%03d] Desc at %08lx(virt %p)\n",
+ i, phys_desc, desc);
+ seq_printf(s, "\tDDADR = %08x\n", desc->ddadr);
+ seq_printf(s, "\tDSADR = %08x\n", desc->dsadr);
+ seq_printf(s, "\tDTADR = %08x\n", desc->dtadr);
+ seq_printf(s, "\tDCMD = %08x (%s%s%s%s%s%s%sburst=%d width=%d len=%d)\n",
+ dcmd,
+ DCMD_STR(INCSRCADDR), DCMD_STR(INCTRGADDR),
+ DCMD_STR(FLOWSRC), DCMD_STR(FLOWTRG),
+ DCMD_STR(STARTIRQEN), DCMD_STR(ENDIRQEN),
+ DCMD_STR(ENDIAN), burst, width,
+ dcmd & DCMD_LENGTH);
phys_desc = desc->ddadr;
}
if (i == max_show)
- pos += seq_printf(s, "[%03d] Desc at %08lx ... max display reached\n",
- i, phys_desc);
+ seq_printf(s, "[%03d] Desc at %08lx ... max display reached\n",
+ i, phys_desc);
else
- pos += seq_printf(s, "[%03d] Desc at %08lx is %s\n",
- i, phys_desc, phys_desc == DDADR_STOP ?
- "DDADR_STOP" : "invalid");
+ seq_printf(s, "[%03d] Desc at %08lx is %s\n",
+ i, phys_desc, phys_desc == DDADR_STOP ?
+ "DDADR_STOP" : "invalid");
spin_unlock_irqrestore(&dma_channels[chan].lock, flags);
- return pos;
+
+ return 0;
}
static int dbg_show_chan_state(struct seq_file *s, void *p)
{
- int pos = 0;
int chan = (int)s->private;
u32 dcsr, dcmd;
int burst, width;
@@ -142,42 +140,39 @@ static int dbg_show_chan_state(struct seq_file *s, void *p)
burst = dbg_burst_from_dcmd(dcmd);
width = (1 << ((dcmd >> 14) & 0x3)) >> 1;
- pos += seq_printf(s, "DMA channel %d\n", chan);
- pos += seq_printf(s, "\tPriority : %s\n",
- str_prio[dma_channels[chan].prio]);
- pos += seq_printf(s, "\tUnaligned transfer bit: %s\n",
- DALGN & (1 << chan) ? "yes" : "no");
- pos += seq_printf(s, "\tDCSR = %08x (%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s)\n",
- dcsr, DCSR_STR(RUN), DCSR_STR(NODESC),
- DCSR_STR(STOPIRQEN), DCSR_STR(EORIRQEN),
- DCSR_STR(EORJMPEN), DCSR_STR(EORSTOPEN),
- DCSR_STR(SETCMPST), DCSR_STR(CLRCMPST),
- DCSR_STR(CMPST), DCSR_STR(EORINTR), DCSR_STR(REQPEND),
- DCSR_STR(STOPSTATE), DCSR_STR(ENDINTR),
- DCSR_STR(STARTINTR), DCSR_STR(BUSERR));
-
- pos += seq_printf(s, "\tDCMD = %08x (%s%s%s%s%s%s%sburst=%d width=%d"
- " len=%d)\n",
- dcmd,
- DCMD_STR(INCSRCADDR), DCMD_STR(INCTRGADDR),
- DCMD_STR(FLOWSRC), DCMD_STR(FLOWTRG),
- DCMD_STR(STARTIRQEN), DCMD_STR(ENDIRQEN),
- DCMD_STR(ENDIAN), burst, width, dcmd & DCMD_LENGTH);
- pos += seq_printf(s, "\tDSADR = %08x\n", DSADR(chan));
- pos += seq_printf(s, "\tDTADR = %08x\n", DTADR(chan));
- pos += seq_printf(s, "\tDDADR = %08x\n", DDADR(chan));
- return pos;
+ seq_printf(s, "DMA channel %d\n", chan);
+ seq_printf(s, "\tPriority : %s\n", str_prio[dma_channels[chan].prio]);
+ seq_printf(s, "\tUnaligned transfer bit: %s\n",
+ DALGN & (1 << chan) ? "yes" : "no");
+ seq_printf(s, "\tDCSR = %08x (%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s)\n",
+ dcsr, DCSR_STR(RUN), DCSR_STR(NODESC),
+ DCSR_STR(STOPIRQEN), DCSR_STR(EORIRQEN),
+ DCSR_STR(EORJMPEN), DCSR_STR(EORSTOPEN),
+ DCSR_STR(SETCMPST), DCSR_STR(CLRCMPST),
+ DCSR_STR(CMPST), DCSR_STR(EORINTR), DCSR_STR(REQPEND),
+ DCSR_STR(STOPSTATE), DCSR_STR(ENDINTR),
+ DCSR_STR(STARTINTR), DCSR_STR(BUSERR));
+
+ seq_printf(s, "\tDCMD = %08x (%s%s%s%s%s%s%sburst=%d width=%d len=%d)\n",
+ dcmd,
+ DCMD_STR(INCSRCADDR), DCMD_STR(INCTRGADDR),
+ DCMD_STR(FLOWSRC), DCMD_STR(FLOWTRG),
+ DCMD_STR(STARTIRQEN), DCMD_STR(ENDIRQEN),
+ DCMD_STR(ENDIAN), burst, width, dcmd & DCMD_LENGTH);
+ seq_printf(s, "\tDSADR = %08x\n", DSADR(chan));
+ seq_printf(s, "\tDTADR = %08x\n", DTADR(chan));
+ seq_printf(s, "\tDDADR = %08x\n", DDADR(chan));
+
+ return 0;
}
static int dbg_show_state(struct seq_file *s, void *p)
{
- int pos = 0;
-
/* basic device status */
- pos += seq_printf(s, "DMA engine status\n");
- pos += seq_printf(s, "\tChannel number: %d\n", num_dma_channels);
+ seq_puts(s, "DMA engine status\n");
+ seq_printf(s, "\tChannel number: %d\n", num_dma_channels);
- return pos;
+ return 0;
}
#define DBGFS_FUNC_DECL(name) \
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 8b965e2769eb..0eae06cdac27 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2,8 +2,8 @@ config ARM64
def_bool y
select ACPI_GENERIC_GSI if ACPI
select ACPI_REDUCED_HARDWARE_ONLY if ACPI
- select ARCH_BINFMT_ELF_RANDOMIZE_PIE
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
+ select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_SG_CHAIN
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
@@ -145,6 +145,13 @@ config KERNEL_MODE_NEON
config FIX_EARLYCON_MEM
def_bool y
+config PGTABLE_LEVELS
+ int
+ default 2 if ARM64_64K_PAGES && ARM64_VA_BITS_42
+ default 3 if ARM64_64K_PAGES && ARM64_VA_BITS_48
+ default 3 if ARM64_4K_PAGES && ARM64_VA_BITS_39
+ default 4 if ARM64_4K_PAGES && ARM64_VA_BITS_48
+
source "init/Kconfig"
source "kernel/Kconfig.freezer"
@@ -425,13 +432,6 @@ config ARM64_VA_BITS
default 42 if ARM64_VA_BITS_42
default 48 if ARM64_VA_BITS_48
-config ARM64_PGTABLE_LEVELS
- int
- default 2 if ARM64_64K_PAGES && ARM64_VA_BITS_42
- default 3 if ARM64_64K_PAGES && ARM64_VA_BITS_48
- default 3 if ARM64_4K_PAGES && ARM64_VA_BITS_39
- default 4 if ARM64_4K_PAGES && ARM64_VA_BITS_48
-
config CPU_BIG_ENDIAN
bool "Build big-endian kernel"
help
diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
index 1f65be393139..faad6df49e5b 100644
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -125,7 +125,6 @@ typedef struct user_fpsimd_state elf_fpregset_t;
* the loader. We need to make sure that it is out of the way of the program
* that it will "exec", and that there is sufficient room for the brk.
*/
-extern unsigned long randomize_et_dyn(unsigned long base);
#define ELF_ET_DYN_BASE (2 * TASK_SIZE_64 / 3)
/*
@@ -157,10 +156,6 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm,
#define STACK_RND_MASK (0x3ffff >> (PAGE_SHIFT - 12))
#endif
-struct mm_struct;
-extern unsigned long arch_randomize_brk(struct mm_struct *mm);
-#define arch_randomize_brk arch_randomize_brk
-
#ifdef CONFIG_COMPAT
#ifdef __AARCH64EB__
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index dc8242d49d62..61505676d085 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -165,12 +165,12 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
/*
* If we are concatenating first level stage-2 page tables, we would have less
* than or equal to 16 pointers in the fake PGD, because that's what the
- * architecture allows. In this case, (4 - CONFIG_ARM64_PGTABLE_LEVELS)
+ * architecture allows. In this case, (4 - CONFIG_PGTABLE_LEVELS)
* represents the first level for the host, and we add 1 to go to the next
* level (which uses contatenation) for the stage-2 tables.
*/
#if PTRS_PER_S2_PGD <= 16
-#define KVM_PREALLOC_LEVEL (4 - CONFIG_ARM64_PGTABLE_LEVELS + 1)
+#define KVM_PREALLOC_LEVEL (4 - CONFIG_PGTABLE_LEVELS + 1)
#else
#define KVM_PREALLOC_LEVEL (0)
#endif
diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h
index 3d02b1869eb8..7d9c7e4a424b 100644
--- a/arch/arm64/include/asm/page.h
+++ b/arch/arm64/include/asm/page.h
@@ -38,9 +38,9 @@
* VA range, so 3 pages are reserved in all cases.
*/
#ifdef CONFIG_ARM64_64K_PAGES
-#define SWAPPER_PGTABLE_LEVELS (CONFIG_ARM64_PGTABLE_LEVELS)
+#define SWAPPER_PGTABLE_LEVELS (CONFIG_PGTABLE_LEVELS)
#else
-#define SWAPPER_PGTABLE_LEVELS (CONFIG_ARM64_PGTABLE_LEVELS - 1)
+#define SWAPPER_PGTABLE_LEVELS (CONFIG_PGTABLE_LEVELS - 1)
#endif
#define SWAPPER_DIR_SIZE (SWAPPER_PGTABLE_LEVELS * PAGE_SIZE)
diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index e20df38a8ff3..76420568d66a 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -28,7 +28,7 @@
#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
-#if CONFIG_ARM64_PGTABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
{
@@ -46,9 +46,9 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
set_pud(pud, __pud(__pa(pmd) | PMD_TYPE_TABLE));
}
-#endif /* CONFIG_ARM64_PGTABLE_LEVELS > 2 */
+#endif /* CONFIG_PGTABLE_LEVELS > 2 */
-#if CONFIG_ARM64_PGTABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
{
@@ -66,7 +66,7 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
set_pgd(pgd, __pgd(__pa(pud) | PUD_TYPE_TABLE));
}
-#endif /* CONFIG_ARM64_PGTABLE_LEVELS > 3 */
+#endif /* CONFIG_PGTABLE_LEVELS > 3 */
extern pgd_t *pgd_alloc(struct mm_struct *mm);
extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index 847e864202cc..59bfae75dc98 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -21,7 +21,7 @@
/*
* PMD_SHIFT determines the size a level 2 page table entry can map.
*/
-#if CONFIG_ARM64_PGTABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
#define PMD_SHIFT ((PAGE_SHIFT - 3) * 2 + 3)
#define PMD_SIZE (_AC(1, UL) << PMD_SHIFT)
#define PMD_MASK (~(PMD_SIZE-1))
@@ -31,7 +31,7 @@
/*
* PUD_SHIFT determines the size a level 1 page table entry can map.
*/
-#if CONFIG_ARM64_PGTABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
#define PUD_SHIFT ((PAGE_SHIFT - 3) * 3 + 3)
#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT)
#define PUD_MASK (~(PUD_SIZE-1))
@@ -42,7 +42,7 @@
* PGDIR_SHIFT determines the size a top-level page table entry can map
* (depending on the configuration, this level can be 0, 1 or 2).
*/
-#define PGDIR_SHIFT ((PAGE_SHIFT - 3) * CONFIG_ARM64_PGTABLE_LEVELS + 3)
+#define PGDIR_SHIFT ((PAGE_SHIFT - 3) * CONFIG_PGTABLE_LEVELS + 3)
#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
#define PGDIR_MASK (~(PGDIR_SIZE-1))
#define PTRS_PER_PGD (1 << (VA_BITS - PGDIR_SHIFT))
diff --git a/arch/arm64/include/asm/pgtable-types.h b/arch/arm64/include/asm/pgtable-types.h
index ca9df80af896..2b1bd7e52c3b 100644
--- a/arch/arm64/include/asm/pgtable-types.h
+++ b/arch/arm64/include/asm/pgtable-types.h
@@ -38,13 +38,13 @@ typedef struct { pteval_t pte; } pte_t;
#define pte_val(x) ((x).pte)
#define __pte(x) ((pte_t) { (x) } )
-#if CONFIG_ARM64_PGTABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
typedef struct { pmdval_t pmd; } pmd_t;
#define pmd_val(x) ((x).pmd)
#define __pmd(x) ((pmd_t) { (x) } )
#endif
-#if CONFIG_ARM64_PGTABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
typedef struct { pudval_t pud; } pud_t;
#define pud_val(x) ((x).pud)
#define __pud(x) ((pud_t) { (x) } )
@@ -64,13 +64,13 @@ typedef pteval_t pte_t;
#define pte_val(x) (x)
#define __pte(x) (x)
-#if CONFIG_ARM64_PGTABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
typedef pmdval_t pmd_t;
#define pmd_val(x) (x)
#define __pmd(x) (x)
#endif
-#if CONFIG_ARM64_PGTABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
typedef pudval_t pud_t;
#define pud_val(x) (x)
#define __pud(x) (x)
@@ -86,9 +86,9 @@ typedef pteval_t pgprot_t;
#endif /* STRICT_MM_TYPECHECKS */
-#if CONFIG_ARM64_PGTABLE_LEVELS == 2
+#if CONFIG_PGTABLE_LEVELS == 2
#include <asm-generic/pgtable-nopmd.h>
-#elif CONFIG_ARM64_PGTABLE_LEVELS == 3
+#elif CONFIG_PGTABLE_LEVELS == 3
#include <asm-generic/pgtable-nopud.h>
#endif
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 800ec0e87ed9..bd5db28324ba 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -285,10 +285,12 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
#define pmd_young(pmd) pte_young(pmd_pte(pmd))
+#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
#define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd)))
#define pmd_mksplitting(pmd) pte_pmd(pte_mkspecial(pmd_pte(pmd)))
#define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd)))
#define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd)))
+#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd)))
#define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd)))
#define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd)))
#define pmd_mknotpresent(pmd) (__pmd(pmd_val(pmd) & ~PMD_TYPE_MASK))
@@ -374,7 +376,7 @@ static inline pte_t *pmd_page_vaddr(pmd_t pmd)
*/
#define mk_pte(page,prot) pfn_pte(page_to_pfn(page),prot)
-#if CONFIG_ARM64_PGTABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
#define pmd_ERROR(pmd) __pmd_error(__FILE__, __LINE__, pmd_val(pmd))
@@ -409,9 +411,9 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
#define pud_page(pud) pfn_to_page(__phys_to_pfn(pud_val(pud) & PHYS_MASK))
-#endif /* CONFIG_ARM64_PGTABLE_LEVELS > 2 */
+#endif /* CONFIG_PGTABLE_LEVELS > 2 */
-#if CONFIG_ARM64_PGTABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
#define pud_ERROR(pud) __pud_error(__FILE__, __LINE__, pud_val(pud))
@@ -445,7 +447,7 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long addr)
#define pgd_page(pgd) pfn_to_page(__phys_to_pfn(pgd_val(pgd) & PHYS_MASK))
-#endif /* CONFIG_ARM64_PGTABLE_LEVELS > 3 */
+#endif /* CONFIG_PGTABLE_LEVELS > 3 */
#define pgd_ERROR(pgd) __pgd_error(__FILE__, __LINE__, pgd_val(pgd))
diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
index 53d9c354219f..3a0242c7eb8d 100644
--- a/arch/arm64/include/asm/tlb.h
+++ b/arch/arm64/include/asm/tlb.h
@@ -53,7 +53,7 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
tlb_remove_entry(tlb, pte);
}
-#if CONFIG_ARM64_PGTABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp,
unsigned long addr)
{
@@ -62,7 +62,7 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp,
}
#endif
-#if CONFIG_ARM64_PGTABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pudp,
unsigned long addr)
{
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index ae85da6307bb..597831bdddf3 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -190,6 +190,8 @@ void __init bootmem_init(void)
min = PFN_UP(memblock_start_of_DRAM());
max = PFN_DOWN(memblock_end_of_DRAM());
+ early_memtest(min << PAGE_SHIFT, max << PAGE_SHIFT);
+
/*
* Sparsemem tries to allocate bootmem in memory_present(), so must be
* done after the fixed reservations.
diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index 54922d1275b8..ed177475dd8c 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c
@@ -47,17 +47,16 @@ static int mmap_is_legacy(void)
return sysctl_legacy_va_layout;
}
-static unsigned long mmap_rnd(void)
+unsigned long arch_mmap_rnd(void)
{
- unsigned long rnd = 0;
+ unsigned long rnd;
- if (current->flags & PF_RANDOMIZE)
- rnd = (long)get_random_int() & STACK_RND_MASK;
+ rnd = (unsigned long)get_random_int() & STACK_RND_MASK;
return rnd << PAGE_SHIFT;
}
-static unsigned long mmap_base(void)
+static unsigned long mmap_base(unsigned long rnd)
{
unsigned long gap = rlimit(RLIMIT_STACK);
@@ -66,7 +65,7 @@ static unsigned long mmap_base(void)
else if (gap > MAX_GAP)
gap = MAX_GAP;
- return PAGE_ALIGN(STACK_TOP - gap - mmap_rnd());
+ return PAGE_ALIGN(STACK_TOP - gap - rnd);
}
/*
@@ -75,15 +74,20 @@ static unsigned long mmap_base(void)
*/
void arch_pick_mmap_layout(struct mm_struct *mm)
{
+ unsigned long random_factor = 0UL;
+
+ if (current->flags & PF_RANDOMIZE)
+ random_factor = arch_mmap_rnd();
+
/*
* Fall back to the standard layout if the personality bit is set, or
* if the expected stack growth is unlimited:
*/
if (mmap_is_legacy()) {
- mm->mmap_base = TASK_UNMAPPED_BASE;
+ mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
mm->get_unmapped_area = arch_get_unmapped_area;
} else {
- mm->mmap_base = mmap_base();
+ mm->mmap_base = mmap_base(random_factor);
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
}
}
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 428aaf86c95b..5b8b664422d3 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -555,10 +555,10 @@ void vmemmap_free(unsigned long start, unsigned long end)
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss;
-#if CONFIG_ARM64_PGTABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss;
#endif
-#if CONFIG_ARM64_PGTABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss;
#endif
diff --git a/arch/cris/arch-v10/kernel/fasttimer.c b/arch/cris/arch-v10/kernel/fasttimer.c
index 48a59afbeeb1..e9298739d72e 100644
--- a/arch/cris/arch-v10/kernel/fasttimer.c
+++ b/arch/cris/arch-v10/kernel/fasttimer.c
@@ -527,7 +527,8 @@ static int proc_fasttimer_show(struct seq_file *m, void *v)
i = debug_log_cnt;
while (i != end_i || debug_log_cnt_wrapped) {
- if (seq_printf(m, debug_log_string[i], debug_log_value[i]) < 0)
+ seq_printf(m, debug_log_string[i], debug_log_value[i]);
+ if (seq_has_overflowed(m))
return 0;
i = (i+1) % DEBUG_LOG_MAX;
}
@@ -542,24 +543,22 @@ static int proc_fasttimer_show(struct seq_file *m, void *v)
int cur = (fast_timers_started - i - 1) % NUM_TIMER_STATS;
#if 1 //ndef FAST_TIMER_LOG
- seq_printf(m, "div: %i freq: %i delay: %i"
- "\n",
+ seq_printf(m, "div: %i freq: %i delay: %i\n",
timer_div_settings[cur],
timer_freq_settings[cur],
timer_delay_settings[cur]);
#endif
#ifdef FAST_TIMER_LOG
t = &timer_started_log[cur];
- if (seq_printf(m, "%-14s s: %6lu.%06lu e: %6lu.%06lu "
- "d: %6li us data: 0x%08lX"
- "\n",
- t->name,
- (unsigned long)t->tv_set.tv_jiff,
- (unsigned long)t->tv_set.tv_usec,
- (unsigned long)t->tv_expires.tv_jiff,
- (unsigned long)t->tv_expires.tv_usec,
- t->delay_us,
- t->data) < 0)
+ seq_printf(m, "%-14s s: %6lu.%06lu e: %6lu.%06lu d: %6li us data: 0x%08lX\n",
+ t->name,
+ (unsigned long)t->tv_set.tv_jiff,
+ (unsigned long)t->tv_set.tv_usec,
+ (unsigned long)t->tv_expires.tv_jiff,
+ (unsigned long)t->tv_expires.tv_usec,
+ t->delay_us,
+ t->data);
+ if (seq_has_overflowed(m))
return 0;
#endif
}
@@ -571,16 +570,15 @@ static int proc_fasttimer_show(struct seq_file *m, void *v)
seq_printf(m, "Timers added: %i\n", fast_timers_added);
for (i = 0; i < num_to_show; i++) {
t = &timer_added_log[(fast_timers_added - i - 1) % NUM_TIMER_STATS];
- if (seq_printf(m, "%-14s s: %6lu.%06lu e: %6lu.%06lu "
- "d: %6li us data: 0x%08lX"
- "\n",
- t->name,
- (unsigned long)t->tv_set.tv_jiff,
- (unsigned long)t->tv_set.tv_usec,
- (unsigned long)t->tv_expires.tv_jiff,
- (unsigned long)t->tv_expires.tv_usec,
- t->delay_us,
- t->data) < 0)
+ seq_printf(m, "%-14s s: %6lu.%06lu e: %6lu.%06lu d: %6li us data: 0x%08lX\n",
+ t->name,
+ (unsigned long)t->tv_set.tv_jiff,
+ (unsigned long)t->tv_set.tv_usec,
+ (unsigned long)t->tv_expires.tv_jiff,
+ (unsigned long)t->tv_expires.tv_usec,
+ t->delay_us,
+ t->data);
+ if (seq_has_overflowed(m))
return 0;
}
seq_putc(m, '\n');
@@ -590,16 +588,15 @@ static int proc_fasttimer_show(struct seq_file *m, void *v)
seq_printf(m, "Timers expired: %i\n", fast_timers_expired);
for (i = 0; i < num_to_show; i++) {
t = &timer_expired_log[(fast_timers_expired - i - 1) % NUM_TIMER_STATS];
- if (seq_printf(m, "%-14s s: %6lu.%06lu e: %6lu.%06lu "
- "d: %6li us data: 0x%08lX"
- "\n",
- t->name,
- (unsigned long)t->tv_set.tv_jiff,
- (unsigned long)t->tv_set.tv_usec,
- (unsigned long)t->tv_expires.tv_jiff,
- (unsigned long)t->tv_expires.tv_usec,
- t->delay_us,
- t->data) < 0)
+ seq_printf(m, "%-14s s: %6lu.%06lu e: %6lu.%06lu d: %6li us data: 0x%08lX\n",
+ t->name,
+ (unsigned long)t->tv_set.tv_jiff,
+ (unsigned long)t->tv_set.tv_usec,
+ (unsigned long)t->tv_expires.tv_jiff,
+ (unsigned long)t->tv_expires.tv_usec,
+ t->delay_us,
+ t->data);
+ if (seq_has_overflowed(m))
return 0;
}
seq_putc(m, '\n');
@@ -611,19 +608,15 @@ static int proc_fasttimer_show(struct seq_file *m, void *v)
while (t) {
nextt = t->next;
local_irq_restore(flags);
- if (seq_printf(m, "%-14s s: %6lu.%06lu e: %6lu.%06lu "
- "d: %6li us data: 0x%08lX"
-/* " func: 0x%08lX" */
- "\n",
- t->name,
- (unsigned long)t->tv_set.tv_jiff,
- (unsigned long)t->tv_set.tv_usec,
- (unsigned long)t->tv_expires.tv_jiff,
- (unsigned long)t->tv_expires.tv_usec,
- t->delay_us,
- t->data
-/* , t->function */
- ) < 0)
+ seq_printf(m, "%-14s s: %6lu.%06lu e: %6lu.%06lu d: %6li us data: 0x%08lX\n",
+ t->name,
+ (unsigned long)t->tv_set.tv_jiff,
+ (unsigned long)t->tv_set.tv_usec,
+ (unsigned long)t->tv_expires.tv_jiff,
+ (unsigned long)t->tv_expires.tv_usec,
+ t->delay_us,
+ t->data);
+ if (seq_has_overflowed(m))
return 0;
local_irq_save(flags);
if (t->next != nextt)
diff --git a/arch/cris/arch-v10/kernel/setup.c b/arch/cris/arch-v10/kernel/setup.c
index 4f96d71b5154..7ab31f1c7540 100644
--- a/arch/cris/arch-v10/kernel/setup.c
+++ b/arch/cris/arch-v10/kernel/setup.c
@@ -63,35 +63,37 @@ int show_cpuinfo(struct seq_file *m, void *v)
else
info = &cpu_info[revision];
- return seq_printf(m,
- "processor\t: 0\n"
- "cpu\t\t: CRIS\n"
- "cpu revision\t: %lu\n"
- "cpu model\t: %s\n"
- "cache size\t: %d kB\n"
- "fpu\t\t: %s\n"
- "mmu\t\t: %s\n"
- "mmu DMA bug\t: %s\n"
- "ethernet\t: %s Mbps\n"
- "token ring\t: %s\n"
- "scsi\t\t: %s\n"
- "ata\t\t: %s\n"
- "usb\t\t: %s\n"
- "bogomips\t: %lu.%02lu\n",
+ seq_printf(m,
+ "processor\t: 0\n"
+ "cpu\t\t: CRIS\n"
+ "cpu revision\t: %lu\n"
+ "cpu model\t: %s\n"
+ "cache size\t: %d kB\n"
+ "fpu\t\t: %s\n"
+ "mmu\t\t: %s\n"
+ "mmu DMA bug\t: %s\n"
+ "ethernet\t: %s Mbps\n"
+ "token ring\t: %s\n"
+ "scsi\t\t: %s\n"
+ "ata\t\t: %s\n"
+ "usb\t\t: %s\n"
+ "bogomips\t: %lu.%02lu\n",
- revision,
- info->model,
- info->cache,
- info->flags & HAS_FPU ? "yes" : "no",
- info->flags & HAS_MMU ? "yes" : "no",
- info->flags & HAS_MMU_BUG ? "yes" : "no",
- info->flags & HAS_ETHERNET100 ? "10/100" : "10",
- info->flags & HAS_TOKENRING ? "4/16 Mbps" : "no",
- info->flags & HAS_SCSI ? "yes" : "no",
- info->flags & HAS_ATA ? "yes" : "no",
- info->flags & HAS_USB ? "yes" : "no",
- (loops_per_jiffy * HZ + 500) / 500000,
- ((loops_per_jiffy * HZ + 500) / 5000) % 100);
+ revision,
+ info->model,
+ info->cache,
+ info->flags & HAS_FPU ? "yes" : "no",
+ info->flags & HAS_MMU ? "yes" : "no",
+ info->flags & HAS_MMU_BUG ? "yes" : "no",
+ info->flags & HAS_ETHERNET100 ? "10/100" : "10",
+ info->flags & HAS_TOKENRING ? "4/16 Mbps" : "no",
+ info->flags & HAS_SCSI ? "yes" : "no",
+ info->flags & HAS_ATA ? "yes" : "no",
+ info->flags & HAS_USB ? "yes" : "no",
+ (loops_per_jiffy * HZ + 500) / 500000,
+ ((loops_per_jiffy * HZ + 500) / 5000) % 100);
+
+ return 0;
}
#endif /* CONFIG_PROC_FS */
diff --git a/arch/cris/arch-v32/kernel/fasttimer.c b/arch/cris/arch-v32/kernel/fasttimer.c
index b130c2c5fdd8..5c84dbb99f30 100644
--- a/arch/cris/arch-v32/kernel/fasttimer.c
+++ b/arch/cris/arch-v32/kernel/fasttimer.c
@@ -501,7 +501,8 @@ static int proc_fasttimer_show(struct seq_file *m, void *v)
i = debug_log_cnt;
while ((i != end_i || debug_log_cnt_wrapped)) {
- if (seq_printf(m, debug_log_string[i], debug_log_value[i]) < 0)
+ seq_printf(m, debug_log_string[i], debug_log_value[i]);
+ if (seq_has_overflowed(m))
return 0;
i = (i+1) % DEBUG_LOG_MAX;
}
@@ -516,23 +517,21 @@ static int proc_fasttimer_show(struct seq_file *m, void *v)
int cur = (fast_timers_started - i - 1) % NUM_TIMER_STATS;
#if 1 //ndef FAST_TIMER_LOG
- seq_printf(m, "div: %i delay: %i"
- "\n",
+ seq_printf(m, "div: %i delay: %i\n",
timer_div_settings[cur],
timer_delay_settings[cur]);
#endif
#ifdef FAST_TIMER_LOG
t = &timer_started_log[cur];
- if (seq_printf(m, "%-14s s: %6lu.%06lu e: %6lu.%06lu "
- "d: %6li us data: 0x%08lX"
- "\n",
- t->name,
- (unsigned long)t->tv_set.tv_jiff,
- (unsigned long)t->tv_set.tv_usec,
- (unsigned long)t->tv_expires.tv_jiff,
- (unsigned long)t->tv_expires.tv_usec,
- t->delay_us,
- t->data) < 0)
+ seq_printf(m, "%-14s s: %6lu.%06lu e: %6lu.%06lu d: %6li us data: 0x%08lX\n",
+ t->name,
+ (unsigned long)t->tv_set.tv_jiff,
+ (unsigned long)t->tv_set.tv_usec,
+ (unsigned long)t->tv_expires.tv_jiff,
+ (unsigned long)t->tv_expires.tv_usec,
+ t->delay_us,
+ t->data);
+ if (seq_has_overflowed(m))
return 0;
#endif
}
@@ -544,16 +543,15 @@ static int proc_fasttimer_show(struct seq_file *m, void *v)
seq_printf(m, "Timers added: %i\n", fast_timers_added);
for (i = 0; i < num_to_show; i++) {
t = &timer_added_log[(fast_timers_added - i - 1) % NUM_TIMER_STATS];
- if (seq_printf(m, "%-14s s: %6lu.%06lu e: %6lu.%06lu "
- "d: %6li us data: 0x%08lX"
- "\n",
- t->name,
- (unsigned long)t->tv_set.tv_jiff,
- (unsigned long)t->tv_set.tv_usec,
- (unsigned long)t->tv_expires.tv_jiff,
- (unsigned long)t->tv_expires.tv_usec,
- t->delay_us,
- t->data) < 0)
+ seq_printf(m, "%-14s s: %6lu.%06lu e: %6lu.%06lu d: %6li us data: 0x%08lX\n",
+ t->name,
+ (unsigned long)t->tv_set.tv_jiff,
+ (unsigned long)t->tv_set.tv_usec,
+ (unsigned long)t->tv_expires.tv_jiff,
+ (unsigned long)t->tv_expires.tv_usec,
+ t->delay_us,
+ t->data);
+ if (seq_has_overflowed(m))
return 0;
}
seq_putc(m, '\n');
@@ -563,16 +561,15 @@ static int proc_fasttimer_show(struct seq_file *m, void *v)
seq_printf(m, "Timers expired: %i\n", fast_timers_expired);
for (i = 0; i < num_to_show; i++){
t = &timer_expired_log[(fast_timers_expired - i - 1) % NUM_TIMER_STATS];
- if (seq_printf(m, "%-14s s: %6lu.%06lu e: %6lu.%06lu "
- "d: %6li us data: 0x%08lX"
- "\n",
- t->name,
- (unsigned long)t->tv_set.tv_jiff,
- (unsigned long)t->tv_set.tv_usec,
- (unsigned long)t->tv_expires.tv_jiff,
- (unsigned long)t->tv_expires.tv_usec,
- t->delay_us,
- t->data) < 0)
+ seq_printf(m, "%-14s s: %6lu.%06lu e: %6lu.%06lu d: %6li us data: 0x%08lX\n",
+ t->name,
+ (unsigned long)t->tv_set.tv_jiff,
+ (unsigned long)t->tv_set.tv_usec,
+ (unsigned long)t->tv_expires.tv_jiff,
+ (unsigned long)t->tv_expires.tv_usec,
+ t->delay_us,
+ t->data);
+ if (seq_has_overflowed(m))
return 0;
}
seq_putc(m, '\n');
@@ -584,19 +581,15 @@ static int proc_fasttimer_show(struct seq_file *m, void *v)
while (t != NULL){
nextt = t->next;
local_irq_restore(flags);
- if (seq_printf(m, "%-14s s: %6lu.%06lu e: %6lu.%06lu "
- "d: %6li us data: 0x%08lX"
-/* " func: 0x%08lX" */
- "\n",
- t->name,
- (unsigned long)t->tv_set.tv_jiff,
- (unsigned long)t->tv_set.tv_usec,
- (unsigned long)t->tv_expires.tv_jiff,
- (unsigned long)t->tv_expires.tv_usec,
- t->delay_us,
- t->data
-/* , t->function */
- ) < 0)
+ seq_printf(m, "%-14s s: %6lu.%06lu e: %6lu.%06lu d: %6li us data: 0x%08lX\n",
+ t->name,
+ (unsigned long)t->tv_set.tv_jiff,
+ (unsigned long)t->tv_set.tv_usec,
+ (unsigned long)t->tv_expires.tv_jiff,
+ (unsigned long)t->tv_expires.tv_usec,
+ t->delay_us,
+ t->data);
+ if (seq_has_overflowed(m))
return 0;
local_irq_save(flags);
if (t->next != nextt)
diff --git a/arch/cris/arch-v32/kernel/setup.c b/arch/cris/arch-v32/kernel/setup.c
index 231927bba7c2..cd1865d68b2e 100644
--- a/arch/cris/arch-v32/kernel/setup.c
+++ b/arch/cris/arch-v32/kernel/setup.c
@@ -72,36 +72,38 @@ int show_cpuinfo(struct seq_file *m, void *v)
}
}
- return seq_printf(m,
- "processor\t: %d\n"
- "cpu\t\t: CRIS\n"
- "cpu revision\t: %lu\n"
- "cpu model\t: %s\n"
- "cache size\t: %d KB\n"
- "fpu\t\t: %s\n"
- "mmu\t\t: %s\n"
- "mmu DMA bug\t: %s\n"
- "ethernet\t: %s Mbps\n"
- "token ring\t: %s\n"
- "scsi\t\t: %s\n"
- "ata\t\t: %s\n"
- "usb\t\t: %s\n"
- "bogomips\t: %lu.%02lu\n\n",
-
- cpu,
- revision,
- info->cpu_model,
- info->cache_size,
- info->flags & HAS_FPU ? "yes" : "no",
- info->flags & HAS_MMU ? "yes" : "no",
- info->flags & HAS_MMU_BUG ? "yes" : "no",
- info->flags & HAS_ETHERNET100 ? "10/100" : "10",
- info->flags & HAS_TOKENRING ? "4/16 Mbps" : "no",
- info->flags & HAS_SCSI ? "yes" : "no",
- info->flags & HAS_ATA ? "yes" : "no",
- info->flags & HAS_USB ? "yes" : "no",
- (loops_per_jiffy * HZ + 500) / 500000,
- ((loops_per_jiffy * HZ + 500) / 5000) % 100);
+ seq_printf(m,
+ "processor\t: %d\n"
+ "cpu\t\t: CRIS\n"
+ "cpu revision\t: %lu\n"
+ "cpu model\t: %s\n"
+ "cache size\t: %d KB\n"
+ "fpu\t\t: %s\n"
+ "mmu\t\t: %s\n"
+ "mmu DMA bug\t: %s\n"
+ "ethernet\t: %s Mbps\n"
+ "token ring\t: %s\n"
+ "scsi\t\t: %s\n"
+ "ata\t\t: %s\n"
+ "usb\t\t: %s\n"
+ "bogomips\t: %lu.%02lu\n\n",
+
+ cpu,
+ revision,
+ info->cpu_model,
+ info->cache_size,
+ info->flags & HAS_FPU ? "yes" : "no",
+ info->flags & HAS_MMU ? "yes" : "no",
+ info->flags & HAS_MMU_BUG ? "yes" : "no",
+ info->flags & HAS_ETHERNET100 ? "10/100" : "10",
+ info->flags & HAS_TOKENRING ? "4/16 Mbps" : "no",
+ info->flags & HAS_SCSI ? "yes" : "no",
+ info->flags & HAS_ATA ? "yes" : "no",
+ info->flags & HAS_USB ? "yes" : "no",
+ (loops_per_jiffy * HZ + 500) / 500000,
+ ((loops_per_jiffy * HZ + 500) / 5000) % 100);
+
+ return 0;
}
#endif /* CONFIG_PROC_FS */
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index cc3414fda362..76d25b2cfbbe 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -1,3 +1,8 @@
+config PGTABLE_LEVELS
+ int "Page Table Levels" if !IA64_PAGE_SIZE_64KB
+ range 3 4 if !IA64_PAGE_SIZE_64KB
+ default 3
+
source "init/Kconfig"
source "kernel/Kconfig.freezer"
@@ -287,19 +292,6 @@ config IA64_PAGE_SIZE_64KB
endchoice
-choice
- prompt "Page Table Levels"
- default PGTABLE_3
-
-config PGTABLE_3
- bool "3 Levels"
-
-config PGTABLE_4
- depends on !IA64_PAGE_SIZE_64KB
- bool "4 Levels"
-
-endchoice
-
if IA64_HP_SIM
config HZ
default 32
diff --git a/arch/ia64/include/asm/page.h b/arch/ia64/include/asm/page.h
index 1f1bf144fe62..ec48bb9f95e1 100644
--- a/arch/ia64/include/asm/page.h
+++ b/arch/ia64/include/asm/page.h
@@ -173,7 +173,7 @@ get_order (unsigned long size)
*/
typedef struct { unsigned long pte; } pte_t;
typedef struct { unsigned long pmd; } pmd_t;
-#ifdef CONFIG_PGTABLE_4
+#if CONFIG_PGTABLE_LEVELS == 4
typedef struct { unsigned long pud; } pud_t;
#endif
typedef struct { unsigned long pgd; } pgd_t;
@@ -182,7 +182,7 @@ get_order (unsigned long size)
# define pte_val(x) ((x).pte)
# define pmd_val(x) ((x).pmd)
-#ifdef CONFIG_PGTABLE_4
+#if CONFIG_PGTABLE_LEVELS == 4
# define pud_val(x) ((x).pud)
#endif
# define pgd_val(x) ((x).pgd)
diff --git a/arch/ia64/include/asm/pgalloc.h b/arch/ia64/include/asm/pgalloc.h
index 5767cdfc08db..f5e70e961948 100644
--- a/arch/ia64/include/asm/pgalloc.h
+++ b/arch/ia64/include/asm/pgalloc.h
@@ -32,7 +32,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
quicklist_free(0, NULL, pgd);
}
-#ifdef CONFIG_PGTABLE_4
+#if CONFIG_PGTABLE_LEVELS == 4
static inline void
pgd_populate(struct mm_struct *mm, pgd_t * pgd_entry, pud_t * pud)
{
@@ -49,7 +49,7 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud)
quicklist_free(0, NULL, pud);
}
#define __pud_free_tlb(tlb, pud, address) pud_free((tlb)->mm, pud)
-#endif /* CONFIG_PGTABLE_4 */
+#endif /* CONFIG_PGTABLE_LEVELS == 4 */
static inline void
pud_populate(struct mm_struct *mm, pud_t * pud_entry, pmd_t * pmd)
diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h
index 7b6f8801df57..9f3ed9ee8f13 100644
--- a/arch/ia64/include/asm/pgtable.h
+++ b/arch/ia64/include/asm/pgtable.h
@@ -99,7 +99,7 @@
#define PMD_MASK (~(PMD_SIZE-1))
#define PTRS_PER_PMD (1UL << (PTRS_PER_PTD_SHIFT))
-#ifdef CONFIG_PGTABLE_4
+#if CONFIG_PGTABLE_LEVELS == 4
/*
* Definitions for second level:
*
@@ -117,7 +117,7 @@
*
* PGDIR_SHIFT determines what a first-level page table entry can map.
*/
-#ifdef CONFIG_PGTABLE_4
+#if CONFIG_PGTABLE_LEVELS == 4
#define PGDIR_SHIFT (PUD_SHIFT + (PTRS_PER_PTD_SHIFT))
#else
#define PGDIR_SHIFT (PMD_SHIFT + (PTRS_PER_PTD_SHIFT))
@@ -180,7 +180,7 @@
#define __S111 __pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_RWX)
#define pgd_ERROR(e) printk("%s:%d: bad pgd %016lx.\n", __FILE__, __LINE__, pgd_val(e))
-#ifdef CONFIG_PGTABLE_4
+#if CONFIG_PGTABLE_LEVELS == 4
#define pud_ERROR(e) printk("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e))
#endif
#define pmd_ERROR(e) printk("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e))
@@ -281,7 +281,7 @@ extern unsigned long VMALLOC_END;
#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & _PFN_MASK))
#define pud_page(pud) virt_to_page((pud_val(pud) + PAGE_OFFSET))
-#ifdef CONFIG_PGTABLE_4
+#if CONFIG_PGTABLE_LEVELS == 4
#define pgd_none(pgd) (!pgd_val(pgd))
#define pgd_bad(pgd) (!ia64_phys_addr_valid(pgd_val(pgd)))
#define pgd_present(pgd) (pgd_val(pgd) != 0UL)
@@ -384,7 +384,7 @@ pgd_offset (const struct mm_struct *mm, unsigned long address)
here. */
#define pgd_offset_gate(mm, addr) pgd_offset_k(addr)
-#ifdef CONFIG_PGTABLE_4
+#if CONFIG_PGTABLE_LEVELS == 4
/* Find an entry in the second-level page table.. */
#define pud_offset(dir,addr) \
((pud_t *) pgd_page_vaddr(*(dir)) + (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)))
@@ -586,7 +586,7 @@ extern struct page *zero_page_memmap_ptr;
#define __HAVE_ARCH_PGD_OFFSET_GATE
-#ifndef CONFIG_PGTABLE_4
+#if CONFIG_PGTABLE_LEVELS == 3
#include <asm-generic/pgtable-nopud.h>
#endif
#include <asm-generic/pgtable.h>
diff --git a/arch/ia64/kernel/ivt.S b/arch/ia64/kernel/ivt.S
index 18e794a57248..e42bf7a913f3 100644
--- a/arch/ia64/kernel/ivt.S
+++ b/arch/ia64/kernel/ivt.S
@@ -146,7 +146,7 @@ ENTRY(vhpt_miss)
(p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=pgd_offset for region 5
(p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=pgd_offset for region[0-4]
cmp.eq p7,p6=0,r21 // unused address bits all zeroes?
-#ifdef CONFIG_PGTABLE_4
+#if CONFIG_PGTABLE_LEVELS == 4
shr.u r28=r22,PUD_SHIFT // shift pud index into position
#else
shr.u r18=r22,PMD_SHIFT // shift pmd index into position
@@ -155,7 +155,7 @@ ENTRY(vhpt_miss)
ld8 r17=[r17] // get *pgd (may be 0)
;;
(p7) cmp.eq p6,p7=r17,r0 // was pgd_present(*pgd) == NULL?
-#ifdef CONFIG_PGTABLE_4
+#if CONFIG_PGTABLE_LEVELS == 4
dep r28=r28,r17,3,(PAGE_SHIFT-3) // r28=pud_offset(pgd,addr)
;;
shr.u r18=r22,PMD_SHIFT // shift pmd index into position
@@ -222,13 +222,13 @@ ENTRY(vhpt_miss)
*/
ld8 r25=[r21] // read *pte again
ld8 r26=[r17] // read *pmd again
-#ifdef CONFIG_PGTABLE_4
+#if CONFIG_PGTABLE_LEVELS == 4
ld8 r19=[r28] // read *pud again
#endif
cmp.ne p6,p7=r0,r0
;;
cmp.ne.or.andcm p6,p7=r26,r20 // did *pmd change
-#ifdef CONFIG_PGTABLE_4
+#if CONFIG_PGTABLE_LEVELS == 4
cmp.ne.or.andcm p6,p7=r19,r29 // did *pud change
#endif
mov r27=PAGE_SHIFT<<2
@@ -476,7 +476,7 @@ ENTRY(nested_dtlb_miss)
(p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=pgd_offset for region 5
(p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=pgd_offset for region[0-4]
cmp.eq p7,p6=0,r21 // unused address bits all zeroes?
-#ifdef CONFIG_PGTABLE_4
+#if CONFIG_PGTABLE_LEVELS == 4
shr.u r18=r22,PUD_SHIFT // shift pud index into position
#else
shr.u r18=r22,PMD_SHIFT // shift pmd index into position
@@ -487,7 +487,7 @@ ENTRY(nested_dtlb_miss)
(p7) cmp.eq p6,p7=r17,r0 // was pgd_present(*pgd) == NULL?
dep r17=r18,r17,3,(PAGE_SHIFT-3) // r17=p[u|m]d_offset(pgd,addr)
;;
-#ifdef CONFIG_PGTABLE_4
+#if CONFIG_PGTABLE_LEVELS == 4
(p7) ld8 r17=[r17] // get *pud (may be 0)
shr.u r18=r22,PMD_SHIFT // shift pmd index into position
;;
diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c
index 5151a649c96b..b72cd7a07222 100644
--- a/arch/ia64/kernel/machine_kexec.c
+++ b/arch/ia64/kernel/machine_kexec.c
@@ -156,9 +156,9 @@ void arch_crash_save_vmcoreinfo(void)
VMCOREINFO_OFFSET(node_memblk_s, start_paddr);
VMCOREINFO_OFFSET(node_memblk_s, size);
#endif
-#ifdef CONFIG_PGTABLE_3
+#if CONFIG_PGTABLE_LEVELS == 3
VMCOREINFO_CONFIG(PGTABLE_3);
-#elif defined(CONFIG_PGTABLE_4)
+#elif CONFIG_PGTABLE_LEVELS == 4
VMCOREINFO_CONFIG(PGTABLE_4);
#endif
}
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 87b7c7581b1d..2dd8f63bfbbb 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -67,6 +67,10 @@ config HZ
default 1000 if CLEOPATRA
default 100
+config PGTABLE_LEVELS
+ default 2 if SUN3 || COLDFIRE
+ default 3
+
source "init/Kconfig"
source "kernel/Kconfig.freezer"
diff --git a/arch/microblaze/include/asm/seccomp.h b/arch/microblaze/include/asm/seccomp.h
index 0d912758a0d7..204618a2ce84 100644
--- a/arch/microblaze/include/asm/seccomp.h
+++ b/arch/microblaze/include/asm/seccomp.h
@@ -3,14 +3,8 @@
#include <linux/unistd.h>
-#define __NR_seccomp_read __NR_read
-#define __NR_seccomp_write __NR_write
-#define __NR_seccomp_exit __NR_exit
#define __NR_seccomp_sigreturn __NR_sigreturn
-#define __NR_seccomp_read_32 __NR_read
-#define __NR_seccomp_write_32 __NR_write
-#define __NR_seccomp_exit_32 __NR_exit
-#define __NR_seccomp_sigreturn_32 __NR_sigreturn
+#include <asm-generic/seccomp.h>
#endif /* _ASM_MICROBLAZE_SECCOMP_H */
diff --git a/arch/microblaze/kernel/cpu/mb.c b/arch/microblaze/kernel/cpu/mb.c
index 7b5dca7ed39d..9581d194d9e4 100644
--- a/arch/microblaze/kernel/cpu/mb.c
+++ b/arch/microblaze/kernel/cpu/mb.c
@@ -27,7 +27,6 @@
static int show_cpuinfo(struct seq_file *m, void *v)
{
- int count = 0;
char *fpga_family = "Unknown";
char *cpu_ver = "Unknown";
int i;
@@ -48,91 +47,89 @@ static int show_cpuinfo(struct seq_file *m, void *v)
}
}
- count = seq_printf(m,
- "CPU-Family: MicroBlaze\n"
- "FPGA-Arch: %s\n"
- "CPU-Ver: %s, %s endian\n"
- "CPU-MHz: %d.%02d\n"
- "BogoMips: %lu.%02lu\n",
- fpga_family,
- cpu_ver,
- cpuinfo.endian ? "little" : "big",
- cpuinfo.cpu_clock_freq /
- 1000000,
- cpuinfo.cpu_clock_freq %
- 1000000,
- loops_per_jiffy / (500000 / HZ),
- (loops_per_jiffy / (5000 / HZ)) % 100);
-
- count += seq_printf(m,
- "HW:\n Shift:\t\t%s\n"
- " MSR:\t\t%s\n"
- " PCMP:\t\t%s\n"
- " DIV:\t\t%s\n",
- (cpuinfo.use_instr & PVR0_USE_BARREL_MASK) ? "yes" : "no",
- (cpuinfo.use_instr & PVR2_USE_MSR_INSTR) ? "yes" : "no",
- (cpuinfo.use_instr & PVR2_USE_PCMP_INSTR) ? "yes" : "no",
- (cpuinfo.use_instr & PVR0_USE_DIV_MASK) ? "yes" : "no");
-
- count += seq_printf(m,
- " MMU:\t\t%x\n",
- cpuinfo.mmu);
-
- count += seq_printf(m,
- " MUL:\t\t%s\n"
- " FPU:\t\t%s\n",
- (cpuinfo.use_mult & PVR2_USE_MUL64_MASK) ? "v2" :
- (cpuinfo.use_mult & PVR0_USE_HW_MUL_MASK) ? "v1" : "no",
- (cpuinfo.use_fpu & PVR2_USE_FPU2_MASK) ? "v2" :
- (cpuinfo.use_fpu & PVR0_USE_FPU_MASK) ? "v1" : "no");
-
- count += seq_printf(m,
- " Exc:\t\t%s%s%s%s%s%s%s%s\n",
- (cpuinfo.use_exc & PVR2_OPCODE_0x0_ILL_MASK) ? "op0x0 " : "",
- (cpuinfo.use_exc & PVR2_UNALIGNED_EXC_MASK) ? "unal " : "",
- (cpuinfo.use_exc & PVR2_ILL_OPCODE_EXC_MASK) ? "ill " : "",
- (cpuinfo.use_exc & PVR2_IOPB_BUS_EXC_MASK) ? "iopb " : "",
- (cpuinfo.use_exc & PVR2_DOPB_BUS_EXC_MASK) ? "dopb " : "",
- (cpuinfo.use_exc & PVR2_DIV_ZERO_EXC_MASK) ? "zero " : "",
- (cpuinfo.use_exc & PVR2_FPU_EXC_MASK) ? "fpu " : "",
- (cpuinfo.use_exc & PVR2_USE_FSL_EXC) ? "fsl " : "");
-
- count += seq_printf(m,
- "Stream-insns:\t%sprivileged\n",
- cpuinfo.mmu_privins ? "un" : "");
+ seq_printf(m,
+ "CPU-Family: MicroBlaze\n"
+ "FPGA-Arch: %s\n"
+ "CPU-Ver: %s, %s endian\n"
+ "CPU-MHz: %d.%02d\n"
+ "BogoMips: %lu.%02lu\n",
+ fpga_family,
+ cpu_ver,
+ cpuinfo.endian ? "little" : "big",
+ cpuinfo.cpu_clock_freq / 1000000,
+ cpuinfo.cpu_clock_freq % 1000000,
+ loops_per_jiffy / (500000 / HZ),
+ (loops_per_jiffy / (5000 / HZ)) % 100);
+
+ seq_printf(m,
+ "HW:\n Shift:\t\t%s\n"
+ " MSR:\t\t%s\n"
+ " PCMP:\t\t%s\n"
+ " DIV:\t\t%s\n",
+ (cpuinfo.use_instr & PVR0_USE_BARREL_MASK) ? "yes" : "no",
+ (cpuinfo.use_instr & PVR2_USE_MSR_INSTR) ? "yes" : "no",
+ (cpuinfo.use_instr & PVR2_USE_PCMP_INSTR) ? "yes" : "no",
+ (cpuinfo.use_instr & PVR0_USE_DIV_MASK) ? "yes" : "no");
+
+ seq_printf(m, " MMU:\t\t%x\n", cpuinfo.mmu);
+
+ seq_printf(m,
+ " MUL:\t\t%s\n"
+ " FPU:\t\t%s\n",
+ (cpuinfo.use_mult & PVR2_USE_MUL64_MASK) ? "v2" :
+ (cpuinfo.use_mult & PVR0_USE_HW_MUL_MASK) ? "v1" : "no",
+ (cpuinfo.use_fpu & PVR2_USE_FPU2_MASK) ? "v2" :
+ (cpuinfo.use_fpu & PVR0_USE_FPU_MASK) ? "v1" : "no");
+
+ seq_printf(m,
+ " Exc:\t\t%s%s%s%s%s%s%s%s\n",
+ (cpuinfo.use_exc & PVR2_OPCODE_0x0_ILL_MASK) ? "op0x0 " : "",
+ (cpuinfo.use_exc & PVR2_UNALIGNED_EXC_MASK) ? "unal " : "",
+ (cpuinfo.use_exc & PVR2_ILL_OPCODE_EXC_MASK) ? "ill " : "",
+ (cpuinfo.use_exc & PVR2_IOPB_BUS_EXC_MASK) ? "iopb " : "",
+ (cpuinfo.use_exc & PVR2_DOPB_BUS_EXC_MASK) ? "dopb " : "",
+ (cpuinfo.use_exc & PVR2_DIV_ZERO_EXC_MASK) ? "zero " : "",
+ (cpuinfo.use_exc & PVR2_FPU_EXC_MASK) ? "fpu " : "",
+ (cpuinfo.use_exc & PVR2_USE_FSL_EXC) ? "fsl " : "");
+
+ seq_printf(m,
+ "Stream-insns:\t%sprivileged\n",
+ cpuinfo.mmu_privins ? "un" : "");
if (cpuinfo.use_icache)
- count += seq_printf(m,
- "Icache:\t\t%ukB\tline length:\t%dB\n",
- cpuinfo.icache_size >> 10,
- cpuinfo.icache_line_length);
+ seq_printf(m,
+ "Icache:\t\t%ukB\tline length:\t%dB\n",
+ cpuinfo.icache_size >> 10,
+ cpuinfo.icache_line_length);
else
- count += seq_printf(m, "Icache:\t\tno\n");
+ seq_puts(m, "Icache:\t\tno\n");
if (cpuinfo.use_dcache) {
- count += seq_printf(m,
- "Dcache:\t\t%ukB\tline length:\t%dB\n",
- cpuinfo.dcache_size >> 10,
- cpuinfo.dcache_line_length);
- seq_printf(m, "Dcache-Policy:\t");
+ seq_printf(m,
+ "Dcache:\t\t%ukB\tline length:\t%dB\n",
+ cpuinfo.dcache_size >> 10,
+ cpuinfo.dcache_line_length);
+ seq_puts(m, "Dcache-Policy:\t");
if (cpuinfo.dcache_wb)
- count += seq_printf(m, "write-back\n");
+ seq_puts(m, "write-back\n");
else
- count += seq_printf(m, "write-through\n");
- } else
- count += seq_printf(m, "Dcache:\t\tno\n");
+ seq_puts(m, "write-through\n");
+ } else {
+ seq_puts(m, "Dcache:\t\tno\n");
+ }
+
+ seq_printf(m,
+ "HW-Debug:\t%s\n",
+ cpuinfo.hw_debug ? "yes" : "no");
- count += seq_printf(m,
- "HW-Debug:\t%s\n",
- cpuinfo.hw_debug ? "yes" : "no");
+ seq_printf(m,
+ "PVR-USR1:\t%02x\n"
+ "PVR-USR2:\t%08x\n",
+ cpuinfo.pvr_user1,
+ cpuinfo.pvr_user2);
- count += seq_printf(m,
- "PVR-USR1:\t%02x\n"
- "PVR-USR2:\t%08x\n",
- cpuinfo.pvr_user1,
- cpuinfo.pvr_user2);
+ seq_printf(m, "Page size:\t%lu\n", PAGE_SIZE);
- count += seq_printf(m, "Page size:\t%lu\n", PAGE_SIZE);
return 0;
}
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 3b44f5e2c17c..7c2db950eb00 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -23,7 +23,7 @@ config MIPS
select HAVE_KRETPROBES
select HAVE_DEBUG_KMEMLEAK
select HAVE_SYSCALL_TRACEPOINTS
- select ARCH_BINFMT_ELF_RANDOMIZE_PIE
+ select ARCH_HAS_ELF_RANDOMIZE
select HAVE_ARCH_TRANSPARENT_HUGEPAGE if CPU_SUPPORTS_HUGEPAGES && 64BIT
select RTC_LIB if !MACH_LOONGSON
select GENERIC_ATOMIC64 if !64BIT
@@ -2680,6 +2680,11 @@ config STACKTRACE_SUPPORT
bool
default y
+config PGTABLE_LEVELS
+ int
+ default 3 if 64BIT && !PAGE_SIZE_64KB
+ default 2
+
source "init/Kconfig"
source "kernel/Kconfig.freezer"
diff --git a/arch/mips/include/asm/elf.h b/arch/mips/include/asm/elf.h
index 694925a26924..d8dd78194e09 100644
--- a/arch/mips/include/asm/elf.h
+++ b/arch/mips/include/asm/elf.h
@@ -415,10 +415,6 @@ struct linux_binprm;
extern int arch_setup_additional_pages(struct linux_binprm *bprm,
int uses_interp);
-struct mm_struct;
-extern unsigned long arch_randomize_brk(struct mm_struct *mm);
-#define arch_randomize_brk arch_randomize_brk
-
struct arch_elf_state {
int fp_abi;
int interp_fp_abi;
diff --git a/arch/mips/include/asm/seccomp.h b/arch/mips/include/asm/seccomp.h
index f29c75cf83c6..1d8a2e2c75c1 100644
--- a/arch/mips/include/asm/seccomp.h
+++ b/arch/mips/include/asm/seccomp.h
@@ -2,11 +2,6 @@
#include <linux/unistd.h>
-#define __NR_seccomp_read __NR_read
-#define __NR_seccomp_write __NR_write
-#define __NR_seccomp_exit __NR_exit
-#define __NR_seccomp_sigreturn __NR_rt_sigreturn
-
/*
* Kludge alert:
*
@@ -29,4 +24,6 @@
#endif /* CONFIG_MIPS32_O32 */
+#include <asm-generic/seccomp.h>
+
#endif /* __ASM_SECCOMP_H */
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index cfcb876cae6b..106e741aa7ee 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -67,6 +67,7 @@
#define MADV_SEQUENTIAL 2 /* expect sequential page references */
#define MADV_WILLNEED 3 /* will need these pages */
#define MADV_DONTNEED 4 /* don't need these pages */
+#define MADV_FREE 5 /* free pages only if memory pressure */
/* common parameters: try to keep these consistent across architectures */
#define MADV_REMOVE 9 /* remove these pages & resources */
diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c
index f1baadd56e82..5c81fdd032c3 100644
--- a/arch/mips/mm/mmap.c
+++ b/arch/mips/mm/mmap.c
@@ -142,18 +142,26 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp,
addr0, len, pgoff, flags, DOWN);
}
+unsigned long arch_mmap_rnd(void)
+{
+ unsigned long rnd;
+
+ rnd = (unsigned long)get_random_int();
+ rnd <<= PAGE_SHIFT;
+ if (TASK_IS_32BIT_ADDR)
+ rnd &= 0xfffffful;
+ else
+ rnd &= 0xffffffful;
+
+ return rnd;
+}
+
void arch_pick_mmap_layout(struct mm_struct *mm)
{
unsigned long random_factor = 0UL;
- if (current->flags & PF_RANDOMIZE) {
- random_factor = get_random_int();
- random_factor = random_factor << PAGE_SHIFT;
- if (TASK_IS_32BIT_ADDR)
- random_factor &= 0xfffffful;
- else
- random_factor &= 0xffffffful;
- }
+ if (current->flags & PF_RANDOMIZE)
+ random_factor = arch_mmap_rnd();
if (mmap_is_legacy()) {
mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
diff --git a/arch/nios2/kernel/cpuinfo.c b/arch/nios2/kernel/cpuinfo.c
index a223691dff4f..1d96de0bd4aa 100644
--- a/arch/nios2/kernel/cpuinfo.c
+++ b/arch/nios2/kernel/cpuinfo.c
@@ -126,47 +126,46 @@ void __init setup_cpuinfo(void)
*/
static int show_cpuinfo(struct seq_file *m, void *v)
{
- int count = 0;
const u32 clockfreq = cpuinfo.cpu_clock_freq;
- count = seq_printf(m,
- "CPU:\t\tNios II/%s\n"
- "MMU:\t\t%s\n"
- "FPU:\t\tnone\n"
- "Clocking:\t%u.%02u MHz\n"
- "BogoMips:\t%lu.%02lu\n"
- "Calibration:\t%lu loops\n",
- cpuinfo.cpu_impl,
- cpuinfo.mmu ? "present" : "none",
- clockfreq / 1000000, (clockfreq / 100000) % 10,
- (loops_per_jiffy * HZ) / 500000,
- ((loops_per_jiffy * HZ) / 5000) % 100,
- (loops_per_jiffy * HZ));
-
- count += seq_printf(m,
- "HW:\n"
- " MUL:\t\t%s\n"
- " MULX:\t\t%s\n"
- " DIV:\t\t%s\n",
- cpuinfo.has_mul ? "yes" : "no",
- cpuinfo.has_mulx ? "yes" : "no",
- cpuinfo.has_div ? "yes" : "no");
-
- count += seq_printf(m,
- "Icache:\t\t%ukB, line length: %u\n",
- cpuinfo.icache_size >> 10,
- cpuinfo.icache_line_size);
-
- count += seq_printf(m,
- "Dcache:\t\t%ukB, line length: %u\n",
- cpuinfo.dcache_size >> 10,
- cpuinfo.dcache_line_size);
-
- count += seq_printf(m,
- "TLB:\t\t%u ways, %u entries, %u PID bits\n",
- cpuinfo.tlb_num_ways,
- cpuinfo.tlb_num_entries,
- cpuinfo.tlb_pid_num_bits);
+ seq_printf(m,
+ "CPU:\t\tNios II/%s\n"
+ "MMU:\t\t%s\n"
+ "FPU:\t\tnone\n"
+ "Clocking:\t%u.%02u MHz\n"
+ "BogoMips:\t%lu.%02lu\n"
+ "Calibration:\t%lu loops\n",
+ cpuinfo.cpu_impl,
+ cpuinfo.mmu ? "present" : "none",
+ clockfreq / 1000000, (clockfreq / 100000) % 10,
+ (loops_per_jiffy * HZ) / 500000,
+ ((loops_per_jiffy * HZ) / 5000) % 100,
+ (loops_per_jiffy * HZ));
+
+ seq_printf(m,
+ "HW:\n"
+ " MUL:\t\t%s\n"
+ " MULX:\t\t%s\n"
+ " DIV:\t\t%s\n",
+ cpuinfo.has_mul ? "yes" : "no",
+ cpuinfo.has_mulx ? "yes" : "no",
+ cpuinfo.has_div ? "yes" : "no");
+
+ seq_printf(m,
+ "Icache:\t\t%ukB, line length: %u\n",
+ cpuinfo.icache_size >> 10,
+ cpuinfo.icache_line_size);
+
+ seq_printf(m,
+ "Dcache:\t\t%ukB, line length: %u\n",
+ cpuinfo.dcache_size >> 10,
+ cpuinfo.dcache_line_size);
+
+ seq_printf(m,
+ "TLB:\t\t%u ways, %u entries, %u PID bits\n",
+ cpuinfo.tlb_num_ways,
+ cpuinfo.tlb_num_entries,
+ cpuinfo.tlb_pid_num_bits);
return 0;
}
diff --git a/arch/openrisc/kernel/setup.c b/arch/openrisc/kernel/setup.c
index 4fc7ccc0a2cf..b4ed8b36e078 100644
--- a/arch/openrisc/kernel/setup.c
+++ b/arch/openrisc/kernel/setup.c
@@ -329,30 +329,32 @@ static int show_cpuinfo(struct seq_file *m, void *v)
version = (vr & SPR_VR_VER) >> 24;
revision = vr & SPR_VR_REV;
- return seq_printf(m,
- "cpu\t\t: OpenRISC-%x\n"
- "revision\t: %d\n"
- "frequency\t: %ld\n"
- "dcache size\t: %d bytes\n"
- "dcache block size\t: %d bytes\n"
- "icache size\t: %d bytes\n"
- "icache block size\t: %d bytes\n"
- "immu\t\t: %d entries, %lu ways\n"
- "dmmu\t\t: %d entries, %lu ways\n"
- "bogomips\t: %lu.%02lu\n",
- version,
- revision,
- loops_per_jiffy * HZ,
- cpuinfo.dcache_size,
- cpuinfo.dcache_block_size,
- cpuinfo.icache_size,
- cpuinfo.icache_block_size,
- 1 << ((mfspr(SPR_DMMUCFGR) & SPR_DMMUCFGR_NTS) >> 2),
- 1 + (mfspr(SPR_DMMUCFGR) & SPR_DMMUCFGR_NTW),
- 1 << ((mfspr(SPR_IMMUCFGR) & SPR_IMMUCFGR_NTS) >> 2),
- 1 + (mfspr(SPR_IMMUCFGR) & SPR_IMMUCFGR_NTW),
- (loops_per_jiffy * HZ) / 500000,
- ((loops_per_jiffy * HZ) / 5000) % 100);
+ seq_printf(m,
+ "cpu\t\t: OpenRISC-%x\n"
+ "revision\t: %d\n"
+ "frequency\t: %ld\n"
+ "dcache size\t: %d bytes\n"
+ "dcache block size\t: %d bytes\n"
+ "icache size\t: %d bytes\n"
+ "icache block size\t: %d bytes\n"
+ "immu\t\t: %d entries, %lu ways\n"
+ "dmmu\t\t: %d entries, %lu ways\n"
+ "bogomips\t: %lu.%02lu\n",
+ version,
+ revision,
+ loops_per_jiffy * HZ,
+ cpuinfo.dcache_size,
+ cpuinfo.dcache_block_size,
+ cpuinfo.icache_size,
+ cpuinfo.icache_block_size,
+ 1 << ((mfspr(SPR_DMMUCFGR) & SPR_DMMUCFGR_NTS) >> 2),
+ 1 + (mfspr(SPR_DMMUCFGR) & SPR_DMMUCFGR_NTW),
+ 1 << ((mfspr(SPR_IMMUCFGR) & SPR_IMMUCFGR_NTS) >> 2),
+ 1 + (mfspr(SPR_IMMUCFGR) & SPR_IMMUCFGR_NTW),
+ (loops_per_jiffy * HZ) / 500000,
+ ((loops_per_jiffy * HZ) / 5000) % 100);
+
+ return 0;
}
static void *c_start(struct seq_file *m, loff_t * pos)
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 8014727a2743..c36546959e86 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -103,6 +103,11 @@ config ARCH_MAY_HAVE_PC_FDC
depends on BROKEN
default y
+config PGTABLE_LEVELS
+ int
+ default 3 if 64BIT && PARISC_PAGE_SIZE_4KB
+ default 2
+
source "init/Kconfig"
source "kernel/Kconfig.freezer"
diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild
index 8686237a3c3c..12b341d04f88 100644
--- a/arch/parisc/include/asm/Kbuild
+++ b/arch/parisc/include/asm/Kbuild
@@ -20,6 +20,7 @@ generic-y += param.h
generic-y += percpu.h
generic-y += poll.h
generic-y += preempt.h
+generic-y += seccomp.h
generic-y += segment.h
generic-y += topology.h
generic-y += trace_clock.h
diff --git a/arch/parisc/include/asm/pgalloc.h b/arch/parisc/include/asm/pgalloc.h
index d17437238a2c..1ba29369257c 100644
--- a/arch/parisc/include/asm/pgalloc.h
+++ b/arch/parisc/include/asm/pgalloc.h
@@ -51,7 +51,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
free_pages((unsigned long)pgd, PGD_ALLOC_ORDER);
}
-#if PT_NLEVELS == 3
+#if CONFIG_PGTABLE_LEVELS == 3
/* Three Level Page Table Support for pmd's */
diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h
index 15207b9362bf..0a183756d6ec 100644
--- a/arch/parisc/include/asm/pgtable.h
+++ b/arch/parisc/include/asm/pgtable.h
@@ -68,13 +68,11 @@ extern void purge_tlb_entries(struct mm_struct *, unsigned long);
#define KERNEL_INITIAL_ORDER 24 /* 0 to 1<<24 = 16MB */
#define KERNEL_INITIAL_SIZE (1 << KERNEL_INITIAL_ORDER)
-#if defined(CONFIG_64BIT) && defined(CONFIG_PARISC_PAGE_SIZE_4KB)
-#define PT_NLEVELS 3
+#if CONFIG_PGTABLE_LEVELS == 3
#define PGD_ORDER 1 /* Number of pages per pgd */
#define PMD_ORDER 1 /* Number of pages per pmd */
#define PGD_ALLOC_ORDER 2 /* first pgd contains pmd */
#else
-#define PT_NLEVELS 2
#define PGD_ORDER 1 /* Number of pages per pgd */
#define PGD_ALLOC_ORDER PGD_ORDER
#endif
@@ -93,7 +91,7 @@ extern void purge_tlb_entries(struct mm_struct *, unsigned long);
#define PMD_SHIFT (PLD_SHIFT + BITS_PER_PTE)
#define PMD_SIZE (1UL << PMD_SHIFT)
#define PMD_MASK (~(PMD_SIZE-1))
-#if PT_NLEVELS == 3
+#if CONFIG_PGTABLE_LEVELS == 3
#define BITS_PER_PMD (PAGE_SHIFT + PMD_ORDER - BITS_PER_PMD_ENTRY)
#else
#define __PAGETABLE_PMD_FOLDED
@@ -277,7 +275,7 @@ extern unsigned long *empty_zero_page;
#define pgd_flag(x) (pgd_val(x) & PxD_FLAG_MASK)
#define pgd_address(x) ((unsigned long)(pgd_val(x) &~ PxD_FLAG_MASK) << PxD_VALUE_SHIFT)
-#if PT_NLEVELS == 3
+#if CONFIG_PGTABLE_LEVELS == 3
/* The first entry of the permanent pmd is not there if it contains
* the gateway marker */
#define pmd_none(x) (!pmd_val(x) || pmd_flag(x) == PxD_FLAG_ATTACHED)
@@ -287,7 +285,7 @@ extern unsigned long *empty_zero_page;
#define pmd_bad(x) (!(pmd_flag(x) & PxD_FLAG_VALID))
#define pmd_present(x) (pmd_flag(x) & PxD_FLAG_PRESENT)
static inline void pmd_clear(pmd_t *pmd) {
-#if PT_NLEVELS == 3
+#if CONFIG_PGTABLE_LEVELS == 3
if (pmd_flag(*pmd) & PxD_FLAG_ATTACHED)
/* This is the entry pointing to the permanent pmd
* attached to the pgd; cannot clear it */
@@ -299,7 +297,7 @@ static inline void pmd_clear(pmd_t *pmd) {
-#if PT_NLEVELS == 3
+#if CONFIG_PGTABLE_LEVELS == 3
#define pgd_page_vaddr(pgd) ((unsigned long) __va(pgd_address(pgd)))
#define pgd_page(pgd) virt_to_page((void *)pgd_page_vaddr(pgd))
@@ -309,7 +307,7 @@ static inline void pmd_clear(pmd_t *pmd) {
#define pgd_bad(x) (!(pgd_flag(x) & PxD_FLAG_VALID))
#define pgd_present(x) (pgd_flag(x) & PxD_FLAG_PRESENT)
static inline void pgd_clear(pgd_t *pgd) {
-#if PT_NLEVELS == 3
+#if CONFIG_PGTABLE_LEVELS == 3
if(pgd_flag(*pgd) & PxD_FLAG_ATTACHED)
/* This is the permanent pmd attached to the pgd; cannot
* free it */
@@ -393,7 +391,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
/* Find an entry in the second-level page table.. */
-#if PT_NLEVELS == 3
+#if CONFIG_PGTABLE_LEVELS == 3
#define pmd_offset(dir,address) \
((pmd_t *) pgd_page_vaddr(*(dir)) + (((address)>>PMD_SHIFT) & (PTRS_PER_PMD-1)))
#else
diff --git a/arch/parisc/include/asm/seccomp.h b/arch/parisc/include/asm/seccomp.h
deleted file mode 100644
index 015f7887aa29..000000000000
--- a/arch/parisc/include/asm/seccomp.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef _ASM_PARISC_SECCOMP_H
-#define _ASM_PARISC_SECCOMP_H
-
-#include <linux/unistd.h>
-
-#define __NR_seccomp_read __NR_read
-#define __NR_seccomp_write __NR_write
-#define __NR_seccomp_exit __NR_exit
-#define __NR_seccomp_sigreturn __NR_rt_sigreturn
-
-#define __NR_seccomp_read_32 __NR_read
-#define __NR_seccomp_write_32 __NR_write
-#define __NR_seccomp_exit_32 __NR_exit
-#define __NR_seccomp_sigreturn_32 __NR_rt_sigreturn
-
-#endif /* _ASM_PARISC_SECCOMP_H */
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index 294d251ca7b2..6cb8db76fd4e 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -40,6 +40,7 @@
#define MADV_SPACEAVAIL 5 /* insure that resources are reserved */
#define MADV_VPS_PURGE 6 /* Purge pages from VM page cache */
#define MADV_VPS_INHERIT 7 /* Inherit parents page size */
+#define MADV_FREE 8 /* free pages only if memory pressure */
/* common/generic parameters */
#define MADV_REMOVE 9 /* remove these pages & resources */
diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S
index 2ab16bb160a8..75819617f93b 100644
--- a/arch/parisc/kernel/entry.S
+++ b/arch/parisc/kernel/entry.S
@@ -398,7 +398,7 @@
* can address up to 1TB
*/
.macro L2_ptep pmd,pte,index,va,fault
-#if PT_NLEVELS == 3
+#if CONFIG_PGTABLE_LEVELS == 3
extru \va,31-ASM_PMD_SHIFT,ASM_BITS_PER_PMD,\index
#else
# if defined(CONFIG_64BIT)
@@ -436,7 +436,7 @@
* all ILP32 processes and all the kernel for machines with
* under 4GB of memory) */
.macro L3_ptep pgd,pte,index,va,fault
-#if PT_NLEVELS == 3 /* we might have a 2-Level scheme, e.g. with 16kb page size */
+#if CONFIG_PGTABLE_LEVELS == 3 /* we might have a 2-Level scheme, e.g. with 16kb page size */
extrd,u \va,63-ASM_PGDIR_SHIFT,ASM_BITS_PER_PGD,\index
copy %r0,\pte
extrd,u,*= \va,63-ASM_PGDIR_SHIFT,64-ASM_PGDIR_SHIFT,%r0
diff --git a/arch/parisc/kernel/head.S b/arch/parisc/kernel/head.S
index d4dc588c0dc1..e7d64527aff9 100644
--- a/arch/parisc/kernel/head.S
+++ b/arch/parisc/kernel/head.S
@@ -74,7 +74,7 @@ $bss_loop:
mtctl %r4,%cr24 /* Initialize kernel root pointer */
mtctl %r4,%cr25 /* Initialize user root pointer */
-#if PT_NLEVELS == 3
+#if CONFIG_PGTABLE_LEVELS == 3
/* Set pmd in pgd */
load32 PA(pmd0),%r5
shrd %r5,PxD_VALUE_SHIFT,%r3
@@ -97,7 +97,7 @@ $bss_loop:
stw %r3,0(%r4)
ldo (PAGE_SIZE >> PxD_VALUE_SHIFT)(%r3),%r3
addib,> -1,%r1,1b
-#if PT_NLEVELS == 3
+#if CONFIG_PGTABLE_LEVELS == 3
ldo ASM_PMD_ENTRY_SIZE(%r4),%r4
#else
ldo ASM_PGD_ENTRY_SIZE(%r4),%r4
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 15dbe81cf5f3..c229427fa546 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -34,7 +34,7 @@
extern int data_start;
extern void parisc_kernel_start(void); /* Kernel entry point in head.S */
-#if PT_NLEVELS == 3
+#if CONFIG_PGTABLE_LEVELS == 3
/* NOTE: This layout exactly conforms to the hybrid L2/L3 page table layout
* with the first pmd adjacent to the pgd and below it. gcc doesn't actually
* guarantee that global objects will be laid out in memory in the same order
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 1734f8a338eb..fb48a2e2f0b3 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -88,7 +88,7 @@ config PPC
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
select BINFMT_ELF
- select ARCH_BINFMT_ELF_RANDOMIZE_PIE
+ select ARCH_HAS_ELF_RANDOMIZE
select OF
select OF_EARLY_FLATTREE
select OF_RESERVED_MEM
@@ -294,6 +294,12 @@ config ZONE_DMA32
bool
default y if PPC64
+config PGTABLE_LEVELS
+ int
+ default 2 if !PPC64
+ default 3 if PPC_64K_PAGES
+ default 4
+
source "init/Kconfig"
source "kernel/Kconfig.freezer"
diff --git a/arch/powerpc/include/asm/elf.h b/arch/powerpc/include/asm/elf.h
index 57d289acb803..ee46ffef608e 100644
--- a/arch/powerpc/include/asm/elf.h
+++ b/arch/powerpc/include/asm/elf.h
@@ -128,10 +128,6 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm,
(0x7ff >> (PAGE_SHIFT - 12)) : \
(0x3ffff >> (PAGE_SHIFT - 12)))
-extern unsigned long arch_randomize_brk(struct mm_struct *mm);
-#define arch_randomize_brk arch_randomize_brk
-
-
#ifdef CONFIG_SPU_BASE
/* Notes used in ET_CORE. Note name is "SPU/<fd>/<filename>". */
#define NT_SPU 1
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index 43e6ad424c7f..1bc103a86160 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -491,9 +491,11 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd)
#define pmd_pfn(pmd) pte_pfn(pmd_pte(pmd))
#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
#define pmd_young(pmd) pte_young(pmd_pte(pmd))
+#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
#define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd)))
#define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd)))
#define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd)))
+#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd)))
#define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd)))
#define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd)))
diff --git a/arch/powerpc/include/asm/seccomp.h b/arch/powerpc/include/asm/seccomp.h
new file mode 100644
index 000000000000..c1818e35cf02
--- /dev/null
+++ b/arch/powerpc/include/asm/seccomp.h
@@ -0,0 +1,10 @@
+#ifndef _ASM_POWERPC_SECCOMP_H
+#define _ASM_POWERPC_SECCOMP_H
+
+#include <linux/unistd.h>
+
+#define __NR_seccomp_sigreturn_32 __NR_sigreturn
+
+#include <asm-generic/seccomp.h>
+
+#endif /* _ASM_POWERPC_SECCOMP_H */
diff --git a/arch/powerpc/include/uapi/asm/Kbuild b/arch/powerpc/include/uapi/asm/Kbuild
index 7a3f795ac218..79c4068be278 100644
--- a/arch/powerpc/include/uapi/asm/Kbuild
+++ b/arch/powerpc/include/uapi/asm/Kbuild
@@ -25,7 +25,6 @@ header-y += posix_types.h
header-y += ps3fb.h
header-y += ptrace.h
header-y += resource.h
-header-y += seccomp.h
header-y += sembuf.h
header-y += setup.h
header-y += shmbuf.h
diff --git a/arch/powerpc/include/uapi/asm/seccomp.h b/arch/powerpc/include/uapi/asm/seccomp.h
deleted file mode 100644
index 00c1d9133cfe..000000000000
--- a/arch/powerpc/include/uapi/asm/seccomp.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef _ASM_POWERPC_SECCOMP_H
-#define _ASM_POWERPC_SECCOMP_H
-
-#include <linux/unistd.h>
-
-#define __NR_seccomp_read __NR_read
-#define __NR_seccomp_write __NR_write
-#define __NR_seccomp_exit __NR_exit
-#define __NR_seccomp_sigreturn __NR_rt_sigreturn
-
-#define __NR_seccomp_read_32 __NR_read
-#define __NR_seccomp_write_32 __NR_write
-#define __NR_seccomp_exit_32 __NR_exit
-#define __NR_seccomp_sigreturn_32 __NR_sigreturn
-
-#endif /* _ASM_POWERPC_SECCOMP_H */
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
index cb8bdbe4972f..0f0502e12f6c 100644
--- a/arch/powerpc/mm/mmap.c
+++ b/arch/powerpc/mm/mmap.c
@@ -53,21 +53,20 @@ static inline int mmap_is_legacy(void)
return sysctl_legacy_va_layout;
}
-static unsigned long mmap_rnd(void)
+unsigned long arch_mmap_rnd(void)
{
- unsigned long rnd = 0;
+ unsigned long rnd;
+
+ /* 8MB for 32bit, 1GB for 64bit */
+ if (is_32bit_task())
+ rnd = (unsigned long)get_random_int() % (1<<(23-PAGE_SHIFT));
+ else
+ rnd = (unsigned long)get_random_int() % (1<<(30-PAGE_SHIFT));
- if (current->flags & PF_RANDOMIZE) {
- /* 8MB for 32bit, 1GB for 64bit */
- if (is_32bit_task())
- rnd = (long)(get_random_int() % (1<<(23-PAGE_SHIFT)));
- else
- rnd = (long)(get_random_int() % (1<<(30-PAGE_SHIFT)));
- }
return rnd << PAGE_SHIFT;
}
-static inline unsigned long mmap_base(void)
+static inline unsigned long mmap_base(unsigned long rnd)
{
unsigned long gap = rlimit(RLIMIT_STACK);
@@ -76,7 +75,7 @@ static inline unsigned long mmap_base(void)
else if (gap > MAX_GAP)
gap = MAX_GAP;
- return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd());
+ return PAGE_ALIGN(TASK_SIZE - gap - rnd);
}
/*
@@ -85,6 +84,11 @@ static inline unsigned long mmap_base(void)
*/
void arch_pick_mmap_layout(struct mm_struct *mm)
{
+ unsigned long random_factor = 0UL;
+
+ if (current->flags & PF_RANDOMIZE)
+ random_factor = arch_mmap_rnd();
+
/*
* Fall back to the standard layout if the personality
* bit is set, or if the expected stack growth is unlimited:
@@ -93,7 +97,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = arch_get_unmapped_area;
} else {
- mm->mmap_base = mmap_base();
+ mm->mmap_base = mmap_base(random_factor);
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
}
}
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 4bdca5e58642..de2726a487b0 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -65,6 +65,7 @@ config S390
def_bool y
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
+ select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_SG_CHAIN
select ARCH_HAVE_NMI_SAFE_CMPXCHG
@@ -155,6 +156,11 @@ config S390
config SCHED_OMIT_FRAME_POINTER
def_bool y
+config PGTABLE_LEVELS
+ int
+ default 4 if 64BIT
+ default 2
+
source "init/Kconfig"
source "kernel/Kconfig.freezer"
@@ -322,6 +328,7 @@ config COMPAT
select COMPAT_BINFMT_ELF if BINFMT_ELF
select ARCH_WANT_OLD_COMPAT_IPC
select COMPAT_OLD_SIGACTION
+ depends on MULTIUSER
help
Select this option if you want to enable your system kernel to
handle system-calls from ELF binaries for 31 bit ESA. This option
diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h
index 5214d15680ab..3ad48f22de78 100644
--- a/arch/s390/include/asm/elf.h
+++ b/arch/s390/include/asm/elf.h
@@ -157,10 +157,11 @@ extern unsigned int vdso_enabled;
/* This is the location that an ET_DYN program is loaded if exec'ed. Typical
use of this is to invoke "./ld.so someprog" to test out a new version of
the loader. We need to make sure that it is out of the way of the program
- that it will "exec", and that there is sufficient room for the brk. */
-
-extern unsigned long randomize_et_dyn(void);
-#define ELF_ET_DYN_BASE randomize_et_dyn()
+ that it will "exec", and that there is sufficient room for the brk. 64-bit
+ tasks are aligned to 4GB. */
+#define ELF_ET_DYN_BASE (is_32bit_task() ? \
+ (STACK_TOP / 3 * 2) : \
+ (STACK_TOP / 3 * 2) & ~((1UL << 32) - 1))
/* This yields a mask that user programs can use to figure out what
instruction set this CPU supports. */
@@ -221,9 +222,6 @@ struct linux_binprm;
#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
int arch_setup_additional_pages(struct linux_binprm *, int);
-extern unsigned long arch_randomize_brk(struct mm_struct *mm);
-#define arch_randomize_brk arch_randomize_brk
-
void *fill_cpu_elf_notes(void *ptr, struct save_area *sa, __vector128 *vxrs);
#endif
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index b68af0564a42..6e552af08c76 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -60,22 +60,20 @@ static inline int mmap_is_legacy(void)
return sysctl_legacy_va_layout;
}
-static unsigned long mmap_rnd(void)
+unsigned long arch_mmap_rnd(void)
{
- if (!(current->flags & PF_RANDOMIZE))
- return 0;
if (is_32bit_task())
return (get_random_int() & 0x7ff) << PAGE_SHIFT;
else
return (get_random_int() & mmap_rnd_mask) << PAGE_SHIFT;
}
-static unsigned long mmap_base_legacy(void)
+static unsigned long mmap_base_legacy(unsigned long rnd)
{
- return TASK_UNMAPPED_BASE + mmap_rnd();
+ return TASK_UNMAPPED_BASE + rnd;
}
-static inline unsigned long mmap_base(void)
+static inline unsigned long mmap_base(unsigned long rnd)
{
unsigned long gap = rlimit(RLIMIT_STACK);
@@ -84,7 +82,7 @@ static inline unsigned long mmap_base(void)
else if (gap > MAX_GAP)
gap = MAX_GAP;
gap &= PAGE_MASK;
- return STACK_TOP - stack_maxrandom_size() - mmap_rnd() - gap;
+ return STACK_TOP - stack_maxrandom_size() - rnd - gap;
}
unsigned long
@@ -179,17 +177,6 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
return addr;
}
-unsigned long randomize_et_dyn(void)
-{
- unsigned long base;
-
- base = STACK_TOP / 3 * 2;
- if (!is_32bit_task())
- /* Align to 4GB */
- base &= ~((1UL << 32) - 1);
- return base + mmap_rnd();
-}
-
int s390_mmap_check(unsigned long addr, unsigned long len, unsigned long flags)
{
if (is_compat_task() || (TASK_SIZE >= (1UL << 53)))
@@ -250,15 +237,20 @@ s390_get_unmapped_area_topdown(struct file *filp, const unsigned long addr,
*/
void arch_pick_mmap_layout(struct mm_struct *mm)
{
+ unsigned long random_factor = 0UL;
+
+ if (current->flags & PF_RANDOMIZE)
+ random_factor = arch_mmap_rnd();
+
/*
* Fall back to the standard layout if the personality
* bit is set, or if the expected stack growth is unlimited:
*/
if (mmap_is_legacy()) {
- mm->mmap_base = mmap_base_legacy();
+ mm->mmap_base = mmap_base_legacy(random_factor);
mm->get_unmapped_area = s390_get_unmapped_area;
} else {
- mm->mmap_base = mmap_base();
+ mm->mmap_base = mmap_base(random_factor);
mm->get_unmapped_area = s390_get_unmapped_area_topdown;
}
}
diff --git a/arch/s390/pci/pci_debug.c b/arch/s390/pci/pci_debug.c
index 3229a2e570df..c22d4402ae45 100644
--- a/arch/s390/pci/pci_debug.c
+++ b/arch/s390/pci/pci_debug.c
@@ -45,8 +45,10 @@ static int pci_perf_show(struct seq_file *m, void *v)
if (!zdev)
return 0;
- if (!zdev->fmb)
- return seq_printf(m, "FMB statistics disabled\n");
+ if (!zdev->fmb) {
+ seq_puts(m, "FMB statistics disabled\n");
+ return 0;
+ }
/* header */
seq_printf(m, "FMB @ %p\n", zdev->fmb);
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index eb4ef274ae9b..50057fed819d 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -162,6 +162,10 @@ config NEED_DMA_MAP_STATE
config NEED_SG_DMA_LENGTH
def_bool y
+config PGTABLE_LEVELS
+ default 3 if X2TLB
+ default 2
+
source "init/Kconfig"
source "kernel/Kconfig.freezer"
diff --git a/arch/sh/kernel/dwarf.c b/arch/sh/kernel/dwarf.c
index 67a049e75ec1..9d209a07235e 100644
--- a/arch/sh/kernel/dwarf.c
+++ b/arch/sh/kernel/dwarf.c
@@ -993,7 +993,7 @@ static struct unwinder dwarf_unwinder = {
.rating = 150,
};
-static void dwarf_unwinder_cleanup(void)
+static void __init dwarf_unwinder_cleanup(void)
{
struct dwarf_fde *fde, *next_fde;
struct dwarf_cie *cie, *next_cie;
@@ -1009,6 +1009,10 @@ static void dwarf_unwinder_cleanup(void)
rbtree_postorder_for_each_entry_safe(cie, next_cie, &cie_root, node)
kfree(cie);
+ if (dwarf_reg_pool)
+ mempool_destroy(dwarf_reg_pool);
+ if (dwarf_frame_pool)
+ mempool_destroy(dwarf_frame_pool);
kmem_cache_destroy(dwarf_reg_cachep);
kmem_cache_destroy(dwarf_frame_cachep);
}
@@ -1176,17 +1180,13 @@ static int __init dwarf_unwinder_init(void)
sizeof(struct dwarf_reg), 0,
SLAB_PANIC | SLAB_HWCACHE_ALIGN | SLAB_NOTRACK, NULL);
- dwarf_frame_pool = mempool_create(DWARF_FRAME_MIN_REQ,
- mempool_alloc_slab,
- mempool_free_slab,
- dwarf_frame_cachep);
+ dwarf_frame_pool = mempool_create_slab_pool(DWARF_FRAME_MIN_REQ,
+ dwarf_frame_cachep);
if (!dwarf_frame_pool)
goto out;
- dwarf_reg_pool = mempool_create(DWARF_REG_MIN_REQ,
- mempool_alloc_slab,
- mempool_free_slab,
- dwarf_reg_cachep);
+ dwarf_reg_pool = mempool_create_slab_pool(DWARF_REG_MIN_REQ,
+ dwarf_reg_cachep);
if (!dwarf_reg_pool)
goto out;
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index efb00ec75805..e49502acbab4 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -146,6 +146,10 @@ config GENERIC_ISA_DMA
config ARCH_SUPPORTS_DEBUG_PAGEALLOC
def_bool y if SPARC64
+config PGTABLE_LEVELS
+ default 4 if 64BIT
+ default 3
+
source "init/Kconfig"
source "kernel/Kconfig.freezer"
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index dc165ebdf05a..5c02174828ed 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -697,6 +697,15 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd)
return __pmd(pte_val(pte));
}
+static inline pmd_t pmd_mkclean(pmd_t pmd)
+{
+ pte_t pte = __pte(pmd_val(pmd));
+
+ pte = pte_mkclean(pte);
+
+ return __pmd(pte_val(pte));
+}
+
static inline pmd_t pmd_mkyoung(pmd_t pmd)
{
pte_t pte = __pte(pmd_val(pmd));
diff --git a/arch/sparc/include/asm/seccomp.h b/arch/sparc/include/asm/seccomp.h
index adca1bce41d4..5ef8826d44f8 100644
--- a/arch/sparc/include/asm/seccomp.h
+++ b/arch/sparc/include/asm/seccomp.h
@@ -1,15 +1,10 @@
#ifndef _ASM_SECCOMP_H
+#define _ASM_SECCOMP_H
#include <linux/unistd.h>
-#define __NR_seccomp_read __NR_read
-#define __NR_seccomp_write __NR_write
-#define __NR_seccomp_exit __NR_exit
-#define __NR_seccomp_sigreturn __NR_rt_sigreturn
-
-#define __NR_seccomp_read_32 __NR_read
-#define __NR_seccomp_write_32 __NR_write
-#define __NR_seccomp_exit_32 __NR_exit
#define __NR_seccomp_sigreturn_32 __NR_sigreturn
+#include <asm-generic/seccomp.h>
+
#endif /* _ASM_SECCOMP_H */
diff --git a/arch/sparc/kernel/mdesc.c b/arch/sparc/kernel/mdesc.c
index 99632a87e697..26c80e18d7b1 100644
--- a/arch/sparc/kernel/mdesc.c
+++ b/arch/sparc/kernel/mdesc.c
@@ -130,26 +130,26 @@ static struct mdesc_mem_ops memblock_mdesc_ops = {
static struct mdesc_handle *mdesc_kmalloc(unsigned int mdesc_size)
{
unsigned int handle_size;
+ struct mdesc_handle *hp;
+ unsigned long addr;
void *base;
handle_size = (sizeof(struct mdesc_handle) -
sizeof(struct mdesc_hdr) +
mdesc_size);
+ /*
+ * Allocation has to succeed because mdesc update would be missed
+ * and such events are not retransmitted.
+ */
base = kmalloc(handle_size + 15, GFP_KERNEL | __GFP_NOFAIL);
- if (base) {
- struct mdesc_handle *hp;
- unsigned long addr;
-
- addr = (unsigned long)base;
- addr = (addr + 15UL) & ~15UL;
- hp = (struct mdesc_handle *) addr;
+ addr = (unsigned long)base;
+ addr = (addr + 15UL) & ~15UL;
+ hp = (struct mdesc_handle *) addr;
- mdesc_handle_init(hp, handle_size, base);
- return hp;
- }
+ mdesc_handle_init(hp, handle_size, base);
- return NULL;
+ return hp;
}
static void mdesc_kfree(struct mdesc_handle *hp)
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 7cca41842a9e..0142d578b5a8 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -147,6 +147,11 @@ config ARCH_DEFCONFIG
default "arch/tile/configs/tilepro_defconfig" if !TILEGX
default "arch/tile/configs/tilegx_defconfig" if TILEGX
+config PGTABLE_LEVELS
+ int
+ default 3 if 64BIT
+ default 2
+
source "init/Kconfig"
source "kernel/Kconfig.freezer"
diff --git a/arch/um/Kconfig.um b/arch/um/Kconfig.um
index a7520c90f62d..5dbfe3d9107c 100644
--- a/arch/um/Kconfig.um
+++ b/arch/um/Kconfig.um
@@ -155,3 +155,8 @@ config MMAPPER
config NO_DMA
def_bool y
+
+config PGTABLE_LEVELS
+ int
+ default 3 if 3_LEVEL_PGTABLES
+ default 2
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 367cec647d2a..b7b6e0a9644a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -88,7 +88,7 @@ config X86
select HAVE_ARCH_KMEMCHECK
select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP
select HAVE_USER_RETURN_NOTIFIER
- select ARCH_BINFMT_ELF_RANDOMIZE_PIE
+ select ARCH_HAS_ELF_RANDOMIZE
select HAVE_ARCH_JUMP_LABEL
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select SPARSE_IRQ
@@ -100,6 +100,7 @@ config X86
select IRQ_FORCED_THREADING
select HAVE_BPF_JIT if X86_64
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
+ select HAVE_ARCH_HUGE_VMAP if X86_64 || (X86_32 && X86_PAE)
select ARCH_HAS_SG_CHAIN
select CLKEVT_I8253
select ARCH_HAVE_NMI_SAFE_CMPXCHG
@@ -278,6 +279,12 @@ config ARCH_SUPPORTS_UPROBES
config FIX_EARLYCON_MEM
def_bool y
+config PGTABLE_LEVELS
+ int
+ default 4 if X86_64
+ default 3 if X86_PAE
+ default 2
+
source "init/Kconfig"
source "kernel/Kconfig.freezer"
@@ -715,17 +722,6 @@ endif #HYPERVISOR_GUEST
config NO_BOOTMEM
def_bool y
-config MEMTEST
- bool "Memtest"
- ---help---
- This option adds a kernel parameter 'memtest', which allows memtest
- to be set.
- memtest=0, mean disabled; -- default
- memtest=1, mean do 1 test pattern;
- ...
- memtest=4, mean do 4 test patterns.
- If you are unsure how to answer this question, answer N.
-
source "arch/x86/Kconfig.cpu"
config HPET_TIMER
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 779c2efe2e97..3ab0537872fb 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -40,14 +40,6 @@ static inline void e820_mark_nosave_regions(unsigned long limit_pfn)
}
#endif
-#ifdef CONFIG_MEMTEST
-extern void early_memtest(unsigned long start, unsigned long end);
-#else
-static inline void early_memtest(unsigned long start, unsigned long end)
-{
-}
-#endif
-
extern unsigned long e820_end_of_ram_pfn(void);
extern unsigned long e820_end_of_low_ram_pfn(void);
extern u64 early_reserve_e820(u64 sizet, u64 align);
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index ca3347a9dab5..bbdace22daf8 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -338,9 +338,6 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
int uses_interp);
#define compat_arch_setup_additional_pages compat_arch_setup_additional_pages
-extern unsigned long arch_randomize_brk(struct mm_struct *mm);
-#define arch_randomize_brk arch_randomize_brk
-
/*
* True on X86_32 or when emulating IA32 on X86_64
*/
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index f97fbe3abb67..c7c712f2648b 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -40,8 +40,10 @@
#ifdef CONFIG_X86_64
#include <asm/page_64_types.h>
+#define IOREMAP_MAX_ORDER (PUD_SHIFT)
#else
#include <asm/page_32_types.h>
+#define IOREMAP_MAX_ORDER (PMD_SHIFT)
#endif /* CONFIG_X86_64 */
#ifndef __ASSEMBLY__
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 965c47d254aa..e599242a6cf8 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -545,7 +545,7 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, val);
}
-#if PAGETABLE_LEVELS >= 3
+#if CONFIG_PGTABLE_LEVELS >= 3
static inline pmd_t __pmd(pmdval_t val)
{
pmdval_t ret;
@@ -585,7 +585,7 @@ static inline void set_pud(pud_t *pudp, pud_t pud)
PVOP_VCALL2(pv_mmu_ops.set_pud, pudp,
val);
}
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
static inline pud_t __pud(pudval_t val)
{
pudval_t ret;
@@ -636,9 +636,9 @@ static inline void pud_clear(pud_t *pudp)
set_pud(pudp, __pud(0));
}
-#endif /* PAGETABLE_LEVELS == 4 */
+#endif /* CONFIG_PGTABLE_LEVELS == 4 */
-#endif /* PAGETABLE_LEVELS >= 3 */
+#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
#ifdef CONFIG_X86_PAE
/* Special-case pte-setting operations for PAE, which can't update a
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 7549b8b369e4..f7b0b5c112f2 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -294,7 +294,7 @@ struct pv_mmu_ops {
struct paravirt_callee_save pgd_val;
struct paravirt_callee_save make_pgd;
-#if PAGETABLE_LEVELS >= 3
+#if CONFIG_PGTABLE_LEVELS >= 3
#ifdef CONFIG_X86_PAE
void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
void (*pte_clear)(struct mm_struct *mm, unsigned long addr,
@@ -308,13 +308,13 @@ struct pv_mmu_ops {
struct paravirt_callee_save pmd_val;
struct paravirt_callee_save make_pmd;
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
struct paravirt_callee_save pud_val;
struct paravirt_callee_save make_pud;
void (*set_pgd)(pgd_t *pudp, pgd_t pgdval);
-#endif /* PAGETABLE_LEVELS == 4 */
-#endif /* PAGETABLE_LEVELS >= 3 */
+#endif /* CONFIG_PGTABLE_LEVELS == 4 */
+#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
struct pv_lazy_ops lazy_mode;
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index c4412e972bbd..bf7f8b55b0f9 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -77,7 +77,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
#define pmd_pgtable(pmd) pmd_page(pmd)
-#if PAGETABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
{
struct page *page;
@@ -116,7 +116,7 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
}
#endif /* CONFIG_X86_PAE */
-#if PAGETABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
{
paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
@@ -142,7 +142,7 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
___pud_free_tlb(tlb, pud);
}
-#endif /* PAGETABLE_LEVELS > 3 */
-#endif /* PAGETABLE_LEVELS > 2 */
+#endif /* CONFIG_PGTABLE_LEVELS > 3 */
+#endif /* CONFIG_PGTABLE_LEVELS > 2 */
#endif /* _ASM_X86_PGALLOC_H */
diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h
index daacc23e3fb9..392576433e77 100644
--- a/arch/x86/include/asm/pgtable-2level_types.h
+++ b/arch/x86/include/asm/pgtable-2level_types.h
@@ -17,7 +17,6 @@ typedef union {
#endif /* !__ASSEMBLY__ */
#define SHARED_KERNEL_PMD 0
-#define PAGETABLE_LEVELS 2
/*
* traditional i386 two-level paging structure:
diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h
index 1bd5876c8649..bcc89625ebe5 100644
--- a/arch/x86/include/asm/pgtable-3level_types.h
+++ b/arch/x86/include/asm/pgtable-3level_types.h
@@ -24,8 +24,6 @@ typedef union {
#define SHARED_KERNEL_PMD 1
#endif
-#define PAGETABLE_LEVELS 3
-
/*
* PGDIR_SHIFT determines what a top-level page table entry can map
*/
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a0c35bf6cb92..affcb3459847 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -267,6 +267,11 @@ static inline pmd_t pmd_mkold(pmd_t pmd)
return pmd_clear_flags(pmd, _PAGE_ACCESSED);
}
+static inline pmd_t pmd_mkclean(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_DIRTY);
+}
+
static inline pmd_t pmd_wrprotect(pmd_t pmd)
{
return pmd_clear_flags(pmd, _PAGE_RW);
@@ -551,7 +556,7 @@ static inline unsigned long pages_to_mb(unsigned long npg)
return npg >> (20 - PAGE_SHIFT);
}
-#if PAGETABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
static inline int pud_none(pud_t pud)
{
return native_pud_val(pud) == 0;
@@ -594,9 +599,9 @@ static inline int pud_large(pud_t pud)
{
return 0;
}
-#endif /* PAGETABLE_LEVELS > 2 */
+#endif /* CONFIG_PGTABLE_LEVELS > 2 */
-#if PAGETABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
static inline int pgd_present(pgd_t pgd)
{
return pgd_flags(pgd) & _PAGE_PRESENT;
@@ -633,7 +638,7 @@ static inline int pgd_none(pgd_t pgd)
{
return !native_pgd_val(pgd);
}
-#endif /* PAGETABLE_LEVELS > 3 */
+#endif /* CONFIG_PGTABLE_LEVELS > 3 */
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 602b6028c5b6..e6844dfb4471 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -20,7 +20,6 @@ typedef struct { pteval_t pte; } pte_t;
#endif /* !__ASSEMBLY__ */
#define SHARED_KERNEL_PMD 0
-#define PAGETABLE_LEVELS 4
/*
* PGDIR_SHIFT determines what a top-level page table entry can map
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 8c7c10802e9c..78f0c8cbe316 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -234,7 +234,7 @@ static inline pgdval_t pgd_flags(pgd_t pgd)
return native_pgd_val(pgd) & PTE_FLAGS_MASK;
}
-#if PAGETABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
typedef struct { pudval_t pud; } pud_t;
static inline pud_t native_make_pud(pmdval_t val)
@@ -255,7 +255,7 @@ static inline pudval_t native_pud_val(pud_t pud)
}
#endif
-#if PAGETABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
typedef struct { pmdval_t pmd; } pmd_t;
static inline pmd_t native_make_pmd(pmdval_t val)
diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h
index 0f3d7f099224..0c8c7c8861b4 100644
--- a/arch/x86/include/asm/seccomp.h
+++ b/arch/x86/include/asm/seccomp.h
@@ -1,5 +1,20 @@
+#ifndef _ASM_X86_SECCOMP_H
+#define _ASM_X86_SECCOMP_H
+
+#include <asm/unistd.h>
+
#ifdef CONFIG_X86_32
-# include <asm/seccomp_32.h>
-#else
-# include <asm/seccomp_64.h>
+#define __NR_seccomp_sigreturn __NR_sigreturn
#endif
+
+#ifdef CONFIG_COMPAT
+#include <asm/ia32_unistd.h>
+#define __NR_seccomp_read_32 __NR_ia32_read
+#define __NR_seccomp_write_32 __NR_ia32_write
+#define __NR_seccomp_exit_32 __NR_ia32_exit
+#define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn
+#endif
+
+#include <asm-generic/seccomp.h>
+
+#endif /* _ASM_X86_SECCOMP_H */
diff --git a/arch/x86/include/asm/seccomp_32.h b/arch/x86/include/asm/seccomp_32.h
deleted file mode 100644
index b811d6f5780c..000000000000
--- a/arch/x86/include/asm/seccomp_32.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef _ASM_X86_SECCOMP_32_H
-#define _ASM_X86_SECCOMP_32_H
-
-#include <linux/unistd.h>
-
-#define __NR_seccomp_read __NR_read
-#define __NR_seccomp_write __NR_write
-#define __NR_seccomp_exit __NR_exit
-#define __NR_seccomp_sigreturn __NR_sigreturn
-
-#endif /* _ASM_X86_SECCOMP_32_H */
diff --git a/arch/x86/include/asm/seccomp_64.h b/arch/x86/include/asm/seccomp_64.h
deleted file mode 100644
index 84ec1bd161a5..000000000000
--- a/arch/x86/include/asm/seccomp_64.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef _ASM_X86_SECCOMP_64_H
-#define _ASM_X86_SECCOMP_64_H
-
-#include <linux/unistd.h>
-#include <asm/ia32_unistd.h>
-
-#define __NR_seccomp_read __NR_read
-#define __NR_seccomp_write __NR_write
-#define __NR_seccomp_exit __NR_exit
-#define __NR_seccomp_sigreturn __NR_rt_sigreturn
-
-#define __NR_seccomp_read_32 __NR_ia32_read
-#define __NR_seccomp_write_32 __NR_ia32_write
-#define __NR_seccomp_exit_32 __NR_ia32_exit
-#define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn
-
-#endif /* _ASM_X86_SECCOMP_64_H */
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index a041e094b8b9..d76f13d6d8d6 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -404,11 +404,10 @@ static const struct file_operations mtrr_fops = {
static int mtrr_seq_show(struct seq_file *seq, void *offset)
{
char factor;
- int i, max, len;
+ int i, max;
mtrr_type type;
unsigned long base, size;
- len = 0;
max = num_var_ranges;
for (i = 0; i < max; i++) {
mtrr_if->get(i, &base, &size, &type);
@@ -425,11 +424,10 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
size >>= 20 - PAGE_SHIFT;
}
/* Base can be > 32bit */
- len += seq_printf(seq, "reg%02i: base=0x%06lx000 "
- "(%5luMB), size=%5lu%cB, count=%d: %s\n",
- i, base, base >> (20 - PAGE_SHIFT), size,
- factor, mtrr_usage_table[i],
- mtrr_attrib_to_str(type));
+ seq_printf(seq, "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n",
+ i, base, base >> (20 - PAGE_SHIFT),
+ size, factor,
+ mtrr_usage_table[i], mtrr_attrib_to_str(type));
}
return 0;
}
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index e354cc6446ab..9435620062df 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -513,7 +513,7 @@ void __init kvm_guest_init(void)
* can get false positives too easily, for example if the host is
* overcommitted.
*/
- watchdog_enable_hardlockup_detector(false);
+ hardlockup_detector_disable();
}
static noinline uint32_t __kvm_cpuid_base(void)
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 415480d3ea84..ab48c252e661 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -335,6 +335,7 @@ void arch_crash_save_vmcoreinfo(void)
#endif
vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
(unsigned long)&_text - __START_KERNEL);
+ VMCOREINFO_PHYS_BASE(phys_base);
}
/* arch-dependent functionality related to kexec file-based syscall */
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 548d25f00c90..c614dd492f5f 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -443,7 +443,7 @@ struct pv_mmu_ops pv_mmu_ops = {
.ptep_modify_prot_start = __ptep_modify_prot_start,
.ptep_modify_prot_commit = __ptep_modify_prot_commit,
-#if PAGETABLE_LEVELS >= 3
+#if CONFIG_PGTABLE_LEVELS >= 3
#ifdef CONFIG_X86_PAE
.set_pte_atomic = native_set_pte_atomic,
.pte_clear = native_pte_clear,
@@ -454,13 +454,13 @@ struct pv_mmu_ops pv_mmu_ops = {
.pmd_val = PTE_IDENT,
.make_pmd = PTE_IDENT,
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
.pud_val = PTE_IDENT,
.make_pud = PTE_IDENT,
.set_pgd = native_set_pgd,
#endif
-#endif /* PAGETABLE_LEVELS >= 3 */
+#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
.pte_val = PTE_IDENT,
.pgd_val = PTE_IDENT,
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 768e120cba44..d50a367cd9d4 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -635,7 +635,8 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
static void
handle_signal(struct ksignal *ksig, struct pt_regs *regs)
{
- bool failed;
+ bool stepping, failed;
+
/* Are we from a system call? */
if (syscall_get_nr(current, regs) >= 0) {
/* If so, check system call restarting.. */
@@ -659,12 +660,13 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
}
/*
- * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF
- * flag so that register information in the sigcontext is correct.
+ * If TF is set due to a debugger (TIF_FORCED_TF), clear TF now
+ * so that register information in the sigcontext is correct and
+ * then notify the tracer before entering the signal handler.
*/
- if (unlikely(regs->flags & X86_EFLAGS_TF) &&
- likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
- regs->flags &= ~X86_EFLAGS_TF;
+ stepping = test_thread_flag(TIF_SINGLESTEP);
+ if (stepping)
+ user_disable_single_step(current);
failed = (setup_rt_frame(ksig, regs) < 0);
if (!failed) {
@@ -675,10 +677,8 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
* it might disable possible debug exception from the
* signal handler.
*
- * Clear TF when entering the signal handler, but
- * notify any tracer that was single-stepping it.
- * The tracer may want to single-step inside the
- * handler too.
+ * Clear TF for the case when it wasn't set by debugger to
+ * avoid the recursive send_sigtrap() in SIGTRAP handler.
*/
regs->flags &= ~(X86_EFLAGS_DF|X86_EFLAGS_RF|X86_EFLAGS_TF);
/*
@@ -687,7 +687,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
if (used_math())
fpu_reset_state(current);
}
- signal_setup_done(failed, ksig, test_thread_flag(TIF_SINGLESTEP));
+ signal_setup_done(failed, ksig, stepping);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index c4cc74006c61..a482d105172b 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -32,6 +32,4 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o
obj-$(CONFIG_ACPI_NUMA) += srat.o
obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
-obj-$(CONFIG_MEMTEST) += memtest.o
-
obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index fdf617c00e2f..5ead4d6cf3a7 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -67,8 +67,13 @@ static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages,
/*
* Remap an arbitrary physical address space into the kernel virtual
- * address space. Needed when the kernel wants to access high addresses
- * directly.
+ * address space. It transparently creates kernel huge I/O mapping when
+ * the physical address is aligned by a huge page size (1GB or 2MB) and
+ * the requested size is at least the huge page size.
+ *
+ * NOTE: MTRRs can override PAT memory types with a 4KB granularity.
+ * Therefore, the mapping code falls back to use a smaller page toward 4KB
+ * when a mapping range is covered by non-WB type of MTRRs.
*
* NOTE! We need to allow non-page-aligned mappings too: we will obviously
* have to convert them into an offset in a page-aligned mapping, but the
@@ -326,6 +331,20 @@ void iounmap(volatile void __iomem *addr)
}
EXPORT_SYMBOL(iounmap);
+int arch_ioremap_pud_supported(void)
+{
+#ifdef CONFIG_X86_64
+ return cpu_has_gbpages;
+#else
+ return 0;
+#endif
+}
+
+int arch_ioremap_pmd_supported(void)
+{
+ return cpu_has_pse;
+}
+
/*
* Convert a physical pointer to a virtual kernel pointer for /dev/mem
* access
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index df4552bd239e..9d518d693b4b 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -65,24 +65,23 @@ static int mmap_is_legacy(void)
return sysctl_legacy_va_layout;
}
-static unsigned long mmap_rnd(void)
+unsigned long arch_mmap_rnd(void)
{
- unsigned long rnd = 0;
+ unsigned long rnd;
/*
- * 8 bits of randomness in 32bit mmaps, 20 address space bits
- * 28 bits of randomness in 64bit mmaps, 40 address space bits
- */
- if (current->flags & PF_RANDOMIZE) {
- if (mmap_is_ia32())
- rnd = get_random_int() % (1<<8);
- else
- rnd = get_random_int() % (1<<28);
- }
+ * 8 bits of randomness in 32bit mmaps, 20 address space bits
+ * 28 bits of randomness in 64bit mmaps, 40 address space bits
+ */
+ if (mmap_is_ia32())
+ rnd = (unsigned long)get_random_int() % (1<<8);
+ else
+ rnd = (unsigned long)get_random_int() % (1<<28);
+
return rnd << PAGE_SHIFT;
}
-static unsigned long mmap_base(void)
+static unsigned long mmap_base(unsigned long rnd)
{
unsigned long gap = rlimit(RLIMIT_STACK);
@@ -91,19 +90,19 @@ static unsigned long mmap_base(void)
else if (gap > MAX_GAP)
gap = MAX_GAP;
- return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd());
+ return PAGE_ALIGN(TASK_SIZE - gap - rnd);
}
/*
* Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64
* does, but not when emulating X86_32
*/
-static unsigned long mmap_legacy_base(void)
+static unsigned long mmap_legacy_base(unsigned long rnd)
{
if (mmap_is_ia32())
return TASK_UNMAPPED_BASE;
else
- return TASK_UNMAPPED_BASE + mmap_rnd();
+ return TASK_UNMAPPED_BASE + rnd;
}
/*
@@ -112,13 +111,18 @@ static unsigned long mmap_legacy_base(void)
*/
void arch_pick_mmap_layout(struct mm_struct *mm)
{
- mm->mmap_legacy_base = mmap_legacy_base();
- mm->mmap_base = mmap_base();
+ unsigned long random_factor = 0UL;
+
+ if (current->flags & PF_RANDOMIZE)
+ random_factor = arch_mmap_rnd();
+
+ mm->mmap_legacy_base = mmap_legacy_base(random_factor);
if (mmap_is_legacy()) {
mm->mmap_base = mm->mmap_legacy_base;
mm->get_unmapped_area = arch_get_unmapped_area;
} else {
+ mm->mmap_base = mmap_base(random_factor);
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
}
}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 5a7e5252c878..0b97d2c75df3 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -4,6 +4,7 @@
#include <asm/pgtable.h>
#include <asm/tlb.h>
#include <asm/fixmap.h>
+#include <asm/mtrr.h>
#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
@@ -58,7 +59,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
tlb_remove_page(tlb, pte);
}
-#if PAGETABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
{
struct page *page = virt_to_page(pmd);
@@ -74,14 +75,14 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
tlb_remove_page(tlb, page);
}
-#if PAGETABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
{
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
tlb_remove_page(tlb, virt_to_page(pud));
}
-#endif /* PAGETABLE_LEVELS > 3 */
-#endif /* PAGETABLE_LEVELS > 2 */
+#endif /* CONFIG_PGTABLE_LEVELS > 3 */
+#endif /* CONFIG_PGTABLE_LEVELS > 2 */
static inline void pgd_list_add(pgd_t *pgd)
{
@@ -117,9 +118,9 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
/* If the pgd points to a shared pagetable level (either the
ptes in non-PAE, or shared PMD in PAE), then just copy the
references from swapper_pg_dir. */
- if (PAGETABLE_LEVELS == 2 ||
- (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
- PAGETABLE_LEVELS == 4) {
+ if (CONFIG_PGTABLE_LEVELS == 2 ||
+ (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
+ CONFIG_PGTABLE_LEVELS == 4) {
clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
swapper_pg_dir + KERNEL_PGD_BOUNDARY,
KERNEL_PGD_PTRS);
@@ -560,3 +561,67 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys,
{
__native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
}
+
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
+{
+ u8 mtrr;
+
+ /*
+ * Do not use a huge page when the range is covered by non-WB type
+ * of MTRRs.
+ */
+ mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE);
+ if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
+ return 0;
+
+ prot = pgprot_4k_2_large(prot);
+
+ set_pte((pte_t *)pud, pfn_pte(
+ (u64)addr >> PAGE_SHIFT,
+ __pgprot(pgprot_val(prot) | _PAGE_PSE)));
+
+ return 1;
+}
+
+int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
+{
+ u8 mtrr;
+
+ /*
+ * Do not use a huge page when the range is covered by non-WB type
+ * of MTRRs.
+ */
+ mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE);
+ if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
+ return 0;
+
+ prot = pgprot_4k_2_large(prot);
+
+ set_pte((pte_t *)pmd, pfn_pte(
+ (u64)addr >> PAGE_SHIFT,
+ __pgprot(pgprot_val(prot) | _PAGE_PSE)));
+
+ return 1;
+}
+
+int pud_clear_huge(pud_t *pud)
+{
+ if (pud_large(*pud)) {
+ pud_clear(pud);
+ return 1;
+ }
+
+ return 0;
+}
+
+int pmd_clear_huge(pmd_t *pmd)
+{
+ if (pmd_large(*pmd)) {
+ pmd_clear(pmd);
+ return 1;
+ }
+
+ return 0;
+}
+#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 29b3be230ede..dd151b2045b0 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -502,7 +502,7 @@ __visible pmd_t xen_make_pmd(pmdval_t pmd)
}
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
__visible pudval_t xen_pud_val(pud_t pud)
{
return pte_mfn_to_pfn(pud.pud);
@@ -589,7 +589,7 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val)
xen_mc_issue(PARAVIRT_LAZY_MMU);
}
-#endif /* PAGETABLE_LEVELS == 4 */
+#endif /* CONFIG_PGTABLE_LEVELS == 4 */
/*
* (Yet another) pagetable walker. This one is intended for pinning a
@@ -1628,7 +1628,7 @@ static void xen_release_pmd(unsigned long pfn)
xen_release_ptpage(pfn, PT_PMD);
}
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
{
xen_alloc_ptpage(mm, pfn, PT_PUD);
@@ -2046,7 +2046,7 @@ static void __init xen_post_allocator_init(void)
pv_mmu_ops.set_pte = xen_set_pte;
pv_mmu_ops.set_pmd = xen_set_pmd;
pv_mmu_ops.set_pud = xen_set_pud;
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
pv_mmu_ops.set_pgd = xen_set_pgd;
#endif
@@ -2056,7 +2056,7 @@ static void __init xen_post_allocator_init(void)
pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
pv_mmu_ops.release_pte = xen_release_pte;
pv_mmu_ops.release_pmd = xen_release_pmd;
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
pv_mmu_ops.alloc_pud = xen_alloc_pud;
pv_mmu_ops.release_pud = xen_release_pud;
#endif
@@ -2122,14 +2122,14 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
.make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
.pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
.pud_val = PV_CALLEE_SAVE(xen_pud_val),
.make_pud = PV_CALLEE_SAVE(xen_make_pud),
.set_pgd = xen_set_pgd_hyper,
.alloc_pud = xen_alloc_pmd_init,
.release_pud = xen_release_pmd_init,
-#endif /* PAGETABLE_LEVELS == 4 */
+#endif /* CONFIG_PGTABLE_LEVELS == 4 */
.activate_mm = xen_activate_mm,
.dup_mmap = xen_dup_mmap,
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index 201aec0e0446..1b19f25bc567 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -80,6 +80,7 @@
#define MADV_SEQUENTIAL 2 /* expect sequential page references */
#define MADV_WILLNEED 3 /* will need these pages */
#define MADV_DONTNEED 4 /* don't need these pages */
+#define MADV_FREE 5 /* free pages only if memory pressure */
/* common parameters: try to keep these consistent across architectures */
#define MADV_REMOVE 9 /* remove these pages & resources */
diff --git a/block/genhd.c b/block/genhd.c
index 0a536dc05f3b..64600e911aaa 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -850,7 +850,7 @@ static int show_partition(struct seq_file *seqf, void *v)
char buf[BDEVNAME_SIZE];
/* Don't show non-partitionable removeable devices or empty devices */
- if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
+ if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) &&
(sgp->flags & GENHD_FL_REMOVABLE)))
return 0;
if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index af9c911cd6b5..2804aed3f416 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -219,6 +219,7 @@ static bool pages_correctly_reserved(unsigned long start_pfn)
/*
* MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
* OK to have direct references to sparsemem variables in here.
+ * Must already be protected by mem_hotplug_begin().
*/
static int
memory_block_action(unsigned long phys_index, unsigned long action, int online_type)
@@ -228,7 +229,7 @@ memory_block_action(unsigned long phys_index, unsigned long action, int online_t
struct page *first_page;
int ret;
- start_pfn = phys_index << PFN_SECTION_SHIFT;
+ start_pfn = section_nr_to_pfn(phys_index);
first_page = pfn_to_page(start_pfn);
switch (action) {
@@ -286,6 +287,7 @@ static int memory_subsys_online(struct device *dev)
if (mem->online_type < 0)
mem->online_type = MMOP_ONLINE_KEEP;
+ /* Already under protection of mem_hotplug_begin() */
ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
/* clear online_type */
@@ -328,17 +330,19 @@ store_mem_state(struct device *dev,
goto err;
}
+ /*
+ * Memory hotplug needs to hold mem_hotplug_begin() for probe to find
+ * the correct memory block to online before doing device_online(dev),
+ * which will take dev->mutex. Take the lock early to prevent an
+ * inversion, memory_subsys_online() callbacks will be implemented by
+ * assuming it's already protected.
+ */
+ mem_hotplug_begin();
+
switch (online_type) {
case MMOP_ONLINE_KERNEL:
case MMOP_ONLINE_MOVABLE:
case MMOP_ONLINE_KEEP:
- /*
- * mem->online_type is not protected so there can be a
- * race here. However, when racing online, the first
- * will succeed and the second will just return as the
- * block will already be online. The online type
- * could be either one, but that is expected.
- */
mem->online_type = online_type;
ret = device_online(&mem->dev);
break;
@@ -349,6 +353,7 @@ store_mem_state(struct device *dev,
ret = -EINVAL; /* should never happen */
}
+ mem_hotplug_done();
err:
unlock_device_hotplug();
diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index aab7158d2afe..77262009f89d 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -843,7 +843,6 @@ static int print_wakeup_source_stats(struct seq_file *m,
unsigned long active_count;
ktime_t active_time;
ktime_t prevent_sleep_time;
- int ret;
spin_lock_irqsave(&ws->lock, flags);
@@ -866,17 +865,16 @@ static int print_wakeup_source_stats(struct seq_file *m,
active_time = ktime_set(0, 0);
}
- ret = seq_printf(m, "%-12s\t%lu\t\t%lu\t\t%lu\t\t%lu\t\t"
- "%lld\t\t%lld\t\t%lld\t\t%lld\t\t%lld\n",
- ws->name, active_count, ws->event_count,
- ws->wakeup_count, ws->expire_count,
- ktime_to_ms(active_time), ktime_to_ms(total_time),
- ktime_to_ms(max_time), ktime_to_ms(ws->last_time),
- ktime_to_ms(prevent_sleep_time));
+ seq_printf(m, "%-12s\t%lu\t\t%lu\t\t%lu\t\t%lu\t\t%lld\t\t%lld\t\t%lld\t\t%lld\t\t%lld\n",
+ ws->name, active_count, ws->event_count,
+ ws->wakeup_count, ws->expire_count,
+ ktime_to_ms(active_time), ktime_to_ms(total_time),
+ ktime_to_ms(max_time), ktime_to_ms(ws->last_time),
+ ktime_to_ms(prevent_sleep_time));
spin_unlock_irqrestore(&ws->lock, flags);
- return ret;
+ return 0;
}
/**
diff --git a/drivers/block/paride/pg.c b/drivers/block/paride/pg.c
index 2ce3dfd7e6b9..876d0c3eaf58 100644
--- a/drivers/block/paride/pg.c
+++ b/drivers/block/paride/pg.c
@@ -137,7 +137,7 @@
*/
-static bool verbose = 0;
+static int verbose;
static int major = PG_MAJOR;
static char *name = PG_NAME;
static int disable = 0;
@@ -168,7 +168,7 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_DLY};
#include <asm/uaccess.h>
-module_param(verbose, bool, 0644);
+module_param(verbose, int, 0644);
module_param(major, int, 0);
module_param(name, charp, 0);
module_param_array(drive0, int, NULL, 0);
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 871bd3550cb0..759ee093d7f8 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -32,22 +32,37 @@
#include <linux/string.h>
#include <linux/vmalloc.h>
#include <linux/err.h>
+#include <linux/idr.h>
+#include <linux/sysfs.h>
#include "zram_drv.h"
-/* Globals */
+static DEFINE_IDR(zram_index_idr);
+/* idr index must be protected */
+static DEFINE_MUTEX(zram_index_mutex);
+
static int zram_major;
-static struct zram *zram_devices;
static const char *default_compressor = "lzo";
/* Module params (documentation at end) */
static unsigned int num_devices = 1;
+static inline void deprecated_attr_warn(const char *name)
+{
+ pr_warn_once("%d (%s) Attribute %s (and others) will be removed. %s\n",
+ task_pid_nr(current),
+ current->comm,
+ name,
+ "See zram documentation.");
+}
+
#define ZRAM_ATTR_RO(name) \
-static ssize_t name##_show(struct device *d, \
+static ssize_t name##_show(struct device *d, \
struct device_attribute *attr, char *b) \
{ \
struct zram *zram = dev_to_zram(d); \
+ \
+ deprecated_attr_warn(__stringify(name)); \
return scnprintf(b, PAGE_SIZE, "%llu\n", \
(u64)atomic64_read(&zram->stats.name)); \
} \
@@ -63,6 +78,119 @@ static inline struct zram *dev_to_zram(struct device *dev)
return (struct zram *)dev_to_disk(dev)->private_data;
}
+/* flag operations require table entry bit_spin_lock() being held */
+static int zram_test_flag(struct zram_meta *meta, u32 index,
+ enum zram_pageflags flag)
+{
+ return meta->table[index].value & BIT(flag);
+}
+
+static void zram_set_flag(struct zram_meta *meta, u32 index,
+ enum zram_pageflags flag)
+{
+ meta->table[index].value |= BIT(flag);
+}
+
+static void zram_clear_flag(struct zram_meta *meta, u32 index,
+ enum zram_pageflags flag)
+{
+ meta->table[index].value &= ~BIT(flag);
+}
+
+static size_t zram_get_obj_size(struct zram_meta *meta, u32 index)
+{
+ return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1);
+}
+
+static void zram_set_obj_size(struct zram_meta *meta,
+ u32 index, size_t size)
+{
+ unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT;
+
+ meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size;
+}
+
+static inline int is_partial_io(struct bio_vec *bvec)
+{
+ return bvec->bv_len != PAGE_SIZE;
+}
+
+/*
+ * Check if request is within bounds and aligned on zram logical blocks.
+ */
+static inline int valid_io_request(struct zram *zram,
+ sector_t start, unsigned int size)
+{
+ u64 end, bound;
+
+ /* unaligned request */
+ if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1)))
+ return 0;
+ if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1)))
+ return 0;
+
+ end = start + (size >> SECTOR_SHIFT);
+ bound = zram->disksize >> SECTOR_SHIFT;
+ /* out of range range */
+ if (unlikely(start >= bound || end > bound || start > end))
+ return 0;
+
+ /* I/O request is valid */
+ return 1;
+}
+
+static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
+{
+ if (*offset + bvec->bv_len >= PAGE_SIZE)
+ (*index)++;
+ *offset = (*offset + bvec->bv_len) % PAGE_SIZE;
+}
+
+static inline void update_used_max(struct zram *zram,
+ const unsigned long pages)
+{
+ unsigned long old_max, cur_max;
+
+ old_max = atomic_long_read(&zram->stats.max_used_pages);
+
+ do {
+ cur_max = old_max;
+ if (pages > cur_max)
+ old_max = atomic_long_cmpxchg(
+ &zram->stats.max_used_pages, cur_max, pages);
+ } while (old_max != cur_max);
+}
+
+static int page_zero_filled(void *ptr)
+{
+ unsigned int pos;
+ unsigned long *page;
+
+ page = (unsigned long *)ptr;
+
+ for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
+ if (page[pos])
+ return 0;
+ }
+
+ return 1;
+}
+
+static void handle_zero_page(struct bio_vec *bvec)
+{
+ struct page *page = bvec->bv_page;
+ void *user_mem;
+
+ user_mem = kmap_atomic(page);
+ if (is_partial_io(bvec))
+ memset(user_mem + bvec->bv_offset, 0, bvec->bv_len);
+ else
+ clear_page(user_mem);
+ kunmap_atomic(user_mem);
+
+ flush_dcache_page(page);
+}
+
static ssize_t disksize_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -89,6 +217,7 @@ static ssize_t orig_data_size_show(struct device *dev,
{
struct zram *zram = dev_to_zram(dev);
+ deprecated_attr_warn("orig_data_size");
return scnprintf(buf, PAGE_SIZE, "%llu\n",
(u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT);
}
@@ -99,6 +228,7 @@ static ssize_t mem_used_total_show(struct device *dev,
u64 val = 0;
struct zram *zram = dev_to_zram(dev);
+ deprecated_attr_warn("mem_used_total");
down_read(&zram->init_lock);
if (init_done(zram)) {
struct zram_meta *meta = zram->meta;
@@ -128,6 +258,7 @@ static ssize_t mem_limit_show(struct device *dev,
u64 val;
struct zram *zram = dev_to_zram(dev);
+ deprecated_attr_warn("mem_limit");
down_read(&zram->init_lock);
val = zram->limit_pages;
up_read(&zram->init_lock);
@@ -159,6 +290,7 @@ static ssize_t mem_used_max_show(struct device *dev,
u64 val = 0;
struct zram *zram = dev_to_zram(dev);
+ deprecated_attr_warn("mem_used_max");
down_read(&zram->init_lock);
if (init_done(zram))
val = atomic_long_read(&zram->stats.max_used_pages);
@@ -246,65 +378,95 @@ static ssize_t comp_algorithm_store(struct device *dev,
return len;
}
-/* flag operations needs meta->tb_lock */
-static int zram_test_flag(struct zram_meta *meta, u32 index,
- enum zram_pageflags flag)
+static ssize_t compact_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
{
- return meta->table[index].value & BIT(flag);
-}
+ unsigned long nr_migrated;
+ struct zram *zram = dev_to_zram(dev);
+ struct zram_meta *meta;
-static void zram_set_flag(struct zram_meta *meta, u32 index,
- enum zram_pageflags flag)
-{
- meta->table[index].value |= BIT(flag);
-}
+ down_read(&zram->init_lock);
+ if (!init_done(zram)) {
+ up_read(&zram->init_lock);
+ return -EINVAL;
+ }
-static void zram_clear_flag(struct zram_meta *meta, u32 index,
- enum zram_pageflags flag)
-{
- meta->table[index].value &= ~BIT(flag);
+ meta = zram->meta;
+ nr_migrated = zs_compact(meta->mem_pool);
+ atomic64_add(nr_migrated, &zram->stats.num_migrated);
+ up_read(&zram->init_lock);
+
+ return len;
}
-static size_t zram_get_obj_size(struct zram_meta *meta, u32 index)
+static ssize_t io_stat_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
- return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1);
+ struct zram *zram = dev_to_zram(dev);
+ ssize_t ret;
+
+ down_read(&zram->init_lock);
+ ret = scnprintf(buf, PAGE_SIZE,
+ "%8llu %8llu %8llu %8llu\n",
+ (u64)atomic64_read(&zram->stats.failed_reads),
+ (u64)atomic64_read(&zram->stats.failed_writes),
+ (u64)atomic64_read(&zram->stats.invalid_io),
+ (u64)atomic64_read(&zram->stats.notify_free));
+ up_read(&zram->init_lock);
+
+ return ret;
}
-static void zram_set_obj_size(struct zram_meta *meta,
- u32 index, size_t size)
+static ssize_t mm_stat_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
- unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT;
+ struct zram *zram = dev_to_zram(dev);
+ u64 orig_size, mem_used = 0;
+ long max_used;
+ ssize_t ret;
- meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size;
+ down_read(&zram->init_lock);
+ if (init_done(zram))
+ mem_used = zs_get_total_pages(zram->meta->mem_pool);
+
+ orig_size = atomic64_read(&zram->stats.pages_stored);
+ max_used = atomic_long_read(&zram->stats.max_used_pages);
+
+ ret = scnprintf(buf, PAGE_SIZE,
+ "%8llu %8llu %8llu %8lu %8ld %8llu %8llu\n",
+ orig_size << PAGE_SHIFT,
+ (u64)atomic64_read(&zram->stats.compr_data_size),
+ mem_used << PAGE_SHIFT,
+ zram->limit_pages << PAGE_SHIFT,
+ max_used << PAGE_SHIFT,
+ (u64)atomic64_read(&zram->stats.zero_pages),
+ (u64)atomic64_read(&zram->stats.num_migrated));
+ up_read(&zram->init_lock);
+
+ return ret;
}
-static inline int is_partial_io(struct bio_vec *bvec)
+static DEVICE_ATTR_RO(io_stat);
+static DEVICE_ATTR_RO(mm_stat);
+ZRAM_ATTR_RO(num_reads);
+ZRAM_ATTR_RO(num_writes);
+ZRAM_ATTR_RO(failed_reads);
+ZRAM_ATTR_RO(failed_writes);
+ZRAM_ATTR_RO(invalid_io);
+ZRAM_ATTR_RO(notify_free);
+ZRAM_ATTR_RO(zero_pages);
+ZRAM_ATTR_RO(compr_data_size);
+
+static inline bool zram_meta_get(struct zram *zram)
{
- return bvec->bv_len != PAGE_SIZE;
+ if (atomic_inc_not_zero(&zram->refcount))
+ return true;
+ return false;
}
-/*
- * Check if request is within bounds and aligned on zram logical blocks.
- */
-static inline int valid_io_request(struct zram *zram,
- sector_t start, unsigned int size)
+static inline void zram_meta_put(struct zram *zram)
{
- u64 end, bound;
-
- /* unaligned request */
- if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1)))
- return 0;
- if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1)))
- return 0;
-
- end = start + (size >> SECTOR_SHIFT);
- bound = zram->disksize >> SECTOR_SHIFT;
- /* out of range range */
- if (unlikely(start >= bound || end > bound || start > end))
- return 0;
-
- /* I/O request is valid */
- return 1;
+ atomic_dec(&zram->refcount);
}
static void zram_meta_free(struct zram_meta *meta, u64 disksize)
@@ -358,56 +520,6 @@ out_error:
return NULL;
}
-static inline bool zram_meta_get(struct zram *zram)
-{
- if (atomic_inc_not_zero(&zram->refcount))
- return true;
- return false;
-}
-
-static inline void zram_meta_put(struct zram *zram)
-{
- atomic_dec(&zram->refcount);
-}
-
-static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
-{
- if (*offset + bvec->bv_len >= PAGE_SIZE)
- (*index)++;
- *offset = (*offset + bvec->bv_len) % PAGE_SIZE;
-}
-
-static int page_zero_filled(void *ptr)
-{
- unsigned int pos;
- unsigned long *page;
-
- page = (unsigned long *)ptr;
-
- for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
- if (page[pos])
- return 0;
- }
-
- return 1;
-}
-
-static void handle_zero_page(struct bio_vec *bvec)
-{
- struct page *page = bvec->bv_page;
- void *user_mem;
-
- user_mem = kmap_atomic(page);
- if (is_partial_io(bvec))
- memset(user_mem + bvec->bv_offset, 0, bvec->bv_len);
- else
- clear_page(user_mem);
- kunmap_atomic(user_mem);
-
- flush_dcache_page(page);
-}
-
-
/*
* To protect concurrent access to the same index entry,
* caller should hold this table index entry's bit_spinlock to
@@ -440,6 +552,7 @@ static void zram_free_page(struct zram *zram, size_t index)
zram_set_obj_size(meta, index, 0);
}
+
static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
{
int ret = 0;
@@ -525,21 +638,6 @@ out_cleanup:
return ret;
}
-static inline void update_used_max(struct zram *zram,
- const unsigned long pages)
-{
- unsigned long old_max, cur_max;
-
- old_max = atomic_long_read(&zram->stats.max_used_pages);
-
- do {
- cur_max = old_max;
- if (pages > cur_max)
- old_max = atomic_long_cmpxchg(
- &zram->stats.max_used_pages, cur_max, pages);
- } while (old_max != cur_max);
-}
-
static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
int offset)
{
@@ -667,29 +765,6 @@ out:
return ret;
}
-static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
- int offset, int rw)
-{
- int ret;
-
- if (rw == READ) {
- atomic64_inc(&zram->stats.num_reads);
- ret = zram_bvec_read(zram, bvec, index, offset);
- } else {
- atomic64_inc(&zram->stats.num_writes);
- ret = zram_bvec_write(zram, bvec, index, offset);
- }
-
- if (unlikely(ret)) {
- if (rw == READ)
- atomic64_inc(&zram->stats.failed_reads);
- else
- atomic64_inc(&zram->stats.failed_writes);
- }
-
- return ret;
-}
-
/*
* zram_bio_discard - handler on discard request
* @index: physical block index in PAGE_SIZE units
@@ -729,149 +804,32 @@ static void zram_bio_discard(struct zram *zram, u32 index,
}
}
-static void zram_reset_device(struct zram *zram)
-{
- struct zram_meta *meta;
- struct zcomp *comp;
- u64 disksize;
-
- down_write(&zram->init_lock);
-
- zram->limit_pages = 0;
-
- if (!init_done(zram)) {
- up_write(&zram->init_lock);
- return;
- }
-
- meta = zram->meta;
- comp = zram->comp;
- disksize = zram->disksize;
- /*
- * Refcount will go down to 0 eventually and r/w handler
- * cannot handle further I/O so it will bail out by
- * check zram_meta_get.
- */
- zram_meta_put(zram);
- /*
- * We want to free zram_meta in process context to avoid
- * deadlock between reclaim path and any other locks.
- */
- wait_event(zram->io_done, atomic_read(&zram->refcount) == 0);
-
- /* Reset stats */
- memset(&zram->stats, 0, sizeof(zram->stats));
- zram->disksize = 0;
- zram->max_comp_streams = 1;
- set_capacity(zram->disk, 0);
-
- up_write(&zram->init_lock);
- /* I/O operation under all of CPU are done so let's free */
- zram_meta_free(meta, disksize);
- zcomp_destroy(comp);
-}
-
-static ssize_t disksize_store(struct device *dev,
- struct device_attribute *attr, const char *buf, size_t len)
-{
- u64 disksize;
- struct zcomp *comp;
- struct zram_meta *meta;
- struct zram *zram = dev_to_zram(dev);
- int err;
-
- disksize = memparse(buf, NULL);
- if (!disksize)
- return -EINVAL;
-
- disksize = PAGE_ALIGN(disksize);
- meta = zram_meta_alloc(zram->disk->first_minor, disksize);
- if (!meta)
- return -ENOMEM;
-
- comp = zcomp_create(zram->compressor, zram->max_comp_streams);
- if (IS_ERR(comp)) {
- pr_info("Cannot initialise %s compressing backend\n",
- zram->compressor);
- err = PTR_ERR(comp);
- goto out_free_meta;
- }
-
- down_write(&zram->init_lock);
- if (init_done(zram)) {
- pr_info("Cannot change disksize for initialized device\n");
- err = -EBUSY;
- goto out_destroy_comp;
- }
-
- init_waitqueue_head(&zram->io_done);
- atomic_set(&zram->refcount, 1);
- zram->meta = meta;
- zram->comp = comp;
- zram->disksize = disksize;
- set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
- up_write(&zram->init_lock);
-
- /*
- * Revalidate disk out of the init_lock to avoid lockdep splat.
- * It's okay because disk's capacity is protected by init_lock
- * so that revalidate_disk always sees up-to-date capacity.
- */
- revalidate_disk(zram->disk);
-
- return len;
-
-out_destroy_comp:
- up_write(&zram->init_lock);
- zcomp_destroy(comp);
-out_free_meta:
- zram_meta_free(meta, disksize);
- return err;
-}
-
-static ssize_t reset_store(struct device *dev,
- struct device_attribute *attr, const char *buf, size_t len)
+static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
+ int offset, int rw)
{
+ unsigned long start_time = jiffies;
int ret;
- unsigned short do_reset;
- struct zram *zram;
- struct block_device *bdev;
-
- zram = dev_to_zram(dev);
- bdev = bdget_disk(zram->disk, 0);
- if (!bdev)
- return -ENOMEM;
+ generic_start_io_acct(rw, bvec->bv_len >> SECTOR_SHIFT,
+ &zram->disk->part0);
- mutex_lock(&bdev->bd_mutex);
- /* Do not reset an active device! */
- if (bdev->bd_openers) {
- ret = -EBUSY;
- goto out;
+ if (rw == READ) {
+ atomic64_inc(&zram->stats.num_reads);
+ ret = zram_bvec_read(zram, bvec, index, offset);
+ } else {
+ atomic64_inc(&zram->stats.num_writes);
+ ret = zram_bvec_write(zram, bvec, index, offset);
}
- ret = kstrtou16(buf, 10, &do_reset);
- if (ret)
- goto out;
+ generic_end_io_acct(rw, &zram->disk->part0, start_time);
- if (!do_reset) {
- ret = -EINVAL;
- goto out;
+ if (unlikely(ret)) {
+ if (rw == READ)
+ atomic64_inc(&zram->stats.failed_reads);
+ else
+ atomic64_inc(&zram->stats.failed_writes);
}
- /* Make sure all pending I/O is finished */
- fsync_bdev(bdev);
- zram_reset_device(zram);
-
- mutex_unlock(&bdev->bd_mutex);
- revalidate_disk(zram->disk);
- bdput(bdev);
-
- return len;
-
-out:
- mutex_unlock(&bdev->bd_mutex);
- bdput(bdev);
return ret;
}
@@ -929,9 +887,7 @@ out:
bio_io_error(bio);
}
-/*
- * Handler function for all zram I/O requests.
- */
+/* Handler function for all zram I/O requests */
static void zram_make_request(struct request_queue *queue, struct bio *bio)
{
struct zram *zram = queue->queuedata;
@@ -1011,12 +967,159 @@ out:
return err;
}
+/* internal device reset part -- cleanup allocated memory and
+ * return back to initial state */
+static void zram_reset_device_internal(struct zram *zram)
+{
+ struct zram_meta *meta;
+ struct zcomp *comp;
+ u64 disksize;
+
+ down_write(&zram->init_lock);
+
+ zram->limit_pages = 0;
+
+ if (!init_done(zram)) {
+ up_write(&zram->init_lock);
+ return;
+ }
+
+ meta = zram->meta;
+ comp = zram->comp;
+ disksize = zram->disksize;
+ /*
+ * Refcount will go down to 0 eventually and r/w handler
+ * cannot handle further I/O so it will bail out by
+ * check zram_meta_get.
+ */
+ zram_meta_put(zram);
+ /*
+ * We want to free zram_meta in process context to avoid
+ * deadlock between reclaim path and any other locks.
+ */
+ wait_event(zram->io_done, atomic_read(&zram->refcount) == 0);
+
+ /* Reset stats */
+ memset(&zram->stats, 0, sizeof(zram->stats));
+ zram->disksize = 0;
+ zram->max_comp_streams = 1;
+ set_capacity(zram->disk, 0);
+
+ up_write(&zram->init_lock);
+ /* I/O operation under all of CPU are done so let's free */
+ zram_meta_free(meta, disksize);
+ zcomp_destroy(comp);
+}
+
+static int zram_reset_device(struct zram *zram)
+{
+ int ret = 0;
+ struct block_device *bdev = bdget_disk(zram->disk, 0);
+
+ if (!bdev)
+ return -ENOMEM;
+
+ mutex_lock(&bdev->bd_mutex);
+ /* Do not reset an active device! */
+ if (bdev->bd_openers) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ /* Make sure all pending I/O is finished */
+ fsync_bdev(bdev);
+ zram_reset_device_internal(zram);
+out:
+ mutex_unlock(&bdev->bd_mutex);
+ bdput(bdev);
+ return ret;
+}
+
+static ssize_t reset_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ int ret;
+ unsigned short do_reset;
+ struct zram *zram;
+
+ zram = dev_to_zram(dev);
+ ret = kstrtou16(buf, 10, &do_reset);
+ if (ret)
+ return ret;
+
+ if (!do_reset)
+ return -EINVAL;
+
+ ret = zram_reset_device(zram);
+ return ret ? ret : len;
+}
+
+static ssize_t disksize_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ u64 disksize;
+ struct zcomp *comp;
+ struct zram_meta *meta;
+ struct zram *zram = dev_to_zram(dev);
+ int err;
+
+ disksize = memparse(buf, NULL);
+ if (!disksize)
+ return -EINVAL;
+
+ disksize = PAGE_ALIGN(disksize);
+ meta = zram_meta_alloc(zram->disk->first_minor, disksize);
+ if (!meta)
+ return -ENOMEM;
+
+ comp = zcomp_create(zram->compressor, zram->max_comp_streams);
+ if (IS_ERR(comp)) {
+ pr_info("Cannot initialise %s compressing backend\n",
+ zram->compressor);
+ err = PTR_ERR(comp);
+ goto out_free_meta;
+ }
+
+ down_write(&zram->init_lock);
+ if (init_done(zram)) {
+ pr_info("Cannot change disksize for initialized device\n");
+ err = -EBUSY;
+ goto out_destroy_comp;
+ }
+
+ init_waitqueue_head(&zram->io_done);
+ atomic_set(&zram->refcount, 1);
+ zram->meta = meta;
+ zram->comp = comp;
+ zram->disksize = disksize;
+ set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
+ up_write(&zram->init_lock);
+
+ /*
+ * Revalidate disk out of the init_lock to avoid lockdep splat.
+ * It's okay because disk's capacity is protected by init_lock
+ * so that revalidate_disk always sees up-to-date capacity.
+ */
+ revalidate_disk(zram->disk);
+
+ return len;
+
+out_destroy_comp:
+ up_write(&zram->init_lock);
+ zcomp_destroy(comp);
+out_free_meta:
+ zram_meta_free(meta, disksize);
+ return err;
+}
+
+/* per-device block device operations and sysfs attrs */
static const struct block_device_operations zram_devops = {
.swap_slot_free_notify = zram_slot_free_notify,
.rw_page = zram_rw_page,
.owner = THIS_MODULE
};
+static DEVICE_ATTR_WO(compact);
static DEVICE_ATTR_RW(disksize);
static DEVICE_ATTR_RO(initstate);
static DEVICE_ATTR_WO(reset);
@@ -1027,15 +1130,6 @@ static DEVICE_ATTR_RW(mem_used_max);
static DEVICE_ATTR_RW(max_comp_streams);
static DEVICE_ATTR_RW(comp_algorithm);
-ZRAM_ATTR_RO(num_reads);
-ZRAM_ATTR_RO(num_writes);
-ZRAM_ATTR_RO(failed_reads);
-ZRAM_ATTR_RO(failed_writes);
-ZRAM_ATTR_RO(invalid_io);
-ZRAM_ATTR_RO(notify_free);
-ZRAM_ATTR_RO(zero_pages);
-ZRAM_ATTR_RO(compr_data_size);
-
static struct attribute *zram_disk_attrs[] = {
&dev_attr_disksize.attr,
&dev_attr_initstate.attr,
@@ -1044,6 +1138,7 @@ static struct attribute *zram_disk_attrs[] = {
&dev_attr_num_writes.attr,
&dev_attr_failed_reads.attr,
&dev_attr_failed_writes.attr,
+ &dev_attr_compact.attr,
&dev_attr_invalid_io.attr,
&dev_attr_notify_free.attr,
&dev_attr_zero_pages.attr,
@@ -1054,6 +1149,8 @@ static struct attribute *zram_disk_attrs[] = {
&dev_attr_mem_used_max.attr,
&dev_attr_max_comp_streams.attr,
&dev_attr_comp_algorithm.attr,
+ &dev_attr_io_stat.attr,
+ &dev_attr_mm_stat.attr,
NULL,
};
@@ -1061,18 +1158,37 @@ static struct attribute_group zram_disk_attr_group = {
.attrs = zram_disk_attrs,
};
-static int create_device(struct zram *zram, int device_id)
+/* allocate and initialize new zram device. the function returns
+ * '>= 0' device_id upon success, and negative value otherwise. */
+static int zram_add(int device_id)
{
+ struct zram *zram;
struct request_queue *queue;
int ret = -ENOMEM;
+ zram = kzalloc(sizeof(struct zram), GFP_KERNEL);
+ if (!zram)
+ return ret;
+
+ if (device_id < 0) {
+ /* generate new device_id */
+ ret = idr_alloc(&zram_index_idr, zram, 0, 0, GFP_KERNEL);
+ device_id = ret;
+ } else {
+ /* use provided device_id */
+ ret = idr_alloc(&zram_index_idr, zram, device_id,
+ device_id + 1, GFP_KERNEL);
+ }
+ if (ret < 0)
+ goto out_free_dev;
+
init_rwsem(&zram->init_lock);
queue = blk_alloc_queue(GFP_KERNEL);
if (!queue) {
pr_err("Error allocating disk queue for device %d\n",
device_id);
- goto out;
+ goto out_free_idr;
}
blk_queue_make_request(queue, zram_make_request);
@@ -1134,90 +1250,194 @@ static int create_device(struct zram *zram, int device_id)
strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
zram->meta = NULL;
zram->max_comp_streams = 1;
- return 0;
+
+ pr_info("Added device: %s\n", zram->disk->disk_name);
+ return device_id;
out_free_disk:
del_gendisk(zram->disk);
put_disk(zram->disk);
out_free_queue:
blk_cleanup_queue(queue);
-out:
+out_free_idr:
+ idr_remove(&zram_index_idr, device_id);
+out_free_dev:
+ kfree(zram);
return ret;
}
-static void destroy_devices(unsigned int nr)
+static void zram_remove(struct zram *zram)
+{
+ /*
+ * Remove sysfs first, so no one will perform a disksize
+ * store while we destroy the devices
+ */
+ sysfs_remove_group(&disk_to_dev(zram->disk)->kobj,
+ &zram_disk_attr_group);
+
+ zram_reset_device_internal(zram);
+ idr_remove(&zram_index_idr, zram->disk->first_minor);
+ blk_cleanup_queue(zram->disk->queue);
+
+ pr_info("Removed device: %s\n", zram->disk->disk_name);
+ del_gendisk(zram->disk);
+ put_disk(zram->disk);
+ kfree(zram);
+}
+
+/* lookup if there is any device pointer that match the given device_id.
+ * return device pointer if so, or ERR_PTR() otherwise. */
+static struct zram *zram_lookup(int dev_id)
{
struct zram *zram;
- unsigned int i;
- for (i = 0; i < nr; i++) {
- zram = &zram_devices[i];
- /*
- * Remove sysfs first, so no one will perform a disksize
- * store while we destroy the devices
- */
- sysfs_remove_group(&disk_to_dev(zram->disk)->kobj,
- &zram_disk_attr_group);
+ zram = idr_find(&zram_index_idr, dev_id);
+ if (zram)
+ return zram;
+ return ERR_PTR(-ENODEV);
+}
+
+/* zram module control sysfs attributes */
+static ssize_t zram_add_show(struct class *class,
+ struct class_attribute *attr,
+ char *buf)
+{
+ int ret;
+
+ mutex_lock(&zram_index_mutex);
+ /* pick up available device_id */
+ ret = zram_add(-1);
+ mutex_unlock(&zram_index_mutex);
+
+ if (ret < 0)
+ return ret;
+ return scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+}
+
+static ssize_t zram_remove_store(struct class *class,
+ struct class_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct zram *zram;
+ int ret, err, dev_id;
+
+ mutex_lock(&zram_index_mutex);
+
+ /* dev_id is gendisk->first_minor, which is `int' */
+ ret = kstrtoint(buf, 10, &dev_id);
+ if (ret || dev_id < 0) {
+ ret = -EINVAL;
+ goto out;
+ }
- zram_reset_device(zram);
+ zram = zram_lookup(dev_id);
+ if (IS_ERR(zram)) {
+ ret = PTR_ERR(zram);
+ goto out;
+ }
+
+ /*
+ * First, make ->disksize device attr RO, closing
+ * ZRAM_CTL_REMOVE vs disksize_store() race window
+ */
+ ret = sysfs_chmod_file(&disk_to_dev(zram->disk)->kobj,
+ &dev_attr_disksize.attr, S_IRUGO);
+ if (ret)
+ goto out;
- blk_cleanup_queue(zram->disk->queue);
- del_gendisk(zram->disk);
- put_disk(zram->disk);
+ ret = zram_reset_device(zram);
+ if (ret == 0) {
+ /* ->disksize is RO and there are no ->bd_openers */
+ zram_remove(zram);
+ goto out;
}
- kfree(zram_devices);
+ /*
+ * If there are still device bd_openers, try to make ->disksize
+ * RW again and return. even if we fail to make ->disksize RW,
+ * user still has RW ->reset attr. so it's possible to destroy
+ * that device.
+ */
+ err = sysfs_chmod_file(&disk_to_dev(zram->disk)->kobj,
+ &dev_attr_disksize.attr,
+ S_IWUSR | S_IRUGO);
+ if (err)
+ ret = err;
+
+out:
+ mutex_unlock(&zram_index_mutex);
+ return ret ? ret : count;
+}
+
+static struct class_attribute zram_control_class_attrs[] = {
+ __ATTR_RO(zram_add),
+ __ATTR_WO(zram_remove),
+ __ATTR_NULL,
+};
+
+static struct class zram_control_class = {
+ .name = "zram-control",
+ .owner = THIS_MODULE,
+ .class_attrs = zram_control_class_attrs,
+};
+
+static int zram_exit_cb(int id, void *ptr, void *data)
+{
+ zram_remove(ptr);
+ return 0;
+}
+
+static void destroy_devices(void)
+{
+ class_unregister(&zram_control_class);
+ idr_for_each(&zram_index_idr, &zram_exit_cb, NULL);
+ idr_destroy(&zram_index_idr);
unregister_blkdev(zram_major, "zram");
- pr_info("Destroyed %u device(s)\n", nr);
}
static int __init zram_init(void)
{
int ret, dev_id;
- if (num_devices > max_num_devices) {
- pr_warn("Invalid value for num_devices: %u\n",
- num_devices);
- return -EINVAL;
+ ret = class_register(&zram_control_class);
+ if (ret) {
+ pr_warn("Unable to register zram-control class\n");
+ return ret;
}
zram_major = register_blkdev(0, "zram");
if (zram_major <= 0) {
pr_warn("Unable to get major number\n");
+ class_unregister(&zram_control_class);
return -EBUSY;
}
- /* Allocate the device array and initialize each one */
- zram_devices = kzalloc(num_devices * sizeof(struct zram), GFP_KERNEL);
- if (!zram_devices) {
- unregister_blkdev(zram_major, "zram");
- return -ENOMEM;
- }
-
for (dev_id = 0; dev_id < num_devices; dev_id++) {
- ret = create_device(&zram_devices[dev_id], dev_id);
- if (ret)
+ mutex_lock(&zram_index_mutex);
+ ret = zram_add(dev_id);
+ mutex_unlock(&zram_index_mutex);
+ if (ret < 0)
goto out_error;
}
- pr_info("Created %u device(s)\n", num_devices);
return 0;
out_error:
- destroy_devices(dev_id);
+ destroy_devices();
return ret;
}
static void __exit zram_exit(void)
{
- destroy_devices(num_devices);
+ destroy_devices();
}
module_init(zram_init);
module_exit(zram_exit);
module_param(num_devices, uint, 0);
-MODULE_PARM_DESC(num_devices, "Number of zram devices");
+MODULE_PARM_DESC(num_devices, "Number of pre-created zram devices");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 17056e589146..042994e807cf 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -20,12 +20,6 @@
#include "zcomp.h"
-/*
- * Some arbitrary value. This is just to catch
- * invalid value for num_devices module parameter.
- */
-static const unsigned max_num_devices = 32;
-
/*-- Configurable parameters */
/*
@@ -84,6 +78,7 @@ struct zram_stats {
atomic64_t compr_data_size; /* compressed size of pages stored */
atomic64_t num_reads; /* failed + successful */
atomic64_t num_writes; /* --do-- */
+ atomic64_t num_migrated; /* no. of migrated object */
atomic64_t failed_reads; /* can happen when memory is too low */
atomic64_t failed_writes; /* can happen when memory is too low */
atomic64_t invalid_io; /* non-page-aligned I/O requests */
diff --git a/drivers/clk/bcm/clk-kona.c b/drivers/clk/bcm/clk-kona.c
index 05abae89262e..a0ef4f75d457 100644
--- a/drivers/clk/bcm/clk-kona.c
+++ b/drivers/clk/bcm/clk-kona.c
@@ -15,6 +15,7 @@
#include "clk-kona.h"
#include <linux/delay.h>
+#include <linux/kernel.h>
/*
* "Policies" affect the frequencies of bus clocks provided by a
@@ -51,21 +52,6 @@ static inline u32 bitfield_replace(u32 reg_val, u32 shift, u32 width, u32 val)
/* Divider and scaling helpers */
-/*
- * Implement DIV_ROUND_CLOSEST() for 64-bit dividend and both values
- * unsigned. Note that unlike do_div(), the remainder is discarded
- * and the return value is the quotient (not the remainder).
- */
-u64 do_div_round_closest(u64 dividend, unsigned long divisor)
-{
- u64 result;
-
- result = dividend + ((u64)divisor >> 1);
- (void)do_div(result, divisor);
-
- return result;
-}
-
/* Convert a divider into the scaled divisor value it represents. */
static inline u64 scaled_div_value(struct bcm_clk_div *div, u32 reg_div)
{
@@ -87,7 +73,7 @@ u64 scaled_div_build(struct bcm_clk_div *div, u32 div_value, u32 billionths)
combined = (u64)div_value * BILLION + billionths;
combined <<= div->u.s.frac_width;
- return do_div_round_closest(combined, BILLION);
+ return DIV_ROUND_CLOSEST_ULL(combined, BILLION);
}
/* The scaled minimum divisor representable by a divider */
@@ -731,7 +717,7 @@ static unsigned long clk_recalc_rate(struct ccu_data *ccu,
scaled_rate = scale_rate(pre_div, parent_rate);
scaled_rate = scale_rate(div, scaled_rate);
scaled_div = divider_read_scaled(ccu, pre_div);
- scaled_parent_rate = do_div_round_closest(scaled_rate,
+ scaled_parent_rate = DIV_ROUND_CLOSEST_ULL(scaled_rate,
scaled_div);
} else {
scaled_parent_rate = scale_rate(div, parent_rate);
@@ -743,7 +729,7 @@ static unsigned long clk_recalc_rate(struct ccu_data *ccu,
* rate.
*/
scaled_div = divider_read_scaled(ccu, div);
- result = do_div_round_closest(scaled_parent_rate, scaled_div);
+ result = DIV_ROUND_CLOSEST_ULL(scaled_parent_rate, scaled_div);
return (unsigned long)result;
}
@@ -790,7 +776,7 @@ static long round_rate(struct ccu_data *ccu, struct bcm_clk_div *div,
scaled_rate = scale_rate(pre_div, parent_rate);
scaled_rate = scale_rate(div, scaled_rate);
scaled_pre_div = divider_read_scaled(ccu, pre_div);
- scaled_parent_rate = do_div_round_closest(scaled_rate,
+ scaled_parent_rate = DIV_ROUND_CLOSEST_ULL(scaled_rate,
scaled_pre_div);
} else {
scaled_parent_rate = scale_rate(div, parent_rate);
@@ -802,7 +788,7 @@ static long round_rate(struct ccu_data *ccu, struct bcm_clk_div *div,
* the best we can do.
*/
if (!divider_is_fixed(div)) {
- best_scaled_div = do_div_round_closest(scaled_parent_rate,
+ best_scaled_div = DIV_ROUND_CLOSEST_ULL(scaled_parent_rate,
rate);
min_scaled_div = scaled_div_min(div);
max_scaled_div = scaled_div_max(div);
@@ -815,7 +801,7 @@ static long round_rate(struct ccu_data *ccu, struct bcm_clk_div *div,
}
/* OK, figure out the resulting rate */
- result = do_div_round_closest(scaled_parent_rate, best_scaled_div);
+ result = DIV_ROUND_CLOSEST_ULL(scaled_parent_rate, best_scaled_div);
if (scaled_div)
*scaled_div = best_scaled_div;
diff --git a/drivers/clk/bcm/clk-kona.h b/drivers/clk/bcm/clk-kona.h
index 2537b3072910..6849a64baf6d 100644
--- a/drivers/clk/bcm/clk-kona.h
+++ b/drivers/clk/bcm/clk-kona.h
@@ -503,7 +503,6 @@ extern struct clk_ops kona_peri_clk_ops;
/* Externally visible functions */
-extern u64 do_div_round_closest(u64 dividend, unsigned long divisor);
extern u64 scaled_div_max(struct bcm_clk_div *div);
extern u64 scaled_div_build(struct bcm_clk_div *div, u32 div_value,
u32 billionths);
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 40580794e23d..b8a5fa15ca24 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -190,12 +190,6 @@ static DEFINE_PER_CPU(struct menu_device, menu_devices);
static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
-/* This implements DIV_ROUND_CLOSEST but avoids 64 bit division */
-static u64 div_round64(u64 dividend, u32 divisor)
-{
- return div_u64(dividend + (divisor / 2), divisor);
-}
-
/*
* Try detecting repeating patterns by keeping track of the last 8
* intervals, and checking if the standard deviation of that set
@@ -317,7 +311,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
* operands are 32 bits.
* Make sure to round up for half microseconds.
*/
- data->predicted_us = div_round64((uint64_t)data->next_timer_us *
+ data->predicted_us = DIV_ROUND_CLOSEST_ULL((uint64_t)data->next_timer_us *
data->correction_factor[data->bucket],
RESOLUTION * DECAY);
diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
index cb57b9c446f3..522eac291561 100644
--- a/drivers/gpu/drm/i915/intel_drv.h
+++ b/drivers/gpu/drm/i915/intel_drv.h
@@ -37,9 +37,6 @@
#include <drm/drm_rect.h>
#include <drm/drm_atomic.h>
-#define DIV_ROUND_CLOSEST_ULL(ll, d) \
-({ unsigned long long _tmp = (ll)+(d)/2; do_div(_tmp, d); _tmp; })
-
/**
* _wait_for - magic (register) wait macro
*
diff --git a/drivers/gpu/drm/i915/intel_panel.c b/drivers/gpu/drm/i915/intel_panel.c
index d8686ce89160..08532d4ffe0a 100644
--- a/drivers/gpu/drm/i915/intel_panel.c
+++ b/drivers/gpu/drm/i915/intel_panel.c
@@ -30,6 +30,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/kernel.h>
#include <linux/moduleparam.h>
#include "intel_drv.h"
diff --git a/drivers/hwmon/ina2xx.c b/drivers/hwmon/ina2xx.c
index d1542b7d4bc3..4d2815079fc2 100644
--- a/drivers/hwmon/ina2xx.c
+++ b/drivers/hwmon/ina2xx.c
@@ -36,6 +36,7 @@
#include <linux/jiffies.h>
#include <linux/of.h>
#include <linux/delay.h>
+#include <linux/util_macros.h>
#include <linux/platform_data/ina2xx.h>
@@ -141,19 +142,6 @@ static const struct ina2xx_config ina2xx_config[] = {
*/
static const int ina226_avg_tab[] = { 1, 4, 16, 64, 128, 256, 512, 1024 };
-static int ina226_avg_bits(int avg)
-{
- int i;
-
- /* Get the closest average from the tab. */
- for (i = 0; i < ARRAY_SIZE(ina226_avg_tab) - 1; i++) {
- if (avg <= (ina226_avg_tab[i] + ina226_avg_tab[i + 1]) / 2)
- break;
- }
-
- return i; /* Return 0b0111 for values greater than 1024. */
-}
-
static int ina226_reg_to_interval(u16 config)
{
int avg = ina226_avg_tab[INA226_READ_AVG(config)];
@@ -171,7 +159,8 @@ static u16 ina226_interval_to_reg(int interval, u16 config)
avg = DIV_ROUND_CLOSEST(interval * 1000,
INA226_TOTAL_CONV_TIME_DEFAULT);
- avg_bits = ina226_avg_bits(avg);
+ avg_bits = find_closest(avg, ina226_avg_tab,
+ ARRAY_SIZE(ina226_avg_tab));
return (config & ~INA226_AVG_RD_MASK) | INA226_SHIFT_AVG(avg_bits);
}
diff --git a/drivers/hwmon/lm85.c b/drivers/hwmon/lm85.c
index 2b4b419273fe..6ff773fcaefb 100644
--- a/drivers/hwmon/lm85.c
+++ b/drivers/hwmon/lm85.c
@@ -34,6 +34,7 @@
#include <linux/hwmon-sysfs.h>
#include <linux/err.h>
#include <linux/mutex.h>
+#include <linux/util_macros.h>
/* Addresses to scan */
static const unsigned short normal_i2c[] = { 0x2c, 0x2d, 0x2e, I2C_CLIENT_END };
@@ -190,15 +191,7 @@ static const int lm85_range_map[] = {
static int RANGE_TO_REG(long range)
{
- int i;
-
- /* Find the closest match */
- for (i = 0; i < 15; ++i) {
- if (range <= (lm85_range_map[i] + lm85_range_map[i + 1]) / 2)
- break;
- }
-
- return i;
+ return find_closest(range, lm85_range_map, ARRAY_SIZE(lm85_range_map));
}
#define RANGE_FROM_REG(val) lm85_range_map[(val) & 0x0f]
@@ -209,16 +202,12 @@ static const int lm85_freq_map[8] = { /* 1 Hz */
static const int adm1027_freq_map[8] = { /* 1 Hz */
11, 15, 22, 29, 35, 44, 59, 88
};
+#define FREQ_MAP_LEN 8
-static int FREQ_TO_REG(const int *map, unsigned long freq)
+static int FREQ_TO_REG(const int *map,
+ unsigned int map_size, unsigned long freq)
{
- int i;
-
- /* Find the closest match */
- for (i = 0; i < 7; ++i)
- if (freq <= (map[i] + map[i + 1]) / 2)
- break;
- return i;
+ return find_closest(freq, map, map_size);
}
static int FREQ_FROM_REG(const int *map, u8 reg)
@@ -828,7 +817,8 @@ static ssize_t set_pwm_freq(struct device *dev,
data->cfg5 &= ~ADT7468_HFPWM;
lm85_write_value(client, ADT7468_REG_CFG5, data->cfg5);
} else { /* Low freq. mode */
- data->pwm_freq[nr] = FREQ_TO_REG(data->freq_map, val);
+ data->pwm_freq[nr] = FREQ_TO_REG(data->freq_map,
+ FREQ_MAP_LEN, val);
lm85_write_value(client, LM85_REG_AFAN_RANGE(nr),
(data->zone[nr].range << 4)
| data->pwm_freq[nr]);
diff --git a/drivers/hwmon/w83795.c b/drivers/hwmon/w83795.c
index 21894131190f..49276bbdac3d 100644
--- a/drivers/hwmon/w83795.c
+++ b/drivers/hwmon/w83795.c
@@ -35,6 +35,7 @@
#include <linux/err.h>
#include <linux/mutex.h>
#include <linux/jiffies.h>
+#include <linux/util_macros.h>
/* Addresses to scan */
static const unsigned short normal_i2c[] = {
@@ -308,11 +309,8 @@ static u8 pwm_freq_to_reg(unsigned long val, u16 clkin)
unsigned long best0, best1;
/* Best fit for cksel = 0 */
- for (reg0 = 0; reg0 < ARRAY_SIZE(pwm_freq_cksel0) - 1; reg0++) {
- if (val > (pwm_freq_cksel0[reg0] +
- pwm_freq_cksel0[reg0 + 1]) / 2)
- break;
- }
+ reg0 = find_closest_descending(val, pwm_freq_cksel0,
+ ARRAY_SIZE(pwm_freq_cksel0));
if (val < 375) /* cksel = 1 can't beat this */
return reg0;
best0 = pwm_freq_cksel0[reg0];
diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c
index 6791fd16272c..3ef0cf9f5c44 100644
--- a/drivers/infiniband/hw/cxgb4/mem.c
+++ b/drivers/infiniband/hw/cxgb4/mem.c
@@ -73,7 +73,7 @@ static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr,
c4iw_init_wr_wait(&wr_wait);
wr_len = roundup(sizeof(*req) + sizeof(*sgl), 16);
- skb = alloc_skb(wr_len, GFP_KERNEL | __GFP_NOFAIL);
+ skb = alloc_skb(wr_len, GFP_KERNEL);
if (!skb)
return -ENOMEM;
set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
diff --git a/drivers/media/dvb-frontends/cxd2820r_c.c b/drivers/media/dvb-frontends/cxd2820r_c.c
index 149fdca3fb44..72b0e2db3aab 100644
--- a/drivers/media/dvb-frontends/cxd2820r_c.c
+++ b/drivers/media/dvb-frontends/cxd2820r_c.c
@@ -79,7 +79,7 @@ int cxd2820r_set_frontend_c(struct dvb_frontend *fe)
num = if_freq / 1000; /* Hz => kHz */
num *= 0x4000;
- if_ctl = 0x4000 - cxd2820r_div_u64_round_closest(num, 41000);
+ if_ctl = 0x4000 - DIV_ROUND_CLOSEST_ULL(num, 41000);
buf[0] = (if_ctl >> 8) & 0x3f;
buf[1] = (if_ctl >> 0) & 0xff;
diff --git a/drivers/media/dvb-frontends/cxd2820r_core.c b/drivers/media/dvb-frontends/cxd2820r_core.c
index 422e84bbb008..490e090048ef 100644
--- a/drivers/media/dvb-frontends/cxd2820r_core.c
+++ b/drivers/media/dvb-frontends/cxd2820r_core.c
@@ -244,12 +244,6 @@ error:
return ret;
}
-/* 64 bit div with round closest, like DIV_ROUND_CLOSEST but 64 bit */
-u32 cxd2820r_div_u64_round_closest(u64 dividend, u32 divisor)
-{
- return div_u64(dividend + (divisor / 2), divisor);
-}
-
static int cxd2820r_set_frontend(struct dvb_frontend *fe)
{
struct cxd2820r_priv *priv = fe->demodulator_priv;
diff --git a/drivers/media/dvb-frontends/cxd2820r_priv.h b/drivers/media/dvb-frontends/cxd2820r_priv.h
index 7ff5f60c83e1..4b428959b16e 100644
--- a/drivers/media/dvb-frontends/cxd2820r_priv.h
+++ b/drivers/media/dvb-frontends/cxd2820r_priv.h
@@ -64,8 +64,6 @@ int cxd2820r_wr_reg_mask(struct cxd2820r_priv *priv, u32 reg, u8 val,
int cxd2820r_wr_regs(struct cxd2820r_priv *priv, u32 reginfo, u8 *val,
int len);
-u32 cxd2820r_div_u64_round_closest(u64 dividend, u32 divisor);
-
int cxd2820r_wr_regs(struct cxd2820r_priv *priv, u32 reginfo, u8 *val,
int len);
diff --git a/drivers/media/dvb-frontends/cxd2820r_t.c b/drivers/media/dvb-frontends/cxd2820r_t.c
index 51401d036530..008cb2ac8480 100644
--- a/drivers/media/dvb-frontends/cxd2820r_t.c
+++ b/drivers/media/dvb-frontends/cxd2820r_t.c
@@ -103,7 +103,7 @@ int cxd2820r_set_frontend_t(struct dvb_frontend *fe)
num = if_freq / 1000; /* Hz => kHz */
num *= 0x1000000;
- if_ctl = cxd2820r_div_u64_round_closest(num, 41000);
+ if_ctl = DIV_ROUND_CLOSEST_ULL(num, 41000);
buf[0] = ((if_ctl >> 16) & 0xff);
buf[1] = ((if_ctl >> 8) & 0xff);
buf[2] = ((if_ctl >> 0) & 0xff);
diff --git a/drivers/media/dvb-frontends/cxd2820r_t2.c b/drivers/media/dvb-frontends/cxd2820r_t2.c
index 9c0c4f42175c..35fe364c7182 100644
--- a/drivers/media/dvb-frontends/cxd2820r_t2.c
+++ b/drivers/media/dvb-frontends/cxd2820r_t2.c
@@ -120,7 +120,7 @@ int cxd2820r_set_frontend_t2(struct dvb_frontend *fe)
num = if_freq / 1000; /* Hz => kHz */
num *= 0x1000000;
- if_ctl = cxd2820r_div_u64_round_closest(num, 41000);
+ if_ctl = DIV_ROUND_CLOSEST_ULL(num, 41000);
buf[0] = ((if_ctl >> 16) & 0xff);
buf[1] = ((if_ctl >> 8) & 0xff);
buf[2] = ((if_ctl >> 0) & 0xff);
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index fc145d202c46..922a750640e8 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -758,7 +758,7 @@ static int mspro_block_complete_req(struct memstick_dev *card, int error)
if (error || (card->current_mrq.tpc == MSPRO_CMD_STOP)) {
if (msb->data_dir == READ) {
- for (cnt = 0; cnt < msb->current_seg; cnt++)
+ for (cnt = 0; cnt < msb->current_seg; cnt++) {
t_len += msb->req_sg[cnt].length
/ msb->page_size;
@@ -766,6 +766,7 @@ static int mspro_block_complete_req(struct memstick_dev *card, int error)
t_len += msb->current_page - 1;
t_len *= msb->page_size;
+ }
}
} else
t_len = blk_rq_bytes(msb->block_req);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index e40e283ff36c..5ca9c9ed633c 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -1119,6 +1119,10 @@ static int set_filter_wr(struct adapter *adapter, int fidx)
struct fw_filter_wr *fwr;
unsigned int ftid;
+ skb = alloc_skb(sizeof(*fwr), GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
/* If the new filter requires loopback Destination MAC and/or VLAN
* rewriting then we need to allocate a Layer 2 Table (L2T) entry for
* the filter.
@@ -1126,19 +1130,21 @@ static int set_filter_wr(struct adapter *adapter, int fidx)
if (f->fs.newdmac || f->fs.newvlan) {
/* allocate L2T entry for new filter */
f->l2t = t4_l2t_alloc_switching(adapter->l2t);
- if (f->l2t == NULL)
+ if (f->l2t == NULL) {
+ kfree_skb(skb);
return -EAGAIN;
+ }
if (t4_l2t_set_switching(adapter, f->l2t, f->fs.vlan,
f->fs.eport, f->fs.dmac)) {
cxgb4_l2t_release(f->l2t);
f->l2t = NULL;
+ kfree_skb(skb);
return -ENOMEM;
}
}
ftid = adapter->tids.ftid_base + fidx;
- skb = alloc_skb(sizeof(*fwr), GFP_KERNEL | __GFP_NOFAIL);
fwr = (struct fw_filter_wr *)__skb_put(skb, sizeof(*fwr));
memset(fwr, 0, sizeof(*fwr));
@@ -1236,7 +1242,10 @@ static int del_filter_wr(struct adapter *adapter, int fidx)
len = sizeof(*fwr);
ftid = adapter->tids.ftid_base + fidx;
- skb = alloc_skb(len, GFP_KERNEL | __GFP_NOFAIL);
+ skb = alloc_skb(len, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
fwr = (struct fw_filter_wr *)__skb_put(skb, len);
t4_mk_filtdelwr(ftid, fwr, adapter->sge.fw_evtq.abs_id);
diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c
index 8b490d77054f..6bc16809c504 100644
--- a/drivers/parisc/ccio-dma.c
+++ b/drivers/parisc/ccio-dma.c
@@ -1021,7 +1021,6 @@ static struct hppa_dma_ops ccio_ops = {
#ifdef CONFIG_PROC_FS
static int ccio_proc_info(struct seq_file *m, void *p)
{
- int len = 0;
struct ioc *ioc = ioc_list;
while (ioc != NULL) {
@@ -1031,22 +1030,22 @@ static int ccio_proc_info(struct seq_file *m, void *p)
int j;
#endif
- len += seq_printf(m, "%s\n", ioc->name);
+ seq_printf(m, "%s\n", ioc->name);
- len += seq_printf(m, "Cujo 2.0 bug : %s\n",
- (ioc->cujo20_bug ? "yes" : "no"));
+ seq_printf(m, "Cujo 2.0 bug : %s\n",
+ (ioc->cujo20_bug ? "yes" : "no"));
- len += seq_printf(m, "IO PDIR size : %d bytes (%d entries)\n",
- total_pages * 8, total_pages);
+ seq_printf(m, "IO PDIR size : %d bytes (%d entries)\n",
+ total_pages * 8, total_pages);
#ifdef CCIO_COLLECT_STATS
- len += seq_printf(m, "IO PDIR entries : %ld free %ld used (%d%%)\n",
- total_pages - ioc->used_pages, ioc->used_pages,
- (int)(ioc->used_pages * 100 / total_pages));
+ seq_printf(m, "IO PDIR entries : %ld free %ld used (%d%%)\n",
+ total_pages - ioc->used_pages, ioc->used_pages,
+ (int)(ioc->used_pages * 100 / total_pages));
#endif
- len += seq_printf(m, "Resource bitmap : %d bytes (%d pages)\n",
- ioc->res_size, total_pages);
+ seq_printf(m, "Resource bitmap : %d bytes (%d pages)\n",
+ ioc->res_size, total_pages);
#ifdef CCIO_COLLECT_STATS
min = max = ioc->avg_search[0];
@@ -1058,26 +1057,26 @@ static int ccio_proc_info(struct seq_file *m, void *p)
min = ioc->avg_search[j];
}
avg /= CCIO_SEARCH_SAMPLE;
- len += seq_printf(m, " Bitmap search : %ld/%ld/%ld (min/avg/max CPU Cycles)\n",
- min, avg, max);
+ seq_printf(m, " Bitmap search : %ld/%ld/%ld (min/avg/max CPU Cycles)\n",
+ min, avg, max);
- len += seq_printf(m, "pci_map_single(): %8ld calls %8ld pages (avg %d/1000)\n",
- ioc->msingle_calls, ioc->msingle_pages,
- (int)((ioc->msingle_pages * 1000)/ioc->msingle_calls));
+ seq_printf(m, "pci_map_single(): %8ld calls %8ld pages (avg %d/1000)\n",
+ ioc->msingle_calls, ioc->msingle_pages,
+ (int)((ioc->msingle_pages * 1000)/ioc->msingle_calls));
/* KLUGE - unmap_sg calls unmap_single for each mapped page */
min = ioc->usingle_calls - ioc->usg_calls;
max = ioc->usingle_pages - ioc->usg_pages;
- len += seq_printf(m, "pci_unmap_single: %8ld calls %8ld pages (avg %d/1000)\n",
- min, max, (int)((max * 1000)/min));
+ seq_printf(m, "pci_unmap_single: %8ld calls %8ld pages (avg %d/1000)\n",
+ min, max, (int)((max * 1000)/min));
- len += seq_printf(m, "pci_map_sg() : %8ld calls %8ld pages (avg %d/1000)\n",
- ioc->msg_calls, ioc->msg_pages,
- (int)((ioc->msg_pages * 1000)/ioc->msg_calls));
+ seq_printf(m, "pci_map_sg() : %8ld calls %8ld pages (avg %d/1000)\n",
+ ioc->msg_calls, ioc->msg_pages,
+ (int)((ioc->msg_pages * 1000)/ioc->msg_calls));
- len += seq_printf(m, "pci_unmap_sg() : %8ld calls %8ld pages (avg %d/1000)\n\n\n",
- ioc->usg_calls, ioc->usg_pages,
- (int)((ioc->usg_pages * 1000)/ioc->usg_calls));
+ seq_printf(m, "pci_unmap_sg() : %8ld calls %8ld pages (avg %d/1000)\n\n\n",
+ ioc->usg_calls, ioc->usg_pages,
+ (int)((ioc->usg_pages * 1000)/ioc->usg_calls));
#endif /* CCIO_COLLECT_STATS */
ioc = ioc->next;
@@ -1101,7 +1100,6 @@ static const struct file_operations ccio_proc_info_fops = {
static int ccio_proc_bitmap_info(struct seq_file *m, void *p)
{
- int len = 0;
struct ioc *ioc = ioc_list;
while (ioc != NULL) {
@@ -1110,11 +1108,11 @@ static int ccio_proc_bitmap_info(struct seq_file *m, void *p)
for (j = 0; j < (ioc->res_size / sizeof(u32)); j++) {
if ((j & 7) == 0)
- len += seq_puts(m, "\n ");
- len += seq_printf(m, "%08x", *res_ptr);
+ seq_puts(m, "\n ");
+ seq_printf(m, "%08x", *res_ptr);
res_ptr++;
}
- len += seq_puts(m, "\n\n");
+ seq_puts(m, "\n\n");
ioc = ioc->next;
break; /* XXX - remove me */
}
diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c
index 1ff1b67e8b27..f07471264689 100644
--- a/drivers/parisc/sba_iommu.c
+++ b/drivers/parisc/sba_iommu.c
@@ -1774,37 +1774,35 @@ static int sba_proc_info(struct seq_file *m, void *p)
#ifdef SBA_COLLECT_STATS
unsigned long avg = 0, min, max;
#endif
- int i, len = 0;
-
- len += seq_printf(m, "%s rev %d.%d\n",
- sba_dev->name,
- (sba_dev->hw_rev & 0x7) + 1,
- (sba_dev->hw_rev & 0x18) >> 3
- );
- len += seq_printf(m, "IO PDIR size : %d bytes (%d entries)\n",
- (int) ((ioc->res_size << 3) * sizeof(u64)), /* 8 bits/byte */
- total_pages);
-
- len += seq_printf(m, "Resource bitmap : %d bytes (%d pages)\n",
- ioc->res_size, ioc->res_size << 3); /* 8 bits per byte */
-
- len += seq_printf(m, "LMMIO_BASE/MASK/ROUTE %08x %08x %08x\n",
- READ_REG32(sba_dev->sba_hpa + LMMIO_DIST_BASE),
- READ_REG32(sba_dev->sba_hpa + LMMIO_DIST_MASK),
- READ_REG32(sba_dev->sba_hpa + LMMIO_DIST_ROUTE)
- );
+ int i;
+
+ seq_printf(m, "%s rev %d.%d\n",
+ sba_dev->name,
+ (sba_dev->hw_rev & 0x7) + 1,
+ (sba_dev->hw_rev & 0x18) >> 3);
+ seq_printf(m, "IO PDIR size : %d bytes (%d entries)\n",
+ (int)((ioc->res_size << 3) * sizeof(u64)), /* 8 bits/byte */
+ total_pages);
+
+ seq_printf(m, "Resource bitmap : %d bytes (%d pages)\n",
+ ioc->res_size, ioc->res_size << 3); /* 8 bits per byte */
+
+ seq_printf(m, "LMMIO_BASE/MASK/ROUTE %08x %08x %08x\n",
+ READ_REG32(sba_dev->sba_hpa + LMMIO_DIST_BASE),
+ READ_REG32(sba_dev->sba_hpa + LMMIO_DIST_MASK),
+ READ_REG32(sba_dev->sba_hpa + LMMIO_DIST_ROUTE));
for (i=0; i<4; i++)
- len += seq_printf(m, "DIR%d_BASE/MASK/ROUTE %08x %08x %08x\n", i,
- READ_REG32(sba_dev->sba_hpa + LMMIO_DIRECT0_BASE + i*0x18),
- READ_REG32(sba_dev->sba_hpa + LMMIO_DIRECT0_MASK + i*0x18),
- READ_REG32(sba_dev->sba_hpa + LMMIO_DIRECT0_ROUTE + i*0x18)
- );
+ seq_printf(m, "DIR%d_BASE/MASK/ROUTE %08x %08x %08x\n",
+ i,
+ READ_REG32(sba_dev->sba_hpa + LMMIO_DIRECT0_BASE + i*0x18),
+ READ_REG32(sba_dev->sba_hpa + LMMIO_DIRECT0_MASK + i*0x18),
+ READ_REG32(sba_dev->sba_hpa + LMMIO_DIRECT0_ROUTE + i*0x18));
#ifdef SBA_COLLECT_STATS
- len += seq_printf(m, "IO PDIR entries : %ld free %ld used (%d%%)\n",
- total_pages - ioc->used_pages, ioc->used_pages,
- (int) (ioc->used_pages * 100 / total_pages));
+ seq_printf(m, "IO PDIR entries : %ld free %ld used (%d%%)\n",
+ total_pages - ioc->used_pages, ioc->used_pages,
+ (int)(ioc->used_pages * 100 / total_pages));
min = max = ioc->avg_search[0];
for (i = 0; i < SBA_SEARCH_SAMPLE; i++) {
@@ -1813,26 +1811,26 @@ static int sba_proc_info(struct seq_file *m, void *p)
if (ioc->avg_search[i] < min) min = ioc->avg_search[i];
}
avg /= SBA_SEARCH_SAMPLE;
- len += seq_printf(m, " Bitmap search : %ld/%ld/%ld (min/avg/max CPU Cycles)\n",
- min, avg, max);
+ seq_printf(m, " Bitmap search : %ld/%ld/%ld (min/avg/max CPU Cycles)\n",
+ min, avg, max);
- len += seq_printf(m, "pci_map_single(): %12ld calls %12ld pages (avg %d/1000)\n",
- ioc->msingle_calls, ioc->msingle_pages,
- (int) ((ioc->msingle_pages * 1000)/ioc->msingle_calls));
+ seq_printf(m, "pci_map_single(): %12ld calls %12ld pages (avg %d/1000)\n",
+ ioc->msingle_calls, ioc->msingle_pages,
+ (int)((ioc->msingle_pages * 1000)/ioc->msingle_calls));
/* KLUGE - unmap_sg calls unmap_single for each mapped page */
min = ioc->usingle_calls;
max = ioc->usingle_pages - ioc->usg_pages;
- len += seq_printf(m, "pci_unmap_single: %12ld calls %12ld pages (avg %d/1000)\n",
- min, max, (int) ((max * 1000)/min));
+ seq_printf(m, "pci_unmap_single: %12ld calls %12ld pages (avg %d/1000)\n",
+ min, max, (int)((max * 1000)/min));
- len += seq_printf(m, "pci_map_sg() : %12ld calls %12ld pages (avg %d/1000)\n",
- ioc->msg_calls, ioc->msg_pages,
- (int) ((ioc->msg_pages * 1000)/ioc->msg_calls));
+ seq_printf(m, "pci_map_sg() : %12ld calls %12ld pages (avg %d/1000)\n",
+ ioc->msg_calls, ioc->msg_pages,
+ (int)((ioc->msg_pages * 1000)/ioc->msg_calls));
- len += seq_printf(m, "pci_unmap_sg() : %12ld calls %12ld pages (avg %d/1000)\n",
- ioc->usg_calls, ioc->usg_pages,
- (int) ((ioc->usg_pages * 1000)/ioc->usg_calls));
+ seq_printf(m, "pci_unmap_sg() : %12ld calls %12ld pages (avg %d/1000)\n",
+ ioc->usg_calls, ioc->usg_pages,
+ (int)((ioc->usg_pages * 1000)/ioc->usg_calls));
#endif
return 0;
@@ -1858,14 +1856,14 @@ sba_proc_bitmap_info(struct seq_file *m, void *p)
struct sba_device *sba_dev = sba_list;
struct ioc *ioc = &sba_dev->ioc[0]; /* FIXME: Multi-IOC support! */
unsigned int *res_ptr = (unsigned int *)ioc->res_map;
- int i, len = 0;
+ int i;
for (i = 0; i < (ioc->res_size/sizeof(unsigned int)); ++i, ++res_ptr) {
if ((i & 7) == 0)
- len += seq_printf(m, "\n ");
- len += seq_printf(m, " %08x", *res_ptr);
+ seq_puts(m, "\n ");
+ seq_printf(m, " %08x", *res_ptr);
}
- len += seq_printf(m, "\n");
+ seq_putc(m, '\n');
return 0;
}
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index b5b5c3d485d6..5c0c698a5abf 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -164,6 +164,15 @@ config RTC_DRV_ABB5ZES3
This driver can also be built as a module. If so, the module
will be called rtc-ab-b5ze-s3.
+config RTC_DRV_ABX80X
+ tristate "Abracon ABx80x"
+ help
+ If you say yes here you get support for Abracon AB080x and AB180x
+ real-time clock chips.
+
+ This driver can also be built as a module. If so, the module
+ will be called rtc-abx80x.
+
config RTC_DRV_AS3722
tristate "ams AS3722 RTC driver"
depends on MFD_AS3722
@@ -1081,6 +1090,14 @@ config RTC_DRV_AB8500
Select this to enable the ST-Ericsson AB8500 power management IC RTC
support. This chip contains a battery- and capacitor-backed RTC.
+config RTC_DRV_ABX805
+ tristate "Abracon AB X805 RTC"
+ depends on I2C
+ help
+ Select this to enable support for the Abracon AB X805 RTC.
+ AB X805 is the i2c flavour of the AB 18X5 family of ultra-low-power
+ battery- and capacitor-backed RTC..
+
config RTC_DRV_NUC900
tristate "NUC910/NUC920 RTC driver"
depends on ARCH_W90X900
@@ -1111,6 +1128,16 @@ config RTC_DRV_DAVINCI
This driver can also be built as a module. If so, the module
will be called rtc-davinci.
+config RTC_DRV_DIGICOLOR
+ tristate "Conexant Digicolor RTC"
+ depends on ARCH_DIGICOLOR
+ help
+ If you say yes here you get support for the RTC on Conexant
+ Digicolor platforms. This currently includes the CX92755 SoC.
+
+ This driver can also be built as a module. If so, the module
+ will be called rtc-digicolor.
+
config RTC_DRV_IMXDI
tristate "Freescale IMX DryIce Real Time Clock"
depends on ARCH_MXC
@@ -1510,6 +1537,16 @@ config RTC_DRV_MOXART
This driver can also be built as a module. If so, the module
will be called rtc-moxart
+config RTC_DRV_MT63XX
+ tristate "Mediatek Real Time Clock driver"
+ depends on MFD_MT6397
+ help
+ This selects the Mediatek(R) RTC driver, you should add support
+ for Mediatek MT6397 PMIC before select Mediatek(R) RTC driver.
+
+ If you want to use Mediatek(R) RTC interface, select Y or M here.
+ If unsure, Please select N.
+
config RTC_DRV_XGENE
tristate "APM X-Gene RTC"
depends on HAS_IOMEM
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index 69c87062b098..6b3ea2eaddcc 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -25,6 +25,8 @@ obj-$(CONFIG_RTC_DRV_88PM80X) += rtc-88pm80x.o
obj-$(CONFIG_RTC_DRV_AB3100) += rtc-ab3100.o
obj-$(CONFIG_RTC_DRV_AB8500) += rtc-ab8500.o
obj-$(CONFIG_RTC_DRV_ABB5ZES3) += rtc-ab-b5ze-s3.o
+obj-$(CONFIG_RTC_DRV_ABX805) += rtc-abx805.o
+obj-$(CONFIG_RTC_DRV_ABX80X) += rtc-abx80x.o
obj-$(CONFIG_RTC_DRV_ARMADA38X) += rtc-armada38x.o
obj-$(CONFIG_RTC_DRV_AS3722) += rtc-as3722.o
obj-$(CONFIG_RTC_DRV_AT32AP700X)+= rtc-at32ap700x.o
@@ -40,6 +42,7 @@ obj-$(CONFIG_RTC_DRV_DA9052) += rtc-da9052.o
obj-$(CONFIG_RTC_DRV_DA9055) += rtc-da9055.o
obj-$(CONFIG_RTC_DRV_DA9063) += rtc-da9063.o
obj-$(CONFIG_RTC_DRV_DAVINCI) += rtc-davinci.o
+obj-$(CONFIG_RTC_DRV_DIGICOLOR) += rtc-digicolor.o
obj-$(CONFIG_RTC_DRV_DM355EVM) += rtc-dm355evm.o
obj-$(CONFIG_RTC_DRV_VRTC) += rtc-mrst.o
obj-$(CONFIG_RTC_DRV_DS1216) += rtc-ds1216.o
@@ -153,3 +156,4 @@ obj-$(CONFIG_RTC_DRV_X1205) += rtc-x1205.o
obj-$(CONFIG_RTC_DRV_XGENE) += rtc-xgene.o
obj-$(CONFIG_RTC_DRV_SIRFSOC) += rtc-sirfsoc.o
obj-$(CONFIG_RTC_DRV_MOXART) += rtc-moxart.o
+obj-$(CONFIG_RTC_DRV_MT63XX) += rtc-mt6397.o
diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c
index 472a5adc4642..c7e09e27ae5d 100644
--- a/drivers/rtc/class.c
+++ b/drivers/rtc/class.c
@@ -55,6 +55,8 @@ static int rtc_suspend(struct device *dev)
struct timespec64 delta, delta_delta;
int err;
+ rtc->valid_alarm = !rtc_read_alarm(rtc, &rtc->alarm);
+
if (has_persistent_clock())
return 0;
@@ -102,6 +104,27 @@ static int rtc_resume(struct device *dev)
struct timespec64 sleep_time;
int err;
+ /*
+ * Ensure that the platform hasn't overwritten a pending alarm while
+ * suspended
+ */
+ if (rtc->valid_alarm) {
+ long now, scheduled;
+
+ rtc_read_time(rtc, &tm);
+ rtc_tm_to_time(&rtc->alarm.time, &scheduled);
+ rtc_tm_to_time(&tm, &now);
+
+ /* Clear the alarm registers if it went off during suspend */
+ if (scheduled <= now) {
+ rtc_time_to_tm(0, &rtc->alarm.time);
+ rtc->alarm.enabled = 0;
+ }
+
+ if (rtc->ops && rtc->ops->set_alarm)
+ rtc->ops->set_alarm(rtc->dev.parent, &rtc->alarm);
+ }
+
if (has_persistent_clock())
return 0;
@@ -145,6 +168,7 @@ static int rtc_resume(struct device *dev)
if (sleep_time.tv_sec >= 0)
timekeeping_inject_sleeptime64(&sleep_time);
rtc_hctosys_ret = 0;
+
return 0;
}
diff --git a/drivers/rtc/hctosys.c b/drivers/rtc/hctosys.c
index 6c719f23520a..fb4251de7ce1 100644
--- a/drivers/rtc/hctosys.c
+++ b/drivers/rtc/hctosys.c
@@ -32,7 +32,7 @@ static int __init rtc_hctosys(void)
struct rtc_device *rtc = rtc_class_open(CONFIG_RTC_HCTOSYS_DEVICE);
if (rtc == NULL) {
- pr_err("%s: unable to open rtc device (%s)\n",
+ pr_info("%s: unable to open rtc device (%s)\n",
__FILE__, CONFIG_RTC_HCTOSYS_DEVICE);
goto err_open;
}
diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c
index 37215cf983e9..75efdaf81e04 100644
--- a/drivers/rtc/interface.c
+++ b/drivers/rtc/interface.c
@@ -487,7 +487,10 @@ int rtc_update_irq_enable(struct rtc_device *rtc, unsigned int enabled)
struct rtc_time tm;
ktime_t now, onesec;
- __rtc_read_time(rtc, &tm);
+ err = __rtc_read_time(rtc, &tm);
+ if (err < 0)
+ goto out;
+
onesec = ktime_set(1, 0);
now = rtc_tm_to_ktime(tm);
rtc->uie_rtctimer.node.expires = ktime_add(now, onesec);
@@ -865,13 +868,17 @@ void rtc_timer_do_work(struct work_struct *work)
struct timerqueue_node *next;
ktime_t now;
struct rtc_time tm;
+ int err = 0;
struct rtc_device *rtc =
container_of(work, struct rtc_device, irqwork);
mutex_lock(&rtc->ops_lock);
again:
- __rtc_read_time(rtc, &tm);
+ err = __rtc_read_time(rtc, &tm);
+ if (err < 0)
+ goto out;
+
now = rtc_tm_to_ktime(tm);
while ((next = timerqueue_getnext(&rtc->timerqueue))) {
if (next->expires.tv64 > now.tv64)
@@ -896,7 +903,6 @@ again:
/* Set next alarm */
if (next) {
struct rtc_wkalrm alarm;
- int err;
int retry = 3;
alarm.time = rtc_ktime_to_tm(next->expires);
@@ -918,6 +924,7 @@ reprogram:
} else
rtc_alarm_disable(rtc);
+out:
pm_relax(rtc->dev.parent);
mutex_unlock(&rtc->ops_lock);
}
diff --git a/drivers/rtc/rtc-ab-b5ze-s3.c b/drivers/rtc/rtc-ab-b5ze-s3.c
index cfc2ef98d393..b5cbc1bf5a3e 100644
--- a/drivers/rtc/rtc-ab-b5ze-s3.c
+++ b/drivers/rtc/rtc-ab-b5ze-s3.c
@@ -881,7 +881,7 @@ static const struct rtc_class_ops rtc_ops = {
.alarm_irq_enable = abb5zes3_rtc_alarm_irq_enable,
};
-static struct regmap_config abb5zes3_rtc_regmap_config = {
+static const struct regmap_config abb5zes3_rtc_regmap_config = {
.reg_bits = 8,
.val_bits = 8,
};
diff --git a/drivers/rtc/rtc-abx805.c b/drivers/rtc/rtc-abx805.c
new file mode 100644
index 000000000000..cafcfbab0e4b
--- /dev/null
+++ b/drivers/rtc/rtc-abx805.c
@@ -0,0 +1,223 @@
+/*
+ * A driver for the I2C members of the Abracon AB 18X5 RTC family,
+ * and compatible: AB 1805 and AB 0805
+ *
+ * Copyright 2014-2015 Macq S.A.
+ *
+ * Author: Philippe De Muyter <phdm@macqel.be>
+ *
+ * Based on rtc-em3027.c by Mike Rapoport <mike@compulab.co.il>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/i2c.h>
+#include <linux/rtc.h>
+#include <linux/bcd.h>
+#include <linux/module.h>
+
+/* Registers */
+
+#define ABX805_REG_SECONDS 0x01
+#define ABX805_REG_CONFIGURATION_KEY 0x1f
+#define KEY_ENABLE_MISC_REGISTERS_WRITE_ACCESS 0x90
+#define ABX805_REG_TRICKLE 0x20
+#define TRICKLE_CHARGE_ENABLE 0xA0
+#define TRICKLE_STANDARD_DIODE 0x8
+#define TRICKLE_SCHOTTKY_DIODE 0x4
+#define TRICKLE_OUTPUT_RESISTOR_3KOHM 0x1
+#define TRICKLE_OUTPUT_RESISTOR_6KOHM 0x2
+#define TRICKLE_OUTPUT_RESISTOR_11KOHM 0x3
+#define ABX805_REG_ID0 0x28
+
+static struct i2c_driver abx805_driver;
+
+static int abx805_read_multiple_regs(struct i2c_client *client,
+ u8 *buf, u8 addr0, int len)
+{
+ u8 addr = addr0;
+ struct i2c_msg msgs[] = {
+ {/* setup read addr */
+ .addr = client->addr,
+ .len = 1,
+ .buf = &addr
+ },
+ {/* read into buf */
+ .addr = client->addr,
+ .flags = I2C_M_RD,
+ .len = len,
+ .buf = buf
+ },
+ };
+
+ if ((i2c_transfer(client->adapter, &msgs[0], 2)) != 2) {
+ dev_err(&client->dev, "%s: read error\n", __func__);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int abx805_enable_trickle_charger(struct i2c_client *client)
+{
+ u8 buf[2];
+ struct i2c_msg msg = {
+ .addr = client->addr,
+ .len = 2,
+ .buf = buf,
+ };
+
+ /*
+ * Write 0x90 in the configuration key register (0x1F) to enable
+ * the access to the Trickle register
+ */
+ buf[0] = ABX805_REG_CONFIGURATION_KEY;
+ buf[1] = 0x9D;
+
+ /* write register */
+ if ((i2c_transfer(client->adapter, &msg, 1)) != 1) {
+ dev_err(&client->dev, "%s: write error\n", __func__);
+ return -EIO;
+ }
+
+ buf[0] = ABX805_REG_TRICKLE;
+ buf[1] = TRICKLE_CHARGE_ENABLE | TRICKLE_SCHOTTKY_DIODE |
+ TRICKLE_OUTPUT_RESISTOR_3KOHM;
+
+ /* write register */
+ if ((i2c_transfer(client->adapter, &msg, 1)) != 1) {
+ dev_err(&client->dev, "%s: write error\n", __func__);
+ return -EIO;
+ }
+ return 0;
+}
+
+static int abx805_get_time(struct device *dev, struct rtc_time *tm)
+{
+ struct i2c_client *client = to_i2c_client(dev);
+ u8 buf[7];
+ int err;
+
+ dev_dbg(dev, "abx805_get_time\n");
+ /* read time/date registers */
+ err = abx805_read_multiple_regs(client, buf, ABX805_REG_SECONDS,
+ sizeof(buf));
+ if (err) {
+ dev_err(&client->dev, "%s: read error\n", __func__);
+ return err;
+ }
+
+ tm->tm_sec = bcd2bin(buf[0]);
+ tm->tm_min = bcd2bin(buf[1]);
+ tm->tm_hour = bcd2bin(buf[2]);
+ tm->tm_mday = bcd2bin(buf[3]);
+ tm->tm_mon = bcd2bin(buf[4]) - 1;
+ tm->tm_year = bcd2bin(buf[5]) + 100;
+ tm->tm_wday = bcd2bin(buf[6]);
+
+ return 0;
+}
+
+static int abx805_set_time(struct device *dev, struct rtc_time *tm)
+{
+ struct i2c_client *client = to_i2c_client(dev);
+ u8 buf[8];
+
+ struct i2c_msg msg = {
+ .addr = client->addr,
+ .len = 8,
+ .buf = buf, /* write time/date */
+ };
+
+ dev_dbg(dev, "abx805_set_time\n");
+ buf[0] = ABX805_REG_SECONDS;
+ buf[1] = bin2bcd(tm->tm_sec);
+ buf[2] = bin2bcd(tm->tm_min);
+ buf[3] = bin2bcd(tm->tm_hour);
+ buf[4] = bin2bcd(tm->tm_mday);
+ buf[5] = bin2bcd(tm->tm_mon + 1);
+ buf[6] = bin2bcd(tm->tm_year % 100);
+ buf[7] = bin2bcd(tm->tm_wday);
+
+ /* write time/date registers */
+ if ((i2c_transfer(client->adapter, &msg, 1)) != 1) {
+ dev_err(&client->dev, "%s: write error\n", __func__);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static const struct rtc_class_ops abx805_rtc_ops = {
+ .read_time = abx805_get_time,
+ .set_time = abx805_set_time,
+};
+
+static int abx805_probe(struct i2c_client *client,
+ const struct i2c_device_id *id)
+{
+ int err;
+ struct rtc_device *rtc;
+ char buf[7];
+ unsigned int partnumber;
+ unsigned int majrev, minrev;
+ unsigned int lot;
+ unsigned int wafer;
+ unsigned int uid;
+
+ dev_info(&client->dev, "abx805_probe\n");
+ if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C))
+ return -ENODEV;
+
+ err = abx805_read_multiple_regs(client, buf, ABX805_REG_ID0,
+ sizeof(buf));
+ if (err)
+ return err;
+
+ partnumber = (buf[0] << 8) | buf[1];
+ majrev = buf[2] >> 3;
+ minrev = buf[2] & 0x7;
+ lot = ((buf[4] & 0x80) << 2) | ((buf[6] & 0x80) << 1) | buf[3];
+ uid = ((buf[4] & 0x7f) << 8) | buf[5];
+ wafer = (buf[6] & 0x7c) >> 2;
+ dev_info(&client->dev, "model %04x, revision %u.%u, lot %x, wafer %x, uid %x\n",
+ partnumber, majrev, minrev, lot, wafer, uid);
+
+ abx805_enable_trickle_charger(client);
+
+ rtc = devm_rtc_device_register(&client->dev, abx805_driver.driver.name,
+ &abx805_rtc_ops, THIS_MODULE);
+ if (IS_ERR(rtc))
+ return PTR_ERR(rtc);
+
+ i2c_set_clientdata(client, rtc);
+
+ return 0;
+}
+
+static int abx805_remove(struct i2c_client *client)
+{
+ return 0;
+}
+
+static struct i2c_device_id abx805_id[] = {
+ { "abx805-rtc", 0 },
+ { }
+};
+
+static struct i2c_driver abx805_driver = {
+ .driver = {
+ .name = "abx805-rtc",
+ },
+ .probe = &abx805_probe,
+ .remove = &abx805_remove,
+ .id_table = abx805_id,
+};
+
+module_i2c_driver(abx805_driver);
+
+MODULE_AUTHOR("Philippe De Muyter <phdm@macqel.be>");
+MODULE_DESCRIPTION("Abracon AB X805 RTC driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/rtc/rtc-abx80x.c b/drivers/rtc/rtc-abx80x.c
new file mode 100644
index 000000000000..588257cec152
--- /dev/null
+++ b/drivers/rtc/rtc-abx80x.c
@@ -0,0 +1,193 @@
+/*
+ * An I2C driver for the Abracon AB08xx
+ *
+ * Author: Alexandre Belloni <alexandre.belloni@free-electrons.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/i2c.h>
+#include <linux/bcd.h>
+#include <linux/rtc.h>
+#include <linux/log2.h>
+
+#define ABX8XX_REG_HTH 0x00
+#define ABX8XX_REG_SC 0x01
+#define ABX8XX_REG_MN 0x02
+#define ABX8XX_REG_HR 0x03
+#define ABX8XX_REG_DA 0x04
+#define ABX8XX_REG_MO 0x05
+#define ABX8XX_REG_YR 0x06
+#define ABX8XX_REG_WD 0x07
+
+#define ABX8XX_REG_CTRL1 0x10
+
+#define ABX8XX_REG_PART0 0x28
+#define ABX8XX_REG_PART1 0x29
+
+#define ABX8XX_CTRL_WRITE BIT(1)
+#define ABX8XX_CTRL_12_24 BIT(6)
+
+enum abx80x_chip {ABX80X, AB0801, AB0802, AB0803, AB0804, AB0805,
+ AB1801, AB1802, AB1803, AB1804, AB1805};
+
+static int abx80x_rtc_read_time(struct device *dev, struct rtc_time *tm)
+{
+ struct i2c_client *client = to_i2c_client(dev);
+ unsigned char date[8];
+ int err;
+
+ err = i2c_smbus_read_i2c_block_data(client, ABX8XX_REG_HTH,
+ 8, date);
+ if (err < 0) {
+ dev_err(&client->dev, "Unable to read date\n");
+ return -EIO;
+ }
+
+ tm->tm_sec = bcd2bin(date[ABX8XX_REG_SC] & 0x7F);
+ tm->tm_min = bcd2bin(date[ABX8XX_REG_MN] & 0x7F);
+ tm->tm_hour = bcd2bin(date[ABX8XX_REG_HR] & 0x3F);
+ tm->tm_wday = date[ABX8XX_REG_WD] & 0x7;
+ tm->tm_mday = bcd2bin(date[ABX8XX_REG_DA] & 0x3F);
+ tm->tm_mon = bcd2bin(date[ABX8XX_REG_MO] & 0x1F) - 1;
+ tm->tm_year = bcd2bin(date[ABX8XX_REG_YR]);
+ if (tm->tm_year < 70)
+ tm->tm_year += 100;
+
+ err = rtc_valid_tm(tm);
+ if (err < 0)
+ dev_err(&client->dev, "retrieved date/time is not valid.\n");
+
+ return err;
+}
+
+static int abx80x_rtc_set_time(struct device *dev, struct rtc_time *tm)
+{
+ struct i2c_client *client = to_i2c_client(dev);
+ int data, err;
+ unsigned char buf[8];
+
+ buf[ABX8XX_REG_SC] = bin2bcd(tm->tm_sec);
+ buf[ABX8XX_REG_MN] = bin2bcd(tm->tm_min);
+ buf[ABX8XX_REG_HR] = bin2bcd(tm->tm_hour);
+ buf[ABX8XX_REG_DA] = bin2bcd(tm->tm_mday);
+ buf[ABX8XX_REG_MO] = bin2bcd(tm->tm_mon + 1);
+ buf[ABX8XX_REG_YR] = bin2bcd(tm->tm_year % 100);
+ buf[ABX8XX_REG_WD] = (0x1 << tm->tm_wday);
+
+ data = i2c_smbus_read_byte_data(client, ABX8XX_REG_CTRL1);
+ if (data < 0) {
+ dev_err(&client->dev, "Unable to read control register\n");
+ return -EIO;
+ }
+
+ err = i2c_smbus_write_byte_data(client, ABX8XX_REG_CTRL1,
+ (data | ABX8XX_CTRL_WRITE));
+ if (err < 0) {
+ dev_err(&client->dev, "Unable to write control register\n");
+ return -EIO;
+ }
+
+ err = i2c_smbus_write_i2c_block_data(client, ABX8XX_REG_SC, 7,
+ &buf[ABX8XX_REG_SC]);
+ if (err < 0) {
+ dev_err(&client->dev, "Unable to write to date registers\n");
+ return -EIO;
+ }
+
+ err = i2c_smbus_write_byte_data(client, ABX8XX_REG_CTRL1,
+ (data & ~ABX8XX_CTRL_WRITE));
+ if (err < 0) {
+ dev_err(&client->dev, "Unable to write control register\n");
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static const struct rtc_class_ops abx80x_rtc_ops = {
+ .read_time = abx80x_rtc_read_time,
+ .set_time = abx80x_rtc_set_time,
+};
+
+static int abx80x_probe(struct i2c_client *client,
+ const struct i2c_device_id *id)
+{
+ struct rtc_device *rtc;
+ int data, err, part0, part1;
+
+ if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C))
+ return -ENODEV;
+
+ part0 = i2c_smbus_read_byte_data(client, ABX8XX_REG_PART0);
+ part1 = i2c_smbus_read_byte_data(client, ABX8XX_REG_PART1);
+ if ((part0 < 0) || (part1 < 0)) {
+ dev_err(&client->dev, "Unable to read part number\n");
+ return -EIO;
+ }
+ dev_info(&client->dev, "chip found %02x%02x\n", part0, part1);
+
+ data = i2c_smbus_read_byte_data(client, ABX8XX_REG_CTRL1);
+ if (data < 0) {
+ dev_err(&client->dev, "Unable to read control register\n");
+ return -EIO;
+ }
+
+ err = i2c_smbus_write_byte_data(client, ABX8XX_REG_CTRL1,
+ (data & ~ABX8XX_CTRL_12_24));
+ if (err < 0) {
+ dev_err(&client->dev, "Unable to write control register\n");
+ return -EIO;
+ }
+
+ rtc = devm_rtc_device_register(&client->dev, "rtc-abx80x",
+ &abx80x_rtc_ops, THIS_MODULE);
+
+ if (IS_ERR(rtc))
+ return PTR_ERR(rtc);
+
+ i2c_set_clientdata(client, rtc);
+
+ return 0;
+}
+
+static int abx80x_remove(struct i2c_client *client)
+{
+ return 0;
+}
+
+static const struct i2c_device_id abx80x_id[] = {
+ { "abx80x", ABX80X },
+ { "ab0801", AB0801 },
+ { "ab0802", AB0802 },
+ { "ab0803", AB0803 },
+ { "ab0804", AB0804 },
+ { "ab0805", AB0805 },
+ { "ab1801", AB1801 },
+ { "ab1802", AB1802 },
+ { "ab1803", AB1803 },
+ { "ab1804", AB1804 },
+ { "ab1805", AB1805 },
+ { }
+};
+MODULE_DEVICE_TABLE(i2c, abx80x_id);
+
+static struct i2c_driver abx80x_driver = {
+ .driver = {
+ .name = "rtc-abx80x",
+ .owner = THIS_MODULE,
+ },
+ .probe = abx80x_probe,
+ .remove = abx80x_remove,
+ .id_table = abx80x_id,
+};
+
+module_i2c_driver(abx80x_driver);
+
+MODULE_AUTHOR("Alexandre Belloni <alexandre.belloni@free-electrons.com>");
+MODULE_DESCRIPTION("Abracon ABX80X RTC driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c
index 5b2e76159b41..87647f459198 100644
--- a/drivers/rtc/rtc-cmos.c
+++ b/drivers/rtc/rtc-cmos.c
@@ -459,23 +459,25 @@ static int cmos_procfs(struct device *dev, struct seq_file *seq)
/* NOTE: at least ICH6 reports battery status using a different
* (non-RTC) bit; and SQWE is ignored on many current systems.
*/
- return seq_printf(seq,
- "periodic_IRQ\t: %s\n"
- "update_IRQ\t: %s\n"
- "HPET_emulated\t: %s\n"
- // "square_wave\t: %s\n"
- "BCD\t\t: %s\n"
- "DST_enable\t: %s\n"
- "periodic_freq\t: %d\n"
- "batt_status\t: %s\n",
- (rtc_control & RTC_PIE) ? "yes" : "no",
- (rtc_control & RTC_UIE) ? "yes" : "no",
- is_hpet_enabled() ? "yes" : "no",
- // (rtc_control & RTC_SQWE) ? "yes" : "no",
- (rtc_control & RTC_DM_BINARY) ? "no" : "yes",
- (rtc_control & RTC_DST_EN) ? "yes" : "no",
- cmos->rtc->irq_freq,
- (valid & RTC_VRT) ? "okay" : "dead");
+ seq_printf(seq,
+ "periodic_IRQ\t: %s\n"
+ "update_IRQ\t: %s\n"
+ "HPET_emulated\t: %s\n"
+ // "square_wave\t: %s\n"
+ "BCD\t\t: %s\n"
+ "DST_enable\t: %s\n"
+ "periodic_freq\t: %d\n"
+ "batt_status\t: %s\n",
+ (rtc_control & RTC_PIE) ? "yes" : "no",
+ (rtc_control & RTC_UIE) ? "yes" : "no",
+ is_hpet_enabled() ? "yes" : "no",
+ // (rtc_control & RTC_SQWE) ? "yes" : "no",
+ (rtc_control & RTC_DM_BINARY) ? "no" : "yes",
+ (rtc_control & RTC_DST_EN) ? "yes" : "no",
+ cmos->rtc->irq_freq,
+ (valid & RTC_VRT) ? "okay" : "dead");
+
+ return 0;
}
#else
diff --git a/drivers/rtc/rtc-da9052.c b/drivers/rtc/rtc-da9052.c
index 613c43b7e9ae..1ba4371cbc2d 100644
--- a/drivers/rtc/rtc-da9052.c
+++ b/drivers/rtc/rtc-da9052.c
@@ -16,6 +16,7 @@
#include <linux/platform_device.h>
#include <linux/rtc.h>
#include <linux/err.h>
+#include <linux/delay.h>
#include <linux/mfd/da9052/da9052.h>
#include <linux/mfd/da9052/reg.h>
@@ -23,6 +24,8 @@
#define rtc_err(rtc, fmt, ...) \
dev_err(rtc->da9052->dev, "%s: " fmt, __func__, ##__VA_ARGS__)
+#define DA9052_GET_TIME_RETRIES 5
+
struct da9052_rtc {
struct rtc_device *rtc;
struct da9052 *da9052;
@@ -58,22 +61,43 @@ static irqreturn_t da9052_rtc_irq(int irq, void *data)
static int da9052_read_alarm(struct da9052_rtc *rtc, struct rtc_time *rtc_tm)
{
int ret;
- uint8_t v[5];
+ uint8_t v[2][5];
+ int idx = 1;
+ int timeout = DA9052_GET_TIME_RETRIES;
- ret = da9052_group_read(rtc->da9052, DA9052_ALARM_MI_REG, 5, v);
- if (ret != 0) {
+ ret = da9052_group_read(rtc->da9052, DA9052_ALARM_MI_REG, 5, &v[0][0]);
+ if (ret) {
rtc_err(rtc, "Failed to group read ALM: %d\n", ret);
return ret;
}
- rtc_tm->tm_year = (v[4] & DA9052_RTC_YEAR) + 100;
- rtc_tm->tm_mon = (v[3] & DA9052_RTC_MONTH) - 1;
- rtc_tm->tm_mday = v[2] & DA9052_RTC_DAY;
- rtc_tm->tm_hour = v[1] & DA9052_RTC_HOUR;
- rtc_tm->tm_min = v[0] & DA9052_RTC_MIN;
+ do {
+ ret = da9052_group_read(rtc->da9052,
+ DA9052_ALARM_MI_REG, 5, &v[idx][0]);
+ if (ret) {
+ rtc_err(rtc, "Failed to group read ALM: %d\n", ret);
+ return ret;
+ }
- ret = rtc_valid_tm(rtc_tm);
- return ret;
+ if (memcmp(&v[0][0], &v[1][0], 5) == 0) {
+ rtc_tm->tm_year = (v[0][4] & DA9052_RTC_YEAR) + 100;
+ rtc_tm->tm_mon = (v[0][3] & DA9052_RTC_MONTH) - 1;
+ rtc_tm->tm_mday = v[0][2] & DA9052_RTC_DAY;
+ rtc_tm->tm_hour = v[0][1] & DA9052_RTC_HOUR;
+ rtc_tm->tm_min = v[0][0] & DA9052_RTC_MIN;
+
+ ret = rtc_valid_tm(rtc_tm);
+ return ret;
+ }
+
+ idx = (1-idx);
+ msleep(20);
+
+ } while (timeout--);
+
+ rtc_err(rtc, "Timed out reading alarm time\n");
+
+ return -EIO;
}
static int da9052_set_alarm(struct da9052_rtc *rtc, struct rtc_time *rtc_tm)
@@ -135,24 +159,45 @@ static int da9052_rtc_get_alarm_status(struct da9052_rtc *rtc)
static int da9052_rtc_read_time(struct device *dev, struct rtc_time *rtc_tm)
{
struct da9052_rtc *rtc = dev_get_drvdata(dev);
- uint8_t v[6];
int ret;
+ uint8_t v[2][6];
+ int idx = 1;
+ int timeout = DA9052_GET_TIME_RETRIES;
- ret = da9052_group_read(rtc->da9052, DA9052_COUNT_S_REG, 6, v);
- if (ret < 0) {
+ ret = da9052_group_read(rtc->da9052, DA9052_COUNT_S_REG, 6, &v[0][0]);
+ if (ret) {
rtc_err(rtc, "Failed to read RTC time : %d\n", ret);
return ret;
}
- rtc_tm->tm_year = (v[5] & DA9052_RTC_YEAR) + 100;
- rtc_tm->tm_mon = (v[4] & DA9052_RTC_MONTH) - 1;
- rtc_tm->tm_mday = v[3] & DA9052_RTC_DAY;
- rtc_tm->tm_hour = v[2] & DA9052_RTC_HOUR;
- rtc_tm->tm_min = v[1] & DA9052_RTC_MIN;
- rtc_tm->tm_sec = v[0] & DA9052_RTC_SEC;
+ do {
+ ret = da9052_group_read(rtc->da9052,
+ DA9052_COUNT_S_REG, 6, &v[idx][0]);
+ if (ret) {
+ rtc_err(rtc, "Failed to read RTC time : %d\n", ret);
+ return ret;
+ }
- ret = rtc_valid_tm(rtc_tm);
- return ret;
+ if (memcmp(&v[0][0], &v[1][0], 6) == 0) {
+ rtc_tm->tm_year = (v[0][5] & DA9052_RTC_YEAR) + 100;
+ rtc_tm->tm_mon = (v[0][4] & DA9052_RTC_MONTH) - 1;
+ rtc_tm->tm_mday = v[0][3] & DA9052_RTC_DAY;
+ rtc_tm->tm_hour = v[0][2] & DA9052_RTC_HOUR;
+ rtc_tm->tm_min = v[0][1] & DA9052_RTC_MIN;
+ rtc_tm->tm_sec = v[0][0] & DA9052_RTC_SEC;
+
+ ret = rtc_valid_tm(rtc_tm);
+ return ret;
+ }
+
+ idx = (1-idx);
+ msleep(20);
+
+ } while (timeout--);
+
+ rtc_err(rtc, "Timed out reading time\n");
+
+ return -EIO;
}
static int da9052_rtc_set_time(struct device *dev, struct rtc_time *tm)
@@ -161,6 +206,10 @@ static int da9052_rtc_set_time(struct device *dev, struct rtc_time *tm)
uint8_t v[6];
int ret;
+ /* DA9052 only has 6 bits for year - to represent 2000-2063 */
+ if ((tm->tm_year < 100) || (tm->tm_year > 163))
+ return -EINVAL;
+
rtc = dev_get_drvdata(dev);
v[0] = tm->tm_sec;
@@ -198,6 +247,10 @@ static int da9052_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
struct rtc_time *tm = &alrm->time;
struct da9052_rtc *rtc = dev_get_drvdata(dev);
+ /* DA9052 only has 6 bits for year - to represent 2000-2063 */
+ if ((tm->tm_year < 100) || (tm->tm_year > 163))
+ return -EINVAL;
+
ret = da9052_rtc_enable_alarm(rtc, 0);
if (ret < 0)
return ret;
@@ -256,6 +309,8 @@ static int da9052_rtc_probe(struct platform_device *pdev)
return ret;
}
+ device_init_wakeup(&pdev->dev, true);
+
rtc->rtc = devm_rtc_device_register(&pdev->dev, pdev->name,
&da9052_rtc_ops, THIS_MODULE);
return PTR_ERR_OR_ZERO(rtc->rtc);
diff --git a/drivers/rtc/rtc-digicolor.c b/drivers/rtc/rtc-digicolor.c
new file mode 100644
index 000000000000..8d05596a6765
--- /dev/null
+++ b/drivers/rtc/rtc-digicolor.c
@@ -0,0 +1,227 @@
+/*
+ * Real Time Clock driver for Conexant Digicolor
+ *
+ * Copyright (C) 2015 Paradox Innovation Ltd.
+ *
+ * Author: Baruch Siach <baruch@tkos.co.il>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/io.h>
+#include <linux/iopoll.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/rtc.h>
+#include <linux/of.h>
+
+#define DC_RTC_CONTROL 0x0
+#define DC_RTC_TIME 0x8
+#define DC_RTC_REFERENCE 0xc
+#define DC_RTC_ALARM 0x10
+#define DC_RTC_INTFLAG_CLEAR 0x14
+#define DC_RTC_INTENABLE 0x16
+
+#define DC_RTC_CMD_MASK 0xf
+#define DC_RTC_GO_BUSY BIT(7)
+
+#define CMD_NOP 0
+#define CMD_RESET 1
+#define CMD_WRITE 3
+#define CMD_READ 4
+
+#define CMD_DELAY_US (10*1000)
+#define CMD_TIMEOUT_US (500*CMD_DELAY_US)
+
+struct dc_rtc {
+ struct rtc_device *rtc_dev;
+ void __iomem *regs;
+};
+
+static int dc_rtc_cmds(struct dc_rtc *rtc, const u8 *cmds, int len)
+{
+ u8 val;
+ int i, ret;
+
+ for (i = 0; i < len; i++) {
+ writeb_relaxed((cmds[i] & DC_RTC_CMD_MASK) | DC_RTC_GO_BUSY,
+ rtc->regs + DC_RTC_CONTROL);
+ ret = readb_relaxed_poll_timeout(
+ rtc->regs + DC_RTC_CONTROL, val,
+ !(val & DC_RTC_GO_BUSY), CMD_DELAY_US, CMD_TIMEOUT_US);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int dc_rtc_read(struct dc_rtc *rtc, unsigned long *val)
+{
+ static const u8 read_cmds[] = {CMD_READ, CMD_NOP};
+ u32 reference, time1, time2;
+ int ret;
+
+ ret = dc_rtc_cmds(rtc, read_cmds, ARRAY_SIZE(read_cmds));
+ if (ret < 0)
+ return ret;
+
+ reference = readl_relaxed(rtc->regs + DC_RTC_REFERENCE);
+ time1 = readl_relaxed(rtc->regs + DC_RTC_TIME);
+ /* Read twice to ensure consistency */
+ while (1) {
+ time2 = readl_relaxed(rtc->regs + DC_RTC_TIME);
+ if (time1 == time2)
+ break;
+ time1 = time2;
+ }
+
+ *val = reference + time1;
+ return 0;
+}
+
+static int dc_rtc_write(struct dc_rtc *rtc, u32 val)
+{
+ static const u8 write_cmds[] = {CMD_WRITE, CMD_NOP, CMD_RESET, CMD_NOP};
+
+ writel_relaxed(val, rtc->regs + DC_RTC_REFERENCE);
+ return dc_rtc_cmds(rtc, write_cmds, ARRAY_SIZE(write_cmds));
+}
+
+static int dc_rtc_read_time(struct device *dev, struct rtc_time *tm)
+{
+ struct dc_rtc *rtc = dev_get_drvdata(dev);
+ unsigned long now;
+ int ret;
+
+ ret = dc_rtc_read(rtc, &now);
+ if (ret < 0)
+ return ret;
+ rtc_time64_to_tm(now, tm);
+
+ return 0;
+}
+
+static int dc_rtc_set_mmss(struct device *dev, unsigned long secs)
+{
+ struct dc_rtc *rtc = dev_get_drvdata(dev);
+
+ return dc_rtc_write(rtc, secs);
+}
+
+static int dc_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alarm)
+{
+ struct dc_rtc *rtc = dev_get_drvdata(dev);
+ u32 alarm_reg, reference;
+ unsigned long now;
+ int ret;
+
+ alarm_reg = readl_relaxed(rtc->regs + DC_RTC_ALARM);
+ reference = readl_relaxed(rtc->regs + DC_RTC_REFERENCE);
+ rtc_time64_to_tm(reference + alarm_reg, &alarm->time);
+
+ ret = dc_rtc_read(rtc, &now);
+ if (ret < 0)
+ return ret;
+
+ alarm->pending = alarm_reg + reference > now;
+ alarm->enabled = readl_relaxed(rtc->regs + DC_RTC_INTENABLE);
+
+ return 0;
+}
+
+static int dc_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alarm)
+{
+ struct dc_rtc *rtc = dev_get_drvdata(dev);
+ time64_t alarm_time;
+ u32 reference;
+
+ alarm_time = rtc_tm_to_time64(&alarm->time);
+
+ reference = readl_relaxed(rtc->regs + DC_RTC_REFERENCE);
+ writel_relaxed(alarm_time - reference, rtc->regs + DC_RTC_ALARM);
+
+ writeb_relaxed(!!alarm->enabled, rtc->regs + DC_RTC_INTENABLE);
+
+ return 0;
+}
+
+static int dc_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
+{
+ struct dc_rtc *rtc = dev_get_drvdata(dev);
+
+ writeb_relaxed(!!enabled, rtc->regs + DC_RTC_INTENABLE);
+
+ return 0;
+}
+
+static struct rtc_class_ops dc_rtc_ops = {
+ .read_time = dc_rtc_read_time,
+ .set_mmss = dc_rtc_set_mmss,
+ .read_alarm = dc_rtc_read_alarm,
+ .set_alarm = dc_rtc_set_alarm,
+ .alarm_irq_enable = dc_rtc_alarm_irq_enable,
+};
+
+static irqreturn_t dc_rtc_irq(int irq, void *dev_id)
+{
+ struct dc_rtc *rtc = dev_id;
+
+ writeb_relaxed(1, rtc->regs + DC_RTC_INTFLAG_CLEAR);
+ rtc_update_irq(rtc->rtc_dev, 1, RTC_AF | RTC_IRQF);
+
+ return IRQ_HANDLED;
+}
+
+static int __init dc_rtc_probe(struct platform_device *pdev)
+{
+ struct resource *res;
+ struct dc_rtc *rtc;
+ int irq, ret;
+
+ rtc = devm_kzalloc(&pdev->dev, sizeof(*rtc), GFP_KERNEL);
+ if (!rtc)
+ return -ENOMEM;
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ rtc->regs = devm_ioremap_resource(&pdev->dev, res);
+ if (IS_ERR(rtc->regs))
+ return PTR_ERR(rtc->regs);
+
+ irq = platform_get_irq(pdev, 0);
+ if (irq < 0)
+ return irq;
+ ret = devm_request_irq(&pdev->dev, irq, dc_rtc_irq, 0, pdev->name, rtc);
+ if (ret < 0)
+ return ret;
+
+ platform_set_drvdata(pdev, rtc);
+ rtc->rtc_dev = devm_rtc_device_register(&pdev->dev, pdev->name,
+ &dc_rtc_ops, THIS_MODULE);
+ if (IS_ERR(rtc->rtc_dev))
+ return PTR_ERR(rtc->rtc_dev);
+
+ return 0;
+}
+
+static const struct of_device_id dc_dt_ids[] = {
+ { .compatible = "cnxt,cx92755-rtc" },
+ { /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, dc_dt_ids);
+
+static struct platform_driver dc_rtc_driver = {
+ .driver = {
+ .name = "digicolor_rtc",
+ .of_match_table = of_match_ptr(dc_dt_ids),
+ },
+};
+module_platform_driver_probe(dc_rtc_driver, dc_rtc_probe);
+
+MODULE_AUTHOR("Baruch Siach <baruch@tkos.co.il>");
+MODULE_DESCRIPTION("Conexant Digicolor Realtime Clock Driver (RTC)");
+MODULE_LICENSE("GPL");
diff --git a/drivers/rtc/rtc-ds1305.c b/drivers/rtc/rtc-ds1305.c
index 129add77065d..12b07158a366 100644
--- a/drivers/rtc/rtc-ds1305.c
+++ b/drivers/rtc/rtc-ds1305.c
@@ -434,9 +434,9 @@ static int ds1305_proc(struct device *dev, struct seq_file *seq)
}
done:
- return seq_printf(seq,
- "trickle_charge\t: %s%s\n",
- diodes, resistors);
+ seq_printf(seq, "trickle_charge\t: %s%s\n", diodes, resistors);
+
+ return 0;
}
#else
diff --git a/drivers/rtc/rtc-ds1685.c b/drivers/rtc/rtc-ds1685.c
index 803869c7d7c2..70202098a8ce 100644
--- a/drivers/rtc/rtc-ds1685.c
+++ b/drivers/rtc/rtc-ds1685.c
@@ -799,7 +799,7 @@ ds1685_rtc_proc(struct device *dev, struct seq_file *seq)
struct platform_device *pdev = to_platform_device(dev);
struct ds1685_priv *rtc = platform_get_drvdata(pdev);
u8 ctrla, ctrlb, ctrlc, ctrld, ctrl4a, ctrl4b, ssn[8];
- char *model = '\0';
+ char *model;
#ifdef CONFIG_RTC_DS1685_PROC_REGS
char bits[NUM_REGS][(NUM_BITS * NUM_SPACES) + NUM_BITS + 1];
#endif
@@ -2139,7 +2139,6 @@ ds1685_rtc_remove(struct platform_device *pdev)
static struct platform_driver ds1685_rtc_driver = {
.driver = {
.name = "rtc-ds1685",
- .owner = THIS_MODULE,
},
.probe = ds1685_rtc_probe,
.remove = ds1685_rtc_remove,
@@ -2175,7 +2174,7 @@ module_exit(ds1685_rtc_exit);
* ds1685_rtc_poweroff - uses the RTC chip to power the system off.
* @pdev: pointer to platform_device structure.
*/
-extern void __noreturn
+void __noreturn
ds1685_rtc_poweroff(struct platform_device *pdev)
{
u8 ctrla, ctrl4a, ctrl4b;
diff --git a/drivers/rtc/rtc-em3027.c b/drivers/rtc/rtc-em3027.c
index fccf36699245..4f4930a2004c 100644
--- a/drivers/rtc/rtc-em3027.c
+++ b/drivers/rtc/rtc-em3027.c
@@ -15,6 +15,7 @@
#include <linux/rtc.h>
#include <linux/bcd.h>
#include <linux/module.h>
+#include <linux/of.h>
/* Registers */
#define EM3027_REG_ON_OFF_CTRL 0x00
@@ -135,10 +136,20 @@ static struct i2c_device_id em3027_id[] = {
{ "em3027", 0 },
{ }
};
+MODULE_DEVICE_TABLE(i2c, em3027_id);
+
+#ifdef CONFIG_OF
+static const struct of_device_id em3027_of_match[] = {
+ { .compatible = "emmicro,em3027", },
+ {}
+};
+MODULE_DEVICE_TABLE(of, em3027_of_match);
+#endif
static struct i2c_driver em3027_driver = {
.driver = {
.name = "rtc-em3027",
+ .of_match_table = of_match_ptr(em3027_of_match),
},
.probe = &em3027_probe,
.id_table = em3027_id,
diff --git a/drivers/rtc/rtc-hym8563.c b/drivers/rtc/rtc-hym8563.c
index b936bb4096b5..d4a65619ef14 100644
--- a/drivers/rtc/rtc-hym8563.c
+++ b/drivers/rtc/rtc-hym8563.c
@@ -309,7 +309,7 @@ static unsigned long hym8563_clkout_recalc_rate(struct clk_hw *hw,
struct i2c_client *client = hym8563->client;
int ret = i2c_smbus_read_byte_data(client, HYM8563_CLKOUT);
- if (ret < 0 || ret & HYM8563_CLKOUT_DISABLE)
+ if (ret < 0)
return 0;
ret &= HYM8563_CLKOUT_MASK;
diff --git a/drivers/rtc/rtc-mc13xxx.c b/drivers/rtc/rtc-mc13xxx.c
index 5bce904b7ee6..c8e78a560de7 100644
--- a/drivers/rtc/rtc-mc13xxx.c
+++ b/drivers/rtc/rtc-mc13xxx.c
@@ -219,7 +219,7 @@ static int mc13xxx_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alarm)
if (unlikely(ret))
goto out;
- dev_dbg(dev, "%s: o%2.s %lu\n", __func__, alarm->enabled ? "n" : "ff",
+ dev_dbg(dev, "%s: %s %lu\n", __func__, alarm->enabled ? "on" : "off",
s1970);
ret = mc13xxx_rtc_irq_enable_unlocked(dev, alarm->enabled,
diff --git a/drivers/rtc/rtc-mrst.c b/drivers/rtc/rtc-mrst.c
index 3a6fd3a8a2ec..548ea6f6f384 100644
--- a/drivers/rtc/rtc-mrst.c
+++ b/drivers/rtc/rtc-mrst.c
@@ -277,13 +277,15 @@ static int mrst_procfs(struct device *dev, struct seq_file *seq)
valid = vrtc_cmos_read(RTC_VALID);
spin_unlock_irq(&rtc_lock);
- return seq_printf(seq,
- "periodic_IRQ\t: %s\n"
- "alarm\t\t: %s\n"
- "BCD\t\t: no\n"
- "periodic_freq\t: daily (not adjustable)\n",
- (rtc_control & RTC_PIE) ? "on" : "off",
- (rtc_control & RTC_AIE) ? "on" : "off");
+ seq_printf(seq,
+ "periodic_IRQ\t: %s\n"
+ "alarm\t\t: %s\n"
+ "BCD\t\t: no\n"
+ "periodic_freq\t: daily (not adjustable)\n",
+ (rtc_control & RTC_PIE) ? "on" : "off",
+ (rtc_control & RTC_AIE) ? "on" : "off");
+
+ return 0;
}
#else
diff --git a/drivers/rtc/rtc-mt6397.c b/drivers/rtc/rtc-mt6397.c
new file mode 100644
index 000000000000..b54d60530d94
--- /dev/null
+++ b/drivers/rtc/rtc-mt6397.c
@@ -0,0 +1,352 @@
+/*
+* Copyright (c) 2014-2015 MediaTek Inc.
+* Author: Tianping.Fang <tianping.fang@mediatek.com>
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License version 2 as
+* published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU General Public License for more details.
+*/
+
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/regmap.h>
+#include <linux/rtc.h>
+#include <linux/irqdomain.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+#include <linux/of_address.h>
+#include <linux/kernel.h>
+#include <linux/io.h>
+#include <linux/mfd/mt6397/core.h>
+
+#define RTC_BBPU 0x0000
+#define RTC_WRTGR 0x003c
+#define RTC_IRQ_EN 0x0004
+#define RTC_IRQ_STA 0x0002
+
+#define RTC_BBPU_CBUSY (1 << 6)
+#define RTC_BBPU_KEY (0x43 << 8)
+#define RTC_BBPU_AUTO (1 << 3)
+#define RTC_IRQ_STA_AL (1 << 0)
+#define RTC_IRQ_STA_LP (1 << 3)
+
+#define RTC_TC_SEC 0x000a
+#define RTC_TC_MIN 0x000c
+#define RTC_TC_HOU 0x000e
+#define RTC_TC_DOM 0x0010
+#define RTC_TC_MTH 0x0014
+#define RTC_TC_YEA 0x0016
+#define RTC_AL_SEC 0x0018
+#define RTC_AL_MIN 0x001a
+
+#define RTC_IRQ_EN_AL (1 << 0)
+#define RTC_IRQ_EN_ONESHOT (1 << 2)
+#define RTC_IRQ_EN_LP (1 << 3)
+#define RTC_IRQ_EN_ONESHOT_AL (RTC_IRQ_EN_ONESHOT | RTC_IRQ_EN_AL)
+
+#define RTC_TC_MIN_MASK 0x003f
+#define RTC_TC_SEC_MASK 0x003f
+#define RTC_TC_HOU_MASK 0x001f
+#define RTC_TC_DOM_MASK 0x001f
+#define RTC_TC_MTH_MASK 0x000f
+#define RTC_TC_YEA_MASK 0x007f
+
+#define RTC_AL_SEC_MASK 0x003f
+#define RTC_AL_MIN_MASK 0x003f
+#define RTC_AL_MASK_DOW (1 << 4)
+
+#define RTC_AL_HOU 0x001c
+#define RTC_NEW_SPARE_FG_MASK 0xff00
+#define RTC_NEW_SPARE_FG_SHIFT 8
+#define RTC_AL_HOU_MASK 0x001f
+
+#define RTC_AL_DOM 0x001e
+#define RTC_NEW_SPARE1 0xff00
+#define RTC_AL_DOM_MASK 0x001f
+#define RTC_AL_MASK 0x0008
+
+#define RTC_AL_MTH 0x0022
+#define RTC_NEW_SPARE3 0xff00
+#define RTC_AL_MTH_MASK 0x000f
+
+#define RTC_AL_YEA 0x0024
+#define RTC_AL_YEA_MASK 0x007f
+
+#define RTC_PDN1 0x002c
+#define RTC_PDN1_PWRON_TIME (1 << 7)
+
+#define RTC_PDN2 0x002e
+#define RTC_PDN2_PWRON_MTH_MASK 0x000f
+#define RTC_PDN2_PWRON_MTH_SHIFT 0
+#define RTC_PDN2_PWRON_ALARM (1 << 4)
+#define RTC_PDN2_UART_MASK 0x0060
+#define RTC_PDN2_UART_SHIFT 5
+#define RTC_PDN2_PWRON_YEA_MASK 0x7f00
+#define RTC_PDN2_PWRON_YEA_SHIFT 8
+#define RTC_PDN2_PWRON_LOGO (1 << 15)
+
+#define RTC_MIN_YEAR 1968
+#define RTC_BASE_YEAR 1900
+#define RTC_NUM_YEARS 128
+#define RTC_MIN_YEAR_OFFSET (RTC_MIN_YEAR - RTC_BASE_YEAR)
+#define RTC_RELPWR_WHEN_XRST 1
+
+struct mt6397_rtc {
+ struct device *dev;
+ struct rtc_device *rtc_dev;
+ struct mutex lock;
+ struct regmap *regmap;
+ int irq;
+ u32 addr_base;
+ u32 addr_range;
+};
+
+static u16 rtc_read(struct mt6397_rtc *rtc, u32 offset)
+{
+ u32 rdata = 0;
+ u32 addr = rtc->addr_base + offset;
+
+ if (offset < rtc->addr_range)
+ regmap_read(rtc->regmap, addr, &rdata);
+
+ return (u16)rdata;
+}
+
+static void rtc_write(struct mt6397_rtc *rtc, u32 offset, u32 data)
+{
+ u32 addr;
+
+ addr = rtc->addr_base + offset;
+
+ if (offset < rtc->addr_range)
+ regmap_write(rtc->regmap, addr, data);
+}
+
+static void rtc_write_trigger(struct mt6397_rtc *rtc)
+{
+ rtc_write(rtc, RTC_WRTGR, 1);
+ while (rtc_read(rtc, RTC_BBPU) & RTC_BBPU_CBUSY)
+ cpu_relax();
+}
+
+static irqreturn_t rtc_irq_handler_thread(int irq, void *data)
+{
+ struct mt6397_rtc *rtc = data;
+ u16 irqsta, irqen;
+
+ mutex_lock(&rtc->lock);
+ irqsta = rtc_read(rtc, RTC_IRQ_STA);
+ mutex_unlock(&rtc->lock);
+
+ if (irqsta & RTC_IRQ_STA_AL) {
+ rtc_update_irq(rtc->rtc_dev, 1, RTC_IRQF | RTC_AF);
+ irqen = irqsta & ~RTC_IRQ_EN_AL;
+ rtc_write(rtc, RTC_IRQ_EN, irqen);
+ rtc_write_trigger(rtc);
+ return IRQ_HANDLED;
+ }
+
+ return IRQ_NONE;
+}
+
+static int mtk_rtc_read_time(struct device *dev, struct rtc_time *tm)
+{
+ unsigned long time;
+ struct mt6397_rtc *rtc = dev_get_drvdata(dev);
+
+ mutex_lock(&rtc->lock);
+ do {
+ tm->tm_sec = rtc_read(rtc, RTC_TC_SEC);
+ tm->tm_min = rtc_read(rtc, RTC_TC_MIN);
+ tm->tm_hour = rtc_read(rtc, RTC_TC_HOU);
+ tm->tm_mday = rtc_read(rtc, RTC_TC_DOM);
+ tm->tm_mon = rtc_read(rtc, RTC_TC_MTH);
+ tm->tm_year = rtc_read(rtc, RTC_TC_YEA);
+ } while (rtc_read(rtc, RTC_TC_SEC) < tm->tm_sec);
+ mutex_unlock(&rtc->lock);
+
+ tm->tm_year += RTC_MIN_YEAR_OFFSET;
+ tm->tm_mon--;
+ rtc_tm_to_time(tm, &time);
+
+ tm->tm_wday = (time / 86400 + 4) % 7;
+
+ return 0;
+}
+
+static int mtk_rtc_set_time(struct device *dev, struct rtc_time *tm)
+{
+ struct mt6397_rtc *rtc = dev_get_drvdata(dev);
+
+ tm->tm_year -= RTC_MIN_YEAR_OFFSET;
+ tm->tm_mon++;
+ mutex_lock(&rtc->lock);
+ rtc_write(rtc, RTC_TC_YEA, tm->tm_year);
+ rtc_write(rtc, RTC_TC_MTH, tm->tm_mon);
+ rtc_write(rtc, RTC_TC_DOM, tm->tm_mday);
+ rtc_write(rtc, RTC_TC_HOU, tm->tm_hour);
+ rtc_write(rtc, RTC_TC_MIN, tm->tm_min);
+ rtc_write(rtc, RTC_TC_SEC, tm->tm_sec);
+ rtc_write_trigger(rtc);
+ mutex_unlock(&rtc->lock);
+
+ return 0;
+}
+
+static int mtk_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alm)
+{
+ struct rtc_time *tm = &alm->time;
+ struct mt6397_rtc *rtc = dev_get_drvdata(dev);
+ u16 irqen, pdn2;
+
+ mutex_lock(&rtc->lock);
+ irqen = rtc_read(rtc, RTC_IRQ_EN);
+ pdn2 = rtc_read(rtc, RTC_PDN2);
+ tm->tm_sec = rtc_read(rtc, RTC_AL_SEC);
+ tm->tm_min = rtc_read(rtc, RTC_AL_MIN);
+ tm->tm_hour = rtc_read(rtc, RTC_AL_HOU) & RTC_AL_HOU_MASK;
+ tm->tm_mday = rtc_read(rtc, RTC_AL_DOM) & RTC_AL_DOM_MASK;
+ tm->tm_mon = rtc_read(rtc, RTC_AL_MTH) & RTC_AL_MTH_MASK;
+ tm->tm_year = rtc_read(rtc, RTC_AL_YEA);
+ mutex_unlock(&rtc->lock);
+
+ alm->enabled = !!(irqen & RTC_IRQ_EN_AL);
+ alm->pending = !!(pdn2 & RTC_PDN2_PWRON_ALARM);
+
+ tm->tm_year += RTC_MIN_YEAR_OFFSET;
+ tm->tm_mon--;
+
+ return 0;
+}
+
+static int mtk_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm)
+{
+ struct rtc_time *tm = &alm->time;
+ struct mt6397_rtc *rtc = dev_get_drvdata(dev);
+ u16 irqen;
+
+ tm->tm_year -= RTC_MIN_YEAR_OFFSET;
+ tm->tm_mon++;
+
+ if (alm->enabled) {
+ mutex_lock(&rtc->lock);
+ rtc_write(rtc, RTC_AL_YEA, tm->tm_year);
+ rtc_write(rtc, RTC_AL_MTH, (rtc_read(rtc, RTC_AL_MTH) &
+ RTC_NEW_SPARE3) | tm->tm_mon);
+ rtc_write(rtc, RTC_AL_DOM, (rtc_read(rtc, RTC_AL_DOM) &
+ RTC_NEW_SPARE1) | tm->tm_mday);
+ rtc_write(rtc, RTC_AL_HOU, (rtc_read(rtc, RTC_AL_HOU) &
+ RTC_NEW_SPARE_FG_MASK) | tm->tm_hour);
+ rtc_write(rtc, RTC_AL_MIN, tm->tm_min);
+ rtc_write(rtc, RTC_AL_SEC, tm->tm_sec);
+ rtc_write(rtc, RTC_AL_MASK, RTC_AL_MASK_DOW);
+ rtc_write_trigger(rtc);
+ irqen = rtc_read(rtc, RTC_IRQ_EN) | RTC_IRQ_EN_ONESHOT_AL;
+ rtc_write(rtc, RTC_IRQ_EN, irqen);
+ rtc_write_trigger(rtc);
+ mutex_unlock(&rtc->lock);
+ }
+
+ return 0;
+}
+
+static struct rtc_class_ops mtk_rtc_ops = {
+ .read_time = mtk_rtc_read_time,
+ .set_time = mtk_rtc_set_time,
+ .read_alarm = mtk_rtc_read_alarm,
+ .set_alarm = mtk_rtc_set_alarm,
+};
+
+static int mtk_rtc_probe(struct platform_device *pdev)
+{
+ struct mt6397_chip *mt6397_chip = dev_get_drvdata(pdev->dev.parent);
+ struct mt6397_rtc *rtc;
+ u32 reg[2];
+ int ret = 0;
+
+ rtc = devm_kzalloc(&pdev->dev, sizeof(struct mt6397_rtc), GFP_KERNEL);
+ if (!rtc)
+ return -ENOMEM;
+
+ ret = of_property_read_u32_array(pdev->dev.of_node, "reg", reg, 2);
+ if (ret) {
+ dev_err(&pdev->dev, "couldn't read rtc base address!\n");
+ return -EINVAL;
+ }
+ rtc->addr_base = reg[0];
+ rtc->addr_range = reg[1];
+ rtc->regmap = mt6397_chip->regmap;
+ rtc->dev = &pdev->dev;
+ mutex_init(&rtc->lock);
+
+ platform_set_drvdata(pdev, rtc);
+
+ rtc->rtc_dev = rtc_device_register("mt6397-rtc", &pdev->dev,
+ &mtk_rtc_ops, THIS_MODULE);
+ if (IS_ERR(rtc->rtc_dev)) {
+ dev_err(&pdev->dev, "register rtc device failed\n");
+ return PTR_ERR(rtc->rtc_dev);
+ }
+
+ rtc->irq = platform_get_irq(pdev, 0);
+ if (rtc->irq < 0) {
+ ret = rtc->irq;
+ goto out_rtc;
+ }
+
+ ret = devm_request_threaded_irq(&pdev->dev, rtc->irq, NULL,
+ rtc_irq_handler_thread, IRQF_ONESHOT,
+ "mt6397-rtc", rtc);
+ if (ret) {
+ dev_err(&pdev->dev, "Failed to request alarm IRQ: %d: %d\n",
+ rtc->irq, ret);
+ goto out_rtc;
+ }
+
+ device_init_wakeup(&pdev->dev, 1);
+
+ return 0;
+
+out_rtc:
+ rtc_device_unregister(rtc->rtc_dev);
+ return ret;
+
+}
+
+static int mtk_rtc_remove(struct platform_device *pdev)
+{
+ struct mt6397_rtc *rtc = platform_get_drvdata(pdev);
+
+ rtc_device_unregister(rtc->rtc_dev);
+ return 0;
+}
+
+static const struct of_device_id mt6397_rtc_of_match[] = {
+ { .compatible = "mediatek,mt6397-rtc", },
+ { }
+};
+
+static struct platform_driver mtk_rtc_driver = {
+ .driver = {
+ .name = "mt6397-rtc",
+ .of_match_table = mt6397_rtc_of_match,
+ },
+ .probe = mtk_rtc_probe,
+ .remove = mtk_rtc_remove,
+};
+
+module_platform_driver(mtk_rtc_driver);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Tianping Fang <tianping.fang@mediatek.com>");
+MODULE_DESCRIPTION("RTC Driver for MediaTek MT6397 PMIC");
+MODULE_ALIAS("platform:mt6397-rtc");
diff --git a/drivers/rtc/rtc-omap.c b/drivers/rtc/rtc-omap.c
index 8e5851aa4369..4516ce078f8c 100644
--- a/drivers/rtc/rtc-omap.c
+++ b/drivers/rtc/rtc-omap.c
@@ -107,6 +107,8 @@
/* OMAP_RTC_OSC_REG bit fields: */
#define OMAP_RTC_OSC_32KCLK_EN BIT(6)
+#define OMAP_RTC_OSC_OSC32K_GZ BIT(4)
+#define OMAP_RTC_OSC_EXT_32K BIT(3)
/* OMAP_RTC_IRQWAKEEN bit fields: */
#define OMAP_RTC_IRQWAKEEN_ALARM_WAKEEN BIT(1)
@@ -120,6 +122,7 @@
struct omap_rtc_device_type {
bool has_32kclk_en;
+ bool has_osc_ext_32k;
bool has_kicker;
bool has_irqwakeen;
bool has_pmic_mode;
@@ -446,6 +449,7 @@ static const struct omap_rtc_device_type omap_rtc_default_type = {
static const struct omap_rtc_device_type omap_rtc_am3352_type = {
.has_32kclk_en = true,
+ .has_osc_ext_32k = true,
.has_kicker = true,
.has_irqwakeen = true,
.has_pmic_mode = true,
@@ -543,7 +547,15 @@ static int __init omap_rtc_probe(struct platform_device *pdev)
if (rtc->type->has_32kclk_en) {
reg = rtc_read(rtc, OMAP_RTC_OSC_REG);
rtc_writel(rtc, OMAP_RTC_OSC_REG,
- reg | OMAP_RTC_OSC_32KCLK_EN);
+ reg | OMAP_RTC_OSC_32KCLK_EN);
+ }
+
+ /* Enable external clock as the source */
+ if (rtc->type->has_osc_ext_32k) {
+ rtc_writel(rtc, OMAP_RTC_OSC_REG,
+ (OMAP_RTC_OSC_EXT_32K |
+ rtc_read(rtc, OMAP_RTC_OSC_REG)) &
+ (~OMAP_RTC_OSC_OSC32K_GZ));
}
/* clear old status */
diff --git a/drivers/rtc/rtc-pcf8563.c b/drivers/rtc/rtc-pcf8563.c
index 96fb32e7d6f8..0ba7e59929be 100644
--- a/drivers/rtc/rtc-pcf8563.c
+++ b/drivers/rtc/rtc-pcf8563.c
@@ -246,7 +246,6 @@ static int pcf8563_get_datetime(struct i2c_client *client, struct rtc_time *tm)
static int pcf8563_set_datetime(struct i2c_client *client, struct rtc_time *tm)
{
struct pcf8563 *pcf8563 = i2c_get_clientdata(client);
- int err;
unsigned char buf[9];
dev_dbg(&client->dev, "%s: secs=%d, mins=%d, hours=%d, "
@@ -272,12 +271,8 @@ static int pcf8563_set_datetime(struct i2c_client *client, struct rtc_time *tm)
buf[PCF8563_REG_DW] = tm->tm_wday & 0x07;
- err = pcf8563_write_block_data(client, PCF8563_REG_SC,
+ return pcf8563_write_block_data(client, PCF8563_REG_SC,
9 - PCF8563_REG_SC, buf + PCF8563_REG_SC);
- if (err)
- return err;
-
- return 0;
}
#ifdef CONFIG_RTC_INTF_DEV
diff --git a/drivers/rtc/rtc-s3c.c b/drivers/rtc/rtc-s3c.c
index f4cf6851fae9..fb0c569765c6 100644
--- a/drivers/rtc/rtc-s3c.c
+++ b/drivers/rtc/rtc-s3c.c
@@ -39,7 +39,6 @@ struct s3c_rtc {
void __iomem *base;
struct clk *rtc_clk;
struct clk *rtc_src_clk;
- bool enabled;
struct s3c_rtc_data *data;
@@ -67,26 +66,25 @@ struct s3c_rtc_data {
void (*disable) (struct s3c_rtc *info);
};
-static void s3c_rtc_alarm_clk_enable(struct s3c_rtc *info, bool enable)
+static void s3c_rtc_enable_clk(struct s3c_rtc *info)
{
unsigned long irq_flags;
spin_lock_irqsave(&info->alarm_clk_lock, irq_flags);
- if (enable) {
- if (!info->enabled) {
- clk_enable(info->rtc_clk);
- if (info->data->needs_src_clk)
- clk_enable(info->rtc_src_clk);
- info->enabled = true;
- }
- } else {
- if (info->enabled) {
- if (info->data->needs_src_clk)
- clk_disable(info->rtc_src_clk);
- clk_disable(info->rtc_clk);
- info->enabled = false;
- }
- }
+ clk_enable(info->rtc_clk);
+ if (info->data->needs_src_clk)
+ clk_enable(info->rtc_src_clk);
+ spin_unlock_irqrestore(&info->alarm_clk_lock, irq_flags);
+}
+
+static void s3c_rtc_disable_clk(struct s3c_rtc *info)
+{
+ unsigned long irq_flags;
+
+ spin_lock_irqsave(&info->alarm_clk_lock, irq_flags);
+ if (info->data->needs_src_clk)
+ clk_disable(info->rtc_src_clk);
+ clk_disable(info->rtc_clk);
spin_unlock_irqrestore(&info->alarm_clk_lock, irq_flags);
}
@@ -119,20 +117,16 @@ static int s3c_rtc_setaie(struct device *dev, unsigned int enabled)
dev_dbg(info->dev, "%s: aie=%d\n", __func__, enabled);
- clk_enable(info->rtc_clk);
- if (info->data->needs_src_clk)
- clk_enable(info->rtc_src_clk);
+ s3c_rtc_enable_clk(info);
+
tmp = readb(info->base + S3C2410_RTCALM) & ~S3C2410_RTCALM_ALMEN;
if (enabled)
tmp |= S3C2410_RTCALM_ALMEN;
writeb(tmp, info->base + S3C2410_RTCALM);
- if (info->data->needs_src_clk)
- clk_disable(info->rtc_src_clk);
- clk_disable(info->rtc_clk);
- s3c_rtc_alarm_clk_enable(info, enabled);
+ s3c_rtc_disable_clk(info);
return 0;
}
@@ -143,18 +137,12 @@ static int s3c_rtc_setfreq(struct s3c_rtc *info, int freq)
if (!is_power_of_2(freq))
return -EINVAL;
- clk_enable(info->rtc_clk);
- if (info->data->needs_src_clk)
- clk_enable(info->rtc_src_clk);
spin_lock_irq(&info->pie_lock);
if (info->data->set_freq)
info->data->set_freq(info, freq);
spin_unlock_irq(&info->pie_lock);
- if (info->data->needs_src_clk)
- clk_disable(info->rtc_src_clk);
- clk_disable(info->rtc_clk);
return 0;
}
@@ -165,9 +153,7 @@ static int s3c_rtc_gettime(struct device *dev, struct rtc_time *rtc_tm)
struct s3c_rtc *info = dev_get_drvdata(dev);
unsigned int have_retried = 0;
- clk_enable(info->rtc_clk);
- if (info->data->needs_src_clk)
- clk_enable(info->rtc_src_clk);
+ s3c_rtc_enable_clk(info);
retry_get_time:
rtc_tm->tm_min = readb(info->base + S3C2410_RTCMIN);
@@ -194,6 +180,8 @@ static int s3c_rtc_gettime(struct device *dev, struct rtc_time *rtc_tm)
rtc_tm->tm_mon = bcd2bin(rtc_tm->tm_mon);
rtc_tm->tm_year = bcd2bin(rtc_tm->tm_year);
+ s3c_rtc_disable_clk(info);
+
rtc_tm->tm_year += 100;
dev_dbg(dev, "read time %04d.%02d.%02d %02d:%02d:%02d\n",
@@ -202,10 +190,6 @@ static int s3c_rtc_gettime(struct device *dev, struct rtc_time *rtc_tm)
rtc_tm->tm_mon -= 1;
- if (info->data->needs_src_clk)
- clk_disable(info->rtc_src_clk);
- clk_disable(info->rtc_clk);
-
return rtc_valid_tm(rtc_tm);
}
@@ -225,9 +209,7 @@ static int s3c_rtc_settime(struct device *dev, struct rtc_time *tm)
return -EINVAL;
}
- clk_enable(info->rtc_clk);
- if (info->data->needs_src_clk)
- clk_enable(info->rtc_src_clk);
+ s3c_rtc_enable_clk(info);
writeb(bin2bcd(tm->tm_sec), info->base + S3C2410_RTCSEC);
writeb(bin2bcd(tm->tm_min), info->base + S3C2410_RTCMIN);
@@ -236,9 +218,7 @@ static int s3c_rtc_settime(struct device *dev, struct rtc_time *tm)
writeb(bin2bcd(tm->tm_mon + 1), info->base + S3C2410_RTCMON);
writeb(bin2bcd(year), info->base + S3C2410_RTCYEAR);
- if (info->data->needs_src_clk)
- clk_disable(info->rtc_src_clk);
- clk_disable(info->rtc_clk);
+ s3c_rtc_disable_clk(info);
return 0;
}
@@ -249,9 +229,7 @@ static int s3c_rtc_getalarm(struct device *dev, struct rtc_wkalrm *alrm)
struct rtc_time *alm_tm = &alrm->time;
unsigned int alm_en;
- clk_enable(info->rtc_clk);
- if (info->data->needs_src_clk)
- clk_enable(info->rtc_src_clk);
+ s3c_rtc_enable_clk(info);
alm_tm->tm_sec = readb(info->base + S3C2410_ALMSEC);
alm_tm->tm_min = readb(info->base + S3C2410_ALMMIN);
@@ -262,6 +240,8 @@ static int s3c_rtc_getalarm(struct device *dev, struct rtc_wkalrm *alrm)
alm_en = readb(info->base + S3C2410_RTCALM);
+ s3c_rtc_disable_clk(info);
+
alrm->enabled = (alm_en & S3C2410_RTCALM_ALMEN) ? 1 : 0;
dev_dbg(dev, "read alarm %d, %04d.%02d.%02d %02d:%02d:%02d\n",
@@ -269,9 +249,7 @@ static int s3c_rtc_getalarm(struct device *dev, struct rtc_wkalrm *alrm)
1900 + alm_tm->tm_year, alm_tm->tm_mon, alm_tm->tm_mday,
alm_tm->tm_hour, alm_tm->tm_min, alm_tm->tm_sec);
-
/* decode the alarm enable field */
-
if (alm_en & S3C2410_RTCALM_SECEN)
alm_tm->tm_sec = bcd2bin(alm_tm->tm_sec);
else
@@ -304,10 +282,6 @@ static int s3c_rtc_getalarm(struct device *dev, struct rtc_wkalrm *alrm)
else
alm_tm->tm_year = -1;
- if (info->data->needs_src_clk)
- clk_disable(info->rtc_src_clk);
- clk_disable(info->rtc_clk);
-
return 0;
}
@@ -317,15 +291,13 @@ static int s3c_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm)
struct rtc_time *tm = &alrm->time;
unsigned int alrm_en;
- clk_enable(info->rtc_clk);
- if (info->data->needs_src_clk)
- clk_enable(info->rtc_src_clk);
-
dev_dbg(dev, "s3c_rtc_setalarm: %d, %04d.%02d.%02d %02d:%02d:%02d\n",
alrm->enabled,
1900 + tm->tm_year, tm->tm_mon + 1, tm->tm_mday,
tm->tm_hour, tm->tm_min, tm->tm_sec);
+ s3c_rtc_enable_clk(info);
+
alrm_en = readb(info->base + S3C2410_RTCALM) & S3C2410_RTCALM_ALMEN;
writeb(0x00, info->base + S3C2410_RTCALM);
@@ -348,11 +320,9 @@ static int s3c_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm)
writeb(alrm_en, info->base + S3C2410_RTCALM);
- s3c_rtc_setaie(dev, alrm->enabled);
+ s3c_rtc_disable_clk(info);
- if (info->data->needs_src_clk)
- clk_disable(info->rtc_src_clk);
- clk_disable(info->rtc_clk);
+ s3c_rtc_setaie(dev, alrm->enabled);
return 0;
}
@@ -361,16 +331,12 @@ static int s3c_rtc_proc(struct device *dev, struct seq_file *seq)
{
struct s3c_rtc *info = dev_get_drvdata(dev);
- clk_enable(info->rtc_clk);
- if (info->data->needs_src_clk)
- clk_enable(info->rtc_src_clk);
+ s3c_rtc_enable_clk(info);
if (info->data->enable_tick)
info->data->enable_tick(info, seq);
- if (info->data->needs_src_clk)
- clk_disable(info->rtc_src_clk);
- clk_disable(info->rtc_clk);
+ s3c_rtc_disable_clk(info);
return 0;
}
@@ -388,10 +354,6 @@ static void s3c24xx_rtc_enable(struct s3c_rtc *info)
{
unsigned int con, tmp;
- clk_enable(info->rtc_clk);
- if (info->data->needs_src_clk)
- clk_enable(info->rtc_src_clk);
-
con = readw(info->base + S3C2410_RTCCON);
/* re-enable the device, and check it is ok */
if ((con & S3C2410_RTCCON_RTCEN) == 0) {
@@ -417,20 +379,12 @@ static void s3c24xx_rtc_enable(struct s3c_rtc *info)
writew(tmp & ~S3C2410_RTCCON_CLKRST,
info->base + S3C2410_RTCCON);
}
-
- if (info->data->needs_src_clk)
- clk_disable(info->rtc_src_clk);
- clk_disable(info->rtc_clk);
}
static void s3c24xx_rtc_disable(struct s3c_rtc *info)
{
unsigned int con;
- clk_enable(info->rtc_clk);
- if (info->data->needs_src_clk)
- clk_enable(info->rtc_src_clk);
-
con = readw(info->base + S3C2410_RTCCON);
con &= ~S3C2410_RTCCON_RTCEN;
writew(con, info->base + S3C2410_RTCCON);
@@ -438,28 +392,16 @@ static void s3c24xx_rtc_disable(struct s3c_rtc *info)
con = readb(info->base + S3C2410_TICNT);
con &= ~S3C2410_TICNT_ENABLE;
writeb(con, info->base + S3C2410_TICNT);
-
- if (info->data->needs_src_clk)
- clk_disable(info->rtc_src_clk);
- clk_disable(info->rtc_clk);
}
static void s3c6410_rtc_disable(struct s3c_rtc *info)
{
unsigned int con;
- clk_enable(info->rtc_clk);
- if (info->data->needs_src_clk)
- clk_enable(info->rtc_src_clk);
-
con = readw(info->base + S3C2410_RTCCON);
con &= ~S3C64XX_RTCCON_TICEN;
con &= ~S3C2410_RTCCON_RTCEN;
writew(con, info->base + S3C2410_RTCCON);
-
- if (info->data->needs_src_clk)
- clk_disable(info->rtc_src_clk);
- clk_disable(info->rtc_clk);
}
static int s3c_rtc_remove(struct platform_device *pdev)
@@ -598,15 +540,16 @@ static int s3c_rtc_probe(struct platform_device *pdev)
s3c_rtc_setfreq(info, 1);
- if (info->data->needs_src_clk)
- clk_disable(info->rtc_src_clk);
- clk_disable(info->rtc_clk);
+ s3c_rtc_disable_clk(info);
return 0;
err_nortc:
if (info->data->disable)
info->data->disable(info);
+
+ if (info->data->needs_src_clk)
+ clk_disable_unprepare(info->rtc_src_clk);
clk_disable_unprepare(info->rtc_clk);
return ret;
@@ -618,9 +561,7 @@ static int s3c_rtc_suspend(struct device *dev)
{
struct s3c_rtc *info = dev_get_drvdata(dev);
- clk_enable(info->rtc_clk);
- if (info->data->needs_src_clk)
- clk_enable(info->rtc_src_clk);
+ s3c_rtc_enable_clk(info);
/* save TICNT for anyone using periodic interrupts */
if (info->data->save_tick_cnt)
@@ -636,10 +577,6 @@ static int s3c_rtc_suspend(struct device *dev)
dev_err(dev, "enable_irq_wake failed\n");
}
- if (info->data->needs_src_clk)
- clk_disable(info->rtc_src_clk);
- clk_disable(info->rtc_clk);
-
return 0;
}
@@ -647,25 +584,19 @@ static int s3c_rtc_resume(struct device *dev)
{
struct s3c_rtc *info = dev_get_drvdata(dev);
- clk_enable(info->rtc_clk);
- if (info->data->needs_src_clk)
- clk_enable(info->rtc_src_clk);
-
if (info->data->enable)
info->data->enable(info);
if (info->data->restore_tick_cnt)
info->data->restore_tick_cnt(info);
+ s3c_rtc_disable_clk(info);
+
if (device_may_wakeup(dev) && info->wake_en) {
disable_irq_wake(info->irq_alarm);
info->wake_en = false;
}
- if (info->data->needs_src_clk)
- clk_disable(info->rtc_src_clk);
- clk_disable(info->rtc_clk);
-
return 0;
}
#endif
@@ -673,29 +604,13 @@ static SIMPLE_DEV_PM_OPS(s3c_rtc_pm_ops, s3c_rtc_suspend, s3c_rtc_resume);
static void s3c24xx_rtc_irq(struct s3c_rtc *info, int mask)
{
- clk_enable(info->rtc_clk);
- if (info->data->needs_src_clk)
- clk_enable(info->rtc_src_clk);
rtc_update_irq(info->rtc, 1, RTC_AF | RTC_IRQF);
- if (info->data->needs_src_clk)
- clk_disable(info->rtc_src_clk);
- clk_disable(info->rtc_clk);
-
- s3c_rtc_alarm_clk_enable(info, false);
}
static void s3c6410_rtc_irq(struct s3c_rtc *info, int mask)
{
- clk_enable(info->rtc_clk);
- if (info->data->needs_src_clk)
- clk_enable(info->rtc_src_clk);
rtc_update_irq(info->rtc, 1, RTC_AF | RTC_IRQF);
writeb(mask, info->base + S3C2410_INTP);
- if (info->data->needs_src_clk)
- clk_disable(info->rtc_src_clk);
- clk_disable(info->rtc_clk);
-
- s3c_rtc_alarm_clk_enable(info, false);
}
static void s3c2410_rtc_setfreq(struct s3c_rtc *info, int freq)
diff --git a/drivers/rtc/rtc-stmp3xxx.c b/drivers/rtc/rtc-stmp3xxx.c
index 2939cdcb2688..eb09eddf39b8 100644
--- a/drivers/rtc/rtc-stmp3xxx.c
+++ b/drivers/rtc/rtc-stmp3xxx.c
@@ -42,6 +42,8 @@
#define STMP3XXX_RTC_STAT 0x10
#define STMP3XXX_RTC_STAT_STALE_SHIFT 16
#define STMP3XXX_RTC_STAT_RTC_PRESENT 0x80000000
+#define STMP3XXX_RTC_STAT_XTAL32000_PRESENT 0x10000000
+#define STMP3XXX_RTC_STAT_XTAL32768_PRESENT 0x08000000
#define STMP3XXX_RTC_SECONDS 0x30
@@ -52,9 +54,13 @@
#define STMP3XXX_RTC_PERSISTENT0 0x60
#define STMP3XXX_RTC_PERSISTENT0_SET 0x64
#define STMP3XXX_RTC_PERSISTENT0_CLR 0x68
-#define STMP3XXX_RTC_PERSISTENT0_ALARM_WAKE_EN 0x00000002
-#define STMP3XXX_RTC_PERSISTENT0_ALARM_EN 0x00000004
-#define STMP3XXX_RTC_PERSISTENT0_ALARM_WAKE 0x00000080
+#define STMP3XXX_RTC_PERSISTENT0_CLOCKSOURCE (1 << 0)
+#define STMP3XXX_RTC_PERSISTENT0_ALARM_WAKE_EN (1 << 1)
+#define STMP3XXX_RTC_PERSISTENT0_ALARM_EN (1 << 2)
+#define STMP3XXX_RTC_PERSISTENT0_XTAL24MHZ_PWRUP (1 << 4)
+#define STMP3XXX_RTC_PERSISTENT0_XTAL32KHZ_PWRUP (1 << 5)
+#define STMP3XXX_RTC_PERSISTENT0_XTAL32_FREQ (1 << 6)
+#define STMP3XXX_RTC_PERSISTENT0_ALARM_WAKE (1 << 7)
#define STMP3XXX_RTC_PERSISTENT1 0x70
/* missing bitmask in headers */
@@ -248,6 +254,9 @@ static int stmp3xxx_rtc_probe(struct platform_device *pdev)
{
struct stmp3xxx_rtc_data *rtc_data;
struct resource *r;
+ u32 rtc_stat;
+ u32 pers0_set, pers0_clr;
+ u32 crystalfreq = 0;
int err;
rtc_data = devm_kzalloc(&pdev->dev, sizeof(*rtc_data), GFP_KERNEL);
@@ -268,8 +277,8 @@ static int stmp3xxx_rtc_probe(struct platform_device *pdev)
rtc_data->irq_alarm = platform_get_irq(pdev, 0);
- if (!(readl(STMP3XXX_RTC_STAT + rtc_data->io) &
- STMP3XXX_RTC_STAT_RTC_PRESENT)) {
+ rtc_stat = readl(rtc_data->io + STMP3XXX_RTC_STAT);
+ if (!(rtc_stat & STMP3XXX_RTC_STAT_RTC_PRESENT)) {
dev_err(&pdev->dev, "no device onboard\n");
return -ENODEV;
}
@@ -282,9 +291,54 @@ static int stmp3xxx_rtc_probe(struct platform_device *pdev)
return err;
}
+ /*
+ * Obviously the rtc needs a clock input to be able to run.
+ * This clock can be provided by an external 32k crystal. If that one is
+ * missing XTAL must not be disabled in suspend which consumes a
+ * lot of power. Normally the presence and exact frequency (supported
+ * are 32000 Hz and 32768 Hz) is detectable from fuses, but as reality
+ * proves these fuses are not blown correctly on all machines, so the
+ * frequency can be overridden in the device tree.
+ */
+ if (rtc_stat & STMP3XXX_RTC_STAT_XTAL32000_PRESENT)
+ crystalfreq = 32000;
+ else if (rtc_stat & STMP3XXX_RTC_STAT_XTAL32768_PRESENT)
+ crystalfreq = 32768;
+
+ of_property_read_u32(pdev->dev.of_node, "stmp,crystal-freq",
+ &crystalfreq);
+
+ switch (crystalfreq) {
+ case 32000:
+ /* keep 32kHz crystal running in low-power mode */
+ pers0_set = STMP3XXX_RTC_PERSISTENT0_XTAL32_FREQ |
+ STMP3XXX_RTC_PERSISTENT0_XTAL32KHZ_PWRUP |
+ STMP3XXX_RTC_PERSISTENT0_CLOCKSOURCE;
+ pers0_clr = STMP3XXX_RTC_PERSISTENT0_XTAL24MHZ_PWRUP;
+ break;
+ case 32768:
+ /* keep 32.768kHz crystal running in low-power mode */
+ pers0_set = STMP3XXX_RTC_PERSISTENT0_XTAL32KHZ_PWRUP |
+ STMP3XXX_RTC_PERSISTENT0_CLOCKSOURCE;
+ pers0_clr = STMP3XXX_RTC_PERSISTENT0_XTAL24MHZ_PWRUP |
+ STMP3XXX_RTC_PERSISTENT0_XTAL32_FREQ;
+ break;
+ default:
+ dev_warn(&pdev->dev,
+ "invalid crystal-freq specified in device-tree. Assuming no crystal\n");
+ /* fall-through */
+ case 0:
+ /* keep XTAL on in low-power mode */
+ pers0_set = STMP3XXX_RTC_PERSISTENT0_XTAL24MHZ_PWRUP;
+ pers0_clr = STMP3XXX_RTC_PERSISTENT0_XTAL32KHZ_PWRUP |
+ STMP3XXX_RTC_PERSISTENT0_CLOCKSOURCE;
+ }
+
+ writel(pers0_set, rtc_data->io + STMP3XXX_RTC_PERSISTENT0_SET);
+
writel(STMP3XXX_RTC_PERSISTENT0_ALARM_EN |
STMP3XXX_RTC_PERSISTENT0_ALARM_WAKE_EN |
- STMP3XXX_RTC_PERSISTENT0_ALARM_WAKE,
+ STMP3XXX_RTC_PERSISTENT0_ALARM_WAKE | pers0_clr,
rtc_data->io + STMP3XXX_RTC_PERSISTENT0_CLR);
writel(STMP3XXX_RTC_CTRL_ONEMSEC_IRQ_EN |
diff --git a/drivers/rtc/rtc-tegra.c b/drivers/rtc/rtc-tegra.c
index d948277057d8..60232bd366ef 100644
--- a/drivers/rtc/rtc-tegra.c
+++ b/drivers/rtc/rtc-tegra.c
@@ -261,7 +261,9 @@ static int tegra_rtc_proc(struct device *dev, struct seq_file *seq)
if (!dev || !dev->driver)
return 0;
- return seq_printf(seq, "name\t\t: %s\n", dev_name(dev));
+ seq_printf(seq, "name\t\t: %s\n", dev_name(dev));
+
+ return 0;
}
static irqreturn_t tegra_rtc_irq_handler(int irq, void *data)
diff --git a/drivers/rtc/rtc-x1205.c b/drivers/rtc/rtc-x1205.c
index b1de58e0b3d0..5638b7ba8b06 100644
--- a/drivers/rtc/rtc-x1205.c
+++ b/drivers/rtc/rtc-x1205.c
@@ -22,6 +22,7 @@
#include <linux/rtc.h>
#include <linux/delay.h>
#include <linux/module.h>
+#include <linux/bitops.h>
#define DRV_VERSION "1.0.8"
@@ -366,8 +367,7 @@ static int x1205_get_atrim(struct i2c_client *client, int *trim)
* perform sign extension. The formula is
* Catr = (atr * 0.25pF) + 11.00pF.
*/
- if (atr & 0x20)
- atr |= 0xC0;
+ atr = sign_extend32(atr, 5);
dev_dbg(&client->dev, "%s: raw atr=%x (%d)\n", __func__, atr, atr);
diff --git a/drivers/s390/cio/blacklist.c b/drivers/s390/cio/blacklist.c
index b3f791b2c1f8..20314aad7ab7 100644
--- a/drivers/s390/cio/blacklist.c
+++ b/drivers/s390/cio/blacklist.c
@@ -330,18 +330,20 @@ cio_ignore_proc_seq_show(struct seq_file *s, void *it)
if (!iter->in_range) {
/* First device in range. */
if ((iter->devno == __MAX_SUBCHANNEL) ||
- !is_blacklisted(iter->ssid, iter->devno + 1))
+ !is_blacklisted(iter->ssid, iter->devno + 1)) {
/* Singular device. */
- return seq_printf(s, "0.%x.%04x\n",
- iter->ssid, iter->devno);
+ seq_printf(s, "0.%x.%04x\n", iter->ssid, iter->devno);
+ return 0;
+ }
iter->in_range = 1;
- return seq_printf(s, "0.%x.%04x-", iter->ssid, iter->devno);
+ seq_printf(s, "0.%x.%04x-", iter->ssid, iter->devno);
+ return 0;
}
if ((iter->devno == __MAX_SUBCHANNEL) ||
!is_blacklisted(iter->ssid, iter->devno + 1)) {
/* Last device in range. */
iter->in_range = 0;
- return seq_printf(s, "0.%x.%04x\n", iter->ssid, iter->devno);
+ seq_printf(s, "0.%x.%04x\n", iter->ssid, iter->devno);
}
return 0;
}
diff --git a/drivers/s390/scsi/zfcp_erp.c b/drivers/s390/scsi/zfcp_erp.c
index 2c5d4567d1da..acde3f5d6e9e 100644
--- a/drivers/s390/scsi/zfcp_erp.c
+++ b/drivers/s390/scsi/zfcp_erp.c
@@ -738,11 +738,11 @@ static int zfcp_erp_adapter_strategy_open_fsf(struct zfcp_erp_action *act)
return ZFCP_ERP_FAILED;
if (mempool_resize(act->adapter->pool.sr_data,
- act->adapter->stat_read_buf_num, GFP_KERNEL))
+ act->adapter->stat_read_buf_num))
return ZFCP_ERP_FAILED;
if (mempool_resize(act->adapter->pool.status_read_req,
- act->adapter->stat_read_buf_num, GFP_KERNEL))
+ act->adapter->stat_read_buf_num))
return ZFCP_ERP_FAILED;
atomic_set(&act->adapter->stat_miss, act->adapter->stat_read_buf_num);
diff --git a/drivers/scsi/bfa/bfad.c b/drivers/scsi/bfa/bfad.c
index e90a3742f09d..cc3b9d3d6d40 100644
--- a/drivers/scsi/bfa/bfad.c
+++ b/drivers/scsi/bfa/bfad.c
@@ -1079,22 +1079,18 @@ bfad_start_ops(struct bfad_s *bfad) {
int
bfad_worker(void *ptr)
{
- struct bfad_s *bfad;
- unsigned long flags;
-
- bfad = (struct bfad_s *)ptr;
-
- while (!kthread_should_stop()) {
+ struct bfad_s *bfad = ptr;
+ unsigned long flags;
- /* Send event BFAD_E_INIT_SUCCESS */
- bfa_sm_send_event(bfad, BFAD_E_INIT_SUCCESS);
+ if (kthread_should_stop())
+ return 0;
- spin_lock_irqsave(&bfad->bfad_lock, flags);
- bfad->bfad_tsk = NULL;
- spin_unlock_irqrestore(&bfad->bfad_lock, flags);
+ /* Send event BFAD_E_INIT_SUCCESS */
+ bfa_sm_send_event(bfad, BFAD_E_INIT_SUCCESS);
- break;
- }
+ spin_lock_irqsave(&bfad->bfad_lock, flags);
+ bfad->bfad_tsk = NULL;
+ spin_unlock_irqrestore(&bfad->bfad_lock, flags);
return 0;
}
diff --git a/drivers/staging/lustre/lustre/Kconfig b/drivers/staging/lustre/lustre/Kconfig
index 6725467ef4d0..62c7bba75274 100644
--- a/drivers/staging/lustre/lustre/Kconfig
+++ b/drivers/staging/lustre/lustre/Kconfig
@@ -10,6 +10,7 @@ config LUSTRE_FS
select CRYPTO_SHA1
select CRYPTO_SHA256
select CRYPTO_SHA512
+ depends on MULTIUSER
help
This option enables Lustre file system client support. Choose Y
here if you want to access a Lustre file system cluster. To compile
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
index a260e99a4447..d72605864b0a 100644
--- a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
@@ -55,7 +55,9 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
if (PagePrivate(page))
page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
- cancel_dirty_page(page, PAGE_SIZE);
+ if (TestClearPageDirty(page))
+ account_page_cleaned(page, mapping);
+
ClearPageMappedToDisk(page);
ll_delete_from_page_cache(page);
}
diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c
index 8a65423bc696..c4211a31612d 100644
--- a/drivers/xen/tmem.c
+++ b/drivers/xen/tmem.c
@@ -397,13 +397,15 @@ static int __init xen_tmem_init(void)
#ifdef CONFIG_CLEANCACHE
BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid));
if (tmem_enabled && cleancache) {
- char *s = "";
- struct cleancache_ops *old_ops =
- cleancache_register_ops(&tmem_cleancache_ops);
- if (old_ops)
- s = " (WARNING: cleancache_ops overridden)";
- pr_info("cleancache enabled, RAM provided by Xen Transcendent Memory%s\n",
- s);
+ int err;
+
+ err = cleancache_register_ops(&tmem_cleancache_ops);
+ if (err)
+ pr_warn("xen-tmem: failed to enable cleancache: %d\n",
+ err);
+ else
+ pr_info("cleancache enabled, RAM provided by "
+ "Xen Transcendent Memory\n");
}
#endif
#ifdef CONFIG_XEN_SELFBALLOONING
diff --git a/firmware/ihex2fw.c b/firmware/ihex2fw.c
index cf38e159131a..08d90e25abf0 100644
--- a/firmware/ihex2fw.c
+++ b/firmware/ihex2fw.c
@@ -86,6 +86,7 @@ int main(int argc, char **argv)
case 'j':
include_jump = 1;
break;
+ default:
return usage();
}
}
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 270c48148f79..2d0cbbd14cfc 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -27,9 +27,6 @@ config COMPAT_BINFMT_ELF
bool
depends on COMPAT && BINFMT_ELF
-config ARCH_BINFMT_ELF_RANDOMIZE_PIE
- bool
-
config ARCH_BINFMT_ELF_STATE
bool
diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c
index f2ba88ab4aed..82d14cdf70f9 100644
--- a/fs/adfs/dir_fplus.c
+++ b/fs/adfs/dir_fplus.c
@@ -61,6 +61,7 @@ adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct
kcalloc(size, sizeof(struct buffer_head *),
GFP_KERNEL);
if (!bh_fplus) {
+ ret = -ENOMEM;
adfs_error(sb, "not enough memory for"
" dir object %X (%d blocks)", id, size);
goto out;
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 9852bdf34d76..a19c31d3f369 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -316,7 +316,7 @@ static struct adfs_discmap *adfs_read_map(struct super_block *sb, struct adfs_di
dm = kmalloc(nzones * sizeof(*dm), GFP_KERNEL);
if (dm == NULL) {
adfs_error(sb, "not enough memory");
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
for (zone = 0; zone < nzones; zone++, map_addr++) {
@@ -349,7 +349,7 @@ error_free:
brelse(dm[zone].dm_bh);
kfree(dm);
- return NULL;
+ return ERR_PTR(-EIO);
}
static inline unsigned long adfs_discsize(struct adfs_discrecord *dr, int block_bits)
@@ -370,6 +370,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
unsigned char *b_data;
struct adfs_sb_info *asb;
struct inode *root;
+ int ret = -EINVAL;
sb->s_flags |= MS_NODIRATIME;
@@ -391,6 +392,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
sb_set_blocksize(sb, BLOCK_SIZE);
if (!(bh = sb_bread(sb, ADFS_DISCRECORD / BLOCK_SIZE))) {
adfs_error(sb, "unable to read superblock");
+ ret = -EIO;
goto error;
}
@@ -400,6 +402,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
if (!silent)
printk("VFS: Can't find an adfs filesystem on dev "
"%s.\n", sb->s_id);
+ ret = -EINVAL;
goto error_free_bh;
}
@@ -412,6 +415,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
if (!silent)
printk("VPS: Can't find an adfs filesystem on dev "
"%s.\n", sb->s_id);
+ ret = -EINVAL;
goto error_free_bh;
}
@@ -421,11 +425,13 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
if (!bh) {
adfs_error(sb, "couldn't read superblock on "
"2nd try.");
+ ret = -EIO;
goto error;
}
b_data = bh->b_data + (ADFS_DISCRECORD % sb->s_blocksize);
if (adfs_checkbblk(b_data)) {
adfs_error(sb, "disc record mismatch, very weird!");
+ ret = -EINVAL;
goto error_free_bh;
}
dr = (struct adfs_discrecord *)(b_data + ADFS_DR_OFFSET);
@@ -433,6 +439,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
if (!silent)
printk(KERN_ERR "VFS: Unsupported blocksize on dev "
"%s.\n", sb->s_id);
+ ret = -EINVAL;
goto error;
}
@@ -447,10 +454,12 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
asb->s_size = adfs_discsize(dr, sb->s_blocksize_bits);
asb->s_version = dr->format_version;
asb->s_log2sharesize = dr->log2sharesize;
-
+
asb->s_map = adfs_read_map(sb, dr);
- if (!asb->s_map)
+ if (IS_ERR(asb->s_map)) {
+ ret = PTR_ERR(asb->s_map);
goto error_free_bh;
+ }
brelse(bh);
@@ -499,6 +508,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
brelse(asb->s_map[i].dm_bh);
kfree(asb->s_map);
adfs_error(sb, "get root inode failed\n");
+ ret = -EIO;
goto error;
}
return 0;
@@ -508,7 +518,7 @@ error_free_bh:
error:
sb->s_fs_info = NULL;
kfree(asb);
- return -EINVAL;
+ return ret;
}
static struct dentry *adfs_mount(struct file_system_type *fs_type,
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index c8764bd7497d..cffe8370fb44 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -106,18 +106,22 @@ struct affs_sb_info {
spinlock_t work_lock; /* protects sb_work and work_queued */
};
-#define SF_INTL 0x0001 /* International filesystem. */
-#define SF_BM_VALID 0x0002 /* Bitmap is valid. */
-#define SF_IMMUTABLE 0x0004 /* Protection bits cannot be changed */
-#define SF_QUIET 0x0008 /* chmod errors will be not reported */
-#define SF_SETUID 0x0010 /* Ignore Amiga uid */
-#define SF_SETGID 0x0020 /* Ignore Amiga gid */
-#define SF_SETMODE 0x0040 /* Ignore Amiga protection bits */
-#define SF_MUFS 0x0100 /* Use MUFS uid/gid mapping */
-#define SF_OFS 0x0200 /* Old filesystem */
-#define SF_PREFIX 0x0400 /* Buffer for prefix is allocated */
-#define SF_VERBOSE 0x0800 /* Talk about fs when mounting */
-#define SF_NO_TRUNCATE 0x1000 /* Don't truncate filenames */
+#define AFFS_MOUNT_SF_INTL 0x0001 /* International filesystem. */
+#define AFFS_MOUNT_SF_BM_VALID 0x0002 /* Bitmap is valid. */
+#define AFFS_MOUNT_SF_IMMUTABLE 0x0004 /* Protection bits cannot be changed */
+#define AFFS_MOUNT_SF_QUIET 0x0008 /* chmod errors will be not reported */
+#define AFFS_MOUNT_SF_SETUID 0x0010 /* Ignore Amiga uid */
+#define AFFS_MOUNT_SF_SETGID 0x0020 /* Ignore Amiga gid */
+#define AFFS_MOUNT_SF_SETMODE 0x0040 /* Ignore Amiga protection bits */
+#define AFFS_MOUNT_SF_MUFS 0x0100 /* Use MUFS uid/gid mapping */
+#define AFFS_MOUNT_SF_OFS 0x0200 /* Old filesystem */
+#define AFFS_MOUNT_SF_PREFIX 0x0400 /* Buffer for prefix is allocated */
+#define AFFS_MOUNT_SF_VERBOSE 0x0800 /* Talk about fs when mounting */
+#define AFFS_MOUNT_SF_NO_TRUNCATE 0x1000 /* Don't truncate filenames */
+
+#define affs_clear_opt(o, opt) (o &= ~AFFS_MOUNT_##opt)
+#define affs_set_opt(o, opt) (o |= AFFS_MOUNT_##opt)
+#define affs_test_opt(o, opt) ((o) & AFFS_MOUNT_##opt)
/* short cut to get to the affs specific sb data */
static inline struct affs_sb_info *AFFS_SB(struct super_block *sb)
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 388da1ea815d..5022ac96aa40 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -472,7 +472,8 @@ bool
affs_nofilenametruncate(const struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
- return AFFS_SB(inode->i_sb)->s_flags & SF_NO_TRUNCATE;
+
+ return affs_test_opt(AFFS_SB(inode->i_sb)->s_flags, SF_NO_TRUNCATE);
}
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 3aa7eb66547e..49f9413b681e 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -915,7 +915,7 @@ affs_truncate(struct inode *inode)
if (inode->i_size) {
AFFS_I(inode)->i_blkcnt = last_blk + 1;
AFFS_I(inode)->i_extcnt = ext + 1;
- if (AFFS_SB(sb)->s_flags & SF_OFS) {
+ if (affs_test_opt(AFFS_SB(sb)->s_flags, SF_OFS)) {
struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0);
u32 tmp;
if (IS_ERR(bh)) {
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 6f34510449e8..9628003ccd2f 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -66,23 +66,23 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
AFFS_I(inode)->i_lastalloc = 0;
AFFS_I(inode)->i_pa_cnt = 0;
- if (sbi->s_flags & SF_SETMODE)
+ if (affs_test_opt(sbi->s_flags, SF_SETMODE))
inode->i_mode = sbi->s_mode;
else
inode->i_mode = prot_to_mode(prot);
id = be16_to_cpu(tail->uid);
- if (id == 0 || sbi->s_flags & SF_SETUID)
+ if (id == 0 || affs_test_opt(sbi->s_flags, SF_SETUID))
inode->i_uid = sbi->s_uid;
- else if (id == 0xFFFF && sbi->s_flags & SF_MUFS)
+ else if (id == 0xFFFF && affs_test_opt(sbi->s_flags, SF_MUFS))
i_uid_write(inode, 0);
else
i_uid_write(inode, id);
id = be16_to_cpu(tail->gid);
- if (id == 0 || sbi->s_flags & SF_SETGID)
+ if (id == 0 || affs_test_opt(sbi->s_flags, SF_SETGID))
inode->i_gid = sbi->s_gid;
- else if (id == 0xFFFF && sbi->s_flags & SF_MUFS)
+ else if (id == 0xFFFF && affs_test_opt(sbi->s_flags, SF_MUFS))
i_gid_write(inode, 0);
else
i_gid_write(inode, id);
@@ -94,7 +94,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
/* fall through */
case ST_USERDIR:
if (be32_to_cpu(tail->stype) == ST_USERDIR ||
- sbi->s_flags & SF_SETMODE) {
+ affs_test_opt(sbi->s_flags, SF_SETMODE)) {
if (inode->i_mode & S_IRUSR)
inode->i_mode |= S_IXUSR;
if (inode->i_mode & S_IRGRP)
@@ -133,7 +133,8 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
}
if (tail->link_chain)
set_nlink(inode, 2);
- inode->i_mapping->a_ops = (sbi->s_flags & SF_OFS) ? &affs_aops_ofs : &affs_aops;
+ inode->i_mapping->a_ops = affs_test_opt(sbi->s_flags, SF_OFS) ?
+ &affs_aops_ofs : &affs_aops;
inode->i_op = &affs_file_inode_operations;
inode->i_fop = &affs_file_operations;
break;
@@ -190,15 +191,15 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc)
if (!(inode->i_ino == AFFS_SB(sb)->s_root_block)) {
uid = i_uid_read(inode);
gid = i_gid_read(inode);
- if (AFFS_SB(sb)->s_flags & SF_MUFS) {
+ if (affs_test_opt(AFFS_SB(sb)->s_flags, SF_MUFS)) {
if (uid == 0 || uid == 0xFFFF)
uid = uid ^ ~0;
if (gid == 0 || gid == 0xFFFF)
gid = gid ^ ~0;
}
- if (!(AFFS_SB(sb)->s_flags & SF_SETUID))
+ if (!affs_test_opt(AFFS_SB(sb)->s_flags, SF_SETUID))
tail->uid = cpu_to_be16(uid);
- if (!(AFFS_SB(sb)->s_flags & SF_SETGID))
+ if (!affs_test_opt(AFFS_SB(sb)->s_flags, SF_SETGID))
tail->gid = cpu_to_be16(gid);
}
}
@@ -221,11 +222,14 @@ affs_notify_change(struct dentry *dentry, struct iattr *attr)
if (error)
goto out;
- if (((attr->ia_valid & ATTR_UID) && (AFFS_SB(inode->i_sb)->s_flags & SF_SETUID)) ||
- ((attr->ia_valid & ATTR_GID) && (AFFS_SB(inode->i_sb)->s_flags & SF_SETGID)) ||
+ if (((attr->ia_valid & ATTR_UID) &&
+ affs_test_opt(AFFS_SB(inode->i_sb)->s_flags, SF_SETUID)) ||
+ ((attr->ia_valid & ATTR_GID) &&
+ affs_test_opt(AFFS_SB(inode->i_sb)->s_flags, SF_SETGID)) ||
((attr->ia_valid & ATTR_MODE) &&
- (AFFS_SB(inode->i_sb)->s_flags & (SF_SETMODE | SF_IMMUTABLE)))) {
- if (!(AFFS_SB(inode->i_sb)->s_flags & SF_QUIET))
+ (AFFS_SB(inode->i_sb)->s_flags &
+ (AFFS_MOUNT_SF_SETMODE | AFFS_MOUNT_SF_IMMUTABLE)))) {
+ if (!affs_test_opt(AFFS_SB(inode->i_sb)->s_flags, SF_QUIET))
error = -EPERM;
goto out;
}
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index ffb7bd82c2a5..ec8ca0efb960 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -53,7 +53,8 @@ affs_intl_toupper(int ch)
static inline toupper_t
affs_get_toupper(struct super_block *sb)
{
- return AFFS_SB(sb)->s_flags & SF_INTL ? affs_intl_toupper : affs_toupper;
+ return affs_test_opt(AFFS_SB(sb)->s_flags, SF_INTL) ?
+ affs_intl_toupper : affs_toupper;
}
/*
@@ -275,7 +276,8 @@ affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
inode->i_op = &affs_file_inode_operations;
inode->i_fop = &affs_file_operations;
- inode->i_mapping->a_ops = (AFFS_SB(sb)->s_flags & SF_OFS) ? &affs_aops_ofs : &affs_aops;
+ inode->i_mapping->a_ops = affs_test_opt(AFFS_SB(sb)->s_flags, SF_OFS) ?
+ &affs_aops_ofs : &affs_aops;
error = affs_add_entry(dir, inode, dentry, ST_FILE);
if (error) {
clear_nlink(inode);
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 4cf0e9113fb6..3f89c9e05b40 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -227,22 +227,22 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved,
if (match_octal(&args[0], &option))
return 0;
*mode = option & 0777;
- *mount_opts |= SF_SETMODE;
+ affs_set_opt(*mount_opts, SF_SETMODE);
break;
case Opt_mufs:
- *mount_opts |= SF_MUFS;
+ affs_set_opt(*mount_opts, SF_MUFS);
break;
case Opt_notruncate:
- *mount_opts |= SF_NO_TRUNCATE;
+ affs_set_opt(*mount_opts, SF_NO_TRUNCATE);
break;
case Opt_prefix:
*prefix = match_strdup(&args[0]);
if (!*prefix)
return 0;
- *mount_opts |= SF_PREFIX;
+ affs_set_opt(*mount_opts, SF_PREFIX);
break;
case Opt_protect:
- *mount_opts |= SF_IMMUTABLE;
+ affs_set_opt(*mount_opts, SF_IMMUTABLE);
break;
case Opt_reserved:
if (match_int(&args[0], reserved))
@@ -258,7 +258,7 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved,
*gid = make_kgid(current_user_ns(), option);
if (!gid_valid(*gid))
return 0;
- *mount_opts |= SF_SETGID;
+ affs_set_opt(*mount_opts, SF_SETGID);
break;
case Opt_setuid:
if (match_int(&args[0], &option))
@@ -266,10 +266,10 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved,
*uid = make_kuid(current_user_ns(), option);
if (!uid_valid(*uid))
return 0;
- *mount_opts |= SF_SETUID;
+ affs_set_opt(*mount_opts, SF_SETUID);
break;
case Opt_verbose:
- *mount_opts |= SF_VERBOSE;
+ affs_set_opt(*mount_opts, SF_VERBOSE);
break;
case Opt_volume: {
char *vol = match_strdup(&args[0]);
@@ -435,30 +435,31 @@ got_root:
case MUFS_FS:
case MUFS_INTLFFS:
case MUFS_DCFFS:
- sbi->s_flags |= SF_MUFS;
+ affs_set_opt(sbi->s_flags, SF_MUFS);
/* fall thru */
case FS_INTLFFS:
case FS_DCFFS:
- sbi->s_flags |= SF_INTL;
+ affs_set_opt(sbi->s_flags, SF_INTL);
break;
case MUFS_FFS:
- sbi->s_flags |= SF_MUFS;
+ affs_set_opt(sbi->s_flags, SF_MUFS);
break;
case FS_FFS:
break;
case MUFS_OFS:
- sbi->s_flags |= SF_MUFS;
+ affs_set_opt(sbi->s_flags, SF_MUFS);
/* fall thru */
case FS_OFS:
- sbi->s_flags |= SF_OFS;
+ affs_set_opt(sbi->s_flags, SF_OFS);
sb->s_flags |= MS_NOEXEC;
break;
case MUFS_DCOFS:
case MUFS_INTLOFS:
- sbi->s_flags |= SF_MUFS;
+ affs_set_opt(sbi->s_flags, SF_MUFS);
case FS_DCOFS:
case FS_INTLOFS:
- sbi->s_flags |= SF_INTL | SF_OFS;
+ affs_set_opt(sbi->s_flags, SF_INTL);
+ affs_set_opt(sbi->s_flags, SF_OFS);
sb->s_flags |= MS_NOEXEC;
break;
default:
@@ -467,7 +468,7 @@ got_root:
return -EINVAL;
}
- if (mount_flags & SF_VERBOSE) {
+ if (affs_test_opt(mount_flags, SF_VERBOSE)) {
u8 len = AFFS_ROOT_TAIL(sb, root_bh)->disk_name[0];
pr_notice("Mounting volume \"%.*s\": Type=%.3s\\%c, Blocksize=%d\n",
len > 31 ? 31 : len,
@@ -478,7 +479,7 @@ got_root:
sb->s_flags |= MS_NODEV | MS_NOSUID;
sbi->s_data_blksize = sb->s_blocksize;
- if (sbi->s_flags & SF_OFS)
+ if (affs_test_opt(sbi->s_flags, SF_OFS))
sbi->s_data_blksize -= 24;
tmp_flags = sb->s_flags;
@@ -493,7 +494,7 @@ got_root:
if (IS_ERR(root_inode))
return PTR_ERR(root_inode);
- if (AFFS_SB(sb)->s_flags & SF_INTL)
+ if (affs_test_opt(AFFS_SB(sb)->s_flags, SF_INTL))
sb->s_d_op = &affs_intl_dentry_operations;
else
sb->s_d_op = &affs_dentry_operations;
@@ -520,10 +521,14 @@ affs_remount(struct super_block *sb, int *flags, char *data)
int root_block;
unsigned long mount_flags;
int res = 0;
- char *new_opts = kstrdup(data, GFP_KERNEL);
+ char *new_opts;
char volume[32];
char *prefix = NULL;
+ new_opts = kstrdup(data, GFP_KERNEL);
+ if (!new_opts)
+ return -ENOMEM;
+
pr_debug("%s(flags=0x%x,opts=\"%s\")\n", __func__, *flags, data);
sync_filesystem(sb);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 995986b8e36b..a115da230ce0 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -31,6 +31,7 @@
#include <linux/security.h>
#include <linux/random.h>
#include <linux/elf.h>
+#include <linux/elf-randomize.h>
#include <linux/utsname.h>
#include <linux/coredump.h>
#include <linux/sched.h>
@@ -909,21 +910,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
* default mmap base, as well as whatever program they
* might try to exec. This is because the brk will
* follow the loader, and is not movable. */
-#ifdef CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE
- /* Memory randomization might have been switched off
- * in runtime via sysctl or explicit setting of
- * personality flags.
- * If that is the case, retain the original non-zero
- * load_bias value in order to establish proper
- * non-randomized mappings.
- */
+ load_bias = ELF_ET_DYN_BASE - vaddr;
if (current->flags & PF_RANDOMIZE)
- load_bias = 0;
- else
- load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
-#else
- load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
-#endif
+ load_bias += arch_mmap_rnd();
+ load_bias = ELF_PAGESTART(load_bias);
}
error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
@@ -1053,15 +1043,13 @@ static int load_elf_binary(struct linux_binprm *bprm)
current->mm->end_data = end_data;
current->mm->start_stack = bprm->p;
-#ifdef arch_randomize_brk
if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) {
current->mm->brk = current->mm->start_brk =
arch_randomize_brk(current->mm);
-#ifdef CONFIG_COMPAT_BRK
+#ifdef compat_brk_randomized
current->brk_randomized = 1;
#endif
}
-#endif
if (current->personality & MMAP_PAGE_ZERO) {
/* Why this, you ask??? Well SVr4 maps page 0 as read-only,
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 97aff2879cda..9dcb05409ba7 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -9,6 +9,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/sched.h>
@@ -521,9 +522,8 @@ static int parse_command(const char __user *buffer, size_t count)
static void entry_status(Node *e, char *page)
{
- char *dp;
- char *status = "disabled";
- const char *flags = "flags: ";
+ char *dp = page;
+ const char *status = "disabled";
if (test_bit(Enabled, &e->flags))
status = "enabled";
@@ -533,12 +533,10 @@ static void entry_status(Node *e, char *page)
return;
}
- sprintf(page, "%s\ninterpreter %s\n", status, e->interpreter);
- dp = page + strlen(page);
+ dp += sprintf(dp, "%s\ninterpreter %s\n", status, e->interpreter);
/* print the special flags */
- sprintf(dp, "%s", flags);
- dp += strlen(flags);
+ dp += sprintf(dp, "flags: ");
if (e->flags & MISC_FMT_PRESERVE_ARGV0)
*dp++ = 'P';
if (e->flags & MISC_FMT_OPEN_BINARY)
@@ -550,21 +548,11 @@ static void entry_status(Node *e, char *page)
if (!test_bit(Magic, &e->flags)) {
sprintf(dp, "extension .%s\n", e->magic);
} else {
- int i;
-
- sprintf(dp, "offset %i\nmagic ", e->offset);
- dp = page + strlen(page);
- for (i = 0; i < e->size; i++) {
- sprintf(dp, "%02x", 0xff & (int) (e->magic[i]));
- dp += 2;
- }
+ dp += sprintf(dp, "offset %i\nmagic ", e->offset);
+ dp = bin2hex(dp, e->magic, e->size);
if (e->mask) {
- sprintf(dp, "\nmask ");
- dp += 6;
- for (i = 0; i < e->size; i++) {
- sprintf(dp, "%02x", 0xff & (int) (e->mask[i]));
- dp += 2;
- }
+ dp += sprintf(dp, "\nmask ");
+ dp = bin2hex(dp, e->mask, e->size);
}
*dp++ = '\n';
*dp = '\0';
diff --git a/fs/buffer.c b/fs/buffer.c
index 20805db2c987..c7a5602d01ee 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3243,8 +3243,8 @@ int try_to_free_buffers(struct page *page)
* to synchronise against __set_page_dirty_buffers and prevent the
* dirty bit from being lost.
*/
- if (ret)
- cancel_dirty_page(page, PAGE_CACHE_SIZE);
+ if (ret && TestClearPageDirty(page))
+ account_page_cleaned(page, mapping);
spin_unlock(&mapping->private_lock);
out:
if (buffers_to_free) {
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 4cb8450e081b..62f033df56d0 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -773,8 +773,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
length = atomic_dec_return(&tcpSesAllocCount);
if (length > 0)
- mempool_resize(cifs_req_poolp, length + cifs_min_rcv,
- GFP_KERNEL);
+ mempool_resize(cifs_req_poolp, length + cifs_min_rcv);
}
static int
@@ -848,8 +847,7 @@ cifs_demultiplex_thread(void *p)
length = atomic_inc_return(&tcpSesAllocCount);
if (length > 1)
- mempool_resize(cifs_req_poolp, length + cifs_min_rcv,
- GFP_KERNEL);
+ mempool_resize(cifs_req_poolp, length + cifs_min_rcv);
set_freezable();
while (server->tcpStatus != CifsExiting) {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ca30c391a894..b9fd85dfee9b 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3413,13 +3413,13 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
* should have access to this page, we're safe to simply set
* PG_locked without checking it first.
*/
- __set_page_locked(page);
+ __SetPageLocked(page);
rc = add_to_page_cache_locked(page, mapping,
page->index, GFP_KERNEL);
/* give up if we can't stick it in the cache */
if (rc) {
- __clear_page_locked(page);
+ __ClearPageLocked(page);
return rc;
}
@@ -3440,10 +3440,10 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
if (*bytes + PAGE_CACHE_SIZE > rsize)
break;
- __set_page_locked(page);
+ __SetPageLocked(page);
if (add_to_page_cache_locked(page, mapping, page->index,
GFP_KERNEL)) {
- __clear_page_locked(page);
+ __ClearPageLocked(page);
break;
}
list_move_tail(&page->lru, tmplist);
diff --git a/fs/exec.c b/fs/exec.c
index c7f9b733406d..314e8d83a351 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -926,10 +926,14 @@ static int de_thread(struct task_struct *tsk)
if (!thread_group_leader(tsk)) {
struct task_struct *leader = tsk->group_leader;
- sig->notify_count = -1; /* for exit_notify() */
for (;;) {
threadgroup_change_begin(tsk);
write_lock_irq(&tasklist_lock);
+ /*
+ * Do this under tasklist_lock to ensure that
+ * exit_notify() can't miss ->group_exit_task
+ */
+ sig->notify_count = -1;
if (likely(leader->exit_state))
break;
__set_current_state(TASK_KILLABLE);
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index a8bc47f75fa0..5b6e9f246233 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -107,7 +107,10 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
}
if (!journal) {
- ret = generic_file_fsync(file, start, end, datasync);
+ if (test_opt(inode->i_sb, BARRIER))
+ ret = generic_file_fsync(file, start, end, datasync);
+ else
+ ret = __generic_file_fsync(file, start, end, datasync);
if (!ret && !hlist_empty(&inode->i_dentry))
ret = ext4_sync_parent(inode);
goto out;
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 91ad9e1c9441..5d384921524d 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -8,9 +8,7 @@
* May 1999. AV. Fixed the bogosity with FAT32 (read "FAT28"). Fscking lusers.
*/
-#include <linux/fs.h>
#include <linux/slab.h>
-#include <linux/buffer_head.h>
#include "fat.h"
/* this must be > 0. */
@@ -303,15 +301,59 @@ static int fat_bmap_cluster(struct inode *inode, int cluster)
return dclus;
}
-int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
- unsigned long *mapped_blocks, int create)
+int fat_get_mapped_cluster(struct inode *inode, sector_t sector,
+ sector_t last_block,
+ unsigned long *mapped_blocks, sector_t *bmap)
{
struct super_block *sb = inode->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ int cluster, offset;
+
+ cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits);
+ offset = sector & (sbi->sec_per_clus - 1);
+ cluster = fat_bmap_cluster(inode, cluster);
+ if (cluster < 0)
+ return cluster;
+ else if (cluster) {
+ *bmap = fat_clus_to_blknr(sbi, cluster) + offset;
+ *mapped_blocks = sbi->sec_per_clus - offset;
+ if (*mapped_blocks > last_block - sector)
+ *mapped_blocks = last_block - sector;
+ }
+
+ return 0;
+}
+
+static int is_exceed_eof(struct inode *inode, sector_t sector,
+ sector_t *last_block, int create)
+{
+ struct super_block *sb = inode->i_sb;
const unsigned long blocksize = sb->s_blocksize;
const unsigned char blocksize_bits = sb->s_blocksize_bits;
+
+ *last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits;
+ if (sector >= *last_block) {
+ if (!create)
+ return 1;
+
+ /*
+ * ->mmu_private can access on only allocation path.
+ * (caller must hold ->i_mutex)
+ */
+ *last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1))
+ >> blocksize_bits;
+ if (sector >= *last_block)
+ return 1;
+ }
+
+ return 0;
+}
+
+int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
+ unsigned long *mapped_blocks, int create, bool from_bmap)
+{
+ struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
sector_t last_block;
- int cluster, offset;
*phys = 0;
*mapped_blocks = 0;
@@ -323,31 +365,16 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
return 0;
}
- last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits;
- if (sector >= last_block) {
- if (!create)
+ if (!from_bmap) {
+ if (is_exceed_eof(inode, sector, &last_block, create))
return 0;
-
- /*
- * ->mmu_private can access on only allocation path.
- * (caller must hold ->i_mutex)
- */
- last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1))
- >> blocksize_bits;
+ } else {
+ last_block = inode->i_blocks >>
+ (inode->i_sb->s_blocksize_bits - 9);
if (sector >= last_block)
return 0;
}
- cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits);
- offset = sector & (sbi->sec_per_clus - 1);
- cluster = fat_bmap_cluster(inode, cluster);
- if (cluster < 0)
- return cluster;
- else if (cluster) {
- *phys = fat_clus_to_blknr(sbi, cluster) + offset;
- *mapped_blocks = sbi->sec_per_clus - offset;
- if (*mapped_blocks > last_block - sector)
- *mapped_blocks = last_block - sector;
- }
- return 0;
+ return fat_get_mapped_cluster(inode, sector, last_block, mapped_blocks,
+ phys);
}
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index c5d6bb939d19..4c71c8c76426 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -13,13 +13,9 @@
* Short name translation 1999, 2001 by Wolfram Pienkoss <wp@bszh.de>
*/
-#include <linux/module.h>
#include <linux/slab.h>
-#include <linux/time.h>
-#include <linux/buffer_head.h>
#include <linux/compat.h>
#include <linux/uaccess.h>
-#include <linux/kernel.h>
#include "fat.h"
/*
@@ -95,7 +91,7 @@ next:
*bh = NULL;
iblock = *pos >> sb->s_blocksize_bits;
- err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0);
+ err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0, false);
if (err || !phys)
return -1; /* beyond EOF or error */
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 64e295e8ff38..4307cd4f8da0 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -2,11 +2,8 @@
#define _FAT_H
#include <linux/buffer_head.h>
-#include <linux/string.h>
#include <linux/nls.h>
-#include <linux/fs.h>
#include <linux/hash.h>
-#include <linux/mutex.h>
#include <linux/ratelimit.h>
#include <linux/msdos_fs.h>
@@ -66,7 +63,7 @@ struct msdos_sb_info {
unsigned short sec_per_clus; /* sectors/cluster */
unsigned short cluster_bits; /* log2(cluster_size) */
unsigned int cluster_size; /* cluster size */
- unsigned char fats, fat_bits; /* number of FATs, FAT bits (12 or 16) */
+ unsigned char fats, fat_bits; /* number of FATs, FAT bits (12,16 or 32) */
unsigned short fat_start;
unsigned long fat_length; /* FAT start & length (sec.) */
unsigned long dir_start;
@@ -288,8 +285,11 @@ static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len)
extern void fat_cache_inval_inode(struct inode *inode);
extern int fat_get_cluster(struct inode *inode, int cluster,
int *fclus, int *dclus);
+extern int fat_get_mapped_cluster(struct inode *inode, sector_t sector,
+ sector_t last_block,
+ unsigned long *mapped_blocks, sector_t *bmap);
extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
- unsigned long *mapped_blocks, int create);
+ unsigned long *mapped_blocks, int create, bool from_bmap);
/* fat/dir.c */
extern const struct file_operations fat_dir_operations;
@@ -387,6 +387,7 @@ static inline unsigned long fat_dir_hash(int logstart)
{
return hash_32(logstart, FAT_HASH_BITS);
}
+extern int fat_add_cluster(struct inode *inode);
/* fat/misc.c */
extern __printf(3, 4) __cold
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 260705c58062..8226557130a2 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -3,9 +3,6 @@
* Released under GPL v2.
*/
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/msdos_fs.h>
#include <linux/blkdev.h>
#include "fat.h"
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 8429c68e3057..e86038da3875 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -10,15 +10,15 @@
#include <linux/module.h>
#include <linux/compat.h>
#include <linux/mount.h>
-#include <linux/time.h>
-#include <linux/buffer_head.h>
-#include <linux/writeback.h>
-#include <linux/backing-dev.h>
#include <linux/blkdev.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
+#include <linux/falloc.h>
#include "fat.h"
+static long fat_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len);
+
static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
{
u32 attr;
@@ -182,6 +182,7 @@ const struct file_operations fat_file_operations = {
#endif
.fsync = fat_file_fsync,
.splice_read = generic_file_splice_read,
+ .fallocate = fat_fallocate,
};
static int fat_cont_expand(struct inode *inode, loff_t size)
@@ -220,6 +221,62 @@ out:
return err;
}
+/*
+ * Preallocate space for a file. This implements fat's fallocate file
+ * operation, which gets called from sys_fallocate system call. User
+ * space requests len bytes at offset. If FALLOC_FL_KEEP_SIZE is set
+ * we just allocate clusters without zeroing them out. Otherwise we
+ * allocate and zero out clusters via an expanding truncate.
+ */
+static long fat_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len)
+{
+ int nr_cluster; /* Number of clusters to be allocated */
+ loff_t mm_bytes; /* Number of bytes to be allocated for file */
+ loff_t ondisksize; /* block aligned on-disk size in bytes*/
+ struct inode *inode = file->f_mapping->host;
+ struct super_block *sb = inode->i_sb;
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ int err = 0;
+
+ /* No support for hole punch or other fallocate flags. */
+ if (mode & ~FALLOC_FL_KEEP_SIZE)
+ return -EOPNOTSUPP;
+
+ /* No support for dir */
+ if (!S_ISREG(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ mutex_lock(&inode->i_mutex);
+ if (mode & FALLOC_FL_KEEP_SIZE) {
+ ondisksize = inode->i_blocks << 9;
+ if ((offset + len) <= ondisksize)
+ goto error;
+
+ /* First compute the number of clusters to be allocated */
+ mm_bytes = offset + len - ondisksize;
+ nr_cluster = (mm_bytes + (sbi->cluster_size - 1)) >>
+ sbi->cluster_bits;
+
+ /* Start the allocation.We are not zeroing out the clusters */
+ while (nr_cluster-- > 0) {
+ err = fat_add_cluster(inode);
+ if (err)
+ goto error;
+ }
+ } else {
+ if ((offset + len) <= i_size_read(inode))
+ goto error;
+
+ /* This is just an expanding truncate */
+ err = fat_cont_expand(inode, (offset + len));
+ }
+
+error:
+ mutex_unlock(&inode->i_mutex);
+ return err;
+}
+
/* Free all clusters after the skip'th cluster. */
static int fat_free(struct inode *inode, int skip)
{
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 8521207de229..c71b00fbca28 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -11,20 +11,12 @@
*/
#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/time.h>
-#include <linux/slab.h>
-#include <linux/seq_file.h>
#include <linux/pagemap.h>
#include <linux/mpage.h>
-#include <linux/buffer_head.h>
-#include <linux/mount.h>
#include <linux/vfs.h>
+#include <linux/seq_file.h>
#include <linux/parser.h>
#include <linux/uio.h>
-#include <linux/writeback.h>
-#include <linux/log2.h>
-#include <linux/hash.h>
#include <linux/blkdev.h>
#include <asm/unaligned.h>
#include "fat.h"
@@ -100,7 +92,7 @@ static struct fat_floppy_defaults {
},
};
-static int fat_add_cluster(struct inode *inode)
+int fat_add_cluster(struct inode *inode)
{
int err, cluster;
@@ -122,10 +114,10 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
struct super_block *sb = inode->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
unsigned long mapped_blocks;
- sector_t phys;
+ sector_t phys, last_block;
int err, offset;
- err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
+ err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false);
if (err)
return err;
if (phys) {
@@ -142,8 +134,14 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
return -EIO;
}
+ last_block = inode->i_blocks >> (sb->s_blocksize_bits - 9);
offset = (unsigned long)iblock & (sbi->sec_per_clus - 1);
- if (!offset) {
+ /*
+ * allocate a cluster according to the following.
+ * 1) no more available blocks
+ * 2) not part of fallocate region
+ */
+ if (!offset && !(iblock < last_block)) {
/* TODO: multiple cluster allocation would be desirable. */
err = fat_add_cluster(inode);
if (err)
@@ -155,7 +153,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
*max_blocks = min(mapped_blocks, *max_blocks);
MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits;
- err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
+ err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false);
if (err)
return err;
@@ -281,13 +279,38 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
return ret;
}
+static int fat_get_block_bmap(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ struct super_block *sb = inode->i_sb;
+ unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
+ int err;
+ sector_t bmap;
+ unsigned long mapped_blocks;
+
+ BUG_ON(create != 0);
+
+ err = fat_bmap(inode, iblock, &bmap, &mapped_blocks, create, true);
+ if (err)
+ return err;
+
+ if (bmap) {
+ map_bh(bh_result, sb, bmap);
+ max_blocks = min(mapped_blocks, max_blocks);
+ }
+
+ bh_result->b_size = max_blocks << sb->s_blocksize_bits;
+
+ return 0;
+}
+
static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
{
sector_t blocknr;
/* fat_get_cluster() assumes the requested blocknr isn't truncated. */
down_read(&MSDOS_I(mapping->host)->truncate_lock);
- blocknr = generic_block_bmap(mapping, block, fat_get_block);
+ blocknr = generic_block_bmap(mapping, block, fat_get_block_bmap);
up_read(&MSDOS_I(mapping->host)->truncate_lock);
return blocknr;
@@ -561,13 +584,43 @@ out:
EXPORT_SYMBOL_GPL(fat_build_inode);
+static int __fat_write_inode(struct inode *inode, int wait);
+
+static void fat_free_eofblocks(struct inode *inode)
+{
+ /* Release unwritten fallocated blocks on inode eviction. */
+ if ((inode->i_blocks << 9) >
+ round_up(MSDOS_I(inode)->mmu_private,
+ MSDOS_SB(inode->i_sb)->cluster_size)) {
+ int err;
+
+ fat_truncate_blocks(inode, MSDOS_I(inode)->mmu_private);
+ /* Fallocate results in updating the i_start/iogstart
+ * for the zero byte file. So, make it return to
+ * original state during evict and commit it to avoid
+ * any corruption on the next access to the cluster
+ * chain for the file.
+ */
+ err = __fat_write_inode(inode, inode_needs_sync(inode));
+ if (err) {
+ fat_msg(inode->i_sb, KERN_WARNING, "Failed to "
+ "update on disk inode for unused "
+ "fallocated blocks, inode could be "
+ "corrupted. Please run fsck");
+ }
+
+ }
+}
+
static void fat_evict_inode(struct inode *inode)
{
truncate_inode_pages_final(&inode->i_data);
if (!inode->i_nlink) {
inode->i_size = 0;
fat_truncate_blocks(inode, 0);
- }
+ } else
+ fat_free_eofblocks(inode);
+
invalidate_inode_buffers(inode);
clear_inode(inode);
fat_cache_inval_inode(inode);
@@ -1279,8 +1332,7 @@ out:
static int fat_read_root(struct inode *inode)
{
- struct super_block *sb = inode->i_sb;
- struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
int error;
MSDOS_I(inode)->i_pos = MSDOS_ROOT_INO;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index d8da2d2e30ae..c4589e981760 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -6,10 +6,6 @@
* and date_dos2unix for date==0 by Igor Zhbanov(bsg@uniyar.ac.ru)
*/
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/buffer_head.h>
-#include <linux/time.h>
#include "fat.h"
/*
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index a783b0e1272a..cc6a8541b668 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -7,8 +7,6 @@
*/
#include <linux/module.h>
-#include <linux/time.h>
-#include <linux/buffer_head.h>
#include "fat.h"
/* Characters that are undesirable in an MS-DOS file name */
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index b8b92c2f9683..7e0974eebd8e 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -16,10 +16,8 @@
*/
#include <linux/module.h>
-#include <linux/jiffies.h>
#include <linux/ctype.h>
#include <linux/slab.h>
-#include <linux/buffer_head.h>
#include <linux/namei.h>
#include "fat.h"
diff --git a/fs/file.c b/fs/file.c
index ee738ea028fa..93c5f89c248b 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -638,8 +638,7 @@ static struct file *__fget(unsigned int fd, fmode_t mask)
file = fcheck_files(files, fd);
if (file) {
/* File object ref couldn't be taken */
- if ((file->f_mode & mask) ||
- !atomic_long_inc_not_zero(&file->f_count))
+ if ((file->f_mode & mask) || !get_file_rcu(file))
file = NULL;
}
rcu_read_unlock();
diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c
index c1422d91cd36..528e38b5af7f 100644
--- a/fs/hfsplus/bfind.c
+++ b/fs/hfsplus/bfind.c
@@ -118,9 +118,7 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd,
int b, e;
int res;
- if (!rec_found)
- BUG();
-
+ BUG_ON(!rec_found);
b = 0;
e = bnode->num_recs - 1;
res = -ENOENT;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index 7892e6fddb66..022974ab6e3c 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -350,10 +350,11 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
&fd.search_key->cat.name.unicode,
off + 2, len);
fd.search_key->key_len = cpu_to_be16(6 + len);
- } else
+ } else {
err = hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str);
if (unlikely(err))
goto out;
+ }
err = hfs_brec_find(&fd, hfs_find_rec_by_key);
if (err)
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index d3ff5cc317d7..8e98f5db6ad6 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -76,7 +76,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
{
struct inode *inode = file_inode(file);
struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
- unsigned int flags;
+ unsigned int flags, new_fl = 0;
int err = 0;
err = mnt_want_write_file(file);
@@ -110,14 +110,12 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
}
if (flags & FS_IMMUTABLE_FL)
- inode->i_flags |= S_IMMUTABLE;
- else
- inode->i_flags &= ~S_IMMUTABLE;
+ new_fl |= S_IMMUTABLE;
if (flags & FS_APPEND_FL)
- inode->i_flags |= S_APPEND;
- else
- inode->i_flags &= ~S_APPEND;
+ new_fl |= S_APPEND;
+
+ inode_set_flags(inode, new_fl, S_IMMUTABLE | S_APPEND);
if (flags & FS_NODUMP_FL)
hip->userflags |= HFSPLUS_FLG_NODUMP;
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index d98094a9f476..8d62de0f4504 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -44,7 +44,7 @@ static int strcmp_xattr_acl(const char *name)
return -1;
}
-static inline int is_known_namespace(const char *name)
+static bool is_known_namespace(const char *name)
{
if (strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) &&
strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) &&
@@ -424,6 +424,28 @@ static int copy_name(char *buffer, const char *xattr_name, int name_len)
return len;
}
+int hfsplus_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags,
+ const char *prefix, size_t prefixlen)
+{
+ char *xattr_name;
+ int res;
+
+ if (!strcmp(name, ""))
+ return -EINVAL;
+
+ xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
+ GFP_KERNEL);
+ if (!xattr_name)
+ return -ENOMEM;
+ strcpy(xattr_name, prefix);
+ strcpy(xattr_name + prefixlen, name);
+ res = __hfsplus_setxattr(dentry->d_inode, xattr_name, value, size,
+ flags);
+ kfree(xattr_name);
+ return res;
+}
+
static ssize_t hfsplus_getxattr_finder_info(struct inode *inode,
void *value, size_t size)
{
@@ -560,6 +582,30 @@ failed_getxattr_init:
return res;
}
+ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
+ void *value, size_t size,
+ const char *prefix, size_t prefixlen)
+{
+ int res;
+ char *xattr_name;
+
+ if (!strcmp(name, ""))
+ return -EINVAL;
+
+ xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
+ GFP_KERNEL);
+ if (!xattr_name)
+ return -ENOMEM;
+
+ strcpy(xattr_name, prefix);
+ strcpy(xattr_name + prefixlen, name);
+
+ res = __hfsplus_getxattr(dentry->d_inode, xattr_name, value, size);
+ kfree(xattr_name);
+ return res;
+
+}
+
static inline int can_list(const char *xattr_name)
{
if (!xattr_name)
@@ -806,9 +852,6 @@ end_removexattr:
static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name,
void *buffer, size_t size, int type)
{
- char *xattr_name;
- int res;
-
if (!strcmp(name, ""))
return -EINVAL;
@@ -818,24 +861,14 @@ static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name,
*/
if (is_known_namespace(name))
return -EOPNOTSUPP;
- xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN
- + XATTR_MAC_OSX_PREFIX_LEN + 1, GFP_KERNEL);
- if (!xattr_name)
- return -ENOMEM;
- strcpy(xattr_name, XATTR_MAC_OSX_PREFIX);
- strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name);
- res = hfsplus_getxattr(dentry, xattr_name, buffer, size);
- kfree(xattr_name);
- return res;
+ return hfsplus_getxattr(dentry, name, buffer, size,
+ XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN);
}
static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name,
const void *buffer, size_t size, int flags, int type)
{
- char *xattr_name;
- int res;
-
if (!strcmp(name, ""))
return -EINVAL;
@@ -845,16 +878,10 @@ static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name,
*/
if (is_known_namespace(name))
return -EOPNOTSUPP;
- xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN
- + XATTR_MAC_OSX_PREFIX_LEN + 1, GFP_KERNEL);
- if (!xattr_name)
- return -ENOMEM;
- strcpy(xattr_name, XATTR_MAC_OSX_PREFIX);
- strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name);
- res = hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
- kfree(xattr_name);
- return res;
+ return hfsplus_setxattr(dentry, name, buffer, size, flags,
+ XATTR_MAC_OSX_PREFIX,
+ XATTR_MAC_OSX_PREFIX_LEN);
}
static size_t hfsplus_osx_listxattr(struct dentry *dentry, char *list,
diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h
index 288530cf80b5..f9b0955b3d28 100644
--- a/fs/hfsplus/xattr.h
+++ b/fs/hfsplus/xattr.h
@@ -21,22 +21,16 @@ extern const struct xattr_handler *hfsplus_xattr_handlers[];
int __hfsplus_setxattr(struct inode *inode, const char *name,
const void *value, size_t size, int flags);
-static inline int hfsplus_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
-{
- return __hfsplus_setxattr(dentry->d_inode, name, value, size, flags);
-}
+int hfsplus_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags,
+ const char *prefix, size_t prefixlen);
ssize_t __hfsplus_getxattr(struct inode *inode, const char *name,
- void *value, size_t size);
-
-static inline ssize_t hfsplus_getxattr(struct dentry *dentry,
- const char *name,
- void *value,
- size_t size)
-{
- return __hfsplus_getxattr(dentry->d_inode, name, value, size);
-}
+ void *value, size_t size);
+
+ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
+ void *value, size_t size,
+ const char *prefix, size_t prefixlen);
ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size);
diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c
index 6ec5e107691f..aacff00a9ff9 100644
--- a/fs/hfsplus/xattr_security.c
+++ b/fs/hfsplus/xattr_security.c
@@ -16,43 +16,17 @@
static int hfsplus_security_getxattr(struct dentry *dentry, const char *name,
void *buffer, size_t size, int type)
{
- char *xattr_name;
- int res;
-
- if (!strcmp(name, ""))
- return -EINVAL;
-
- xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
- GFP_KERNEL);
- if (!xattr_name)
- return -ENOMEM;
- strcpy(xattr_name, XATTR_SECURITY_PREFIX);
- strcpy(xattr_name + XATTR_SECURITY_PREFIX_LEN, name);
-
- res = hfsplus_getxattr(dentry, xattr_name, buffer, size);
- kfree(xattr_name);
- return res;
+ return hfsplus_getxattr(dentry, name, buffer, size,
+ XATTR_SECURITY_PREFIX,
+ XATTR_SECURITY_PREFIX_LEN);
}
static int hfsplus_security_setxattr(struct dentry *dentry, const char *name,
const void *buffer, size_t size, int flags, int type)
{
- char *xattr_name;
- int res;
-
- if (!strcmp(name, ""))
- return -EINVAL;
-
- xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
- GFP_KERNEL);
- if (!xattr_name)
- return -ENOMEM;
- strcpy(xattr_name, XATTR_SECURITY_PREFIX);
- strcpy(xattr_name + XATTR_SECURITY_PREFIX_LEN, name);
-
- res = hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
- kfree(xattr_name);
- return res;
+ return hfsplus_setxattr(dentry, name, buffer, size, flags,
+ XATTR_SECURITY_PREFIX,
+ XATTR_SECURITY_PREFIX_LEN);
}
static size_t hfsplus_security_listxattr(struct dentry *dentry, char *list,
diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c
index 3c5f27e4746a..bcf65089b7f7 100644
--- a/fs/hfsplus/xattr_trusted.c
+++ b/fs/hfsplus/xattr_trusted.c
@@ -14,43 +14,16 @@
static int hfsplus_trusted_getxattr(struct dentry *dentry, const char *name,
void *buffer, size_t size, int type)
{
- char *xattr_name;
- int res;
-
- if (!strcmp(name, ""))
- return -EINVAL;
-
- xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
- GFP_KERNEL);
- if (!xattr_name)
- return -ENOMEM;
- strcpy(xattr_name, XATTR_TRUSTED_PREFIX);
- strcpy(xattr_name + XATTR_TRUSTED_PREFIX_LEN, name);
-
- res = hfsplus_getxattr(dentry, xattr_name, buffer, size);
- kfree(xattr_name);
- return res;
+ return hfsplus_getxattr(dentry, name, buffer, size,
+ XATTR_TRUSTED_PREFIX,
+ XATTR_TRUSTED_PREFIX_LEN);
}
static int hfsplus_trusted_setxattr(struct dentry *dentry, const char *name,
const void *buffer, size_t size, int flags, int type)
{
- char *xattr_name;
- int res;
-
- if (!strcmp(name, ""))
- return -EINVAL;
-
- xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
- GFP_KERNEL);
- if (!xattr_name)
- return -ENOMEM;
- strcpy(xattr_name, XATTR_TRUSTED_PREFIX);
- strcpy(xattr_name + XATTR_TRUSTED_PREFIX_LEN, name);
-
- res = hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
- kfree(xattr_name);
- return res;
+ return hfsplus_setxattr(dentry, name, buffer, size, flags,
+ XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
}
static size_t hfsplus_trusted_listxattr(struct dentry *dentry, char *list,
diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c
index 2b625a538b64..5aa0e6dc4a1e 100644
--- a/fs/hfsplus/xattr_user.c
+++ b/fs/hfsplus/xattr_user.c
@@ -14,43 +14,16 @@
static int hfsplus_user_getxattr(struct dentry *dentry, const char *name,
void *buffer, size_t size, int type)
{
- char *xattr_name;
- int res;
- if (!strcmp(name, ""))
- return -EINVAL;
-
- xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
- GFP_KERNEL);
- if (!xattr_name)
- return -ENOMEM;
- strcpy(xattr_name, XATTR_USER_PREFIX);
- strcpy(xattr_name + XATTR_USER_PREFIX_LEN, name);
-
- res = hfsplus_getxattr(dentry, xattr_name, buffer, size);
- kfree(xattr_name);
- return res;
+ return hfsplus_getxattr(dentry, name, buffer, size,
+ XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
}
static int hfsplus_user_setxattr(struct dentry *dentry, const char *name,
const void *buffer, size_t size, int flags, int type)
{
- char *xattr_name;
- int res;
-
- if (!strcmp(name, ""))
- return -EINVAL;
-
- xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
- GFP_KERNEL);
- if (!xattr_name)
- return -ENOMEM;
- strcpy(xattr_name, XATTR_USER_PREFIX);
- strcpy(xattr_name + XATTR_USER_PREFIX_LEN, name);
-
- res = hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
- kfree(xattr_name);
- return res;
+ return hfsplus_setxattr(dentry, name, buffer, size, flags,
+ XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
}
static size_t hfsplus_user_listxattr(struct dentry *dentry, char *list,
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c274aca8e8dc..3a8f12762821 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -47,9 +47,10 @@ struct hugetlbfs_config {
kuid_t uid;
kgid_t gid;
umode_t mode;
- long nr_blocks;
+ long max_hpages;
long nr_inodes;
struct hstate *hstate;
+ long min_hpages;
};
struct hugetlbfs_inode_info {
@@ -67,7 +68,7 @@ int sysctl_hugetlb_shm_group;
enum {
Opt_size, Opt_nr_inodes,
Opt_mode, Opt_uid, Opt_gid,
- Opt_pagesize,
+ Opt_pagesize, Opt_min_size,
Opt_err,
};
@@ -78,6 +79,7 @@ static const match_table_t tokens = {
{Opt_uid, "uid=%u"},
{Opt_gid, "gid=%u"},
{Opt_pagesize, "pagesize=%s"},
+ {Opt_min_size, "min_size=%s"},
{Opt_err, NULL},
};
@@ -319,7 +321,7 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
static void truncate_huge_page(struct page *page)
{
- cancel_dirty_page(page, /* No IO accounting for huge pages? */0);
+ ClearPageDirty(page);
ClearPageUptodate(page);
delete_from_page_cache(page);
}
@@ -754,14 +756,38 @@ static const struct super_operations hugetlbfs_ops = {
.show_options = generic_show_options,
};
+enum { NO_SIZE, SIZE_STD, SIZE_PERCENT };
+
+/*
+ * Convert size option passed from command line to number of huge pages
+ * in the pool specified by hstate. Size option could be in bytes
+ * (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT).
+ */
+static long long
+hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt,
+ int val_type)
+{
+ if (val_type == NO_SIZE)
+ return -1;
+
+ if (val_type == SIZE_PERCENT) {
+ size_opt <<= huge_page_shift(h);
+ size_opt *= h->max_huge_pages;
+ do_div(size_opt, 100);
+ }
+
+ size_opt >>= huge_page_shift(h);
+ return size_opt;
+}
+
static int
hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
{
char *p, *rest;
substring_t args[MAX_OPT_ARGS];
int option;
- unsigned long long size = 0;
- enum { NO_SIZE, SIZE_STD, SIZE_PERCENT } setsize = NO_SIZE;
+ unsigned long long max_size_opt = 0, min_size_opt = 0;
+ int max_val_type = NO_SIZE, min_val_type = NO_SIZE;
if (!options)
return 0;
@@ -799,10 +825,10 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
/* memparse() will accept a K/M/G without a digit */
if (!isdigit(*args[0].from))
goto bad_val;
- size = memparse(args[0].from, &rest);
- setsize = SIZE_STD;
+ max_size_opt = memparse(args[0].from, &rest);
+ max_val_type = SIZE_STD;
if (*rest == '%')
- setsize = SIZE_PERCENT;
+ max_val_type = SIZE_PERCENT;
break;
}
@@ -825,6 +851,17 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
break;
}
+ case Opt_min_size: {
+ /* memparse() will accept a K/M/G without a digit */
+ if (!isdigit(*args[0].from))
+ goto bad_val;
+ min_size_opt = memparse(args[0].from, &rest);
+ min_val_type = SIZE_STD;
+ if (*rest == '%')
+ min_val_type = SIZE_PERCENT;
+ break;
+ }
+
default:
pr_err("Bad mount option: \"%s\"\n", p);
return -EINVAL;
@@ -832,15 +869,22 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
}
}
- /* Do size after hstate is set up */
- if (setsize > NO_SIZE) {
- struct hstate *h = pconfig->hstate;
- if (setsize == SIZE_PERCENT) {
- size <<= huge_page_shift(h);
- size *= h->max_huge_pages;
- do_div(size, 100);
- }
- pconfig->nr_blocks = (size >> huge_page_shift(h));
+ /*
+ * Use huge page pool size (in hstate) to convert the size
+ * options to number of huge pages. If NO_SIZE, -1 is returned.
+ */
+ pconfig->max_hpages = hugetlbfs_size_to_hpages(pconfig->hstate,
+ max_size_opt, max_val_type);
+ pconfig->min_hpages = hugetlbfs_size_to_hpages(pconfig->hstate,
+ min_size_opt, min_val_type);
+
+ /*
+ * If max_size was specified, then min_size must be smaller
+ */
+ if (max_val_type > NO_SIZE &&
+ pconfig->min_hpages > pconfig->max_hpages) {
+ pr_err("minimum size can not be greater than maximum size\n");
+ return -EINVAL;
}
return 0;
@@ -859,12 +903,13 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
save_mount_options(sb, data);
- config.nr_blocks = -1; /* No limit on size by default */
+ config.max_hpages = -1; /* No limit on size by default */
config.nr_inodes = -1; /* No limit on number of inodes by default */
config.uid = current_fsuid();
config.gid = current_fsgid();
config.mode = 0755;
config.hstate = &default_hstate;
+ config.min_hpages = -1; /* No default minimum size */
ret = hugetlbfs_parse_options(data, &config);
if (ret)
return ret;
@@ -878,8 +923,15 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
sbinfo->max_inodes = config.nr_inodes;
sbinfo->free_inodes = config.nr_inodes;
sbinfo->spool = NULL;
- if (config.nr_blocks != -1) {
- sbinfo->spool = hugepage_new_subpool(config.nr_blocks);
+ /*
+ * Allocate and initialize subpool if maximum or minimum size is
+ * specified. Any needed reservations (for minimim size) are taken
+ * taken when the subpool is created.
+ */
+ if (config.max_hpages != -1 || config.min_hpages != -1) {
+ sbinfo->spool = hugepage_new_subpool(config.hstate,
+ config.max_hpages,
+ config.min_hpages);
if (!sbinfo->spool)
goto out_free;
}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index b96bd8076b70..0bc333b4a594 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -371,16 +371,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
*/
J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
-retry_alloc:
- new_bh = alloc_buffer_head(GFP_NOFS);
- if (!new_bh) {
- /*
- * Failure is not an option, but __GFP_NOFAIL is going
- * away; so we retry ourselves here.
- */
- congestion_wait(BLK_RW_ASYNC, HZ/50);
- goto retry_alloc;
- }
+ new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
/* keep subsequent assertions sane */
atomic_set(&new_bh->b_count, 1);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 5f09370c90a8..dac4523fa142 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -278,22 +278,16 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
alloc_transaction:
if (!journal->j_running_transaction) {
+ /*
+ * If __GFP_FS is not present, then we may be being called from
+ * inside the fs writeback layer, so we MUST NOT fail.
+ */
+ if ((gfp_mask & __GFP_FS) == 0)
+ gfp_mask |= __GFP_NOFAIL;
new_transaction = kmem_cache_zalloc(transaction_cache,
gfp_mask);
- if (!new_transaction) {
- /*
- * If __GFP_FS is not present, then we may be
- * being called from inside the fs writeback
- * layer, so we MUST NOT fail. Since
- * __GFP_NOFAIL is going away, we will arrange
- * to retry the allocation ourselves.
- */
- if ((gfp_mask & __GFP_FS) == 0) {
- congestion_wait(BLK_RW_ASYNC, HZ/50);
- goto alloc_transaction;
- }
+ if (!new_transaction)
return -ENOMEM;
- }
}
jbd_debug(3, "New handle %p going live.\n", handle);
diff --git a/fs/locks.c b/fs/locks.c
index dba01917f88f..cd8d3fad8ffc 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2591,6 +2591,44 @@ static int locks_show(struct seq_file *f, void *v)
return 0;
}
+static void __show_fd_locks(struct seq_file *f,
+ struct list_head *head, int *id,
+ struct file *filp, struct files_struct *files)
+{
+ struct file_lock *fl;
+
+ list_for_each_entry(fl, head, fl_list) {
+
+ if (filp != fl->fl_file)
+ continue;
+ if (fl->fl_owner != files &&
+ fl->fl_owner != filp)
+ continue;
+
+ (*id)++;
+ seq_puts(f, "lock:\t");
+ lock_get_status(f, fl, *id, "");
+ }
+}
+
+void show_fd_locks(struct seq_file *f,
+ struct file *filp, struct files_struct *files)
+{
+ struct inode *inode = file_inode(filp);
+ struct file_lock_context *ctx;
+ int id = 0;
+
+ ctx = inode->i_flctx;
+ if (!ctx)
+ return;
+
+ spin_lock(&ctx->flc_lock);
+ __show_fd_locks(f, &ctx->flc_flock, &id, filp, files);
+ __show_fd_locks(f, &ctx->flc_posix, &id, filp, files);
+ __show_fd_locks(f, &ctx->flc_lease, &id, filp, files);
+ spin_unlock(&ctx->flc_lock);
+}
+
static void *locks_start(struct seq_file *f, loff_t *pos)
__acquires(&blocked_lock_lock)
{
diff --git a/fs/mpage.c b/fs/mpage.c
index 3e79220babac..587c7ed4185d 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -482,6 +482,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
struct buffer_head map_bh;
loff_t i_size = i_size_read(inode);
int ret = 0;
+ int wr = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
if (page_has_buffers(page)) {
struct buffer_head *head = page_buffers(page);
@@ -590,7 +591,7 @@ page_is_mapped:
* This page will go to BIO. Do we need to send this BIO off first?
*/
if (bio && mpd->last_block_in_bio != blocks[0] - 1)
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
alloc_new:
if (bio == NULL) {
@@ -614,7 +615,7 @@ alloc_new:
*/
length = first_unmapped << blkbits;
if (bio_add_page(bio, page, length, 0) < length) {
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
goto alloc_new;
}
@@ -624,7 +625,7 @@ alloc_new:
set_page_writeback(page);
unlock_page(page);
if (boundary || (first_unmapped != blocks_per_page)) {
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
if (boundary_block) {
write_boundary_block(boundary_bdev,
boundary_block, 1 << blkbits);
@@ -636,7 +637,7 @@ alloc_new:
confused:
if (bio)
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
if (mpd->use_writepage) {
ret = mapping->a_ops->writepage(page, wbc);
@@ -692,8 +693,11 @@ mpage_writepages(struct address_space *mapping,
};
ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
- if (mpd.bio)
- mpage_bio_submit(WRITE, mpd.bio);
+ if (mpd.bio) {
+ int wr = (wbc->sync_mode == WB_SYNC_ALL ?
+ WRITE_SYNC : WRITE);
+ mpage_bio_submit(wr, mpd.bio);
+ }
}
blk_finish_plug(&plug);
return ret;
@@ -710,8 +714,11 @@ int mpage_writepage(struct page *page, get_block_t get_block,
.use_writepage = 0,
};
int ret = __mpage_writepage(page, wbc, &mpd);
- if (mpd.bio)
- mpage_bio_submit(WRITE, mpd.bio);
+ if (mpd.bio) {
+ int wr = (wbc->sync_mode == WB_SYNC_ALL ?
+ WRITE_SYNC : WRITE);
+ mpage_bio_submit(wr, mpd.bio);
+ }
return ret;
}
EXPORT_SYMBOL(mpage_writepage);
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index c7abc10279af..f31fd0dd92c6 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -1,6 +1,6 @@
config NFS_FS
tristate "NFS client support"
- depends on INET && FILE_LOCKING
+ depends on INET && FILE_LOCKING && MULTIUSER
select LOCKD
select SUNRPC
select NFS_ACL_SUPPORT if NFS_V3_ACL
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 849ed784d6ac..759931088094 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1876,11 +1876,6 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
* request from the inode / page_private pointer and
* release it */
nfs_inode_remove_request(req);
- /*
- * In case nfs_inode_remove_request has marked the
- * page as being dirty
- */
- cancel_dirty_page(page, PAGE_CACHE_SIZE);
nfs_unlock_and_release_request(req);
}
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 8b87b7a0eab5..a0b77fc1bd39 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -6,6 +6,7 @@ config NFSD
select SUNRPC
select EXPORTFS
select NFS_ACL_SUPPORT if NFSD_V2_ACL
+ depends on MULTIUSER
help
Choose Y here if you want to allow other computers to access
files residing on this system using Sun's Network File System
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 741fd02e0444..8df0f3b7839b 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -405,13 +405,14 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
static int nilfs_palloc_count_desc_blocks(struct inode *inode,
unsigned long *desc_blocks)
{
- unsigned long blknum;
+ __u64 blknum;
int ret;
ret = nilfs_bmap_last_key(NILFS_I(inode)->i_bmap, &blknum);
if (likely(!ret))
*desc_blocks = DIV_ROUND_UP(
- blknum, NILFS_MDT(inode)->mi_blocks_per_desc_block);
+ (unsigned long)blknum,
+ NILFS_MDT(inode)->mi_blocks_per_desc_block);
return ret;
}
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index aadbd0b5e3e8..27f75bcbeb30 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -152,9 +152,7 @@ static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
*
* %-EEXIST - A record associated with @key already exist.
*/
-int nilfs_bmap_insert(struct nilfs_bmap *bmap,
- unsigned long key,
- unsigned long rec)
+int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec)
{
int ret;
@@ -191,19 +189,47 @@ static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key)
return bmap->b_ops->bop_delete(bmap, key);
}
-int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key)
+/**
+ * nilfs_bmap_seek_key - seek a valid entry and return its key
+ * @bmap: bmap struct
+ * @start: start key number
+ * @keyp: place to store valid key
+ *
+ * Description: nilfs_bmap_seek_key() seeks a valid key on @bmap
+ * starting from @start, and stores it to @keyp if found.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - No valid entry was found
+ */
+int nilfs_bmap_seek_key(struct nilfs_bmap *bmap, __u64 start, __u64 *keyp)
{
- __u64 lastkey;
int ret;
down_read(&bmap->b_sem);
- ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
+ ret = bmap->b_ops->bop_seek_key(bmap, start, keyp);
+ up_read(&bmap->b_sem);
+
+ if (ret < 0)
+ ret = nilfs_bmap_convert_error(bmap, __func__, ret);
+ return ret;
+}
+
+int nilfs_bmap_last_key(struct nilfs_bmap *bmap, __u64 *keyp)
+{
+ int ret;
+
+ down_read(&bmap->b_sem);
+ ret = bmap->b_ops->bop_last_key(bmap, keyp);
up_read(&bmap->b_sem);
if (ret < 0)
ret = nilfs_bmap_convert_error(bmap, __func__, ret);
- else
- *key = lastkey;
return ret;
}
@@ -224,7 +250,7 @@ int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key)
*
* %-ENOENT - A record associated with @key does not exist.
*/
-int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key)
+int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key)
{
int ret;
@@ -235,7 +261,7 @@ int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key)
return nilfs_bmap_convert_error(bmap, __func__, ret);
}
-static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key)
+static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, __u64 key)
{
__u64 lastkey;
int ret;
@@ -276,7 +302,7 @@ static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key)
*
* %-ENOMEM - Insufficient amount of memory available.
*/
-int nilfs_bmap_truncate(struct nilfs_bmap *bmap, unsigned long key)
+int nilfs_bmap_truncate(struct nilfs_bmap *bmap, __u64 key)
{
int ret;
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index b89e68076adc..bfa817ce40b3 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -76,8 +76,10 @@ struct nilfs_bmap_operations {
union nilfs_binfo *);
int (*bop_mark)(struct nilfs_bmap *, __u64, int);
- /* The following functions are internal use only. */
+ int (*bop_seek_key)(const struct nilfs_bmap *, __u64, __u64 *);
int (*bop_last_key)(const struct nilfs_bmap *, __u64 *);
+
+ /* The following functions are internal use only. */
int (*bop_check_insert)(const struct nilfs_bmap *, __u64);
int (*bop_check_delete)(struct nilfs_bmap *, __u64);
int (*bop_gather_data)(struct nilfs_bmap *, __u64 *, __u64 *, int);
@@ -153,10 +155,11 @@ int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned);
-int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long);
-int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long);
-int nilfs_bmap_last_key(struct nilfs_bmap *, unsigned long *);
-int nilfs_bmap_truncate(struct nilfs_bmap *, unsigned long);
+int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec);
+int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key);
+int nilfs_bmap_seek_key(struct nilfs_bmap *bmap, __u64 start, __u64 *keyp);
+int nilfs_bmap_last_key(struct nilfs_bmap *bmap, __u64 *keyp);
+int nilfs_bmap_truncate(struct nilfs_bmap *bmap, __u64 key);
void nilfs_bmap_clear(struct nilfs_bmap *);
int nilfs_bmap_propagate(struct nilfs_bmap *, struct buffer_head *);
void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *, struct list_head *);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index ecdbae19a766..841d177d58d6 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -633,6 +633,44 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_bmap *btree,
return 0;
}
+/**
+ * nilfs_btree_get_next_key - get next valid key from btree path array
+ * @btree: bmap struct of btree
+ * @path: array of nilfs_btree_path struct
+ * @minlevel: start level
+ * @nextkey: place to store the next valid key
+ *
+ * Return Value: If a next key was found, 0 is returned. Otherwise,
+ * -ENOENT is returned.
+ */
+static int nilfs_btree_get_next_key(const struct nilfs_bmap *btree,
+ const struct nilfs_btree_path *path,
+ int minlevel, __u64 *nextkey)
+{
+ struct nilfs_btree_node *node;
+ int maxlevel = nilfs_btree_height(btree) - 1;
+ int index, next_adj, level;
+
+ /* Next index is already set to bp_index for leaf nodes. */
+ next_adj = 0;
+ for (level = minlevel; level <= maxlevel; level++) {
+ if (level == maxlevel)
+ node = nilfs_btree_get_root(btree);
+ else
+ node = nilfs_btree_get_nonroot_node(path, level);
+
+ index = path[level].bp_index + next_adj;
+ if (index < nilfs_btree_node_get_nchildren(node)) {
+ /* Next key is in this node */
+ *nextkey = nilfs_btree_node_get_key(node, index);
+ return 0;
+ }
+ /* For non-leaf nodes, next index is stored at bp_index + 1. */
+ next_adj = 1;
+ }
+ return -ENOENT;
+}
+
static int nilfs_btree_lookup(const struct nilfs_bmap *btree,
__u64 key, int level, __u64 *ptrp)
{
@@ -1563,6 +1601,30 @@ out:
return ret;
}
+static int nilfs_btree_seek_key(const struct nilfs_bmap *btree, __u64 start,
+ __u64 *keyp)
+{
+ struct nilfs_btree_path *path;
+ const int minlevel = NILFS_BTREE_LEVEL_NODE_MIN;
+ int ret;
+
+ if (start > NILFS_BTREE_KEY_MAX)
+ return -ENOENT;
+
+ path = nilfs_btree_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = nilfs_btree_do_lookup(btree, path, start, NULL, minlevel, 0);
+ if (!ret)
+ *keyp = start;
+ else if (ret == -ENOENT)
+ ret = nilfs_btree_get_next_key(btree, path, minlevel, keyp);
+
+ nilfs_btree_free_path(path);
+ return ret;
+}
+
static int nilfs_btree_last_key(const struct nilfs_bmap *btree, __u64 *keyp)
{
struct nilfs_btree_path *path;
@@ -2298,7 +2360,9 @@ static const struct nilfs_bmap_operations nilfs_btree_ops = {
.bop_assign = nilfs_btree_assign,
.bop_mark = nilfs_btree_mark,
+ .bop_seek_key = nilfs_btree_seek_key,
.bop_last_key = nilfs_btree_last_key,
+
.bop_check_insert = NULL,
.bop_check_delete = nilfs_btree_check_delete,
.bop_gather_data = nilfs_btree_gather_data,
@@ -2318,7 +2382,9 @@ static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
.bop_assign = nilfs_btree_assign_gc,
.bop_mark = NULL,
+ .bop_seek_key = NULL,
.bop_last_key = NULL,
+
.bop_check_insert = NULL,
.bop_check_delete = NULL,
.bop_gather_data = NULL,
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 0d58075f34e2..b6596cab9e99 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -53,6 +53,13 @@ nilfs_cpfile_get_offset(const struct inode *cpfile, __u64 cno)
return do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
}
+static __u64 nilfs_cpfile_first_checkpoint_in_block(const struct inode *cpfile,
+ unsigned long blkoff)
+{
+ return (__u64)nilfs_cpfile_checkpoints_per_block(cpfile) * blkoff
+ + 1 - NILFS_MDT(cpfile)->mi_first_entry_offset;
+}
+
static unsigned long
nilfs_cpfile_checkpoints_in_block(const struct inode *cpfile,
__u64 curr,
@@ -146,6 +153,44 @@ static inline int nilfs_cpfile_get_checkpoint_block(struct inode *cpfile,
create, nilfs_cpfile_block_init, bhp);
}
+/**
+ * nilfs_cpfile_find_checkpoint_block - find and get a buffer on cpfile
+ * @cpfile: inode of cpfile
+ * @start_cno: start checkpoint number (inclusive)
+ * @end_cno: end checkpoint number (inclusive)
+ * @cnop: place to store the next checkpoint number
+ * @bhp: place to store a pointer to buffer_head struct
+ *
+ * Return Value: On success, it returns 0. On error, the following negative
+ * error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOENT - no block exists in the range.
+ */
+static int nilfs_cpfile_find_checkpoint_block(struct inode *cpfile,
+ __u64 start_cno, __u64 end_cno,
+ __u64 *cnop,
+ struct buffer_head **bhp)
+{
+ unsigned long start, end, blkoff;
+ int ret;
+
+ if (unlikely(start_cno > end_cno))
+ return -ENOENT;
+
+ start = nilfs_cpfile_get_blkoff(cpfile, start_cno);
+ end = nilfs_cpfile_get_blkoff(cpfile, end_cno);
+
+ ret = nilfs_mdt_find_block(cpfile, start, end, &blkoff, bhp);
+ if (!ret)
+ *cnop = (blkoff == start) ? start_cno :
+ nilfs_cpfile_first_checkpoint_in_block(cpfile, blkoff);
+ return ret;
+}
+
static inline int nilfs_cpfile_delete_checkpoint_block(struct inode *cpfile,
__u64 cno)
{
@@ -403,14 +448,15 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
return -ENOENT; /* checkpoint number 0 is invalid */
down_read(&NILFS_MDT(cpfile)->mi_sem);
- for (n = 0; cno < cur_cno && n < nci; cno += ncps) {
- ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno);
- ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
+ for (n = 0; n < nci; cno += ncps) {
+ ret = nilfs_cpfile_find_checkpoint_block(
+ cpfile, cno, cur_cno - 1, &cno, &bh);
if (ret < 0) {
- if (ret != -ENOENT)
- goto out;
- continue; /* skip hole */
+ if (likely(ret == -ENOENT))
+ break;
+ goto out;
}
+ ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno);
kaddr = kmap_atomic(bh->b_page);
cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 82f4865e86dd..ebf89fd8ac1a 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -173,6 +173,21 @@ static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key)
return ret;
}
+static int nilfs_direct_seek_key(const struct nilfs_bmap *direct, __u64 start,
+ __u64 *keyp)
+{
+ __u64 key;
+
+ for (key = start; key <= NILFS_DIRECT_KEY_MAX; key++) {
+ if (nilfs_direct_get_ptr(direct, key) !=
+ NILFS_BMAP_INVALID_PTR) {
+ *keyp = key;
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
static int nilfs_direct_last_key(const struct nilfs_bmap *direct, __u64 *keyp)
{
__u64 key, lastkey;
@@ -355,7 +370,9 @@ static const struct nilfs_bmap_operations nilfs_direct_ops = {
.bop_assign = nilfs_direct_assign,
.bop_mark = NULL,
+ .bop_seek_key = nilfs_direct_seek_key,
.bop_last_key = nilfs_direct_last_key,
+
.bop_check_insert = nilfs_direct_check_insert,
.bop_check_delete = NULL,
.bop_gather_data = nilfs_direct_gather_data,
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index ab4987bc637f..07577cbe668a 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -106,7 +106,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
err = nilfs_transaction_begin(inode->i_sb, &ti, 1);
if (unlikely(err))
goto out;
- err = nilfs_bmap_insert(ii->i_bmap, (unsigned long)blkoff,
+ err = nilfs_bmap_insert(ii->i_bmap, blkoff,
(unsigned long)bh_result);
if (unlikely(err != 0)) {
if (err == -EEXIST) {
@@ -714,7 +714,7 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags)
static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
unsigned long from)
{
- unsigned long b;
+ __u64 b;
int ret;
if (!test_bit(NILFS_I_BMAP, &ii->i_state))
@@ -729,7 +729,7 @@ repeat:
if (b < from)
return;
- b -= min_t(unsigned long, NILFS_MAX_TRUNCATE_BLOCKS, b - from);
+ b -= min_t(__u64, NILFS_MAX_TRUNCATE_BLOCKS, b - from);
ret = nilfs_bmap_truncate(ii->i_bmap, b);
nilfs_relax_pressure_in_lock(ii->vfs_inode.i_sb);
if (!ret || (ret == -ENOMEM &&
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 892cf5ffdb8e..dee34d990281 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -261,6 +261,60 @@ int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
}
/**
+ * nilfs_mdt_find_block - find and get a buffer on meta data file.
+ * @inode: inode of the meta data file
+ * @start: start block offset (inclusive)
+ * @end: end block offset (inclusive)
+ * @blkoff: block offset
+ * @out_bh: place to store a pointer to buffer_head struct
+ *
+ * nilfs_mdt_find_block() looks up an existing block in range of
+ * [@start, @end] and stores pointer to a buffer head of the block to
+ * @out_bh, and block offset to @blkoff, respectively. @out_bh and
+ * @blkoff are substituted only when zero is returned.
+ *
+ * Return Value: On success, it returns 0. On error, the following negative
+ * error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOENT - no block was found in the range
+ */
+int nilfs_mdt_find_block(struct inode *inode, unsigned long start,
+ unsigned long end, unsigned long *blkoff,
+ struct buffer_head **out_bh)
+{
+ __u64 next;
+ int ret;
+
+ if (unlikely(start > end))
+ return -ENOENT;
+
+ ret = nilfs_mdt_read_block(inode, start, true, out_bh);
+ if (!ret) {
+ *blkoff = start;
+ goto out;
+ }
+ if (unlikely(ret != -ENOENT || start == ULONG_MAX))
+ goto out;
+
+ ret = nilfs_bmap_seek_key(NILFS_I(inode)->i_bmap, start + 1, &next);
+ if (!ret) {
+ if (next <= end) {
+ ret = nilfs_mdt_read_block(inode, next, true, out_bh);
+ if (!ret)
+ *blkoff = next;
+ } else {
+ ret = -ENOENT;
+ }
+ }
+out:
+ return ret;
+}
+
+/**
* nilfs_mdt_delete_block - make a hole on the meta data file.
* @inode: inode of the meta data file
* @block: block offset
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index ab172e8549c5..fe529a87a208 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -78,6 +78,9 @@ int nilfs_mdt_get_block(struct inode *, unsigned long, int,
void (*init_block)(struct inode *,
struct buffer_head *, void *),
struct buffer_head **);
+int nilfs_mdt_find_block(struct inode *inode, unsigned long start,
+ unsigned long end, unsigned long *blkoff,
+ struct buffer_head **out_bh);
int nilfs_mdt_delete_block(struct inode *, unsigned long);
int nilfs_mdt_forget_block(struct inode *, unsigned long);
int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
@@ -111,7 +114,10 @@ static inline __u64 nilfs_mdt_cno(struct inode *inode)
return ((struct the_nilfs *)inode->i_sb->s_fs_info)->ns_cno;
}
-#define nilfs_mdt_bgl_lock(inode, bg) \
- (&NILFS_MDT(inode)->mi_bgl->locks[(bg) & (NR_BG_LOCKS-1)].lock)
+static inline spinlock_t *
+nilfs_mdt_bgl_lock(struct inode *inode, unsigned int block_group)
+{
+ return bgl_lock_ptr(NILFS_MDT(inode)->mi_bgl, block_group);
+}
#endif /* _NILFS_MDT_H */
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 700ecbcca55d..45d650addd56 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -89,18 +89,16 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
void nilfs_forget_buffer(struct buffer_head *bh)
{
struct page *page = bh->b_page;
+ const unsigned long clear_bits =
+ (1 << BH_Uptodate | 1 << BH_Dirty | 1 << BH_Mapped |
+ 1 << BH_Async_Write | 1 << BH_NILFS_Volatile |
+ 1 << BH_NILFS_Checked | 1 << BH_NILFS_Redirected);
lock_buffer(bh);
- clear_buffer_nilfs_volatile(bh);
- clear_buffer_nilfs_checked(bh);
- clear_buffer_nilfs_redirected(bh);
- clear_buffer_async_write(bh);
- clear_buffer_dirty(bh);
+ set_mask_bits(&bh->b_state, clear_bits, 0);
if (nilfs_page_buffers_clean(page))
__nilfs_clear_page_dirty(page);
- clear_buffer_uptodate(bh);
- clear_buffer_mapped(bh);
bh->b_blocknr = -1;
ClearPageUptodate(page);
ClearPageMappedToDisk(page);
@@ -421,6 +419,10 @@ void nilfs_clear_dirty_page(struct page *page, bool silent)
if (page_has_buffers(page)) {
struct buffer_head *bh, *head;
+ const unsigned long clear_bits =
+ (1 << BH_Uptodate | 1 << BH_Dirty | 1 << BH_Mapped |
+ 1 << BH_Async_Write | 1 << BH_NILFS_Volatile |
+ 1 << BH_NILFS_Checked | 1 << BH_NILFS_Redirected);
bh = head = page_buffers(page);
do {
@@ -430,13 +432,7 @@ void nilfs_clear_dirty_page(struct page *page, bool silent)
"discard block %llu, size %zu",
(u64)bh->b_blocknr, bh->b_size);
}
- clear_buffer_async_write(bh);
- clear_buffer_dirty(bh);
- clear_buffer_nilfs_volatile(bh);
- clear_buffer_nilfs_checked(bh);
- clear_buffer_nilfs_redirected(bh);
- clear_buffer_uptodate(bh);
- clear_buffer_mapped(bh);
+ set_mask_bits(&bh->b_state, clear_bits, 0);
unlock_buffer(bh);
} while (bh = bh->b_this_page, bh != head);
}
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 0c3f303baf32..c6abbad9b8e3 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -24,6 +24,7 @@
#include <linux/pagemap.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
+#include <linux/bitops.h>
#include <linux/bio.h>
#include <linux/completion.h>
#include <linux/blkdev.h>
@@ -1588,7 +1589,6 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
b_assoc_buffers) {
- set_buffer_async_write(bh);
if (bh->b_page != bd_page) {
if (bd_page) {
lock_page(bd_page);
@@ -1688,7 +1688,6 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
list_for_each_entry(segbuf, logs, sb_list) {
list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
b_assoc_buffers) {
- clear_buffer_async_write(bh);
if (bh->b_page != bd_page) {
if (bd_page)
end_page_writeback(bd_page);
@@ -1768,7 +1767,6 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
b_assoc_buffers) {
set_buffer_uptodate(bh);
clear_buffer_dirty(bh);
- clear_buffer_async_write(bh);
if (bh->b_page != bd_page) {
if (bd_page)
end_page_writeback(bd_page);
@@ -1788,12 +1786,13 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
*/
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
- set_buffer_uptodate(bh);
- clear_buffer_dirty(bh);
- clear_buffer_async_write(bh);
- clear_buffer_delay(bh);
- clear_buffer_nilfs_volatile(bh);
- clear_buffer_nilfs_redirected(bh);
+ const unsigned long set_bits = (1 << BH_Uptodate);
+ const unsigned long clear_bits =
+ (1 << BH_Dirty | 1 << BH_Async_Write |
+ 1 << BH_Delay | 1 << BH_NILFS_Volatile |
+ 1 << BH_NILFS_Redirected);
+
+ set_mask_bits(&bh->b_state, clear_bits, set_bits);
if (bh == segbuf->sb_super_root) {
if (bh->b_page != bd_page) {
end_page_writeback(bd_page);
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 044158bd22be..2b0aaa6205b8 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -908,32 +908,32 @@ static int ocfs2_validate_extent_block(struct super_block *sb,
*/
if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- ocfs2_error(sb,
+ rc = ocfs2_error(sb,
"Extent block #%llu has bad signature %.*s",
(unsigned long long)bh->b_blocknr, 7,
eb->h_signature);
- return -EINVAL;
+ goto bail;
}
if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
- ocfs2_error(sb,
+ rc = ocfs2_error(sb,
"Extent block #%llu has an invalid h_blkno "
"of %llu",
(unsigned long long)bh->b_blocknr,
(unsigned long long)le64_to_cpu(eb->h_blkno));
- return -EINVAL;
+ goto bail;
}
if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
- ocfs2_error(sb,
+ rc = ocfs2_error(sb,
"Extent block #%llu has an invalid "
"h_fs_generation of #%u",
(unsigned long long)bh->b_blocknr,
le32_to_cpu(eb->h_fs_generation));
- return -EINVAL;
+ goto bail;
}
-
- return 0;
+bail:
+ return rc;
}
int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
@@ -3370,7 +3370,7 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
ret = ocfs2_get_right_path(et, left_path, &right_path);
if (ret) {
mlog_errno(ret);
- goto out;
+ return ret;
}
right_el = path_leaf_el(right_path);
@@ -3453,8 +3453,7 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
subtree_index);
}
out:
- if (right_path)
- ocfs2_free_path(right_path);
+ ocfs2_free_path(right_path);
return ret;
}
@@ -3536,7 +3535,7 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
ret = ocfs2_get_left_path(et, right_path, &left_path);
if (ret) {
mlog_errno(ret);
- goto out;
+ return ret;
}
left_el = path_leaf_el(left_path);
@@ -3647,8 +3646,7 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
right_path, subtree_index);
}
out:
- if (left_path)
- ocfs2_free_path(left_path);
+ ocfs2_free_path(left_path);
return ret;
}
@@ -4334,17 +4332,17 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
} else if (path->p_tree_depth > 0) {
status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
if (status)
- goto out;
+ goto exit;
if (left_cpos != 0) {
left_path = ocfs2_new_path_from_path(path);
if (!left_path)
- goto out;
+ goto exit;
status = ocfs2_find_path(et->et_ci, left_path,
left_cpos);
if (status)
- goto out;
+ goto free_left_path;
new_el = path_leaf_el(left_path);
@@ -4361,7 +4359,7 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
le16_to_cpu(new_el->l_next_free_rec),
le16_to_cpu(new_el->l_count));
status = -EINVAL;
- goto out;
+ goto free_left_path;
}
rec = &new_el->l_recs[
le16_to_cpu(new_el->l_next_free_rec) - 1];
@@ -4388,18 +4386,18 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
path->p_tree_depth > 0) {
status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
if (status)
- goto out;
+ goto free_left_path;
if (right_cpos == 0)
- goto out;
+ goto free_left_path;
right_path = ocfs2_new_path_from_path(path);
if (!right_path)
- goto out;
+ goto free_left_path;
status = ocfs2_find_path(et->et_ci, right_path, right_cpos);
if (status)
- goto out;
+ goto free_right_path;
new_el = path_leaf_el(right_path);
rec = &new_el->l_recs[0];
@@ -4413,7 +4411,7 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
(unsigned long long)le64_to_cpu(eb->h_blkno),
le16_to_cpu(new_el->l_next_free_rec));
status = -EINVAL;
- goto out;
+ goto free_right_path;
}
rec = &new_el->l_recs[1];
}
@@ -4430,12 +4428,11 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
ret = contig_type;
}
-out:
- if (left_path)
- ocfs2_free_path(left_path);
- if (right_path)
- ocfs2_free_path(right_path);
-
+free_right_path:
+ ocfs2_free_path(right_path);
+free_left_path:
+ ocfs2_free_path(left_path);
+exit:
return ret;
}
@@ -6858,13 +6855,13 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
if (pages == NULL) {
ret = -ENOMEM;
mlog_errno(ret);
- goto out;
+ return ret;
}
ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
if (ret) {
mlog_errno(ret);
- goto out;
+ goto free_pages;
}
}
@@ -6996,9 +6993,8 @@ out_commit:
out:
if (data_ac)
ocfs2_free_alloc_context(data_ac);
- if (pages)
- kfree(pages);
-
+free_pages:
+ kfree(pages);
return ret;
}
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index e1bf18c5d25e..27ffd4bfc548 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -664,6 +664,117 @@ static int ocfs2_is_overwrite(struct ocfs2_super *osb,
return 0;
}
+static int ocfs2_direct_IO_zero_extend(struct ocfs2_super *osb,
+ struct inode *inode, loff_t offset,
+ u64 zero_len, int cluster_align)
+{
+ u32 p_cpos = 0;
+ u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
+ unsigned int num_clusters = 0;
+ unsigned int ext_flags = 0;
+ int ret = 0;
+
+ if (offset <= i_size_read(inode) || cluster_align)
+ return 0;
+
+ ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
+ &ext_flags);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
+ u64 s = i_size_read(inode);
+ sector_t sector = (p_cpos << (osb->s_clustersize_bits - 9)) +
+ (do_div(s, osb->s_clustersize) >> 9);
+
+ ret = blkdev_issue_zeroout(osb->sb->s_bdev, sector,
+ zero_len >> 9, GFP_NOFS, false);
+ if (ret < 0)
+ mlog_errno(ret);
+ }
+
+ return ret;
+}
+
+static int ocfs2_direct_IO_extend_no_holes(struct ocfs2_super *osb,
+ struct inode *inode, loff_t offset)
+{
+ u64 zero_start, zero_len, total_zero_len;
+ u32 p_cpos = 0, clusters_to_add;
+ u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
+ unsigned int num_clusters = 0;
+ unsigned int ext_flags = 0;
+ u32 size_div, offset_div;
+ int ret = 0;
+
+ {
+ u64 o = offset;
+ u64 s = i_size_read(inode);
+
+ offset_div = do_div(o, osb->s_clustersize);
+ size_div = do_div(s, osb->s_clustersize);
+ }
+
+ if (offset <= i_size_read(inode))
+ return 0;
+
+ clusters_to_add = ocfs2_bytes_to_clusters(inode->i_sb, offset) -
+ ocfs2_bytes_to_clusters(inode->i_sb, i_size_read(inode));
+ total_zero_len = offset - i_size_read(inode);
+ if (clusters_to_add)
+ total_zero_len -= offset_div;
+
+ /* Allocate clusters to fill out holes, and this is only needed
+ * when we add more than one clusters. Otherwise the cluster will
+ * be allocated during direct IO */
+ if (clusters_to_add > 1) {
+ ret = ocfs2_extend_allocation(inode,
+ OCFS2_I(inode)->ip_clusters,
+ clusters_to_add - 1, 0);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ while (total_zero_len) {
+ ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
+ &ext_flags);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ zero_start = ocfs2_clusters_to_bytes(osb->sb, p_cpos) +
+ size_div;
+ zero_len = ocfs2_clusters_to_bytes(osb->sb, num_clusters) -
+ size_div;
+ zero_len = min(total_zero_len, zero_len);
+
+ if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
+ ret = blkdev_issue_zeroout(osb->sb->s_bdev,
+ zero_start >> 9, zero_len >> 9,
+ GFP_NOFS, false);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ total_zero_len -= zero_len;
+ v_cpos += ocfs2_bytes_to_clusters(osb->sb, zero_len + size_div);
+
+ /* Only at first iteration can be cluster not aligned.
+ * So set size_div to 0 for the rest */
+ size_div = 0;
+ }
+
+out:
+ return ret;
+}
+
static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
struct iov_iter *iter,
loff_t offset)
@@ -678,8 +789,8 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
struct buffer_head *di_bh = NULL;
size_t count = iter->count;
journal_t *journal = osb->journal->j_journal;
- u32 zero_len;
- int cluster_align;
+ u64 zero_len_head, zero_len_tail;
+ int cluster_align_head, cluster_align_tail;
loff_t final_size = offset + count;
int append_write = offset >= i_size_read(inode) ? 1 : 0;
unsigned int num_clusters = 0;
@@ -687,9 +798,16 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
{
u64 o = offset;
+ u64 s = i_size_read(inode);
+
+ zero_len_head = do_div(o, 1 << osb->s_clustersize_bits);
+ cluster_align_head = !zero_len_head;
- zero_len = do_div(o, 1 << osb->s_clustersize_bits);
- cluster_align = !zero_len;
+ zero_len_tail = osb->s_clustersize -
+ do_div(s, osb->s_clustersize);
+ if ((offset - i_size_read(inode)) < zero_len_tail)
+ zero_len_tail = offset - i_size_read(inode);
+ cluster_align_tail = !zero_len_tail;
}
/*
@@ -707,21 +825,23 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
}
if (append_write) {
- ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ ret = ocfs2_inode_lock(inode, NULL, 1);
if (ret < 0) {
mlog_errno(ret);
goto clean_orphan;
}
+ /* zeroing out the previously allocated cluster tail
+ * that but not zeroed */
if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
- ret = ocfs2_zero_extend(inode, di_bh, offset);
+ ret = ocfs2_direct_IO_zero_extend(osb, inode, offset,
+ zero_len_tail, cluster_align_tail);
else
- ret = ocfs2_extend_no_holes(inode, di_bh, offset,
+ ret = ocfs2_direct_IO_extend_no_holes(osb, inode,
offset);
if (ret < 0) {
mlog_errno(ret);
ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
goto clean_orphan;
}
@@ -729,13 +849,10 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
if (is_overwrite < 0) {
mlog_errno(is_overwrite);
ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
goto clean_orphan;
}
ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
- di_bh = NULL;
}
written = __blockdev_direct_IO(WRITE, iocb, inode, inode->i_sb->s_bdev,
@@ -772,15 +889,23 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
if (ret < 0)
mlog_errno(ret);
}
- } else if (written < 0 && append_write && !is_overwrite &&
- !cluster_align) {
+ } else if (written > 0 && append_write && !is_overwrite &&
+ !cluster_align_head) {
+ /* zeroing out the allocated cluster head */
u32 p_cpos = 0;
u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
+ ret = ocfs2_inode_lock(inode, NULL, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto clean_orphan;
+ }
+
ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
&num_clusters, &ext_flags);
if (ret < 0) {
mlog_errno(ret);
+ ocfs2_inode_unlock(inode, 0);
goto clean_orphan;
}
@@ -788,9 +913,11 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
ret = blkdev_issue_zeroout(osb->sb->s_bdev,
p_cpos << (osb->s_clustersize_bits - 9),
- zero_len >> 9, GFP_KERNEL, false);
+ zero_len_head >> 9, GFP_NOFS, false);
if (ret < 0)
mlog_errno(ret);
+
+ ocfs2_inode_unlock(inode, 0);
}
clean_orphan:
@@ -2053,16 +2180,6 @@ try_again:
if (ret)
goto out_commit;
}
- /*
- * We don't want this to fail in ocfs2_write_end(), so do it
- * here.
- */
- ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret) {
- mlog_errno(ret);
- goto out_quota;
- }
/*
* Fill our page array first. That way we've grabbed enough so
@@ -2213,7 +2330,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
- int i;
+ int i, ret;
unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
struct inode *inode = mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -2263,6 +2380,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
}
}
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ copied = ret;
+ mlog_errno(ret);
+ goto out;
+ }
+
out_write_size:
pos += copied;
if (pos > i_size_read(inode)) {
@@ -2284,6 +2409,7 @@ out_write_size:
*/
ocfs2_unlock_pages(wc);
+out:
ocfs2_commit_trans(osb, handle);
ocfs2_run_deallocs(osb, &wc->w_dealloc);
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 1edcb141f639..fe50ded1b4ce 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -316,6 +316,12 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
bh = bhs[i];
if (!(flags & OCFS2_BH_READAHEAD)) {
+ if (status) {
+ /* Clear the rest of the buffers on error */
+ put_bh(bh);
+ bhs[i] = NULL;
+ continue;
+ }
/* We know this can't have changed as we hold the
* owner sem. Avoid doing any work on the bh if the
* journal has it. */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 16eff45727ee..0870fffd2a35 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -36,7 +36,7 @@
#include <linux/debugfs.h>
#include <linux/slab.h>
#include <linux/bitmap.h>
-
+#include <linux/ktime.h>
#include "heartbeat.h"
#include "tcp.h"
#include "nodemanager.h"
@@ -1061,37 +1061,6 @@ bail:
return ret;
}
-/* Subtract b from a, storing the result in a. a *must* have a larger
- * value than b. */
-static void o2hb_tv_subtract(struct timeval *a,
- struct timeval *b)
-{
- /* just return 0 when a is after b */
- if (a->tv_sec < b->tv_sec ||
- (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) {
- a->tv_sec = 0;
- a->tv_usec = 0;
- return;
- }
-
- a->tv_sec -= b->tv_sec;
- a->tv_usec -= b->tv_usec;
- while ( a->tv_usec < 0 ) {
- a->tv_sec--;
- a->tv_usec += 1000000;
- }
-}
-
-static unsigned int o2hb_elapsed_msecs(struct timeval *start,
- struct timeval *end)
-{
- struct timeval res = *end;
-
- o2hb_tv_subtract(&res, start);
-
- return res.tv_sec * 1000 + res.tv_usec / 1000;
-}
-
/*
* we ride the region ref that the region dir holds. before the region
* dir is removed and drops it ref it will wait to tear down this
@@ -1102,7 +1071,7 @@ static int o2hb_thread(void *data)
int i, ret;
struct o2hb_region *reg = data;
struct o2hb_bio_wait_ctxt write_wc;
- struct timeval before_hb, after_hb;
+ ktime_t before_hb, after_hb;
unsigned int elapsed_msec;
mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n");
@@ -1119,18 +1088,18 @@ static int o2hb_thread(void *data)
* hr_timeout_ms between disk writes. On busy systems
* this should result in a heartbeat which is less
* likely to time itself out. */
- do_gettimeofday(&before_hb);
+ before_hb = ktime_get_real();
ret = o2hb_do_disk_heartbeat(reg);
- do_gettimeofday(&after_hb);
- elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
+ after_hb = ktime_get_real();
+
+ elapsed_msec = (unsigned int)
+ ktime_ms_delta(after_hb, before_hb);
mlog(ML_HEARTBEAT,
- "start = %lu.%lu, end = %lu.%lu, msec = %u, ret = %d\n",
- before_hb.tv_sec, (unsigned long) before_hb.tv_usec,
- after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
- elapsed_msec, ret);
+ "start = %lld, end = %lld, msec = %u, ret = %d\n",
+ before_hb.tv64, after_hb.tv64, elapsed_msec, ret);
if (!kthread_should_stop() &&
elapsed_msec < reg->hr_timeout_ms) {
@@ -1312,7 +1281,9 @@ static int o2hb_debug_init(void)
int ret = -ENOMEM;
o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
- if (!o2hb_debug_dir) {
+ if (IS_ERR_OR_NULL(o2hb_debug_dir)) {
+ ret = o2hb_debug_dir ?
+ PTR_ERR(o2hb_debug_dir) : -ENOMEM;
mlog_errno(ret);
goto bail;
}
@@ -1325,7 +1296,9 @@ static int o2hb_debug_init(void)
sizeof(o2hb_live_node_bitmap),
O2NM_MAX_NODES,
o2hb_live_node_bitmap);
- if (!o2hb_debug_livenodes) {
+ if (IS_ERR_OR_NULL(o2hb_debug_livenodes)) {
+ ret = o2hb_debug_livenodes ?
+ PTR_ERR(o2hb_debug_livenodes) : -ENOMEM;
mlog_errno(ret);
goto bail;
}
@@ -1338,7 +1311,9 @@ static int o2hb_debug_init(void)
sizeof(o2hb_live_region_bitmap),
O2NM_MAX_REGIONS,
o2hb_live_region_bitmap);
- if (!o2hb_debug_liveregions) {
+ if (IS_ERR_OR_NULL(o2hb_debug_liveregions)) {
+ ret = o2hb_debug_liveregions ?
+ PTR_ERR(o2hb_debug_liveregions) : -ENOMEM;
mlog_errno(ret);
goto bail;
}
@@ -1352,7 +1327,9 @@ static int o2hb_debug_init(void)
sizeof(o2hb_quorum_region_bitmap),
O2NM_MAX_REGIONS,
o2hb_quorum_region_bitmap);
- if (!o2hb_debug_quorumregions) {
+ if (IS_ERR_OR_NULL(o2hb_debug_quorumregions)) {
+ ret = o2hb_debug_quorumregions ?
+ PTR_ERR(o2hb_debug_quorumregions) : -ENOMEM;
mlog_errno(ret);
goto bail;
}
@@ -1366,7 +1343,9 @@ static int o2hb_debug_init(void)
sizeof(o2hb_failed_region_bitmap),
O2NM_MAX_REGIONS,
o2hb_failed_region_bitmap);
- if (!o2hb_debug_failedregions) {
+ if (IS_ERR_OR_NULL(o2hb_debug_failedregions)) {
+ ret = o2hb_debug_failedregions ?
+ PTR_ERR(o2hb_debug_failedregions) : -ENOMEM;
mlog_errno(ret);
goto bail;
}
@@ -2000,7 +1979,8 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
reg->hr_debug_dir =
debugfs_create_dir(config_item_name(&reg->hr_item), dir);
- if (!reg->hr_debug_dir) {
+ if (IS_ERR_OR_NULL(reg->hr_debug_dir)) {
+ ret = reg->hr_debug_dir ? PTR_ERR(reg->hr_debug_dir) : -ENOMEM;
mlog_errno(ret);
goto bail;
}
@@ -2013,7 +1993,9 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
O2HB_DB_TYPE_REGION_LIVENODES,
sizeof(reg->hr_live_node_bitmap),
O2NM_MAX_NODES, reg);
- if (!reg->hr_debug_livenodes) {
+ if (IS_ERR_OR_NULL(reg->hr_debug_livenodes)) {
+ ret = reg->hr_debug_livenodes ?
+ PTR_ERR(reg->hr_debug_livenodes) : -ENOMEM;
mlog_errno(ret);
goto bail;
}
@@ -2025,7 +2007,9 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
sizeof(*(reg->hr_db_regnum)),
O2HB_DB_TYPE_REGION_NUMBER,
0, O2NM_MAX_NODES, reg);
- if (!reg->hr_debug_regnum) {
+ if (IS_ERR_OR_NULL(reg->hr_debug_regnum)) {
+ ret = reg->hr_debug_regnum ?
+ PTR_ERR(reg->hr_debug_regnum) : -ENOMEM;
mlog_errno(ret);
goto bail;
}
@@ -2037,7 +2021,9 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
sizeof(*(reg->hr_db_elapsed_time)),
O2HB_DB_TYPE_REGION_ELAPSED_TIME,
0, 0, reg);
- if (!reg->hr_debug_elapsed_time) {
+ if (IS_ERR_OR_NULL(reg->hr_debug_elapsed_time)) {
+ ret = reg->hr_debug_elapsed_time ?
+ PTR_ERR(reg->hr_debug_elapsed_time) : -ENOMEM;
mlog_errno(ret);
goto bail;
}
@@ -2049,13 +2035,16 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
sizeof(*(reg->hr_db_pinned)),
O2HB_DB_TYPE_REGION_PINNED,
0, 0, reg);
- if (!reg->hr_debug_pinned) {
+ if (IS_ERR_OR_NULL(reg->hr_debug_pinned)) {
+ ret = reg->hr_debug_pinned ?
+ PTR_ERR(reg->hr_debug_pinned) : -ENOMEM;
mlog_errno(ret);
goto bail;
}
- ret = 0;
+ return 0;
bail:
+ debugfs_remove_recursive(reg->hr_debug_dir);
return ret;
}
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index b08050bd3f2e..52b9a06b1784 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -18,7 +18,7 @@
*
* linux/fs/minix/dir.c
*
- * Copyright (C) 1991, 1992 Linux Torvalds
+ * Copyright (C) 1991, 1992 Linus Torvalds
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
@@ -480,8 +480,7 @@ static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
trailer = ocfs2_trailer_from_bh(bh, dir->i_sb);
if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
- rc = -EINVAL;
- ocfs2_error(dir->i_sb,
+ rc = ocfs2_error(dir->i_sb,
"Invalid dirblock #%llu: "
"signature = %.*s\n",
(unsigned long long)bh->b_blocknr, 7,
@@ -489,8 +488,7 @@ static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
goto out;
}
if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) {
- rc = -EINVAL;
- ocfs2_error(dir->i_sb,
+ rc = ocfs2_error(dir->i_sb,
"Directory block #%llu has an invalid "
"db_blkno of %llu",
(unsigned long long)bh->b_blocknr,
@@ -499,8 +497,7 @@ static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
}
if (le64_to_cpu(trailer->db_parent_dinode) !=
OCFS2_I(dir)->ip_blkno) {
- rc = -EINVAL;
- ocfs2_error(dir->i_sb,
+ rc = ocfs2_error(dir->i_sb,
"Directory block #%llu on dinode "
"#%llu has an invalid parent_dinode "
"of %llu",
@@ -604,14 +601,13 @@ static int ocfs2_validate_dx_root(struct super_block *sb,
}
if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) {
- ocfs2_error(sb,
+ ret = ocfs2_error(sb,
"Dir Index Root # %llu has bad signature %.*s",
(unsigned long long)le64_to_cpu(dx_root->dr_blkno),
7, dx_root->dr_signature);
- return -EINVAL;
}
- return 0;
+ return ret;
}
static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
@@ -648,12 +644,11 @@ static int ocfs2_validate_dx_leaf(struct super_block *sb,
}
if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) {
- ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s",
+ ret = ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s",
7, dx_leaf->dl_signature);
- return -EROFS;
}
- return 0;
+ return ret;
}
static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
@@ -812,11 +807,10 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
el = &eb->h_list;
if (el->l_tree_depth) {
- ocfs2_error(inode->i_sb,
+ ret = ocfs2_error(inode->i_sb,
"Inode %lu has non zero tree depth in "
"btree tree block %llu\n", inode->i_ino,
(unsigned long long)eb_bh->b_blocknr);
- ret = -EROFS;
goto out;
}
}
@@ -832,11 +826,10 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
}
if (!found) {
- ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+ ret = ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
"record (%u, %u, 0) in btree", inode->i_ino,
le32_to_cpu(rec->e_cpos),
ocfs2_rec_clusters(el, rec));
- ret = -EROFS;
goto out;
}
@@ -2047,22 +2040,19 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
const char *name,
int namelen)
{
- int ret;
+ int ret = 0;
struct ocfs2_dir_lookup_result lookup = { NULL, };
trace_ocfs2_check_dir_for_entry(
(unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
- ret = -EEXIST;
- if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0)
- goto bail;
+ if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0) {
+ ret = -EEXIST;
+ mlog_errno(ret);
+ }
- ret = 0;
-bail:
ocfs2_free_dir_lookup_result(&lookup);
- if (ret)
- mlog_errno(ret);
return ret;
}
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index a6944b25fd5b..52c1a14ade29 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -498,16 +498,6 @@ static void dlm_lockres_release(struct kref *kref)
mlog(0, "destroying lockres %.*s\n", res->lockname.len,
res->lockname.name);
- spin_lock(&dlm->track_lock);
- if (!list_empty(&res->tracking))
- list_del_init(&res->tracking);
- else {
- mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
- res->lockname.len, res->lockname.name);
- dlm_print_one_lock_resource(res);
- }
- spin_unlock(&dlm->track_lock);
-
atomic_dec(&dlm->res_cur_count);
if (!hlist_unhashed(&res->hash_node) ||
@@ -782,8 +772,19 @@ lookup:
dlm_lockres_grab_inflight_ref(dlm, tmpres);
spin_unlock(&tmpres->spinlock);
- if (res)
+ if (res) {
+ spin_lock(&dlm->track_lock);
+ if (!list_empty(&res->tracking))
+ list_del_init(&res->tracking);
+ else {
+ mlog(ML_ERROR, "Resource %.*s not "
+ "on the Tracking list\n",
+ res->lockname.len,
+ res->lockname.name);
+ }
+ spin_unlock(&dlm->track_lock);
dlm_lockres_put(res);
+ }
res = tmpres;
goto leave;
}
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 69aac6f088ad..2e5e6d5fffe8 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -211,6 +211,16 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
__dlm_unhash_lockres(dlm, res);
+ spin_lock(&dlm->track_lock);
+ if (!list_empty(&res->tracking))
+ list_del_init(&res->tracking);
+ else {
+ mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
+ res->lockname.len, res->lockname.name);
+ __dlm_print_one_lock_resource(res);
+ }
+ spin_unlock(&dlm->track_lock);
+
/* lockres is not in the hash now. drop the flag and wake up
* any processes waiting in dlm_get_lock_resource. */
if (!master) {
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 11849a44dc5a..23adcbf374d3 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2954,7 +2954,7 @@ static int ocfs2_dlm_init_debug(struct ocfs2_super *osb)
osb->osb_debug_root,
osb,
&ocfs2_dlm_debug_fops);
- if (!dlm_debug->d_locking_state) {
+ if (IS_ERR_OR_NULL(dlm_debug->d_locking_state)) {
ret = -EINVAL;
mlog(ML_ERROR,
"Unable to create locking state debugfs file.\n");
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 29651167190d..540dc4bdd042 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -82,7 +82,6 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
}
status = ocfs2_test_inode_bit(osb, blkno, &set);
- trace_ocfs2_get_dentry_test_bit(status, set);
if (status < 0) {
if (status == -EINVAL) {
/*
@@ -96,6 +95,7 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
goto unlock_nfs_sync;
}
+ trace_ocfs2_get_dentry_test_bit(status, set);
/* If the inode allocator bit is clear, this inode must be stale */
if (!set) {
status = -ESTALE;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 3025c0da6b8a..d608b0d48d64 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -624,7 +624,7 @@ static int ocfs2_remove_inode(struct inode *inode,
ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
le16_to_cpu(di->i_suballoc_slot));
if (!inode_alloc_inode) {
- status = -EEXIST;
+ status = -ENOENT;
mlog_errno(status);
goto bail;
}
@@ -742,7 +742,7 @@ static int ocfs2_wipe_inode(struct inode *inode,
ORPHAN_DIR_SYSTEM_INODE,
orphaned_slot);
if (!orphan_dir_inode) {
- status = -EEXIST;
+ status = -ENOENT;
mlog_errno(status);
goto bail;
}
@@ -1191,17 +1191,19 @@ void ocfs2_evict_inode(struct inode *inode)
int ocfs2_drop_inode(struct inode *inode)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- int res;
trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno,
inode->i_nlink, oi->ip_flags);
- if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
- res = 1;
- else
- res = generic_drop_inode(inode);
+ assert_spin_locked(&inode->i_lock);
+ inode->i_state |= I_WILL_FREE;
+ spin_unlock(&inode->i_lock);
+ write_inode_now(inode, 1);
+ spin_lock(&inode->i_lock);
+ WARN_ON(inode->i_state & I_NEW);
+ inode->i_state &= ~I_WILL_FREE;
- return res;
+ return 1;
}
/*
@@ -1350,21 +1352,21 @@ int ocfs2_validate_inode_block(struct super_block *sb,
rc = -EINVAL;
if (!OCFS2_IS_VALID_DINODE(di)) {
- ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
+ rc = ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
(unsigned long long)bh->b_blocknr, 7,
di->i_signature);
goto bail;
}
if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
- ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
+ rc = ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
(unsigned long long)bh->b_blocknr,
(unsigned long long)le64_to_cpu(di->i_blkno));
goto bail;
}
if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
- ocfs2_error(sb,
+ rc = ocfs2_error(sb,
"Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
(unsigned long long)bh->b_blocknr);
goto bail;
@@ -1372,7 +1374,7 @@ int ocfs2_validate_inode_block(struct super_block *sb,
if (le32_to_cpu(di->i_fs_generation) !=
OCFS2_SB(sb)->fs_generation) {
- ocfs2_error(sb,
+ rc = ocfs2_error(sb,
"Invalid dinode #%llu: fs_generation is %u\n",
(unsigned long long)bh->b_blocknr,
le32_to_cpu(di->i_fs_generation));
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 044013455621..857bbbcd39f3 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -666,7 +666,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
ocfs2_local_alloc_count_bits(alloc)) {
ocfs2_error(osb->sb, "local alloc inode %llu says it has "
- "%u free bits, but a count shows %u",
+ "%u used bits, but a count shows %u",
(unsigned long long)le64_to_cpu(alloc->i_blkno),
le32_to_cpu(alloc->id1.bitmap1.i_used),
ocfs2_local_alloc_count_bits(alloc));
@@ -839,7 +839,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
u32 *numbits,
struct ocfs2_alloc_reservation *resv)
{
- int numfound, bitoff, left, startoff, lastzero;
+ int numfound = 0, bitoff, left, startoff, lastzero;
int local_resv = 0;
struct ocfs2_alloc_reservation r;
void *bitmap = NULL;
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 56a768d06aa6..70dd0ec7b7e9 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -99,11 +99,10 @@ static int __ocfs2_move_extent(handle_t *handle,
index = ocfs2_search_extent_list(el, cpos);
if (index == -1) {
- ocfs2_error(inode->i_sb,
+ ret = ocfs2_error(inode->i_sb,
"Inode %llu has an extent at cpos %u which can no "
"longer be found.\n",
(unsigned long long)ino, cpos);
- ret = -EROFS;
goto out;
}
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index b5c3a5ea3ee6..f35cb7a7b601 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1291,6 +1291,15 @@ static int ocfs2_rename(struct inode *old_dir,
}
parents_locked = 1;
+ if (!new_dir->i_nlink) {
+ mlog(ML_ERROR, "new dir %llu has been removed, inode %llu "
+ "can not be moved into it.",
+ (unsigned long long)new_dir->i_ino,
+ (unsigned long long)old_inode->i_ino);
+ status = -EACCES;
+ goto bail;
+ }
+
/* make sure both dirs have bhs
* get an extra ref on old_dir_bh if old==new */
if (!new_dir_bh) {
@@ -2322,10 +2331,10 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
trace_ocfs2_orphan_del(
(unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
- name, namelen);
+ name, strlen(name));
/* find it's spot in the orphan directory */
- status = ocfs2_find_entry(name, namelen, orphan_dir_inode,
+ status = ocfs2_find_entry(name, strlen(name), orphan_dir_inode,
&lookup);
if (status) {
mlog_errno(status);
@@ -2808,7 +2817,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
ORPHAN_DIR_SYSTEM_INODE,
osb->slot_num);
if (!orphan_dir_inode) {
- status = -EEXIST;
+ status = -ENOENT;
mlog_errno(status);
goto leave;
}
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 460c6c37e683..2c923223ec11 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -286,6 +286,8 @@ enum ocfs2_mount_options
OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */
+ OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */
+ OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */
};
#define OCFS2_OSB_SOFT_RO 0x0001
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index ee541f92dab4..b001516bc4e7 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -102,32 +102,32 @@ static int ocfs2_validate_refcount_block(struct super_block *sb,
if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) {
- ocfs2_error(sb,
+ rc = ocfs2_error(sb,
"Refcount block #%llu has bad signature %.*s",
(unsigned long long)bh->b_blocknr, 7,
rb->rf_signature);
- return -EINVAL;
+ goto out;
}
if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
- ocfs2_error(sb,
+ rc = ocfs2_error(sb,
"Refcount block #%llu has an invalid rf_blkno "
"of %llu",
(unsigned long long)bh->b_blocknr,
(unsigned long long)le64_to_cpu(rb->rf_blkno));
- return -EINVAL;
+ goto out;
}
if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) {
- ocfs2_error(sb,
+ rc = ocfs2_error(sb,
"Refcount block #%llu has an invalid "
"rf_fs_generation of #%u",
(unsigned long long)bh->b_blocknr,
le32_to_cpu(rb->rf_fs_generation));
- return -EINVAL;
+ goto out;
}
-
- return 0;
+out:
+ return rc;
}
static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
@@ -1102,12 +1102,11 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
el = &eb->h_list;
if (el->l_tree_depth) {
- ocfs2_error(sb,
- "refcount tree %llu has non zero tree "
- "depth in leaf btree tree block %llu\n",
- (unsigned long long)ocfs2_metadata_cache_owner(ci),
- (unsigned long long)eb_bh->b_blocknr);
- ret = -EROFS;
+ ret = ocfs2_error(sb,
+ "refcount tree %llu has non zero tree "
+ "depth in leaf btree tree block %llu\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)eb_bh->b_blocknr);
goto out;
}
}
@@ -2361,10 +2360,9 @@ static int ocfs2_mark_extent_refcounted(struct inode *inode,
cpos, len, phys);
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
- ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
+ ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
"tree, but the feature bit is not set in the "
"super block.", inode->i_ino);
- ret = -EROFS;
goto out;
}
@@ -2547,10 +2545,9 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
- ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
+ ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
"tree, but the feature bit is not set in the "
"super block.", inode->i_ino);
- ret = -EROFS;
goto out;
}
@@ -2674,11 +2671,10 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
el = &eb->h_list;
if (el->l_tree_depth) {
- ocfs2_error(inode->i_sb,
+ ret = ocfs2_error(inode->i_sb,
"Inode %lu has non zero tree depth in "
"leaf block %llu\n", inode->i_ino,
(unsigned long long)eb_bh->b_blocknr);
- ret = -EROFS;
goto out;
}
}
@@ -3108,11 +3104,10 @@ static int ocfs2_clear_ext_refcount(handle_t *handle,
index = ocfs2_search_extent_list(el, cpos);
if (index == -1) {
- ocfs2_error(sb,
+ ret = ocfs2_error(sb,
"Inode %llu has an extent at cpos %u which can no "
"longer be found.\n",
(unsigned long long)ino, cpos);
- ret = -EROFS;
goto out;
}
@@ -3378,10 +3373,9 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
- ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
+ return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
"tree, but the feature bit is not set in the "
"super block.", inode->i_ino);
- return -EROFS;
}
ocfs2_init_dealloc_ctxt(&context->dealloc);
@@ -4276,7 +4270,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
error = posix_acl_create(dir, &mode, &default_acl, &acl);
if (error) {
mlog_errno(error);
- goto out;
+ return error;
}
error = ocfs2_create_inode_in_orphan(dir, mode,
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index d5493e361a38..e78a203d44c8 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -427,7 +427,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
if (!si) {
status = -ENOMEM;
mlog_errno(status);
- goto bail;
+ return status;
}
si->si_extended = ocfs2_uses_extended_slot_map(osb);
@@ -452,7 +452,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
osb->slot_info = (struct ocfs2_slot_info *)si;
bail:
- if (status < 0 && si)
+ if (status < 0)
__ocfs2_free_slot_info(si);
return status;
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 1724d43d3da1..220cae7bbdbc 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -295,7 +295,7 @@ static int o2cb_cluster_check(void)
set_bit(node_num, netmap);
if (!memcmp(hbmap, netmap, sizeof(hbmap)))
return 0;
- if (i < O2CB_MAP_STABILIZE_COUNT)
+ if (i < O2CB_MAP_STABILIZE_COUNT - 1)
msleep(1000);
}
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 720aa389e0ea..2768eb1da2b8 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -1004,10 +1004,8 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
BUG_ON(conn == NULL);
lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
- if (!lc) {
- rc = -ENOMEM;
- goto out;
- }
+ if (!lc)
+ return -ENOMEM;
init_waitqueue_head(&lc->oc_wait);
init_completion(&lc->oc_sync_wait);
@@ -1063,7 +1061,7 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
}
out:
- if (rc && lc)
+ if (rc)
kfree(lc);
return rc;
}
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 0cb889a17ae1..e4bb00110e91 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -171,7 +171,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
if (resize) \
mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \
else \
- ocfs2_error(sb, fmt, ##__VA_ARGS__); \
+ return ocfs2_error(sb, fmt, ##__VA_ARGS__); \
} while (0)
static int ocfs2_validate_gd_self(struct super_block *sb,
@@ -184,7 +184,6 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
do_error("Group descriptor #%llu has bad signature %.*s",
(unsigned long long)bh->b_blocknr, 7,
gd->bg_signature);
- return -EINVAL;
}
if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
@@ -192,7 +191,6 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
"of %llu",
(unsigned long long)bh->b_blocknr,
(unsigned long long)le64_to_cpu(gd->bg_blkno));
- return -EINVAL;
}
if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
@@ -200,7 +198,6 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
"fs_generation of #%u",
(unsigned long long)bh->b_blocknr,
le32_to_cpu(gd->bg_generation));
- return -EINVAL;
}
if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
@@ -209,7 +206,6 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
(unsigned long long)bh->b_blocknr,
le16_to_cpu(gd->bg_bits),
le16_to_cpu(gd->bg_free_bits_count));
- return -EINVAL;
}
if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
@@ -218,7 +214,6 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
(unsigned long long)bh->b_blocknr,
le16_to_cpu(gd->bg_bits),
8 * le16_to_cpu(gd->bg_size));
- return -EINVAL;
}
return 0;
@@ -238,7 +233,6 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
(unsigned long long)bh->b_blocknr,
(unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
(unsigned long long)le64_to_cpu(di->i_blkno));
- return -EINVAL;
}
max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
@@ -246,7 +240,6 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
do_error("Group descriptor #%llu has bit count of %u",
(unsigned long long)bh->b_blocknr,
le16_to_cpu(gd->bg_bits));
- return -EINVAL;
}
/* In resize, we may meet the case bg_chain == cl_next_free_rec. */
@@ -257,7 +250,6 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
do_error("Group descriptor #%llu has bad chain %u",
(unsigned long long)bh->b_blocknr,
le16_to_cpu(gd->bg_chain));
- return -EINVAL;
}
return 0;
@@ -384,11 +376,10 @@ static int ocfs2_block_group_fill(handle_t *handle,
struct super_block * sb = alloc_inode->i_sb;
if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
- ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
+ status = ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
"b_blocknr (%llu)",
(unsigned long long)group_blkno,
(unsigned long long) bg_bh->b_blocknr);
- status = -EIO;
goto bail;
}
@@ -834,9 +825,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
- ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
+ status = ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
(unsigned long long)le64_to_cpu(fe->i_blkno));
- status = -EIO;
goto bail;
}
@@ -1370,12 +1360,11 @@ int ocfs2_block_group_set_bits(handle_t *handle,
le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
- ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+ return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
" count %u but claims %u are freed. num_bits %d",
(unsigned long long)le64_to_cpu(bg->bg_blkno),
le16_to_cpu(bg->bg_bits),
le16_to_cpu(bg->bg_free_bits_count), num_bits);
- return -EROFS;
}
while(num_bits--)
ocfs2_set_bit(bit_off++, bitmap);
@@ -1905,13 +1894,12 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
le32_to_cpu(fe->id1.bitmap1.i_total)) {
- ocfs2_error(ac->ac_inode->i_sb,
+ status = ocfs2_error(ac->ac_inode->i_sb,
"Chain allocator dinode %llu has %u used "
"bits but only %u total.",
(unsigned long long)le64_to_cpu(fe->i_blkno),
le32_to_cpu(fe->id1.bitmap1.i_used),
le32_to_cpu(fe->id1.bitmap1.i_total));
- status = -EIO;
goto bail;
}
@@ -2429,12 +2417,11 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
}
le16_add_cpu(&bg->bg_free_bits_count, num_bits);
if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
- ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+ return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
" count %u but claims %u are freed. num_bits %d",
(unsigned long long)le64_to_cpu(bg->bg_blkno),
le16_to_cpu(bg->bg_bits),
le16_to_cpu(bg->bg_free_bits_count), num_bits);
- return -EROFS;
}
if (undo_fn)
@@ -2499,6 +2486,8 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle,
alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
+ ocfs2_block_group_set_bits(handle, alloc_inode, group, group_bh,
+ start_bit, count);
goto bail;
}
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 26675185b886..2dfaa5cccc6d 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -192,6 +192,7 @@ enum {
Opt_resv_level,
Opt_dir_resv_level,
Opt_journal_async_commit,
+ Opt_err_cont,
Opt_err,
};
@@ -224,6 +225,7 @@ static const match_table_t tokens = {
{Opt_resv_level, "resv_level=%u"},
{Opt_dir_resv_level, "dir_resv_level=%u"},
{Opt_journal_async_commit, "journal_async_commit"},
+ {Opt_err_cont, "errors=continue"},
{Opt_err, NULL}
};
@@ -1112,7 +1114,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
osb->osb_debug_root = debugfs_create_dir(osb->uuid_str,
ocfs2_debugfs_root);
- if (!osb->osb_debug_root) {
+ if (IS_ERR_OR_NULL(osb->osb_debug_root)) {
status = -EINVAL;
mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n");
goto read_super_error;
@@ -1122,7 +1124,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
osb->osb_debug_root,
osb,
&ocfs2_osb_debug_fops);
- if (!osb->osb_ctxt) {
+ if (IS_ERR_OR_NULL(osb->osb_ctxt)) {
status = -EINVAL;
mlog_errno(status);
goto read_super_error;
@@ -1330,10 +1332,19 @@ static int ocfs2_parse_options(struct super_block *sb,
mopt->mount_opt |= OCFS2_MOUNT_NOINTR;
break;
case Opt_err_panic:
+ mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT;
+ mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS;
mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
break;
case Opt_err_ro:
+ mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT;
mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
+ mopt->mount_opt |= OCFS2_MOUNT_ERRORS_ROFS;
+ break;
+ case Opt_err_cont:
+ mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS;
+ mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
+ mopt->mount_opt |= OCFS2_MOUNT_ERRORS_CONT;
break;
case Opt_data_ordered:
mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK;
@@ -1530,6 +1541,8 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
if (opts & OCFS2_MOUNT_ERRORS_PANIC)
seq_printf(s, ",errors=panic");
+ else if (opts & OCFS2_MOUNT_ERRORS_CONT)
+ seq_printf(s, ",errors=continue");
else
seq_printf(s, ",errors=remount-ro");
@@ -1606,8 +1619,9 @@ static int __init ocfs2_init(void)
}
ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
- if (!ocfs2_debugfs_root) {
- status = -ENOMEM;
+ if (IS_ERR_OR_NULL(ocfs2_debugfs_root)) {
+ status = ocfs2_debugfs_root ?
+ PTR_ERR(ocfs2_debugfs_root) : -ENOMEM;
mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
goto out4;
}
@@ -2069,6 +2083,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits);
bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits);
+ memcpy(sb->s_uuid, di->id2.i_super.s_uuid,
+ sizeof(di->id2.i_super.s_uuid));
osb->osb_dx_mask = (1 << (cbits - bbits)) - 1;
@@ -2333,7 +2349,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
mlog_errno(status);
goto bail;
}
- cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb);
+ cleancache_init_shared_fs(sb);
bail:
return status;
@@ -2539,33 +2555,47 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
memset(osb, 0, sizeof(struct ocfs2_super));
}
-/* Put OCFS2 into a readonly state, or (if the user specifies it),
- * panic(). We do not support continue-on-error operation. */
-static void ocfs2_handle_error(struct super_block *sb)
+/* Depending on the mount option passed, perform one of the following:
+ * Put OCFS2 into a readonly state (default)
+ * Return EIO so that only the process errs
+ * Fix the error as if fsck.ocfs2 -y
+ * panic
+ */
+static int ocfs2_handle_error(struct super_block *sb)
{
struct ocfs2_super *osb = OCFS2_SB(sb);
+ int rv = 0;
+
+ ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS);
+ pr_crit("On-disk corruption discovered. "
+ "Please run fsck.ocfs2 once the filesystem is unmounted.\n");
if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC)
panic("OCFS2: (device %s): panic forced after error\n",
sb->s_id);
+ else if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_CONT) {
+ pr_crit("OCFS2: Returning error to the calling process.\n");
+ rv = -EIO;
+ }
+ else { /* default option */
+ rv = -EROFS;
+ if (sb->s_flags & MS_RDONLY &&
+ (ocfs2_is_soft_readonly(osb) ||
+ ocfs2_is_hard_readonly(osb)))
+ return rv;
- ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS);
+ pr_crit("OCFS2: File system is now read-only.\n");
+ sb->s_flags |= MS_RDONLY;
+ ocfs2_set_ro_flag(osb, 0);
+ }
- if (sb->s_flags & MS_RDONLY &&
- (ocfs2_is_soft_readonly(osb) ||
- ocfs2_is_hard_readonly(osb)))
- return;
+ return rv;
- printk(KERN_CRIT "File system is now read-only due to the potential "
- "of on-disk corruption. Please run fsck.ocfs2 once the file "
- "system is unmounted.\n");
- sb->s_flags |= MS_RDONLY;
- ocfs2_set_ro_flag(osb, 0);
}
static char error_buf[1024];
-void __ocfs2_error(struct super_block *sb,
+int __ocfs2_error(struct super_block *sb,
const char *function,
const char *fmt, ...)
{
@@ -2580,7 +2610,7 @@ void __ocfs2_error(struct super_block *sb,
printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %s\n",
sb->s_id, function, error_buf);
- ocfs2_handle_error(sb);
+ return ocfs2_handle_error(sb);
}
/* Handle critical errors. This is intentionally more drastic than
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 74ff74cf78fe..c1c87d90542c 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -32,7 +32,7 @@ int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
int node_num);
__printf(3, 4)
-void __ocfs2_error(struct super_block *sb, const char *function,
+int __ocfs2_error(struct super_block *sb, const char *function,
const char *fmt, ...);
#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args)
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 85b190dc132f..aabccb1281df 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -499,30 +499,27 @@ static int ocfs2_validate_xattr_block(struct super_block *sb,
*/
if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
- ocfs2_error(sb,
+ return ocfs2_error(sb,
"Extended attribute block #%llu has bad "
"signature %.*s",
(unsigned long long)bh->b_blocknr, 7,
xb->xb_signature);
- return -EINVAL;
}
if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) {
- ocfs2_error(sb,
+ return ocfs2_error(sb,
"Extended attribute block #%llu has an "
"invalid xb_blkno of %llu",
(unsigned long long)bh->b_blocknr,
(unsigned long long)le64_to_cpu(xb->xb_blkno));
- return -EINVAL;
}
if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) {
- ocfs2_error(sb,
+ return ocfs2_error(sb,
"Extended attribute block #%llu has an invalid "
"xb_fs_generation of #%u",
(unsigned long long)bh->b_blocknr,
le32_to_cpu(xb->xb_fs_generation));
- return -EINVAL;
}
return 0;
@@ -1238,6 +1235,10 @@ static int ocfs2_xattr_block_get(struct inode *inode,
i,
&block_off,
&name_offset);
+ if (ret) {
+ mlog_errno(ret);
+ goto cleanup;
+ }
xs->base = bucket_block(xs->bucket, block_off);
}
if (ocfs2_xattr_is_local(xs->here)) {
@@ -3690,11 +3691,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode,
el = &eb->h_list;
if (el->l_tree_depth) {
- ocfs2_error(inode->i_sb,
+ ret = ocfs2_error(inode->i_sb,
"Inode %lu has non zero tree depth in "
"xattr tree block %llu\n", inode->i_ino,
(unsigned long long)eb_bh->b_blocknr);
- ret = -EROFS;
goto out;
}
}
@@ -3709,11 +3709,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode,
}
if (!e_blkno) {
- ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+ ret = ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
"record (%u, %u, 0) in xattr", inode->i_ino,
le32_to_cpu(rec->e_cpos),
ocfs2_rec_clusters(el, rec));
- ret = -EROFS;
goto out;
}
@@ -5665,6 +5664,10 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket,
i, &xv, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
ret = ocfs2_lock_xattr_remove_allocators(inode, xv,
args->ref_ci,
@@ -7326,6 +7329,9 @@ static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list,
const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
+ if (!capable(CAP_SYS_ADMIN))
+ return 0;
+
if (list && total_len <= list_size) {
memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
memcpy(list + prefix_len, name, name_len);
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 3a48bb789c9f..a327300b7f40 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -547,51 +547,45 @@ posix_acl_create(struct inode *dir, umode_t *mode,
struct posix_acl **default_acl, struct posix_acl **acl)
{
struct posix_acl *p;
+ struct posix_acl *clone;
int ret;
+ *acl = NULL;
+ *default_acl = NULL;
+
if (S_ISLNK(*mode) || !IS_POSIXACL(dir))
- goto no_acl;
+ return 0;
p = get_acl(dir, ACL_TYPE_DEFAULT);
- if (IS_ERR(p)) {
- if (p == ERR_PTR(-EOPNOTSUPP))
- goto apply_umask;
- return PTR_ERR(p);
+ if (!p || p == ERR_PTR(-EOPNOTSUPP)) {
+ *mode &= ~current_umask();
+ return 0;
}
+ if (IS_ERR(p))
+ return PTR_ERR(p);
- if (!p)
- goto apply_umask;
-
- *acl = posix_acl_clone(p, GFP_NOFS);
- if (!*acl)
+ clone = posix_acl_clone(p, GFP_NOFS);
+ if (!clone)
goto no_mem;
- ret = posix_acl_create_masq(*acl, mode);
+ ret = posix_acl_create_masq(clone, mode);
if (ret < 0)
goto no_mem_clone;
- if (ret == 0) {
- posix_acl_release(*acl);
- *acl = NULL;
- }
+ if (ret == 0)
+ posix_acl_release(clone);
+ else
+ *acl = clone;
- if (!S_ISDIR(*mode)) {
+ if (!S_ISDIR(*mode))
posix_acl_release(p);
- *default_acl = NULL;
- } else {
+ else
*default_acl = p;
- }
- return 0;
-apply_umask:
- *mode &= ~current_umask();
-no_acl:
- *default_acl = NULL;
- *acl = NULL;
return 0;
no_mem_clone:
- posix_acl_release(*acl);
+ posix_acl_release(clone);
no_mem:
posix_acl_release(p);
return -ENOMEM;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 1295a00ca316..a3893b7505b2 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -99,8 +99,8 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
buf = m->buf + m->count;
/* Ignore error for now */
- string_escape_str(tcomm, &buf, m->size - m->count,
- ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
+ buf += string_escape_str(tcomm, buf, m->size - m->count,
+ ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
m->count = buf - m->buf;
seq_putc(m, '\n');
@@ -188,6 +188,22 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
from_kgid_munged(user_ns, GROUP_AT(group_info, g)));
put_cred(cred);
+ seq_puts(m, "\nNStgid:");
+ for (g = ns->level; g <= pid->level; g++)
+ seq_printf(m, "\t%d",
+ task_tgid_nr_ns(p, pid->numbers[g].ns));
+ seq_puts(m, "\nNSpid:");
+ for (g = ns->level; g <= pid->level; g++)
+ seq_printf(m, "\t%d",
+ task_pid_nr_ns(p, pid->numbers[g].ns));
+ seq_puts(m, "\nNSpgid:");
+ for (g = ns->level; g <= pid->level; g++)
+ seq_printf(m, "\t%d",
+ task_pgrp_nr_ns(p, pid->numbers[g].ns));
+ seq_puts(m, "\nNSsid:");
+ for (g = ns->level; g <= pid->level; g++)
+ seq_printf(m, "\t%d",
+ task_session_nr_ns(p, pid->numbers[g].ns));
seq_putc(m, '\n');
}
@@ -614,7 +630,9 @@ static int children_seq_show(struct seq_file *seq, void *v)
pid_t pid;
pid = pid_nr_ns(v, inode->i_sb->s_fs_info);
- return seq_printf(seq, "%d ", pid);
+ seq_printf(seq, "%d ", pid);
+
+ return 0;
}
static void *children_seq_start(struct seq_file *seq, loff_t *pos)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3f3d7aeb0712..7a3b82f986dd 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -238,13 +238,15 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
wchan = get_wchan(task);
- if (lookup_symbol_name(wchan, symname) < 0)
+ if (lookup_symbol_name(wchan, symname) < 0) {
if (!ptrace_may_access(task, PTRACE_MODE_READ))
return 0;
- else
- return seq_printf(m, "%lu", wchan);
- else
- return seq_printf(m, "%s", symname);
+ seq_printf(m, "%lu", wchan);
+ } else {
+ seq_printf(m, "%s", symname);
+ }
+
+ return 0;
}
#endif /* CONFIG_KALLSYMS */
@@ -309,10 +311,12 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
- return seq_printf(m, "%llu %llu %lu\n",
- (unsigned long long)task->se.sum_exec_runtime,
- (unsigned long long)task->sched_info.run_delay,
- task->sched_info.pcount);
+ seq_printf(m, "%llu %llu %lu\n",
+ (unsigned long long)task->se.sum_exec_runtime,
+ (unsigned long long)task->sched_info.run_delay,
+ task->sched_info.pcount);
+
+ return 0;
}
#endif
@@ -387,7 +391,9 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
points = oom_badness(task, NULL, NULL, totalpages) *
1000 / totalpages;
read_unlock(&tasklist_lock);
- return seq_printf(m, "%lu\n", points);
+ seq_printf(m, "%lu\n", points);
+
+ return 0;
}
struct limit_names {
@@ -432,15 +438,15 @@ static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns,
* print the file header
*/
seq_printf(m, "%-25s %-20s %-20s %-10s\n",
- "Limit", "Soft Limit", "Hard Limit", "Units");
+ "Limit", "Soft Limit", "Hard Limit", "Units");
for (i = 0; i < RLIM_NLIMITS; i++) {
if (rlim[i].rlim_cur == RLIM_INFINITY)
seq_printf(m, "%-25s %-20s ",
- lnames[i].name, "unlimited");
+ lnames[i].name, "unlimited");
else
seq_printf(m, "%-25s %-20lu ",
- lnames[i].name, rlim[i].rlim_cur);
+ lnames[i].name, rlim[i].rlim_cur);
if (rlim[i].rlim_max == RLIM_INFINITY)
seq_printf(m, "%-20s ", "unlimited");
@@ -462,7 +468,9 @@ static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns,
{
long nr;
unsigned long args[6], sp, pc;
- int res = lock_trace(task);
+ int res;
+
+ res = lock_trace(task);
if (res)
return res;
@@ -477,7 +485,8 @@ static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns,
args[0], args[1], args[2], args[3], args[4], args[5],
sp, pc);
unlock_trace(task);
- return res;
+
+ return 0;
}
#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
@@ -2002,12 +2011,13 @@ static int show_timer(struct seq_file *m, void *v)
notify = timer->it_sigev_notify;
seq_printf(m, "ID: %d\n", timer->it_id);
- seq_printf(m, "signal: %d/%p\n", timer->sigq->info.si_signo,
- timer->sigq->info.si_value.sival_ptr);
+ seq_printf(m, "signal: %d/%p\n",
+ timer->sigq->info.si_signo,
+ timer->sigq->info.si_value.sival_ptr);
seq_printf(m, "notify: %s/%s.%d\n",
- nstr[notify & ~SIGEV_THREAD_ID],
- (notify & SIGEV_THREAD_ID) ? "tid" : "pid",
- pid_nr_ns(timer->it_pid, tp->ns));
+ nstr[notify & ~SIGEV_THREAD_ID],
+ (notify & SIGEV_THREAD_ID) ? "tid" : "pid",
+ pid_nr_ns(timer->it_pid, tp->ns));
seq_printf(m, "ClockID: %d\n", timer->it_clock);
return 0;
@@ -2352,21 +2362,23 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh
unlock_task_sighand(task, &flags);
}
- result = seq_printf(m,
- "rchar: %llu\n"
- "wchar: %llu\n"
- "syscr: %llu\n"
- "syscw: %llu\n"
- "read_bytes: %llu\n"
- "write_bytes: %llu\n"
- "cancelled_write_bytes: %llu\n",
- (unsigned long long)acct.rchar,
- (unsigned long long)acct.wchar,
- (unsigned long long)acct.syscr,
- (unsigned long long)acct.syscw,
- (unsigned long long)acct.read_bytes,
- (unsigned long long)acct.write_bytes,
- (unsigned long long)acct.cancelled_write_bytes);
+ seq_printf(m,
+ "rchar: %llu\n"
+ "wchar: %llu\n"
+ "syscr: %llu\n"
+ "syscw: %llu\n"
+ "read_bytes: %llu\n"
+ "write_bytes: %llu\n"
+ "cancelled_write_bytes: %llu\n",
+ (unsigned long long)acct.rchar,
+ (unsigned long long)acct.wchar,
+ (unsigned long long)acct.syscr,
+ (unsigned long long)acct.syscw,
+ (unsigned long long)acct.read_bytes,
+ (unsigned long long)acct.write_bytes,
+ (unsigned long long)acct.cancelled_write_bytes);
+ result = 0;
+
out_unlock:
mutex_unlock(&task->signal->cred_guard_mutex);
return result;
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 8e5ad83b629a..af84ad04df77 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -8,6 +8,7 @@
#include <linux/security.h>
#include <linux/file.h>
#include <linux/seq_file.h>
+#include <linux/fs.h>
#include <linux/proc_fs.h>
@@ -48,17 +49,23 @@ static int seq_show(struct seq_file *m, void *v)
put_files_struct(files);
}
- if (!ret) {
- seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\n",
- (long long)file->f_pos, f_flags,
- real_mount(file->f_path.mnt)->mnt_id);
- if (file->f_op->show_fdinfo)
- file->f_op->show_fdinfo(m, file);
- ret = seq_has_overflowed(m);
- fput(file);
- }
+ if (ret)
+ return ret;
- return ret;
+ seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\n",
+ (long long)file->f_pos, f_flags,
+ real_mount(file->f_path.mnt)->mnt_id);
+
+ show_fd_locks(m, file, files);
+ if (seq_has_overflowed(m))
+ goto out;
+
+ if (file->f_op->show_fdinfo)
+ file->f_op->show_fdinfo(m, file);
+
+out:
+ fput(file);
+ return 0;
}
static int seq_fdinfo_open(struct inode *inode, struct file *file)
diff --git a/fs/super.c b/fs/super.c
index 2b7dc90ccdbb..928c20f47af9 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -224,7 +224,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
s->s_maxbytes = MAX_NON_LFS;
s->s_op = &default_op;
s->s_time_gran = 1000000000;
- s->cleancache_poolid = -1;
+ s->cleancache_poolid = CLEANCACHE_NO_POOL;
s->s_shrink.seeks = DEFAULT_SEEKS;
s->s_shrink.scan_objects = super_cache_scan;
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 4d46085c1b90..39f1d6a2b04d 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -6,6 +6,12 @@
#include <linux/mm_types.h>
#include <linux/bug.h>
+#include <linux/errno.h>
+
+#if 4 - defined(__PAGETABLE_PUD_FOLDED) - defined(__PAGETABLE_PMD_FOLDED) != \
+ CONFIG_PGTABLE_LEVELS
+#error CONFIG_PGTABLE_LEVELS is not consistent with __PAGETABLE_{PUD,PMD}_FOLDED
+#endif
/*
* On almost all architectures and configurations, 0 can be used as the
@@ -691,6 +697,30 @@ static inline int pmd_protnone(pmd_t pmd)
#endif /* CONFIG_MMU */
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot);
+int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot);
+int pud_clear_huge(pud_t *pud);
+int pmd_clear_huge(pmd_t *pmd);
+#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
+static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
+{
+ return 0;
+}
+static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
+{
+ return 0;
+}
+static inline int pud_clear_huge(pud_t *pud)
+{
+ return 0;
+}
+static inline int pmd_clear_huge(pmd_t *pmd)
+{
+ return 0;
+}
+#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
+
#endif /* !__ASSEMBLY__ */
#ifndef io_remap_pfn_range
diff --git a/include/asm-generic/seccomp.h b/include/asm-generic/seccomp.h
index 9fa1f653ed3b..c9ccafa0d99a 100644
--- a/include/asm-generic/seccomp.h
+++ b/include/asm-generic/seccomp.h
@@ -17,7 +17,9 @@
#define __NR_seccomp_read_32 __NR_read
#define __NR_seccomp_write_32 __NR_write
#define __NR_seccomp_exit_32 __NR_exit
+#ifndef __NR_seccomp_sigreturn_32
#define __NR_seccomp_sigreturn_32 __NR_rt_sigreturn
+#endif
#endif /* CONFIG_COMPAT && ! already defined */
#define __NR_seccomp_read __NR_read
diff --git a/include/linux/a.out.h b/include/linux/a.out.h
index 220f14338895..ee884168989f 100644
--- a/include/linux/a.out.h
+++ b/include/linux/a.out.h
@@ -4,44 +4,6 @@
#include <uapi/linux/a.out.h>
#ifndef __ASSEMBLY__
-#if defined (M_OLDSUN2)
-#else
-#endif
-#if defined (M_68010)
-#else
-#endif
-#if defined (M_68020)
-#else
-#endif
-#if defined (M_SPARC)
-#else
-#endif
-#if !defined (N_MAGIC)
-#endif
-#if !defined (N_BADMAG)
-#endif
-#if !defined (N_TXTOFF)
-#endif
-#if !defined (N_DATOFF)
-#endif
-#if !defined (N_TRELOFF)
-#endif
-#if !defined (N_DRELOFF)
-#endif
-#if !defined (N_SYMOFF)
-#endif
-#if !defined (N_STROFF)
-#endif
-#if !defined (N_TXTADDR)
-#endif
-#if defined(vax) || defined(hp300) || defined(pyr)
-#endif
-#ifdef sony
-#endif /* Sony. */
-#ifdef is68k
-#endif
-#if defined(m68k) && defined(PORTAR)
-#endif
#ifdef linux
#include <asm/page.h>
#if defined(__i386__) || defined(__mc68000__)
@@ -51,34 +13,5 @@
#endif
#endif
#endif
-#ifndef N_DATADDR
-#endif
-#if !defined (N_BSSADDR)
-#endif
-#if !defined (N_NLIST_DECLARED)
-#endif /* no N_NLIST_DECLARED. */
-#if !defined (N_UNDF)
-#endif
-#if !defined (N_ABS)
-#endif
-#if !defined (N_TEXT)
-#endif
-#if !defined (N_DATA)
-#endif
-#if !defined (N_BSS)
-#endif
-#if !defined (N_FN)
-#endif
-#if !defined (N_EXT)
-#endif
-#if !defined (N_TYPE)
-#endif
-#if !defined (N_STAB)
-#endif
-#if !defined (N_RELOCATION_INFO_DECLARED)
-#ifdef NS32K
-#else
-#endif
-#endif /* no N_RELOCATION_INFO_DECLARED. */
#endif /*__ASSEMBLY__ */
#endif /* __A_OUT_GNU_H__ */
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index dbfbf4990005..be4fa5ddf36c 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -172,12 +172,8 @@ extern unsigned int bitmap_ord_to_pos(const unsigned long *bitmap, unsigned int
extern int bitmap_print_to_pagebuf(bool list, char *buf,
const unsigned long *maskp, int nmaskbits);
-#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) % BITS_PER_LONG))
-#define BITMAP_LAST_WORD_MASK(nbits) \
-( \
- ((nbits) % BITS_PER_LONG) ? \
- (1UL<<((nbits) % BITS_PER_LONG))-1 : ~0UL \
-)
+#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
+#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
#define small_const_nbits(nbits) \
(__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG)
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 5d858e02997f..297f5bda4fdf 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -218,9 +218,9 @@ static inline unsigned long __ffs64(u64 word)
/**
* find_last_bit - find the last set bit in a memory region
* @addr: The address to start the search at
- * @size: The maximum size to search
+ * @size: The number of bits to search
*
- * Returns the bit number of the first set bit, or size.
+ * Returns the bit number of the last set bit, or size.
*/
extern unsigned long find_last_bit(const unsigned long *addr,
unsigned long size);
diff --git a/include/linux/capability.h b/include/linux/capability.h
index aa93e5ef594c..af9f0b9e80e6 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -205,6 +205,7 @@ static inline kernel_cap_t cap_raise_nfsd_set(const kernel_cap_t a,
cap_intersect(permitted, __cap_nfsd_set));
}
+#ifdef CONFIG_MULTIUSER
extern bool has_capability(struct task_struct *t, int cap);
extern bool has_ns_capability(struct task_struct *t,
struct user_namespace *ns, int cap);
@@ -213,6 +214,34 @@ extern bool has_ns_capability_noaudit(struct task_struct *t,
struct user_namespace *ns, int cap);
extern bool capable(int cap);
extern bool ns_capable(struct user_namespace *ns, int cap);
+#else
+static inline bool has_capability(struct task_struct *t, int cap)
+{
+ return true;
+}
+static inline bool has_ns_capability(struct task_struct *t,
+ struct user_namespace *ns, int cap)
+{
+ return true;
+}
+static inline bool has_capability_noaudit(struct task_struct *t, int cap)
+{
+ return true;
+}
+static inline bool has_ns_capability_noaudit(struct task_struct *t,
+ struct user_namespace *ns, int cap)
+{
+ return true;
+}
+static inline bool capable(int cap)
+{
+ return true;
+}
+static inline bool ns_capable(struct user_namespace *ns, int cap)
+{
+ return true;
+}
+#endif /* CONFIG_MULTIUSER */
extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap);
extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap);
diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h
index 4ce9056b31a8..bda5ec0b4b4d 100644
--- a/include/linux/cleancache.h
+++ b/include/linux/cleancache.h
@@ -5,6 +5,10 @@
#include <linux/exportfs.h>
#include <linux/mm.h>
+#define CLEANCACHE_NO_POOL -1
+#define CLEANCACHE_NO_BACKEND -2
+#define CLEANCACHE_NO_BACKEND_SHARED -3
+
#define CLEANCACHE_KEY_MAX 6
/*
@@ -33,10 +37,9 @@ struct cleancache_ops {
void (*invalidate_fs)(int);
};
-extern struct cleancache_ops *
- cleancache_register_ops(struct cleancache_ops *ops);
+extern int cleancache_register_ops(struct cleancache_ops *ops);
extern void __cleancache_init_fs(struct super_block *);
-extern void __cleancache_init_shared_fs(char *, struct super_block *);
+extern void __cleancache_init_shared_fs(struct super_block *);
extern int __cleancache_get_page(struct page *);
extern void __cleancache_put_page(struct page *);
extern void __cleancache_invalidate_page(struct address_space *, struct page *);
@@ -78,10 +81,10 @@ static inline void cleancache_init_fs(struct super_block *sb)
__cleancache_init_fs(sb);
}
-static inline void cleancache_init_shared_fs(char *uuid, struct super_block *sb)
+static inline void cleancache_init_shared_fs(struct super_block *sb)
{
if (cleancache_enabled)
- __cleancache_init_shared_fs(uuid, sb);
+ __cleancache_init_shared_fs(sb);
}
static inline int cleancache_get_page(struct page *page)
diff --git a/include/linux/cma.h b/include/linux/cma.h
index 9384ba66e975..f7ef093ec49a 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -16,16 +16,16 @@
struct cma;
extern unsigned long totalcma_pages;
-extern phys_addr_t cma_get_base(struct cma *cma);
-extern unsigned long cma_get_size(struct cma *cma);
+extern phys_addr_t cma_get_base(const struct cma *cma);
+extern unsigned long cma_get_size(const struct cma *cma);
extern int __init cma_declare_contiguous(phys_addr_t base,
phys_addr_t size, phys_addr_t limit,
phys_addr_t alignment, unsigned int order_per_bit,
bool fixed, struct cma **res_cma);
-extern int cma_init_reserved_mem(phys_addr_t base,
- phys_addr_t size, int order_per_bit,
+extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
+ unsigned int order_per_bit,
struct cma **res_cma);
-extern struct page *cma_alloc(struct cma *cma, int count, unsigned int align);
-extern bool cma_release(struct cma *cma, struct page *pages, int count);
+extern struct page *cma_alloc(struct cma *cma, unsigned int count, unsigned int align);
+extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count);
#endif
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index a014559e4a49..aa8f61cf3a19 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -34,6 +34,7 @@ extern int sysctl_compaction_handler(struct ctl_table *table, int write,
extern int sysctl_extfrag_threshold;
extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos);
+extern int sysctl_compact_unevictable_allowed;
extern int fragmentation_index(struct zone *zone, unsigned int order);
extern unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
diff --git a/include/linux/crc64_ecma.h b/include/linux/crc64_ecma.h
new file mode 100644
index 000000000000..bba7a4d692b3
--- /dev/null
+++ b/include/linux/crc64_ecma.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CRC64_ECMA_H_
+#define __CRC64_ECMA_H_
+
+#include <linux/types.h>
+
+
+#define CRC64_DEFAULT_INITVAL 0xFFFFFFFFFFFFFFFFULL
+
+
+/*
+ * crc64_ecma_seed - Initializes the CRC64 ECMA seed.
+ */
+u64 crc64_ecma_seed(void);
+
+/*
+ * crc64_ecma - Computes the 64 bit ECMA CRC.
+ *
+ * @pdata: pointer to the data to compute checksum for.
+ * @nbytes: number of bytes in data buffer.
+ * @seed: CRC seed.
+ */
+u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed);
+
+#endif /* __CRC64_ECMA_H_ */
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 2fb2ca2127ed..8b6c083e68a7 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -62,9 +62,27 @@ do { \
groups_free(group_info); \
} while (0)
-extern struct group_info *groups_alloc(int);
extern struct group_info init_groups;
+#ifdef CONFIG_MULTIUSER
+extern struct group_info *groups_alloc(int);
extern void groups_free(struct group_info *);
+
+extern int in_group_p(kgid_t);
+extern int in_egroup_p(kgid_t);
+#else
+static inline void groups_free(struct group_info *group_info)
+{
+}
+
+static inline int in_group_p(kgid_t grp)
+{
+ return 1;
+}
+static inline int in_egroup_p(kgid_t grp)
+{
+ return 1;
+}
+#endif
extern int set_current_groups(struct group_info *);
extern void set_groups(struct cred *, struct group_info *);
extern int groups_search(const struct group_info *, kgid_t);
@@ -74,9 +92,6 @@ extern bool may_setgroups(void);
#define GROUP_AT(gi, i) \
((gi)->blocks[(i) / NGROUPS_PER_BLOCK][(i) % NGROUPS_PER_BLOCK])
-extern int in_group_p(kgid_t);
-extern int in_egroup_p(kgid_t);
-
/*
* The security context of a task
*
diff --git a/include/linux/elf-randomize.h b/include/linux/elf-randomize.h
new file mode 100644
index 000000000000..b5f0bda9472e
--- /dev/null
+++ b/include/linux/elf-randomize.h
@@ -0,0 +1,22 @@
+#ifndef _ELF_RANDOMIZE_H
+#define _ELF_RANDOMIZE_H
+
+struct mm_struct;
+
+#ifndef CONFIG_ARCH_HAS_ELF_RANDOMIZE
+static inline unsigned long arch_mmap_rnd(void) { return 0; }
+# if defined(arch_randomize_brk) && defined(CONFIG_COMPAT_BRK)
+# define compat_brk_randomized
+# endif
+# ifndef arch_randomize_brk
+# define arch_randomize_brk(mm) (mm->brk)
+# endif
+#else
+extern unsigned long arch_mmap_rnd(void);
+extern unsigned long arch_randomize_brk(struct mm_struct *mm);
+# ifdef CONFIG_COMPAT_BRK
+# define compat_brk_randomized
+# endif
+#endif
+
+#endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1eed7d8c1a09..d6be71ed87d0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -869,6 +869,7 @@ static inline struct file *get_file(struct file *f)
atomic_long_inc(&f->f_count);
return f;
}
+#define get_file_rcu(x) atomic_long_inc_not_zero(&(x)->f_count)
#define fput_atomic(x) atomic_long_add_unless(&(x)->f_count, -1, 1)
#define file_count(x) atomic_long_read(&(x)->f_count)
@@ -1040,6 +1041,9 @@ extern void lease_get_mtime(struct inode *, struct timespec *time);
extern int generic_setlease(struct file *, long, struct file_lock **, void **priv);
extern int vfs_setlease(struct file *, long, struct file_lock **, void **);
extern int lease_modify(struct file_lock *, int, struct list_head *);
+struct files_struct;
+extern void show_fd_locks(struct seq_file *f,
+ struct file *filp, struct files_struct *files);
#else /* !CONFIG_FILE_LOCKING */
static inline int fcntl_getlk(struct file *file, unsigned int cmd,
struct flock __user *user)
@@ -1176,6 +1180,10 @@ static inline int lease_modify(struct file_lock *fl, int arg,
{
return -EINVAL;
}
+
+struct files_struct;
+static inline void show_fd_locks(struct seq_file *f,
+ struct file *filp, struct files_struct *files) {}
#endif /* !CONFIG_FILE_LOCKING */
@@ -2683,7 +2691,6 @@ void inode_sub_bytes(struct inode *inode, loff_t bytes);
loff_t inode_get_bytes(struct inode *inode);
void inode_set_bytes(struct inode *inode, loff_t bytes);
-extern int vfs_readdir(struct file *, filldir_t, void *);
extern int iterate_dir(struct file *, struct dir_context *);
extern int vfs_stat(const char __user *, struct kstat *);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 51bd1e72a917..97a9373e61e8 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -57,8 +57,10 @@ struct vm_area_struct;
* _might_ fail. This depends upon the particular VM implementation.
*
* __GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller
- * cannot handle allocation failures. This modifier is deprecated and no new
- * users should be added.
+ * cannot handle allocation failures. New users should be evaluated carefully
+ * (and the flag should be used only when there is no reasonable failure policy)
+ * but it is definitely preferable to use the flag rather than opencode endless
+ * loop around allocator.
*
* __GFP_NORETRY: The VM implementation must not retry indefinitely.
*
@@ -117,16 +119,6 @@ struct vm_area_struct;
__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \
__GFP_NO_KSWAPD)
-/*
- * GFP_THISNODE does not perform any reclaim, you most likely want to
- * use __GFP_THISNODE to allocate from a given node without fallback!
- */
-#ifdef CONFIG_NUMA
-#define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)
-#else
-#define GFP_THISNODE ((__force gfp_t)0)
-#endif
-
/* This mask makes up all the page movable related flags */
#define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index f10b20f05159..44a840a53974 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -19,6 +19,9 @@ extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
unsigned long addr,
pmd_t *pmd,
unsigned int flags);
+extern int madvise_free_huge_pmd(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ pmd_t *pmd, unsigned long addr);
extern int zap_huge_pmd(struct mmu_gather *tlb,
struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr);
@@ -56,6 +59,7 @@ extern pmd_t *page_check_address_pmd(struct page *page,
unsigned long address,
enum page_check_address_pmd_flag flag,
spinlock_t **ptl);
+extern int pmd_freeable(pmd_t pmd);
#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 7b5785032049..5713c49a4a5c 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -22,7 +22,13 @@ struct mmu_gather;
struct hugepage_subpool {
spinlock_t lock;
long count;
- long max_hpages, used_hpages;
+ long max_hpages; /* Maximum huge pages or -1 if no maximum. */
+ long used_hpages; /* Used count against maximum, includes */
+ /* both alloced and reserved pages. */
+ struct hstate *hstate;
+ long min_hpages; /* Minimum huge pages or -1 if no minimum. */
+ long rsv_hpages; /* Pages reserved against global pool to */
+ /* sasitfy minimum size. */
};
struct resv_map {
@@ -38,11 +44,10 @@ extern int hugetlb_max_hstate __read_mostly;
#define for_each_hstate(h) \
for ((h) = hstates; (h) < &hstates[hugetlb_max_hstate]; (h)++)
-struct hugepage_subpool *hugepage_new_subpool(long nr_blocks);
+struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
+ long min_hpages);
void hugepage_put_subpool(struct hugepage_subpool *spool);
-int PageHuge(struct page *page);
-
void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
@@ -109,11 +114,6 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
#else /* !CONFIG_HUGETLB_PAGE */
-static inline int PageHuge(struct page *page)
-{
- return 0;
-}
-
static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
{
}
diff --git a/include/linux/io.h b/include/linux/io.h
index 42b33f03d1df..986f2bffea1e 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -38,6 +38,14 @@ static inline int ioremap_page_range(unsigned long addr, unsigned long end,
}
#endif
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+void __init ioremap_huge_init(void);
+int arch_ioremap_pud_supported(void);
+int arch_ioremap_pmd_supported(void);
+#else
+static inline void ioremap_huge_init(void) { }
+#endif
+
/*
* Managed iomap interface
*/
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 2c5250222278..388e3ae94f7a 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -196,10 +196,8 @@ extern struct resource * __request_region(struct resource *,
/* Compatibility cruft */
#define release_region(start,n) __release_region(&ioport_resource, (start), (n))
-#define check_mem_region(start,n) __check_region(&iomem_resource, (start), (n))
#define release_mem_region(start,n) __release_region(&iomem_resource, (start), (n))
-extern int __check_region(struct resource *, resource_size_t, resource_size_t);
extern void __release_region(struct resource *, resource_size_t,
resource_size_t);
#ifdef CONFIG_MEMORY_HOTREMOVE
@@ -207,12 +205,6 @@ extern int release_mem_region_adjustable(struct resource *, resource_size_t,
resource_size_t);
#endif
-static inline int __deprecated check_region(resource_size_t s,
- resource_size_t n)
-{
- return __check_region(&ioport_resource, s, n);
-}
-
/* Wrappers for managed devices */
struct device;
diff --git a/include/linux/kconfig.h b/include/linux/kconfig.h
index be342b94c640..63ca8dacec59 100644
--- a/include/linux/kconfig.h
+++ b/include/linux/kconfig.h
@@ -23,14 +23,6 @@
#define ___config_enabled(__ignored, val, ...) val
/*
- * IS_ENABLED(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'y' or 'm',
- * 0 otherwise.
- *
- */
-#define IS_ENABLED(option) \
- (config_enabled(option) || config_enabled(option##_MODULE))
-
-/*
* IS_BUILTIN(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'y', 0
* otherwise. For boolean options, this is equivalent to
* IS_ENABLED(CONFIG_FOO).
@@ -43,4 +35,11 @@
*/
#define IS_MODULE(option) config_enabled(option##_MODULE)
+/*
+ * IS_ENABLED(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'y' or 'm',
+ * 0 otherwise.
+ */
+#define IS_ENABLED(option) \
+ (IS_BUILTIN(option) || IS_MODULE(option))
+
#endif /* __LINUX_KCONFIG_H */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index d6d630d31ef3..3a5b48e52a9e 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -103,6 +103,18 @@
(((__x) - ((__d) / 2)) / (__d)); \
} \
)
+/*
+ * Same as above but for u64 dividends. divisor must be a 32-bit
+ * number.
+ */
+#define DIV_ROUND_CLOSEST_ULL(x, divisor)( \
+{ \
+ typeof(divisor) __d = divisor; \
+ unsigned long long _tmp = (x) + (__d) / 2; \
+ do_div(_tmp, __d); \
+ _tmp; \
+} \
+)
/*
* Multiplies an integer by a fraction, while avoiding unnecessary
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index e60a745ac198..87e37264d464 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -265,6 +265,8 @@ unsigned long paddr_vmcoreinfo_note(void);
vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name)
#define VMCOREINFO_CONFIG(name) \
vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
+#define VMCOREINFO_PHYS_BASE(value) \
+ vmcoreinfo_append_str("PHYS_BASE=%lx\n", (unsigned long)value)
extern struct kimage *kexec_image;
extern struct kimage *kexec_crash_image;
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 3be6bb18562d..7ae216a39c9e 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -35,18 +35,6 @@ static inline void ksm_exit(struct mm_struct *mm)
__ksm_exit(mm);
}
-/*
- * A KSM page is one of those write-protected "shared pages" or "merged pages"
- * which KSM maps into multiple mms, wherever identical anonymous page content
- * is found in VM_MERGEABLE vmas. It's a PageAnon page, pointing not to any
- * anon_vma, but to that page's node of the stable tree.
- */
-static inline int PageKsm(struct page *page)
-{
- return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
- (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
-}
-
static inline struct stable_node *page_stable_node(struct page *page)
{
return PageKsm(page) ? page_rmapping(page) : NULL;
@@ -87,11 +75,6 @@ static inline void ksm_exit(struct mm_struct *mm)
{
}
-static inline int PageKsm(struct page *page)
-{
- return 0;
-}
-
#ifdef CONFIG_MMU
static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
unsigned long end, int advice, unsigned long *vm_flags)
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index e8cc45307f8f..9497ec7c77ea 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -365,6 +365,14 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo
#define __initdata_memblock
#endif
+#ifdef CONFIG_MEMTEST
+extern void early_memtest(phys_addr_t start, phys_addr_t end);
+#else
+static inline void early_memtest(phys_addr_t start, phys_addr_t end)
+{
+}
+#endif
+
#else
static inline phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align)
{
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 8f1a41951df9..6ffa0ac7f7d6 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -192,6 +192,9 @@ extern void get_page_bootmem(unsigned long ingo, struct page *page,
void get_online_mems(void);
void put_online_mems(void);
+void mem_hotplug_begin(void);
+void mem_hotplug_done(void);
+
#else /* ! CONFIG_MEMORY_HOTPLUG */
/*
* Stub functions for when hotplug is off
@@ -231,6 +234,9 @@ static inline int try_online_node(int nid)
static inline void get_online_mems(void) {}
static inline void put_online_mems(void) {}
+static inline void mem_hotplug_begin(void) {}
+static inline void mem_hotplug_done(void) {}
+
#endif /* ! CONFIG_MEMORY_HOTPLUG */
#ifdef CONFIG_MEMORY_HOTREMOVE
diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 39ed62ab5b8a..b19b3023c880 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -29,7 +29,7 @@ extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data,
gfp_t gfp_mask, int nid);
-extern int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask);
+extern int mempool_resize(mempool_t *pool, int new_min_nr);
extern void mempool_destroy(mempool_t *pool);
extern void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask);
extern void mempool_free(void *element, mempool_t *pool);
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 78baed5f2952..cac1c0904d5f 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -69,7 +69,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
extern bool pmd_trans_migrating(pmd_t pmd);
extern int migrate_misplaced_page(struct page *page,
struct vm_area_struct *vma, int node);
-extern bool migrate_ratelimited(int node);
#else
static inline bool pmd_trans_migrating(pmd_t pmd)
{
@@ -80,10 +79,6 @@ static inline int migrate_misplaced_page(struct page *page,
{
return -EAGAIN; /* can't migrate now */
}
-static inline bool migrate_ratelimited(int node)
-{
- return false;
-}
#endif /* CONFIG_NUMA_BALANCING */
#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 47a93928b90f..4a3a38522ab4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -433,46 +433,6 @@ static inline void compound_unlock_irqrestore(struct page *page,
#endif
}
-static inline struct page *compound_head_by_tail(struct page *tail)
-{
- struct page *head = tail->first_page;
-
- /*
- * page->first_page may be a dangling pointer to an old
- * compound page, so recheck that it is still a tail
- * page before returning.
- */
- smp_rmb();
- if (likely(PageTail(tail)))
- return head;
- return tail;
-}
-
-/*
- * Since either compound page could be dismantled asynchronously in THP
- * or we access asynchronously arbitrary positioned struct page, there
- * would be tail flag race. To handle this race, we should call
- * smp_rmb() before checking tail flag. compound_head_by_tail() did it.
- */
-static inline struct page *compound_head(struct page *page)
-{
- if (unlikely(PageTail(page)))
- return compound_head_by_tail(page);
- return page;
-}
-
-/*
- * If we access compound page synchronously such as access to
- * allocated page, there is no need to handle tail flag race, so we can
- * check tail flag directly without any synchronization primitive.
- */
-static inline struct page *compound_head_fast(struct page *page)
-{
- if (unlikely(PageTail(page)))
- return page->first_page;
- return page;
-}
-
/*
* The atomic page->_mapcount, starts from -1: so that transitions
* both from it and to it can be tracked, using atomic_inc_and_test
@@ -494,15 +454,6 @@ static inline int page_count(struct page *page)
return atomic_read(&compound_head(page)->_count);
}
-#ifdef CONFIG_HUGETLB_PAGE
-extern int PageHeadHuge(struct page *page_head);
-#else /* CONFIG_HUGETLB_PAGE */
-static inline int PageHeadHuge(struct page *page_head)
-{
- return 0;
-}
-#endif /* CONFIG_HUGETLB_PAGE */
-
static inline bool __compound_tail_refcounted(struct page *page)
{
return !PageSlab(page) && !PageHeadHuge(page);
@@ -571,53 +522,6 @@ static inline void init_page_count(struct page *page)
atomic_set(&page->_count, 1);
}
-/*
- * PageBuddy() indicate that the page is free and in the buddy system
- * (see mm/page_alloc.c).
- *
- * PAGE_BUDDY_MAPCOUNT_VALUE must be <= -2 but better not too close to
- * -2 so that an underflow of the page_mapcount() won't be mistaken
- * for a genuine PAGE_BUDDY_MAPCOUNT_VALUE. -128 can be created very
- * efficiently by most CPU architectures.
- */
-#define PAGE_BUDDY_MAPCOUNT_VALUE (-128)
-
-static inline int PageBuddy(struct page *page)
-{
- return atomic_read(&page->_mapcount) == PAGE_BUDDY_MAPCOUNT_VALUE;
-}
-
-static inline void __SetPageBuddy(struct page *page)
-{
- VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page);
- atomic_set(&page->_mapcount, PAGE_BUDDY_MAPCOUNT_VALUE);
-}
-
-static inline void __ClearPageBuddy(struct page *page)
-{
- VM_BUG_ON_PAGE(!PageBuddy(page), page);
- atomic_set(&page->_mapcount, -1);
-}
-
-#define PAGE_BALLOON_MAPCOUNT_VALUE (-256)
-
-static inline int PageBalloon(struct page *page)
-{
- return atomic_read(&page->_mapcount) == PAGE_BALLOON_MAPCOUNT_VALUE;
-}
-
-static inline void __SetPageBalloon(struct page *page)
-{
- VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page);
- atomic_set(&page->_mapcount, PAGE_BALLOON_MAPCOUNT_VALUE);
-}
-
-static inline void __ClearPageBalloon(struct page *page)
-{
- VM_BUG_ON_PAGE(!PageBalloon(page), page);
- atomic_set(&page->_mapcount, -1);
-}
-
void put_page(struct page *page);
void put_pages_list(struct list_head *pages);
@@ -1006,31 +910,12 @@ void page_address_init(void);
#define page_address_init() do { } while(0)
#endif
-/*
- * On an anonymous page mapped into a user virtual memory area,
- * page->mapping points to its anon_vma, not to a struct address_space;
- * with the PAGE_MAPPING_ANON bit set to distinguish it. See rmap.h.
- *
- * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled,
- * the PAGE_MAPPING_KSM bit may be set along with the PAGE_MAPPING_ANON bit;
- * and then page->mapping points, not to an anon_vma, but to a private
- * structure which KSM associates with that merged page. See ksm.h.
- *
- * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is currently never used.
- *
- * Please note that, confusingly, "page_mapping" refers to the inode
- * address_space which maps the page from disk; whereas "page_mapped"
- * refers to user virtual address space into which the page is mapped.
- */
-#define PAGE_MAPPING_ANON 1
-#define PAGE_MAPPING_KSM 2
-#define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM)
-
extern struct address_space *page_mapping(struct page *page);
/* Neutral page->mapping pointer to address_space or anon_vma or other */
static inline void *page_rmapping(struct page *page)
{
+ page = compound_head(page);
return (void *)((unsigned long)page->mapping & ~PAGE_MAPPING_FLAGS);
}
@@ -1045,11 +930,6 @@ struct address_space *page_file_mapping(struct page *page)
return page->mapping;
}
-static inline int PageAnon(struct page *page)
-{
- return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
-}
-
/*
* Return the pagecache index of the passed page. Regular pagecache pages
* use ->index whereas swapcache pages use ->private
@@ -1294,9 +1174,11 @@ int __set_page_dirty_no_writeback(struct page *page);
int redirty_page_for_writepage(struct writeback_control *wbc,
struct page *page);
void account_page_dirtied(struct page *page, struct address_space *mapping);
+void account_page_cleaned(struct page *page, struct address_space *mapping);
int set_page_dirty(struct page *page);
int set_page_dirty_lock(struct page *page);
int clear_page_dirty_for_io(struct page *page);
+
int get_cmdline(struct task_struct *task, char *buffer, int buflen);
/* Is the vma a continuation of the stack vma above it? */
@@ -2109,7 +1991,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma,
#define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */
#define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO
* and return without waiting upon it */
-#define FOLL_MLOCK 0x40 /* mark page as mlocked */
+#define FOLL_POPULATE 0x40 /* fault in page */
#define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */
#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */
#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 199a03aab8dc..8d37e26a1007 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -364,7 +364,9 @@ struct mm_struct {
atomic_t mm_users; /* How many users with user space? */
atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */
atomic_long_t nr_ptes; /* PTE page table pages */
+#if CONFIG_PGTABLE_LEVELS > 2
atomic_long_t nr_pmds; /* PMD page table pages */
+#endif
int map_count; /* number of VMAs */
spinlock_t page_table_lock; /* Protects page tables and some counters */
@@ -427,7 +429,7 @@ struct mm_struct {
#endif
/* store ref to file /proc/<pid>/exe symlink points to */
- struct file *exe_file;
+ struct file __rcu *exe_file;
#ifdef CONFIG_MMU_NOTIFIER
struct mmu_notifier_mm *mmu_notifier_mm;
#endif
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index f279d9c158cd..218f89208e83 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -843,16 +843,16 @@ static inline int populated_zone(struct zone *zone)
extern int movable_zone;
+#ifdef CONFIG_HIGHMEM
static inline int zone_movable_is_highmem(void)
{
-#if defined(CONFIG_HIGHMEM) && defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
return movable_zone == ZONE_HIGHMEM;
-#elif defined(CONFIG_HIGHMEM)
- return (ZONE_MOVABLE - 1) == ZONE_HIGHMEM;
#else
- return 0;
+ return (ZONE_MOVABLE - 1) == ZONE_HIGHMEM;
#endif
}
+#endif
static inline int is_highmem_idx(enum zone_type idx)
{
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 9b2022ab4d85..3d46fb4708e0 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -25,16 +25,11 @@ static inline void touch_nmi_watchdog(void)
#endif
#if defined(CONFIG_HARDLOCKUP_DETECTOR)
-extern void watchdog_enable_hardlockup_detector(bool val);
-extern bool watchdog_hardlockup_detector_is_enabled(void);
+extern void hardlockup_detector_disable(void);
#else
-static inline void watchdog_enable_hardlockup_detector(bool val)
+static inline void hardlockup_detector_disable(void)
{
}
-static inline bool watchdog_hardlockup_detector_is_enabled(void)
-{
- return true;
-}
#endif
/*
@@ -68,12 +63,20 @@ static inline bool trigger_allbutself_cpu_backtrace(void)
#ifdef CONFIG_LOCKUP_DETECTOR
int hw_nmi_is_cpu_stuck(struct pt_regs *);
u64 hw_nmi_get_sample_period(int watchdog_thresh);
+extern int nmi_watchdog_enabled;
+extern int soft_watchdog_enabled;
extern int watchdog_user_enabled;
extern int watchdog_thresh;
extern int sysctl_softlockup_all_cpu_backtrace;
struct ctl_table;
-extern int proc_dowatchdog(struct ctl_table *, int ,
- void __user *, size_t *, loff_t *);
+extern int proc_watchdog(struct ctl_table *, int ,
+ void __user *, size_t *, loff_t *);
+extern int proc_nmi_watchdog(struct ctl_table *, int ,
+ void __user *, size_t *, loff_t *);
+extern int proc_soft_watchdog(struct ctl_table *, int ,
+ void __user *, size_t *, loff_t *);
+extern int proc_watchdog_thresh(struct ctl_table *, int ,
+ void __user *, size_t *, loff_t *);
#endif
#ifdef CONFIG_HAVE_ACPI_APEI_NMI
diff --git a/include/linux/oom.h b/include/linux/oom.h
index d5771bed59c9..44b2f6f7bbd8 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -66,7 +66,8 @@ extern bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_flags);
extern void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_flags);
extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
- int order, const nodemask_t *nodemask);
+ int order, const nodemask_t *nodemask,
+ struct mem_cgroup *memcg);
extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
unsigned long totalpages, const nodemask_t *nodemask,
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 5ed7bdaf22d5..e5770201bba7 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -134,49 +134,68 @@ enum pageflags {
#ifndef __GENERATING_BOUNDS_H
+/* Page flags policies wrt compound pages */
+#define PF_ANY(page, enforce) page
+#define PF_HEAD(page, enforce) compound_head(page)
+#define PF_NO_TAIL(page, enforce) ({ \
+ if (enforce) \
+ VM_BUG_ON_PAGE(PageTail(page), page); \
+ else \
+ page = compound_head(page); \
+ page;})
+#define PF_NO_COMPOUND(page, enforce) ({ \
+ if (enforce) \
+ VM_BUG_ON_PAGE(PageCompound(page), page); \
+ page;})
+
/*
* Macros to create function definitions for page flags
*/
-#define TESTPAGEFLAG(uname, lname) \
-static inline int Page##uname(const struct page *page) \
- { return test_bit(PG_##lname, &page->flags); }
+#define TESTPAGEFLAG(uname, lname, policy) \
+static inline int Page##uname(struct page *page) \
+ { return test_bit(PG_##lname, &policy(page, 0)->flags); }
-#define SETPAGEFLAG(uname, lname) \
+#define SETPAGEFLAG(uname, lname, policy) \
static inline void SetPage##uname(struct page *page) \
- { set_bit(PG_##lname, &page->flags); }
+ { set_bit(PG_##lname, &policy(page, 1)->flags); }
-#define CLEARPAGEFLAG(uname, lname) \
+#define CLEARPAGEFLAG(uname, lname, policy) \
static inline void ClearPage##uname(struct page *page) \
- { clear_bit(PG_##lname, &page->flags); }
+ { clear_bit(PG_##lname, &policy(page, 1)->flags); }
-#define __SETPAGEFLAG(uname, lname) \
+#define __SETPAGEFLAG(uname, lname, policy) \
static inline void __SetPage##uname(struct page *page) \
- { __set_bit(PG_##lname, &page->flags); }
+ { __set_bit(PG_##lname, &policy(page, 1)->flags); }
-#define __CLEARPAGEFLAG(uname, lname) \
+#define __CLEARPAGEFLAG(uname, lname, policy) \
static inline void __ClearPage##uname(struct page *page) \
- { __clear_bit(PG_##lname, &page->flags); }
+ { __clear_bit(PG_##lname, &policy(page, 1)->flags); }
-#define TESTSETFLAG(uname, lname) \
+#define TESTSETFLAG(uname, lname, policy) \
static inline int TestSetPage##uname(struct page *page) \
- { return test_and_set_bit(PG_##lname, &page->flags); }
+ { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }
-#define TESTCLEARFLAG(uname, lname) \
+#define TESTCLEARFLAG(uname, lname, policy) \
static inline int TestClearPage##uname(struct page *page) \
- { return test_and_clear_bit(PG_##lname, &page->flags); }
+ { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
-#define __TESTCLEARFLAG(uname, lname) \
+#define __TESTCLEARFLAG(uname, lname, policy) \
static inline int __TestClearPage##uname(struct page *page) \
- { return __test_and_clear_bit(PG_##lname, &page->flags); }
+ { return __test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
-#define PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \
- SETPAGEFLAG(uname, lname) CLEARPAGEFLAG(uname, lname)
+#define PAGEFLAG(uname, lname, policy) \
+ TESTPAGEFLAG(uname, lname, policy) \
+ SETPAGEFLAG(uname, lname, policy) \
+ CLEARPAGEFLAG(uname, lname, policy)
-#define __PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \
- __SETPAGEFLAG(uname, lname) __CLEARPAGEFLAG(uname, lname)
+#define __PAGEFLAG(uname, lname, policy) \
+ TESTPAGEFLAG(uname, lname, policy) \
+ __SETPAGEFLAG(uname, lname, policy) \
+ __CLEARPAGEFLAG(uname, lname, policy)
-#define TESTSCFLAG(uname, lname) \
- TESTSETFLAG(uname, lname) TESTCLEARFLAG(uname, lname)
+#define TESTSCFLAG(uname, lname, policy) \
+ TESTSETFLAG(uname, lname, policy) \
+ TESTCLEARFLAG(uname, lname, policy)
#define TESTPAGEFLAG_FALSE(uname) \
static inline int Page##uname(const struct page *page) { return 0; }
@@ -205,47 +224,100 @@ static inline int __TestClearPage##uname(struct page *page) { return 0; }
#define TESTSCFLAG_FALSE(uname) \
TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname)
-struct page; /* forward declaration */
-
-TESTPAGEFLAG(Locked, locked)
-PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error)
-PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced)
- __SETPAGEFLAG(Referenced, referenced)
-PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty)
-PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru)
-PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
- TESTCLEARFLAG(Active, active)
-__PAGEFLAG(Slab, slab)
-PAGEFLAG(Checked, checked) /* Used by some filesystems */
-PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */
-PAGEFLAG(SavePinned, savepinned); /* Xen */
-PAGEFLAG(Foreign, foreign); /* Xen */
-PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
-PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
- __SETPAGEFLAG(SwapBacked, swapbacked)
-
-__PAGEFLAG(SlobFree, slob_free)
+/* Forward declarations */
+struct page;
+static inline int PageCompound(struct page *page);
+static inline int PageTail(struct page *page);
+
+static inline struct page *compound_head_by_tail(struct page *tail)
+{
+ struct page *head = tail->first_page;
+
+ /*
+ * page->first_page may be a dangling pointer to an old
+ * compound page, so recheck that it is still a tail
+ * page before returning.
+ */
+ smp_rmb();
+ if (likely(PageTail(tail)))
+ return head;
+ return tail;
+}
+
+/*
+ * Since either compound page could be dismantled asynchronously in THP
+ * or we access asynchronously arbitrary positioned struct page, there
+ * would be tail flag race. To handle this race, we should call
+ * smp_rmb() before checking tail flag. compound_head_by_tail() did it.
+ */
+static inline struct page *compound_head(struct page *page)
+{
+ if (unlikely(PageTail(page)))
+ return compound_head_by_tail(page);
+ return page;
+}
+
+/*
+ * If we access compound page synchronously such as access to
+ * allocated page, there is no need to handle tail flag race, so we can
+ * check tail flag directly without any synchronization primitive.
+ */
+static inline struct page *compound_head_fast(struct page *page)
+{
+ if (unlikely(PageTail(page)))
+ return page->first_page;
+ return page;
+}
+
+__PAGEFLAG(Locked, locked, PF_NO_TAIL)
+PAGEFLAG(Error, error, PF_NO_COMPOUND) TESTCLEARFLAG(Error, error, PF_NO_COMPOUND)
+PAGEFLAG(Referenced, referenced, PF_HEAD)
+ TESTCLEARFLAG(Referenced, referenced, PF_HEAD)
+ __SETPAGEFLAG(Referenced, referenced, PF_HEAD)
+PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
+ __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD)
+PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
+PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
+ TESTCLEARFLAG(Active, active, PF_HEAD)
+__PAGEFLAG(Slab, slab, PF_NO_TAIL)
+__PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL)
+PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */
+
+/* Xen */
+PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND) TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND)
+PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND)
+PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND)
+
+PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
+ __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
+PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
+ __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
+ __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
/*
* Private page markings that may be used by the filesystem that owns the page
* for its own purposes.
* - PG_private and PG_private_2 cause releasepage() and co to be invoked
*/
-PAGEFLAG(Private, private) __SETPAGEFLAG(Private, private)
- __CLEARPAGEFLAG(Private, private)
-PAGEFLAG(Private2, private_2) TESTSCFLAG(Private2, private_2)
-PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1)
+PAGEFLAG(Private, private, PF_ANY) __SETPAGEFLAG(Private, private, PF_ANY)
+ __CLEARPAGEFLAG(Private, private, PF_ANY)
+PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY)
+PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
+ TESTCLEARFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
/*
* Only test-and-set exist for PG_writeback. The unconditional operators are
* risky: they bypass page accounting.
*/
-TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback)
-PAGEFLAG(MappedToDisk, mappedtodisk)
+TESTPAGEFLAG(Writeback, writeback, PF_NO_COMPOUND)
+ TESTSCFLAG(Writeback, writeback, PF_NO_COMPOUND)
+PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_COMPOUND)
/* PG_readahead is only used for reads; PG_reclaim is only for writes */
-PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim)
-PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim)
+PAGEFLAG(Reclaim, reclaim, PF_NO_COMPOUND)
+ TESTCLEARFLAG(Reclaim, reclaim, PF_NO_COMPOUND)
+PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND)
+ TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND)
#ifdef CONFIG_HIGHMEM
/*
@@ -258,43 +330,89 @@ PAGEFLAG_FALSE(HighMem)
#endif
#ifdef CONFIG_SWAP
-PAGEFLAG(SwapCache, swapcache)
+PAGEFLAG(SwapCache, swapcache, PF_NO_COMPOUND)
#else
PAGEFLAG_FALSE(SwapCache)
#endif
-PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable)
- TESTCLEARFLAG(Unevictable, unevictable)
+PAGEFLAG(Unevictable, unevictable, PF_HEAD)
+ __CLEARPAGEFLAG(Unevictable, unevictable, PF_HEAD)
+ TESTCLEARFLAG(Unevictable, unevictable, PF_HEAD)
#ifdef CONFIG_MMU
-PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked)
- TESTSCFLAG(Mlocked, mlocked) __TESTCLEARFLAG(Mlocked, mlocked)
+PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) __CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
+ TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL)
+ __TESTCLEARFLAG(Mlocked, mlocked, PF_NO_TAIL)
#else
PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked)
TESTSCFLAG_FALSE(Mlocked) __TESTCLEARFLAG_FALSE(Mlocked)
#endif
#ifdef CONFIG_ARCH_USES_PG_UNCACHED
-PAGEFLAG(Uncached, uncached)
+PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND)
#else
PAGEFLAG_FALSE(Uncached)
#endif
#ifdef CONFIG_MEMORY_FAILURE
-PAGEFLAG(HWPoison, hwpoison)
-TESTSCFLAG(HWPoison, hwpoison)
+PAGEFLAG(HWPoison, hwpoison, PF_ANY)
+TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
#define __PG_HWPOISON (1UL << PG_hwpoison)
#else
PAGEFLAG_FALSE(HWPoison)
#define __PG_HWPOISON 0
#endif
+/*
+ * On an anonymous page mapped into a user virtual memory area,
+ * page->mapping points to its anon_vma, not to a struct address_space;
+ * with the PAGE_MAPPING_ANON bit set to distinguish it. See rmap.h.
+ *
+ * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled,
+ * the PAGE_MAPPING_KSM bit may be set along with the PAGE_MAPPING_ANON bit;
+ * and then page->mapping points, not to an anon_vma, but to a private
+ * structure which KSM associates with that merged page. See ksm.h.
+ *
+ * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is currently never used.
+ *
+ * Please note that, confusingly, "page_mapping" refers to the inode
+ * address_space which maps the page from disk; whereas "page_mapped"
+ * refers to user virtual address space into which the page is mapped.
+ */
+#define PAGE_MAPPING_ANON 1
+#define PAGE_MAPPING_KSM 2
+#define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM)
+
+static inline int PageAnon(struct page *page)
+{
+ page = compound_head(page);
+ return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
+}
+
+#ifdef CONFIG_KSM
+/*
+ * A KSM page is one of those write-protected "shared pages" or "merged pages"
+ * which KSM maps into multiple mms, wherever identical anonymous page content
+ * is found in VM_MERGEABLE vmas. It's a PageAnon page, pointing not to any
+ * anon_vma, but to that page's node of the stable tree.
+ */
+static inline int PageKsm(struct page *page)
+{
+ page = compound_head(page);
+ return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
+ (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
+}
+#else
+TESTPAGEFLAG_FALSE(Ksm)
+#endif
+
u64 stable_page_flags(struct page *page);
static inline int PageUptodate(struct page *page)
{
- int ret = test_bit(PG_uptodate, &(page)->flags);
-
+ int ret;
+ page = compound_head(page);
+ ret = test_bit(PG_uptodate, &(page)->flags);
/*
* Must ensure that the data we read out of the page is loaded
* _after_ we've loaded page->flags to check for PageUptodate.
@@ -311,24 +429,24 @@ static inline int PageUptodate(struct page *page)
static inline void __SetPageUptodate(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
smp_wmb();
- __set_bit(PG_uptodate, &(page)->flags);
+ __set_bit(PG_uptodate, &page->flags);
}
static inline void SetPageUptodate(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
/*
* Memory barrier must be issued before setting the PG_uptodate bit,
* so that all previous stores issued in order to bring the page
* uptodate are actually visible before PageUptodate becomes true.
*/
smp_wmb();
- set_bit(PG_uptodate, &(page)->flags);
+ set_bit(PG_uptodate, &page->flags);
}
-CLEARPAGEFLAG(Uptodate, uptodate)
-
-extern void cancel_dirty_page(struct page *page, unsigned int account_size);
+CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL)
int test_clear_page_writeback(struct page *page);
int __test_set_page_writeback(struct page *page, bool keep_write);
@@ -357,8 +475,8 @@ static inline void set_page_writeback_keepwrite(struct page *page)
* and arch/powerpc/kvm/book3s_64_vio_hv.c which use it to detect huge pages
* and avoid handling those in real mode.
*/
-__PAGEFLAG(Head, head) CLEARPAGEFLAG(Head, head)
-__PAGEFLAG(Tail, tail)
+__PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY)
+__PAGEFLAG(Tail, tail, PF_ANY)
static inline int PageCompound(struct page *page)
{
@@ -382,8 +500,8 @@ static inline void ClearPageCompound(struct page *page)
* because PageCompound is always set for compound pages and not for
* pages on the LRU and/or pagecache.
*/
-TESTPAGEFLAG(Compound, compound)
-__SETPAGEFLAG(Head, compound) __CLEARPAGEFLAG(Head, compound)
+TESTPAGEFLAG(Compound, compound, PF_ANY)
+__SETPAGEFLAG(Head, compound, PF_ANY) __CLEARPAGEFLAG(Head, compound, PF_ANY)
/*
* PG_reclaim is used in combination with PG_compound to mark the
@@ -428,6 +546,14 @@ static inline void ClearPageCompound(struct page *page)
#endif /* !PAGEFLAGS_EXTENDED */
+#ifdef CONFIG_HUGETLB_PAGE
+int PageHuge(struct page *page);
+int PageHeadHuge(struct page *page);
+#else
+TESTPAGEFLAG_FALSE(Huge)
+TESTPAGEFLAG_FALSE(HeadHuge)
+#endif
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
* PageHuge() only returns true for hugetlbfs pages, but not for
@@ -464,22 +590,57 @@ static inline int PageTransTail(struct page *page)
}
#else
+TESTPAGEFLAG_FALSE(TransHuge)
+TESTPAGEFLAG_FALSE(TransCompound)
+TESTPAGEFLAG_FALSE(TransTail)
+#endif
-static inline int PageTransHuge(struct page *page)
+/*
+ * PageBuddy() indicate that the page is free and in the buddy system
+ * (see mm/page_alloc.c).
+ *
+ * PAGE_BUDDY_MAPCOUNT_VALUE must be <= -2 but better not too close to
+ * -2 so that an underflow of the page_mapcount() won't be mistaken
+ * for a genuine PAGE_BUDDY_MAPCOUNT_VALUE. -128 can be created very
+ * efficiently by most CPU architectures.
+ */
+#define PAGE_BUDDY_MAPCOUNT_VALUE (-128)
+
+static inline int PageBuddy(struct page *page)
{
- return 0;
+ return atomic_read(&page->_mapcount) == PAGE_BUDDY_MAPCOUNT_VALUE;
}
-static inline int PageTransCompound(struct page *page)
+static inline void __SetPageBuddy(struct page *page)
{
- return 0;
+ VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page);
+ atomic_set(&page->_mapcount, PAGE_BUDDY_MAPCOUNT_VALUE);
}
-static inline int PageTransTail(struct page *page)
+static inline void __ClearPageBuddy(struct page *page)
{
- return 0;
+ VM_BUG_ON_PAGE(!PageBuddy(page), page);
+ atomic_set(&page->_mapcount, -1);
+}
+
+#define PAGE_BALLOON_MAPCOUNT_VALUE (-256)
+
+static inline int PageBalloon(struct page *page)
+{
+ return atomic_read(&page->_mapcount) == PAGE_BALLOON_MAPCOUNT_VALUE;
+}
+
+static inline void __SetPageBalloon(struct page *page)
+{
+ VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page);
+ atomic_set(&page->_mapcount, PAGE_BALLOON_MAPCOUNT_VALUE);
+}
+
+static inline void __ClearPageBalloon(struct page *page)
+{
+ VM_BUG_ON_PAGE(!PageBalloon(page), page);
+ atomic_set(&page->_mapcount, -1);
}
-#endif
/*
* If network-based swap is enabled, sl*b must keep track of whether pages
@@ -554,6 +715,10 @@ static inline int page_has_private(struct page *page)
return !!(page->flags & PAGE_FLAGS_PRIVATE);
}
+#undef PF_ANY
+#undef PF_HEAD
+#undef PF_NO_TAIL
+#undef PF_NO_COMPOUND
#endif /* !__GENERATING_BOUNDS_H */
#endif /* PAGE_FLAGS_H */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 4b3736f7065c..7c3790764795 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -426,18 +426,9 @@ extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
unsigned int flags);
extern void unlock_page(struct page *page);
-static inline void __set_page_locked(struct page *page)
-{
- __set_bit(PG_locked, &page->flags);
-}
-
-static inline void __clear_page_locked(struct page *page)
-{
- __clear_bit(PG_locked, &page->flags);
-}
-
static inline int trylock_page(struct page *page)
{
+ page = compound_head(page);
return (likely(!test_and_set_bit_lock(PG_locked, &page->flags)));
}
@@ -490,9 +481,9 @@ extern int wait_on_page_bit_killable_timeout(struct page *page,
static inline int wait_on_page_locked_killable(struct page *page)
{
- if (PageLocked(page))
- return wait_on_page_bit_killable(page, PG_locked);
- return 0;
+ if (!PageLocked(page))
+ return 0;
+ return wait_on_page_bit_killable(compound_head(page), PG_locked);
}
extern wait_queue_head_t *page_waitqueue(struct page *page);
@@ -511,7 +502,7 @@ static inline void wake_up_page(struct page *page, int bit)
static inline void wait_on_page_locked(struct page *page)
{
if (PageLocked(page))
- wait_on_page_bit(page, PG_locked);
+ wait_on_page_bit(compound_head(page), PG_locked);
}
/*
@@ -656,17 +647,17 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
/*
* Like add_to_page_cache_locked, but used to add newly allocated pages:
- * the page is new, so we can just run __set_page_locked() against it.
+ * the page is new, so we can just run __SetPageLocked() against it.
*/
static inline int add_to_page_cache(struct page *page,
struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
{
int error;
- __set_page_locked(page);
+ __SetPageLocked(page);
error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
if (unlikely(error))
- __clear_page_locked(page);
+ __ClearPageLocked(page);
return error;
}
diff --git a/include/linux/poison.h b/include/linux/poison.h
index 2110a81c5e2a..7b2a7fcde6a3 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -32,6 +32,10 @@
/********** mm/debug-pagealloc.c **********/
#define PAGE_POISON 0xaa
+/********** mm/page_alloc.c ************/
+
+#define TAIL_MAPPING ((void *) 0x01014A11 + POISON_POINTER_DELTA)
+
/********** mm/slab.c **********/
/*
* Magic nums for obj red zoning.
diff --git a/include/linux/printk.h b/include/linux/printk.h
index baa3f97d8ce8..9b30871c9149 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -255,6 +255,11 @@ extern asmlinkage void dump_stack(void) __cold;
printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
#define pr_info(fmt, ...) \
printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
+/*
+ * Like KERN_CONT, pr_cont() should only be used when continuing
+ * a line with no newline ('\n') enclosed. Otherwise it defaults
+ * back to KERN_DEFAULT.
+ */
#define pr_cont(fmt, ...) \
printk(KERN_CONT fmt, ##__VA_ARGS__)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index c4c559a45dc8..9c5ff69fa0cd 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -85,6 +85,7 @@ enum ttu_flags {
TTU_UNMAP = 1, /* unmap mode */
TTU_MIGRATION = 2, /* migration mode */
TTU_MUNLOCK = 4, /* munlock mode */
+ TTU_FREE = 8, /* free mode */
TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */
TTU_IGNORE_ACCESS = (1 << 9), /* don't age */
@@ -191,7 +192,8 @@ static inline void page_dup_rmap(struct page *page)
* Called from mm/vmscan.c to handle paging out
*/
int page_referenced(struct page *, int is_locked,
- struct mem_cgroup *memcg, unsigned long *vm_flags);
+ struct mem_cgroup *memcg, unsigned long *vm_flags,
+ int *is_pte_dirty);
#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
@@ -268,9 +270,12 @@ int rmap_walk(struct page *page, struct rmap_walk_control *rwc);
static inline int page_referenced(struct page *page, int is_locked,
struct mem_cgroup *memcg,
- unsigned long *vm_flags)
+ unsigned long *vm_flags,
+ int *is_pte_dirty)
{
*vm_flags = 0;
+ if (is_pte_dirty)
+ *is_pte_dirty = 0;
return 0;
}
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index dcad7ee0d746..38e0b4d1591a 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -133,6 +133,10 @@ struct rtc_device
/* Some hardware can't support UIE mode */
int uie_unsupported;
+#ifdef CONFIG_PM_SLEEP
+ struct rtc_wkalrm alarm;
+ bool valid_alarm;
+#endif
#ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL
struct work_struct uie_task;
struct timer_list uie_timer;
diff --git a/include/linux/string.h b/include/linux/string.h
index e40099e585c9..12a5a60f1f3b 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -117,6 +117,7 @@ extern void kfree_const(const void *x);
extern char *kstrdup(const char *s, gfp_t gfp);
extern const char *kstrdup_const(const char *s, gfp_t gfp);
extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
+extern char *kstrimdup(const char *s, gfp_t gfp);
extern void *kmemdup(const void *src, size_t len, gfp_t gfp);
extern char **argv_split(gfp_t gfp, const char *str, int *argcp);
diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index 657571817260..0991913f4953 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -47,22 +47,22 @@ static inline int string_unescape_any_inplace(char *buf)
#define ESCAPE_ANY_NP (ESCAPE_ANY | ESCAPE_NP)
#define ESCAPE_HEX 0x20
-int string_escape_mem(const char *src, size_t isz, char **dst, size_t osz,
+int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
unsigned int flags, const char *esc);
static inline int string_escape_mem_any_np(const char *src, size_t isz,
- char **dst, size_t osz, const char *esc)
+ char *dst, size_t osz, const char *esc)
{
return string_escape_mem(src, isz, dst, osz, ESCAPE_ANY_NP, esc);
}
-static inline int string_escape_str(const char *src, char **dst, size_t sz,
+static inline int string_escape_str(const char *src, char *dst, size_t sz,
unsigned int flags, const char *esc)
{
return string_escape_mem(src, strlen(src), dst, sz, flags, esc);
}
-static inline int string_escape_str_any_np(const char *src, char **dst,
+static inline int string_escape_str_any_np(const char *src, char *dst,
size_t sz, const char *esc)
{
return string_escape_str(src, dst, sz, ESCAPE_ANY_NP, esc);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7067eca501e2..0428e4c84e1d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -307,6 +307,7 @@ extern void lru_add_drain(void);
extern void lru_add_drain_cpu(int cpu);
extern void lru_add_drain_all(void);
extern void rotate_reclaimable_page(struct page *page);
+extern void deactivate_file_page(struct page *page);
extern void deactivate_page(struct page *page);
extern void swap_setup(void);
diff --git a/include/linux/types.h b/include/linux/types.h
index 6747247e3f9f..59698be03490 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -146,12 +146,6 @@ typedef u64 dma_addr_t;
typedef u32 dma_addr_t;
#endif /* dma_addr_t */
-#ifdef __CHECKER__
-#else
-#endif
-#ifdef __CHECK_ENDIAN__
-#else
-#endif
typedef unsigned __bitwise__ gfp_t;
typedef unsigned __bitwise__ fmode_t;
typedef unsigned __bitwise__ oom_flags_t;
diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h
index 2d1f9b627f91..0ee05da38899 100644
--- a/include/linux/uidgid.h
+++ b/include/linux/uidgid.h
@@ -29,6 +29,7 @@ typedef struct {
#define KUIDT_INIT(value) (kuid_t){ value }
#define KGIDT_INIT(value) (kgid_t){ value }
+#ifdef CONFIG_MULTIUSER
static inline uid_t __kuid_val(kuid_t uid)
{
return uid.val;
@@ -38,6 +39,17 @@ static inline gid_t __kgid_val(kgid_t gid)
{
return gid.val;
}
+#else
+static inline uid_t __kuid_val(kuid_t uid)
+{
+ return 0;
+}
+
+static inline gid_t __kgid_val(kgid_t gid)
+{
+ return 0;
+}
+#endif
#define GLOBAL_ROOT_UID KUIDT_INIT(0)
#define GLOBAL_ROOT_GID KGIDT_INIT(0)
diff --git a/include/linux/util_macros.h b/include/linux/util_macros.h
new file mode 100644
index 000000000000..d5f4fb69dba3
--- /dev/null
+++ b/include/linux/util_macros.h
@@ -0,0 +1,40 @@
+#ifndef _LINUX_HELPER_MACROS_H_
+#define _LINUX_HELPER_MACROS_H_
+
+#define __find_closest(x, a, as, op) \
+({ \
+ typeof(as) __fc_i, __fc_as = (as) - 1; \
+ typeof(x) __fc_x = (x); \
+ typeof(*a) *__fc_a = (a); \
+ for (__fc_i = 0; __fc_i < __fc_as; __fc_i++) { \
+ if (__fc_x op DIV_ROUND_CLOSEST(__fc_a[__fc_i] + \
+ __fc_a[__fc_i + 1], 2)) \
+ break; \
+ } \
+ (__fc_i); \
+})
+
+/**
+ * find_closest - locate the closest element in a sorted array
+ * @x: The reference value.
+ * @a: The array in which to look for the closest element. Must be sorted
+ * in ascending order.
+ * @as: Size of 'a'.
+ *
+ * Returns the index of the element closest to 'x'.
+ */
+#define find_closest(x, a, as) __find_closest(x, a, as, <=)
+
+/**
+ * find_closest_descending - locate the closest element in a sorted array
+ * @x: The reference value.
+ * @a: The array in which to look for the closest element. Must be sorted
+ * in descending order.
+ * @as: Size of 'a'.
+ *
+ * Similar to find_closest() but 'a' is expected to be sorted in descending
+ * order.
+ */
+#define find_closest_descending(x, a, as) __find_closest(x, a, as, >=)
+
+#endif
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 9246d32dc973..2b1cef88b827 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -25,6 +25,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
FOR_ALL_ZONES(PGALLOC),
PGFREE, PGACTIVATE, PGDEACTIVATE,
PGFAULT, PGMAJFAULT,
+ PGLAZYFREED,
FOR_ALL_ZONES(PGREFILL),
FOR_ALL_ZONES(PGSTEAL_KSWAPD),
FOR_ALL_ZONES(PGSTEAL_DIRECT),
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index 3283c6a55425..1338190b5478 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -47,5 +47,6 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
void zs_unmap_object(struct zs_pool *pool, unsigned long handle);
unsigned long zs_get_total_pages(struct zs_pool *pool);
+unsigned long zs_compact(struct zs_pool *pool);
#endif
diff --git a/include/trace/events/xen.h b/include/trace/events/xen.h
index d06b6da5c1e3..bce990f5a35d 100644
--- a/include/trace/events/xen.h
+++ b/include/trace/events/xen.h
@@ -224,7 +224,7 @@ TRACE_EVENT(xen_mmu_pmd_clear,
TP_printk("pmdp %p", __entry->pmdp)
);
-#if PAGETABLE_LEVELS >= 4
+#if CONFIG_PGTABLE_LEVELS >= 4
TRACE_EVENT(xen_mmu_set_pud,
TP_PROTO(pud_t *pudp, pud_t pudval),
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index ddc3b36f1046..7a94102b7a02 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -34,6 +34,7 @@
#define MADV_SEQUENTIAL 2 /* expect sequential page references */
#define MADV_WILLNEED 3 /* will need these pages */
#define MADV_DONTNEED 4 /* don't need these pages */
+#define MADV_FREE 5 /* free pages only if memory pressure */
/* common parameters: try to keep these consistent across architectures */
#define MADV_REMOVE 9 /* remove these pages & resources */
diff --git a/init/Kconfig b/init/Kconfig
index 25bf338f97ad..e0e9e75ac112 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -407,6 +407,7 @@ endchoice
config BSD_PROCESS_ACCT
bool "BSD Process Accounting"
+ depends on MULTIUSER
help
If you say Y here, a user level program will be able to instruct the
kernel (via a special system call) to write process accounting
@@ -433,6 +434,7 @@ config BSD_PROCESS_ACCT_V3
config TASKSTATS
bool "Export task/process statistics through netlink"
depends on NET
+ depends on MULTIUSER
default n
help
Export selected statistics for tasks/processes through the
@@ -1173,6 +1175,7 @@ config CHECKPOINT_RESTORE
menuconfig NAMESPACES
bool "Namespaces support" if EXPERT
+ depends on MULTIUSER
default !EXPERT
help
Provides the way to make tasks work with different objects using
@@ -1440,11 +1443,25 @@ menuconfig EXPERT
config UID16
bool "Enable 16-bit UID system calls" if EXPERT
- depends on HAVE_UID16
+ depends on HAVE_UID16 && MULTIUSER
default y
help
This enables the legacy 16-bit UID syscall wrappers.
+config MULTIUSER
+ bool "Multiple users, groups and capabilities support" if EXPERT
+ default y
+ help
+ This option enables support for non-root users, groups and
+ capabilities.
+
+ If you say N here, all processes will run with UID 0, GID 0, and all
+ possible capabilities. Saying N here also compiles out support for
+ system calls related to UIDs, GIDs, and capabilities, such as setuid,
+ setgid, and capset.
+
+ If unsure, say Y here.
+
config SGETMASK_SYSCALL
bool "sgetmask/ssetmask syscalls support" if EXPERT
def_bool PARISC || MN10300 || BLACKFIN || M68K || PPC || MIPS || X86 || SPARC || CRIS || MICROBLAZE || SUPERH
diff --git a/init/main.c b/init/main.c
index 54565bf57beb..739a677e8fa9 100644
--- a/init/main.c
+++ b/init/main.c
@@ -80,6 +80,7 @@
#include <linux/list.h>
#include <linux/integrity.h>
#include <linux/proc_ns.h>
+#include <linux/io.h>
#include <asm/io.h>
#include <asm/bugs.h>
@@ -484,6 +485,7 @@ static void __init mm_init(void)
percpu_init_late();
pgtable_init();
vmalloc_init();
+ ioremap_huge_init();
}
asmlinkage __visible void __init start_kernel(void)
diff --git a/ipc/msg.c b/ipc/msg.c
index a7261d5cbc89..2b6fdbb9e0e9 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -1015,22 +1015,24 @@ static int sysvipc_msg_proc_show(struct seq_file *s, void *it)
struct user_namespace *user_ns = seq_user_ns(s);
struct msg_queue *msq = it;
- return seq_printf(s,
- "%10d %10d %4o %10lu %10lu %5u %5u %5u %5u %5u %5u %10lu %10lu %10lu\n",
- msq->q_perm.key,
- msq->q_perm.id,
- msq->q_perm.mode,
- msq->q_cbytes,
- msq->q_qnum,
- msq->q_lspid,
- msq->q_lrpid,
- from_kuid_munged(user_ns, msq->q_perm.uid),
- from_kgid_munged(user_ns, msq->q_perm.gid),
- from_kuid_munged(user_ns, msq->q_perm.cuid),
- from_kgid_munged(user_ns, msq->q_perm.cgid),
- msq->q_stime,
- msq->q_rtime,
- msq->q_ctime);
+ seq_printf(s,
+ "%10d %10d %4o %10lu %10lu %5u %5u %5u %5u %5u %5u %10lu %10lu %10lu\n",
+ msq->q_perm.key,
+ msq->q_perm.id,
+ msq->q_perm.mode,
+ msq->q_cbytes,
+ msq->q_qnum,
+ msq->q_lspid,
+ msq->q_lrpid,
+ from_kuid_munged(user_ns, msq->q_perm.uid),
+ from_kgid_munged(user_ns, msq->q_perm.gid),
+ from_kuid_munged(user_ns, msq->q_perm.cuid),
+ from_kgid_munged(user_ns, msq->q_perm.cgid),
+ msq->q_stime,
+ msq->q_rtime,
+ msq->q_ctime);
+
+ return 0;
}
#endif
diff --git a/ipc/sem.c b/ipc/sem.c
index 92842113c6a9..d1a6edd17eba 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -2170,17 +2170,19 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
sem_otime = get_semotime(sma);
- return seq_printf(s,
- "%10d %10d %4o %10u %5u %5u %5u %5u %10lu %10lu\n",
- sma->sem_perm.key,
- sma->sem_perm.id,
- sma->sem_perm.mode,
- sma->sem_nsems,
- from_kuid_munged(user_ns, sma->sem_perm.uid),
- from_kgid_munged(user_ns, sma->sem_perm.gid),
- from_kuid_munged(user_ns, sma->sem_perm.cuid),
- from_kgid_munged(user_ns, sma->sem_perm.cgid),
- sem_otime,
- sma->sem_ctime);
+ seq_printf(s,
+ "%10d %10d %4o %10u %5u %5u %5u %5u %10lu %10lu\n",
+ sma->sem_perm.key,
+ sma->sem_perm.id,
+ sma->sem_perm.mode,
+ sma->sem_nsems,
+ from_kuid_munged(user_ns, sma->sem_perm.uid),
+ from_kgid_munged(user_ns, sma->sem_perm.gid),
+ from_kuid_munged(user_ns, sma->sem_perm.cuid),
+ from_kgid_munged(user_ns, sma->sem_perm.cgid),
+ sem_otime,
+ sma->sem_ctime);
+
+ return 0;
}
#endif
diff --git a/ipc/shm.c b/ipc/shm.c
index 19633b4a2350..d280a74af2ef 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -1342,25 +1342,27 @@ static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
#define SIZE_SPEC "%21lu"
#endif
- return seq_printf(s,
- "%10d %10d %4o " SIZE_SPEC " %5u %5u "
- "%5lu %5u %5u %5u %5u %10lu %10lu %10lu "
- SIZE_SPEC " " SIZE_SPEC "\n",
- shp->shm_perm.key,
- shp->shm_perm.id,
- shp->shm_perm.mode,
- shp->shm_segsz,
- shp->shm_cprid,
- shp->shm_lprid,
- shp->shm_nattch,
- from_kuid_munged(user_ns, shp->shm_perm.uid),
- from_kgid_munged(user_ns, shp->shm_perm.gid),
- from_kuid_munged(user_ns, shp->shm_perm.cuid),
- from_kgid_munged(user_ns, shp->shm_perm.cgid),
- shp->shm_atim,
- shp->shm_dtim,
- shp->shm_ctim,
- rss * PAGE_SIZE,
- swp * PAGE_SIZE);
+ seq_printf(s,
+ "%10d %10d %4o " SIZE_SPEC " %5u %5u "
+ "%5lu %5u %5u %5u %5u %10lu %10lu %10lu "
+ SIZE_SPEC " " SIZE_SPEC "\n",
+ shp->shm_perm.key,
+ shp->shm_perm.id,
+ shp->shm_perm.mode,
+ shp->shm_segsz,
+ shp->shm_cprid,
+ shp->shm_lprid,
+ shp->shm_nattch,
+ from_kuid_munged(user_ns, shp->shm_perm.uid),
+ from_kgid_munged(user_ns, shp->shm_perm.gid),
+ from_kuid_munged(user_ns, shp->shm_perm.cuid),
+ from_kgid_munged(user_ns, shp->shm_perm.cgid),
+ shp->shm_atim,
+ shp->shm_dtim,
+ shp->shm_ctim,
+ rss * PAGE_SIZE,
+ swp * PAGE_SIZE);
+
+ return 0;
}
#endif
diff --git a/ipc/util.c b/ipc/util.c
index 106bed0378ab..ff3323ef8d8b 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -837,8 +837,10 @@ static int sysvipc_proc_show(struct seq_file *s, void *it)
struct ipc_proc_iter *iter = s->private;
struct ipc_proc_iface *iface = iter->iface;
- if (it == SEQ_START_TOKEN)
- return seq_puts(s, iface->header);
+ if (it == SEQ_START_TOKEN) {
+ seq_puts(s, iface->header);
+ return 0;
+ }
return iface->show(s, it);
}
diff --git a/kernel/Makefile b/kernel/Makefile
index 1408b3353a3c..0f8f8b0bc1bf 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,9 @@ obj-y = fork.o exec_domain.o panic.o \
extable.o params.o \
kthread.o sys_ni.o nsproxy.o \
notifier.o ksysfs.o cred.o reboot.o \
- async.o range.o groups.o smpboot.o
+ async.o range.o smpboot.o
+
+obj-$(CONFIG_MULTIUSER) += groups.o
ifdef CONFIG_FUNCTION_TRACER
# Do not trace debug files and internal ftrace files
diff --git a/kernel/capability.c b/kernel/capability.c
index 989f5bfc57dc..45432b54d5c6 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -35,6 +35,7 @@ static int __init file_caps_disable(char *str)
}
__setup("no_file_caps", file_caps_disable);
+#ifdef CONFIG_MULTIUSER
/*
* More recent versions of libcap are available from:
*
@@ -386,6 +387,24 @@ bool ns_capable(struct user_namespace *ns, int cap)
}
EXPORT_SYMBOL(ns_capable);
+
+/**
+ * capable - Determine if the current task has a superior capability in effect
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+bool capable(int cap)
+{
+ return ns_capable(&init_user_ns, cap);
+}
+EXPORT_SYMBOL(capable);
+#endif /* CONFIG_MULTIUSER */
+
/**
* file_ns_capable - Determine if the file's opener had a capability in effect
* @file: The file we want to check
@@ -412,22 +431,6 @@ bool file_ns_capable(const struct file *file, struct user_namespace *ns,
EXPORT_SYMBOL(file_ns_capable);
/**
- * capable - Determine if the current task has a superior capability in effect
- * @cap: The capability to be tested for
- *
- * Return true if the current task has the given superior capability currently
- * available for use, false if not.
- *
- * This sets PF_SUPERPRIV on the task if the capability is available on the
- * assumption that it's about to be used.
- */
-bool capable(int cap)
-{
- return ns_capable(&init_user_ns, cap);
-}
-EXPORT_SYMBOL(capable);
-
-/**
* capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
* @inode: The inode in question
* @cap: The capability in question
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a220fdb66568..469dd547770c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4196,7 +4196,9 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
static int cgroup_pidlist_show(struct seq_file *s, void *v)
{
- return seq_printf(s, "%d\n", *(int *)v);
+ seq_printf(s, "%d\n", *(int *)v);
+
+ return 0;
}
static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
@@ -5451,7 +5453,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
{
WARN_ON_ONCE(!rcu_read_lock_held());
- return idr_find(&ss->css_idr, id);
+ return id > 0 ? idr_find(&ss->css_idr, id) : NULL;
}
#ifdef CONFIG_CGROUP_DEBUG
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index df1f24032fc6..123e75c0889a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2457,20 +2457,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* @node: is this an allowed node?
* @gfp_mask: memory allocation flags
*
- * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
- * set, yes, we can always allocate. If node is in our task's mems_allowed,
- * yes. If it's not a __GFP_HARDWALL request and this node is in the nearest
- * hardwalled cpuset ancestor to this task's cpuset, yes. If the task has been
- * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE
- * flag, yes.
+ * If we're in interrupt, yes, we can always allocate. If @node is set in
+ * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this
+ * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
+ * yes. If current has access to memory reserves due to TIF_MEMDIE, yes.
* Otherwise, no.
*
- * The __GFP_THISNODE placement logic is really handled elsewhere,
- * by forcibly using a zonelist starting at a specified node, and by
- * (in get_page_from_freelist()) refusing to consider the zones for
- * any node on the zonelist except the first. By the time any such
- * calls get to this routine, we should just shut up and say 'yes'.
- *
* GFP_USER allocations are marked with the __GFP_HARDWALL bit,
* and do not allow allocations outside the current tasks cpuset
* unless the task has been OOM killed as is marked TIF_MEMDIE.
@@ -2506,7 +2498,7 @@ int __cpuset_node_allowed(int node, gfp_t gfp_mask)
int allowed; /* is allocation in zone z allowed? */
unsigned long flags;
- if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
+ if (in_interrupt())
return 1;
if (node_isset(node, current->mems_allowed))
return 1;
diff --git a/kernel/cred.c b/kernel/cred.c
index e0573a43c7df..ec1c07667ec1 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -29,6 +29,9 @@
static struct kmem_cache *cred_jar;
+/* init to 2 - one for init_task, one to ensure it is never freed */
+struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
+
/*
* The initial credentials for the initial task
*/
diff --git a/kernel/fork.c b/kernel/fork.c
index cf65139615a0..20bb2ec39215 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -270,8 +270,8 @@ void __init fork_init(unsigned long mempages)
/*
* The default maximum number of threads is set to a safe
- * value: the thread structures can take up at most half
- * of memory.
+ * value: the thread structures can take up at most one
+ * eighth of the memory.
*/
max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE);
@@ -380,6 +380,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
*/
down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
+ /* No ordering required: file already has been exposed. */
+ RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
+
mm->total_vm = oldmm->total_vm;
mm->shared_vm = oldmm->shared_vm;
mm->exec_vm = oldmm->exec_vm;
@@ -505,7 +508,13 @@ static inline void mm_free_pgd(struct mm_struct *mm)
pgd_free(mm, mm->pgd);
}
#else
-#define dup_mmap(mm, oldmm) (0)
+static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+ down_write(&oldmm->mmap_sem);
+ RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
+ up_write(&oldmm->mmap_sem);
+ return 0;
+}
#define mm_alloc_pgd(mm) (0)
#define mm_free_pgd(mm)
#endif /* CONFIG_MMU */
@@ -674,35 +683,46 @@ void mmput(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(mmput);
+/**
+ * set_mm_exe_file - change a reference to the mm's executable file
+ *
+ * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
+ *
+ * Main users are mmput(), sys_execve() and sys_prctl(PR_SET_MM_MAP/EXE_FILE).
+ * Callers prevent concurrent invocations: in mmput() nobody alive left,
+ * in execve task is single-threaded, prctl holds mmap_sem exclusively.
+ */
void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
{
+ struct file *old_exe_file = rcu_dereference_protected(mm->exe_file,
+ !atomic_read(&mm->mm_users) || current->in_execve ||
+ lockdep_is_held(&mm->mmap_sem));
+
if (new_exe_file)
get_file(new_exe_file);
- if (mm->exe_file)
- fput(mm->exe_file);
- mm->exe_file = new_exe_file;
+ rcu_assign_pointer(mm->exe_file, new_exe_file);
+ if (old_exe_file)
+ fput(old_exe_file);
}
+/**
+ * get_mm_exe_file - acquire a reference to the mm's executable file
+ *
+ * Returns %NULL if mm has no associated executable file.
+ * User must release file via fput().
+ */
struct file *get_mm_exe_file(struct mm_struct *mm)
{
struct file *exe_file;
- /* We need mmap_sem to protect against races with removal of exe_file */
- down_read(&mm->mmap_sem);
- exe_file = mm->exe_file;
- if (exe_file)
- get_file(exe_file);
- up_read(&mm->mmap_sem);
+ rcu_read_lock();
+ exe_file = rcu_dereference(mm->exe_file);
+ if (exe_file && !get_file_rcu(exe_file))
+ exe_file = NULL;
+ rcu_read_unlock();
return exe_file;
}
-static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
-{
- /* It's safe to write the exe_file pointer without exe_file_lock because
- * this is called during fork when the task is not yet in /proc */
- newmm->exe_file = get_mm_exe_file(oldmm);
-}
-
/**
* get_task_mm - acquire a reference to the task's mm
*
@@ -864,8 +884,6 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
if (!mm_init(mm, tsk))
goto fail_nomem;
- dup_mm_exe_file(oldmm, mm);
-
err = dup_mmap(mm, oldmm);
if (err)
goto free_pt;
@@ -1406,10 +1424,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto bad_fork_cleanup_io;
if (pid != &init_struct_pid) {
- retval = -ENOMEM;
pid = alloc_pid(p->nsproxy->pid_ns_for_children);
- if (!pid)
+ if (IS_ERR(pid)) {
+ retval = PTR_ERR(pid);
goto bad_fork_cleanup_io;
+ }
}
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
diff --git a/kernel/groups.c b/kernel/groups.c
index 664411f171b5..74d431d25251 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -9,9 +9,6 @@
#include <linux/user_namespace.h>
#include <asm/uaccess.h>
-/* init to 2 - one for init_task, one to ensure it is never freed */
-struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
-
struct group_info *groups_alloc(int gidsetsize)
{
struct group_info *group_info;
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 06db12434d72..e0f90c2b57aa 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -169,7 +169,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
return;
rcu_read_lock();
- do_each_thread(g, t) {
+ for_each_process_thread(g, t) {
if (!max_count--)
goto unlock;
if (!--batch_count) {
@@ -180,7 +180,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
if (t->state == TASK_UNINTERRUPTIBLE)
check_hung_task(t, timeout);
- } while_each_thread(g, t);
+ }
unlock:
rcu_read_unlock();
}
diff --git a/kernel/pid.c b/kernel/pid.c
index cd36a5e0d173..4fd07d5b7baf 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -182,7 +182,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
spin_unlock_irq(&pidmap_lock);
kfree(page);
if (unlikely(!map->page))
- break;
+ return -ENOMEM;
}
if (likely(atomic_read(&map->nr_free))) {
for ( ; ; ) {
@@ -210,7 +210,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
}
pid = mk_pid(pid_ns, map, offset);
}
- return -1;
+ return -EAGAIN;
}
int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
@@ -301,17 +301,20 @@ struct pid *alloc_pid(struct pid_namespace *ns)
int i, nr;
struct pid_namespace *tmp;
struct upid *upid;
+ int retval = -ENOMEM;
pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
if (!pid)
- goto out;
+ return ERR_PTR(retval);
tmp = ns;
pid->level = ns->level;
for (i = ns->level; i >= 0; i--) {
nr = alloc_pidmap(tmp);
- if (nr < 0)
+ if (IS_ERR_VALUE(nr)) {
+ retval = nr;
goto out_free;
+ }
pid->numbers[i].nr = nr;
pid->numbers[i].ns = tmp;
@@ -339,7 +342,6 @@ struct pid *alloc_pid(struct pid_namespace *ns)
}
spin_unlock_irq(&pidmap_lock);
-out:
return pid;
out_unlock:
@@ -351,8 +353,7 @@ out_free:
free_pidmap(pid->numbers + i);
kmem_cache_free(ns->pid_cachep, pid);
- pid = NULL;
- goto out;
+ return ERR_PTR(retval);
}
void disable_pid_allocation(struct pid_namespace *ns)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 227fec36b12a..c8e0e050a36a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -456,8 +456,6 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
static int ptrace_detach(struct task_struct *child, unsigned int data)
{
- bool dead = false;
-
if (!valid_signal(data))
return -EIO;
@@ -467,18 +465,19 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
write_lock_irq(&tasklist_lock);
/*
- * This child can be already killed. Make sure de_thread() or
- * our sub-thread doing do_wait() didn't do release_task() yet.
+ * We rely on ptrace_freeze_traced(). It can't be killed and
+ * untraced by another thread, it can't be a zombie.
*/
- if (child->ptrace) {
- child->exit_code = data;
- dead = __ptrace_detach(current, child);
- }
+ WARN_ON(!child->ptrace || child->exit_state);
+ /*
+ * tasklist_lock avoids the race with wait_task_stopped(), see
+ * the comment in ptrace_resume().
+ */
+ child->exit_code = data;
+ __ptrace_detach(current, child);
write_unlock_irq(&tasklist_lock);
proc_ptrace_connector(child, PTRACE_DETACH);
- if (unlikely(dead))
- release_task(child);
return 0;
}
@@ -697,6 +696,8 @@ static int ptrace_peek_siginfo(struct task_struct *child,
static int ptrace_resume(struct task_struct *child, long request,
unsigned long data)
{
+ bool need_siglock;
+
if (!valid_signal(data))
return -EIO;
@@ -724,8 +725,26 @@ static int ptrace_resume(struct task_struct *child, long request,
user_disable_single_step(child);
}
+ /*
+ * Change ->exit_code and ->state under siglock to avoid the race
+ * with wait_task_stopped() in between; a non-zero ->exit_code will
+ * wrongly look like another report from tracee.
+ *
+ * Note that we need siglock even if ->exit_code == data and/or this
+ * status was not reported yet, the new status must not be cleared by
+ * wait_task_stopped() after resume.
+ *
+ * If data == 0 we do not care if wait_task_stopped() reports the old
+ * status and clears the code too; this can't race with the tracee, it
+ * takes siglock after resume.
+ */
+ need_siglock = data && !thread_group_empty(current);
+ if (need_siglock)
+ spin_lock_irq(&child->sighand->siglock);
child->exit_code = data;
wake_up_state(child, __TASK_TRACED);
+ if (need_siglock)
+ spin_unlock_irq(&child->sighand->siglock);
return 0;
}
diff --git a/kernel/resource.c b/kernel/resource.c
index 19f2357dfda3..90552aab5f2d 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1034,8 +1034,6 @@ resource_size_t resource_alignment(struct resource *res)
*
* request_region creates a new busy region.
*
- * check_region returns non-zero if the area is already busy.
- *
* release_region releases a matching busy region.
*/
@@ -1098,36 +1096,6 @@ struct resource * __request_region(struct resource *parent,
EXPORT_SYMBOL(__request_region);
/**
- * __check_region - check if a resource region is busy or free
- * @parent: parent resource descriptor
- * @start: resource start address
- * @n: resource region size
- *
- * Returns 0 if the region is free at the moment it is checked,
- * returns %-EBUSY if the region is busy.
- *
- * NOTE:
- * This function is deprecated because its use is racy.
- * Even if it returns 0, a subsequent call to request_region()
- * may fail because another driver etc. just allocated the region.
- * Do NOT use it. It will be removed from the kernel.
- */
-int __check_region(struct resource *parent, resource_size_t start,
- resource_size_t n)
-{
- struct resource * res;
-
- res = __request_region(parent, start, n, "check-region", 0);
- if (!res)
- return -EBUSY;
-
- release_resource(res);
- free_resource(res);
- return 0;
-}
-EXPORT_SYMBOL(__check_region);
-
-/**
* __release_region - release a previously reserved resource region
* @parent: parent resource descriptor
* @start: resource start address
diff --git a/kernel/signal.c b/kernel/signal.c
index a390499943e4..d51c5ddd855c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2992,11 +2992,9 @@ static int do_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t *info)
* Nor can they impersonate a kill()/tgkill(), which adds source info.
*/
if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
- (task_pid_vnr(current) != pid)) {
- /* We used to allow any < 0 si_code */
- WARN_ON_ONCE(info->si_code < 0);
+ (task_pid_vnr(current) != pid))
return -EPERM;
- }
+
info->si_signo = sig;
/* POSIX.1b doesn't mention process groups. */
@@ -3041,12 +3039,10 @@ static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
/* Not even root can pretend to send signals from the kernel.
* Nor can they impersonate a kill()/tgkill(), which adds source info.
*/
- if (((info->si_code >= 0 || info->si_code == SI_TKILL)) &&
- (task_pid_vnr(current) != pid)) {
- /* We used to allow any < 0 si_code */
- WARN_ON_ONCE(info->si_code < 0);
+ if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
+ (task_pid_vnr(current) != pid))
return -EPERM;
- }
+
info->si_signo = sig;
return do_send_specific(tgid, pid, sig, info);
diff --git a/kernel/sys.c b/kernel/sys.c
index a03d9cd23ed7..3be344902316 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -325,6 +325,7 @@ out_unlock:
* SMP: There are not races, the GIDs are checked only by filesystem
* operations (as far as semantic preservation is concerned).
*/
+#ifdef CONFIG_MULTIUSER
SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
{
struct user_namespace *ns = current_user_ns();
@@ -815,6 +816,7 @@ change_okay:
commit_creds(new);
return old_fsgid;
}
+#endif /* CONFIG_MULTIUSER */
/**
* sys_getpid - return the thread group id of the current process
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 5adcb0ae3a58..7995ef5868d8 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -159,6 +159,20 @@ cond_syscall(sys_uselib);
cond_syscall(sys_fadvise64);
cond_syscall(sys_fadvise64_64);
cond_syscall(sys_madvise);
+cond_syscall(sys_setuid);
+cond_syscall(sys_setregid);
+cond_syscall(sys_setgid);
+cond_syscall(sys_setreuid);
+cond_syscall(sys_setresuid);
+cond_syscall(sys_getresuid);
+cond_syscall(sys_setresgid);
+cond_syscall(sys_getresgid);
+cond_syscall(sys_setgroups);
+cond_syscall(sys_getgroups);
+cond_syscall(sys_setfsuid);
+cond_syscall(sys_setfsgid);
+cond_syscall(sys_capget);
+cond_syscall(sys_capset);
/* arch-specific weak syscall entries */
cond_syscall(sys_pciconfig_read);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 83d907afb4a6..19f3aeecb4f0 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -847,7 +847,7 @@ static struct ctl_table kern_table[] = {
.data = &watchdog_user_enabled,
.maxlen = sizeof (int),
.mode = 0644,
- .proc_handler = proc_dowatchdog,
+ .proc_handler = proc_watchdog,
.extra1 = &zero,
.extra2 = &one,
},
@@ -856,11 +856,33 @@ static struct ctl_table kern_table[] = {
.data = &watchdog_thresh,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dowatchdog,
+ .proc_handler = proc_watchdog_thresh,
.extra1 = &zero,
.extra2 = &sixty,
},
{
+ .procname = "nmi_watchdog",
+ .data = &nmi_watchdog_enabled,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = proc_nmi_watchdog,
+ .extra1 = &zero,
+#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
+ .extra2 = &one,
+#else
+ .extra2 = &zero,
+#endif
+ },
+ {
+ .procname = "soft_watchdog",
+ .data = &soft_watchdog_enabled,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = proc_soft_watchdog,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
.procname = "softlockup_panic",
.data = &softlockup_panic,
.maxlen = sizeof(int),
@@ -880,15 +902,6 @@ static struct ctl_table kern_table[] = {
.extra2 = &one,
},
#endif /* CONFIG_SMP */
- {
- .procname = "nmi_watchdog",
- .data = &watchdog_user_enabled,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = proc_dowatchdog,
- .extra1 = &zero,
- .extra2 = &one,
- },
#endif
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
{
@@ -1314,6 +1327,15 @@ static struct ctl_table vm_table[] = {
.extra1 = &min_extfrag_threshold,
.extra2 = &max_extfrag_threshold,
},
+ {
+ .procname = "compact_unevictable_allowed",
+ .data = &sysctl_compact_unevictable_allowed,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
#endif /* CONFIG_COMPACTION */
{
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index c3e4fcfddd45..3f34496244e9 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -327,11 +327,11 @@ static void t_stop(struct seq_file *m, void *p)
local_irq_enable();
}
-static int trace_lookup_stack(struct seq_file *m, long i)
+static void trace_lookup_stack(struct seq_file *m, long i)
{
unsigned long addr = stack_dump_trace[i];
- return seq_printf(m, "%pS\n", (void *)addr);
+ seq_printf(m, "%pS\n", (void *)addr);
}
static void print_disabled(struct seq_file *m)
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 3174bf8e3538..f2be11ab7e08 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -24,8 +24,33 @@
#include <linux/kvm_para.h>
#include <linux/perf_event.h>
-int watchdog_user_enabled = 1;
+/*
+ * The run state of the lockup detectors is controlled by the content of the
+ * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit -
+ * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector.
+ *
+ * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled'
+ * are variables that are only used as an 'interface' between the parameters
+ * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The
+ * 'watchdog_thresh' variable is handled differently because its value is not
+ * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh'
+ * is equal zero.
+ */
+#define NMI_WATCHDOG_ENABLED_BIT 0
+#define SOFT_WATCHDOG_ENABLED_BIT 1
+#define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT)
+#define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT)
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED;
+#else
+static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
+#endif
+int __read_mostly nmi_watchdog_enabled;
+int __read_mostly soft_watchdog_enabled;
+int __read_mostly watchdog_user_enabled;
int __read_mostly watchdog_thresh = 10;
+
#ifdef CONFIG_SMP
int __read_mostly sysctl_softlockup_all_cpu_backtrace;
#else
@@ -58,8 +83,6 @@ static unsigned long soft_lockup_nmi_warn;
#ifdef CONFIG_HARDLOCKUP_DETECTOR
static int hardlockup_panic =
CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
-
-static bool hardlockup_detector_enabled = true;
/*
* We may not want to enable hard lockup detection by default in all cases,
* for example when running the kernel as a guest on a hypervisor. In these
@@ -68,14 +91,9 @@ static bool hardlockup_detector_enabled = true;
* kernel command line parameters are parsed, because otherwise it is not
* possible to override this in hardlockup_panic_setup().
*/
-void watchdog_enable_hardlockup_detector(bool val)
-{
- hardlockup_detector_enabled = val;
-}
-
-bool watchdog_hardlockup_detector_is_enabled(void)
+void hardlockup_detector_disable(void)
{
- return hardlockup_detector_enabled;
+ watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
}
static int __init hardlockup_panic_setup(char *str)
@@ -85,15 +103,9 @@ static int __init hardlockup_panic_setup(char *str)
else if (!strncmp(str, "nopanic", 7))
hardlockup_panic = 0;
else if (!strncmp(str, "0", 1))
- watchdog_user_enabled = 0;
- else if (!strncmp(str, "1", 1) || !strncmp(str, "2", 1)) {
- /*
- * Setting 'nmi_watchdog=1' or 'nmi_watchdog=2' (legacy option)
- * has the same effect.
- */
- watchdog_user_enabled = 1;
- watchdog_enable_hardlockup_detector(true);
- }
+ watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+ else if (!strncmp(str, "1", 1))
+ watchdog_enabled |= NMI_WATCHDOG_ENABLED;
return 1;
}
__setup("nmi_watchdog=", hardlockup_panic_setup);
@@ -112,19 +124,18 @@ __setup("softlockup_panic=", softlockup_panic_setup);
static int __init nowatchdog_setup(char *str)
{
- watchdog_user_enabled = 0;
+ watchdog_enabled = 0;
return 1;
}
__setup("nowatchdog", nowatchdog_setup);
-/* deprecated */
static int __init nosoftlockup_setup(char *str)
{
- watchdog_user_enabled = 0;
+ watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED;
return 1;
}
__setup("nosoftlockup", nosoftlockup_setup);
-/* */
+
#ifdef CONFIG_SMP
static int __init softlockup_all_cpu_backtrace_setup(char *str)
{
@@ -239,10 +250,11 @@ static int is_softlockup(unsigned long touch_ts)
{
unsigned long now = get_timestamp();
- /* Warn about unreasonable delays: */
- if (time_after(now, touch_ts + get_softlockup_thresh()))
- return now - touch_ts;
-
+ if (watchdog_enabled & SOFT_WATCHDOG_ENABLED) {
+ /* Warn about unreasonable delays. */
+ if (time_after(now, touch_ts + get_softlockup_thresh()))
+ return now - touch_ts;
+ }
return 0;
}
@@ -477,6 +489,21 @@ static void watchdog(unsigned int cpu)
__this_cpu_write(soft_lockup_hrtimer_cnt,
__this_cpu_read(hrtimer_interrupts));
__touch_watchdog();
+
+ /*
+ * watchdog_nmi_enable() clears the NMI_WATCHDOG_ENABLED bit in the
+ * failure path. Check for failures that can occur asynchronously -
+ * for example, when CPUs are on-lined - and shut down the hardware
+ * perf event on each CPU accordingly.
+ *
+ * The only non-obvious place this bit can be cleared is through
+ * watchdog_nmi_enable(), so a pr_info() is placed there. Placing a
+ * pr_info here would be too noisy as it would result in a message
+ * every few seconds if the hardlockup was disabled but the softlockup
+ * enabled.
+ */
+ if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+ watchdog_nmi_disable(cpu);
}
#ifdef CONFIG_HARDLOCKUP_DETECTOR
@@ -492,14 +519,9 @@ static int watchdog_nmi_enable(unsigned int cpu)
struct perf_event_attr *wd_attr;
struct perf_event *event = per_cpu(watchdog_ev, cpu);
- /*
- * Some kernels need to default hard lockup detection to
- * 'disabled', for example a guest on a hypervisor.
- */
- if (!watchdog_hardlockup_detector_is_enabled()) {
- event = ERR_PTR(-ENOENT);
- goto handle_err;
- }
+ /* nothing to do if the hard lockup detector is disabled */
+ if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+ goto out;
/* is it already setup and enabled? */
if (event && event->state > PERF_EVENT_STATE_OFF)
@@ -515,7 +537,6 @@ static int watchdog_nmi_enable(unsigned int cpu)
/* Try to register using hardware perf events */
event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
-handle_err:
/* save cpu0 error for future comparision */
if (cpu == 0 && IS_ERR(event))
cpu0_err = PTR_ERR(event);
@@ -527,6 +548,18 @@ handle_err:
goto out_save;
}
+ /*
+ * Disable the hard lockup detector if _any_ CPU fails to set up
+ * set up the hardware perf event. The watchdog() function checks
+ * the NMI_WATCHDOG_ENABLED bit periodically.
+ *
+ * The barriers are for syncing up watchdog_enabled across all the
+ * cpus, as clear_bit() does not use barriers.
+ */
+ smp_mb__before_atomic();
+ clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
+ smp_mb__after_atomic();
+
/* skip displaying the same error again */
if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
return PTR_ERR(event);
@@ -540,6 +573,9 @@ handle_err:
else
pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
cpu, PTR_ERR(event));
+
+ pr_info("Shutting down hard lockup detector on all cpus\n");
+
return PTR_ERR(event);
/* success path */
@@ -600,7 +636,7 @@ static void restart_watchdog_hrtimer(void *info)
HRTIMER_MODE_REL_PINNED);
}
-static void update_timers(int cpu)
+static void update_watchdog(int cpu)
{
/*
* Make sure that perf event counter will adopt to a new
@@ -615,17 +651,17 @@ static void update_timers(int cpu)
watchdog_nmi_enable(cpu);
}
-static void update_timers_all_cpus(void)
+static void update_watchdog_all_cpus(void)
{
int cpu;
get_online_cpus();
for_each_online_cpu(cpu)
- update_timers(cpu);
+ update_watchdog(cpu);
put_online_cpus();
}
-static int watchdog_enable_all_cpus(bool sample_period_changed)
+static int watchdog_enable_all_cpus(void)
{
int err = 0;
@@ -635,8 +671,12 @@ static int watchdog_enable_all_cpus(bool sample_period_changed)
pr_err("Failed to create watchdog threads, disabled\n");
else
watchdog_running = 1;
- } else if (sample_period_changed) {
- update_timers_all_cpus();
+ } else {
+ /*
+ * Enable/disable the lockup detectors or
+ * change the sample period 'on the fly'.
+ */
+ update_watchdog_all_cpus();
}
return err;
@@ -654,58 +694,159 @@ static void watchdog_disable_all_cpus(void)
}
/*
- * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
+ * Update the run state of the lockup detectors.
*/
+static int proc_watchdog_update(void)
+{
+ int err = 0;
+
+ /*
+ * Watchdog threads won't be started if they are already active.
+ * The 'watchdog_running' variable in watchdog_*_all_cpus() takes
+ * care of this. If those threads are already active, the sample
+ * period will be updated and the lockup detectors will be enabled
+ * or disabled 'on the fly'.
+ */
+ if (watchdog_enabled && watchdog_thresh)
+ err = watchdog_enable_all_cpus();
+ else
+ watchdog_disable_all_cpus();
+
+ return err;
+
+}
+
+static DEFINE_MUTEX(watchdog_proc_mutex);
-int proc_dowatchdog(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+/*
+ * common function for watchdog, nmi_watchdog and soft_watchdog parameter
+ *
+ * caller | table->data points to | 'which' contains the flag(s)
+ * -------------------|-----------------------|-----------------------------
+ * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED or'ed
+ * | | with SOFT_WATCHDOG_ENABLED
+ * -------------------|-----------------------|-----------------------------
+ * proc_nmi_watchdog | nmi_watchdog_enabled | NMI_WATCHDOG_ENABLED
+ * -------------------|-----------------------|-----------------------------
+ * proc_soft_watchdog | soft_watchdog_enabled | SOFT_WATCHDOG_ENABLED
+ */
+static int proc_watchdog_common(int which, struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
{
- int err, old_thresh, old_enabled;
- bool old_hardlockup;
- static DEFINE_MUTEX(watchdog_proc_mutex);
+ int err, old, new;
+ int *watchdog_param = (int *)table->data;
mutex_lock(&watchdog_proc_mutex);
- old_thresh = ACCESS_ONCE(watchdog_thresh);
- old_enabled = ACCESS_ONCE(watchdog_user_enabled);
- old_hardlockup = watchdog_hardlockup_detector_is_enabled();
- err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (err || !write)
- goto out;
-
- set_sample_period();
/*
- * Watchdog threads shouldn't be enabled if they are
- * disabled. The 'watchdog_running' variable check in
- * watchdog_*_all_cpus() function takes care of this.
+ * If the parameter is being read return the state of the corresponding
+ * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the
+ * run state of the lockup detectors.
*/
- if (watchdog_user_enabled && watchdog_thresh) {
+ if (!write) {
+ *watchdog_param = (watchdog_enabled & which) != 0;
+ err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ } else {
+ err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (err)
+ goto out;
+
/*
- * Prevent a change in watchdog_thresh accidentally overriding
- * the enablement of the hardlockup detector.
+ * There is a race window between fetching the current value
+ * from 'watchdog_enabled' and storing the new value. During
+ * this race window, watchdog_nmi_enable() can sneak in and
+ * clear the NMI_WATCHDOG_ENABLED bit in 'watchdog_enabled'.
+ * The 'cmpxchg' detects this race and the loop retries.
*/
- if (watchdog_user_enabled != old_enabled)
- watchdog_enable_hardlockup_detector(true);
- err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh);
- } else
- watchdog_disable_all_cpus();
+ do {
+ old = watchdog_enabled;
+ /*
+ * If the parameter value is not zero set the
+ * corresponding bit(s), else clear it(them).
+ */
+ if (*watchdog_param)
+ new = old | which;
+ else
+ new = old & ~which;
+ } while (cmpxchg(&watchdog_enabled, old, new) != old);
- /* Restore old values on failure */
- if (err) {
- watchdog_thresh = old_thresh;
- watchdog_user_enabled = old_enabled;
- watchdog_enable_hardlockup_detector(old_hardlockup);
+ /*
+ * Update the run state of the lockup detectors.
+ * Restore 'watchdog_enabled' on failure.
+ */
+ err = proc_watchdog_update();
+ if (err)
+ watchdog_enabled = old;
}
out:
mutex_unlock(&watchdog_proc_mutex);
return err;
}
+
+/*
+ * /proc/sys/kernel/watchdog
+ */
+int proc_watchdog(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED,
+ table, write, buffer, lenp, ppos);
+}
+
+/*
+ * /proc/sys/kernel/nmi_watchdog
+ */
+int proc_nmi_watchdog(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_watchdog_common(NMI_WATCHDOG_ENABLED,
+ table, write, buffer, lenp, ppos);
+}
+
+/*
+ * /proc/sys/kernel/soft_watchdog
+ */
+int proc_soft_watchdog(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_watchdog_common(SOFT_WATCHDOG_ENABLED,
+ table, write, buffer, lenp, ppos);
+}
+
+/*
+ * /proc/sys/kernel/watchdog_thresh
+ */
+int proc_watchdog_thresh(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int err, old;
+
+ mutex_lock(&watchdog_proc_mutex);
+
+ old = ACCESS_ONCE(watchdog_thresh);
+ err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+ if (err || !write)
+ goto out;
+
+ /*
+ * Update the sample period.
+ * Restore 'watchdog_thresh' on failure.
+ */
+ set_sample_period();
+ err = proc_watchdog_update();
+ if (err)
+ watchdog_thresh = old;
+out:
+ mutex_unlock(&watchdog_proc_mutex);
+ return err;
+}
#endif /* CONFIG_SYSCTL */
void __init lockup_detector_init(void)
{
set_sample_period();
- if (watchdog_user_enabled)
- watchdog_enable_all_cpus(false);
+ if (watchdog_enabled)
+ watchdog_enable_all_cpus();
}
diff --git a/lib/Kconfig b/lib/Kconfig
index 87da53bb1fef..b039e70fd31d 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -189,6 +189,13 @@ config CRC8
when they need to do cyclic redundancy check according CRC8
algorithm. Module will be called crc8.
+config CRC64_ECMA
+ tristate "CRC64 ECMA function"
+ help
+ This option provides CRC64 ECMA function. Drivers may select this
+ when they need to do cyclic redundancy check according to the CRC64
+ ECMA algorithm.
+
config AUDIT_GENERIC
bool
depends on AUDIT && !AUDIT_ARCH
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 48f73efd29fe..ea369ddfb3d8 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1808,6 +1808,18 @@ config TEST_UDELAY
If unsure, say N.
+config MEMTEST
+ bool "Memtest"
+ depends on MEMBLOCK
+ ---help---
+ This option adds a kernel parameter 'memtest', which allows memtest
+ to be set.
+ memtest=0, mean disabled; -- default
+ memtest=1, mean do 1 test pattern;
+ ...
+ memtest=17, mean do 17 test patterns.
+ If you are unsure how to answer this question, answer N.
+
source "samples/Kconfig"
source "lib/Kconfig.kgdb"
diff --git a/lib/Makefile b/lib/Makefile
index 58f74d2dd396..7b320c818d1a 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -25,7 +25,7 @@ obj-y += lockref.o
obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \
gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \
- bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o \
+ bsearch.o find_bit.o llist.o memweight.o kfifo.o \
percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o
obj-y += string_helpers.o
obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
@@ -76,6 +76,7 @@ obj-$(CONFIG_CRC32) += crc32.o
obj-$(CONFIG_CRC7) += crc7.o
obj-$(CONFIG_LIBCRC32C) += libcrc32c.o
obj-$(CONFIG_CRC8) += crc8.o
+obj-$(CONFIG_CRC64_ECMA) += crc64_ecma.o
obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate/
diff --git a/lib/cpumask.c b/lib/cpumask.c
index b6513a9f2892..5ab1553fd076 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -37,10 +37,11 @@ EXPORT_SYMBOL(__next_cpu_nr);
int cpumask_next_and(int n, const struct cpumask *src1p,
const struct cpumask *src2p)
{
- while ((n = cpumask_next(n, src1p)) < nr_cpu_ids)
- if (cpumask_test_cpu(n, src2p))
- break;
- return n;
+ struct cpumask tmp;
+
+ if (cpumask_and(&tmp, src1p, src2p))
+ return cpumask_next(n, &tmp);
+ return nr_cpu_ids;
}
EXPORT_SYMBOL(cpumask_next_and);
diff --git a/lib/crc64_ecma.c b/lib/crc64_ecma.c
new file mode 100644
index 000000000000..41629ea5a60c
--- /dev/null
+++ b/lib/crc64_ecma.c
@@ -0,0 +1,341 @@
+/*
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <linux/crc64_ecma.h>
+
+
+#define CRC64_BYTE_MASK 0xFF
+#define CRC64_TABLE_SIZE 256
+
+
+struct crc64_table {
+ u64 seed;
+ u64 table[CRC64_TABLE_SIZE];
+};
+
+
+static struct crc64_table CRC64_ECMA_182 = {
+ CRC64_DEFAULT_INITVAL,
+ {
+ 0x0000000000000000ULL,
+ 0xb32e4cbe03a75f6fULL,
+ 0xf4843657a840a05bULL,
+ 0x47aa7ae9abe7ff34ULL,
+ 0x7bd0c384ff8f5e33ULL,
+ 0xc8fe8f3afc28015cULL,
+ 0x8f54f5d357cffe68ULL,
+ 0x3c7ab96d5468a107ULL,
+ 0xf7a18709ff1ebc66ULL,
+ 0x448fcbb7fcb9e309ULL,
+ 0x0325b15e575e1c3dULL,
+ 0xb00bfde054f94352ULL,
+ 0x8c71448d0091e255ULL,
+ 0x3f5f08330336bd3aULL,
+ 0x78f572daa8d1420eULL,
+ 0xcbdb3e64ab761d61ULL,
+ 0x7d9ba13851336649ULL,
+ 0xceb5ed8652943926ULL,
+ 0x891f976ff973c612ULL,
+ 0x3a31dbd1fad4997dULL,
+ 0x064b62bcaebc387aULL,
+ 0xb5652e02ad1b6715ULL,
+ 0xf2cf54eb06fc9821ULL,
+ 0x41e11855055bc74eULL,
+ 0x8a3a2631ae2dda2fULL,
+ 0x39146a8fad8a8540ULL,
+ 0x7ebe1066066d7a74ULL,
+ 0xcd905cd805ca251bULL,
+ 0xf1eae5b551a2841cULL,
+ 0x42c4a90b5205db73ULL,
+ 0x056ed3e2f9e22447ULL,
+ 0xb6409f5cfa457b28ULL,
+ 0xfb374270a266cc92ULL,
+ 0x48190ecea1c193fdULL,
+ 0x0fb374270a266cc9ULL,
+ 0xbc9d3899098133a6ULL,
+ 0x80e781f45de992a1ULL,
+ 0x33c9cd4a5e4ecdceULL,
+ 0x7463b7a3f5a932faULL,
+ 0xc74dfb1df60e6d95ULL,
+ 0x0c96c5795d7870f4ULL,
+ 0xbfb889c75edf2f9bULL,
+ 0xf812f32ef538d0afULL,
+ 0x4b3cbf90f69f8fc0ULL,
+ 0x774606fda2f72ec7ULL,
+ 0xc4684a43a15071a8ULL,
+ 0x83c230aa0ab78e9cULL,
+ 0x30ec7c140910d1f3ULL,
+ 0x86ace348f355aadbULL,
+ 0x3582aff6f0f2f5b4ULL,
+ 0x7228d51f5b150a80ULL,
+ 0xc10699a158b255efULL,
+ 0xfd7c20cc0cdaf4e8ULL,
+ 0x4e526c720f7dab87ULL,
+ 0x09f8169ba49a54b3ULL,
+ 0xbad65a25a73d0bdcULL,
+ 0x710d64410c4b16bdULL,
+ 0xc22328ff0fec49d2ULL,
+ 0x85895216a40bb6e6ULL,
+ 0x36a71ea8a7ace989ULL,
+ 0x0adda7c5f3c4488eULL,
+ 0xb9f3eb7bf06317e1ULL,
+ 0xfe5991925b84e8d5ULL,
+ 0x4d77dd2c5823b7baULL,
+ 0x64b62bcaebc387a1ULL,
+ 0xd7986774e864d8ceULL,
+ 0x90321d9d438327faULL,
+ 0x231c512340247895ULL,
+ 0x1f66e84e144cd992ULL,
+ 0xac48a4f017eb86fdULL,
+ 0xebe2de19bc0c79c9ULL,
+ 0x58cc92a7bfab26a6ULL,
+ 0x9317acc314dd3bc7ULL,
+ 0x2039e07d177a64a8ULL,
+ 0x67939a94bc9d9b9cULL,
+ 0xd4bdd62abf3ac4f3ULL,
+ 0xe8c76f47eb5265f4ULL,
+ 0x5be923f9e8f53a9bULL,
+ 0x1c4359104312c5afULL,
+ 0xaf6d15ae40b59ac0ULL,
+ 0x192d8af2baf0e1e8ULL,
+ 0xaa03c64cb957be87ULL,
+ 0xeda9bca512b041b3ULL,
+ 0x5e87f01b11171edcULL,
+ 0x62fd4976457fbfdbULL,
+ 0xd1d305c846d8e0b4ULL,
+ 0x96797f21ed3f1f80ULL,
+ 0x2557339fee9840efULL,
+ 0xee8c0dfb45ee5d8eULL,
+ 0x5da24145464902e1ULL,
+ 0x1a083bacedaefdd5ULL,
+ 0xa9267712ee09a2baULL,
+ 0x955cce7fba6103bdULL,
+ 0x267282c1b9c65cd2ULL,
+ 0x61d8f8281221a3e6ULL,
+ 0xd2f6b4961186fc89ULL,
+ 0x9f8169ba49a54b33ULL,
+ 0x2caf25044a02145cULL,
+ 0x6b055fede1e5eb68ULL,
+ 0xd82b1353e242b407ULL,
+ 0xe451aa3eb62a1500ULL,
+ 0x577fe680b58d4a6fULL,
+ 0x10d59c691e6ab55bULL,
+ 0xa3fbd0d71dcdea34ULL,
+ 0x6820eeb3b6bbf755ULL,
+ 0xdb0ea20db51ca83aULL,
+ 0x9ca4d8e41efb570eULL,
+ 0x2f8a945a1d5c0861ULL,
+ 0x13f02d374934a966ULL,
+ 0xa0de61894a93f609ULL,
+ 0xe7741b60e174093dULL,
+ 0x545a57dee2d35652ULL,
+ 0xe21ac88218962d7aULL,
+ 0x5134843c1b317215ULL,
+ 0x169efed5b0d68d21ULL,
+ 0xa5b0b26bb371d24eULL,
+ 0x99ca0b06e7197349ULL,
+ 0x2ae447b8e4be2c26ULL,
+ 0x6d4e3d514f59d312ULL,
+ 0xde6071ef4cfe8c7dULL,
+ 0x15bb4f8be788911cULL,
+ 0xa6950335e42fce73ULL,
+ 0xe13f79dc4fc83147ULL,
+ 0x521135624c6f6e28ULL,
+ 0x6e6b8c0f1807cf2fULL,
+ 0xdd45c0b11ba09040ULL,
+ 0x9aefba58b0476f74ULL,
+ 0x29c1f6e6b3e0301bULL,
+ 0xc96c5795d7870f42ULL,
+ 0x7a421b2bd420502dULL,
+ 0x3de861c27fc7af19ULL,
+ 0x8ec62d7c7c60f076ULL,
+ 0xb2bc941128085171ULL,
+ 0x0192d8af2baf0e1eULL,
+ 0x4638a2468048f12aULL,
+ 0xf516eef883efae45ULL,
+ 0x3ecdd09c2899b324ULL,
+ 0x8de39c222b3eec4bULL,
+ 0xca49e6cb80d9137fULL,
+ 0x7967aa75837e4c10ULL,
+ 0x451d1318d716ed17ULL,
+ 0xf6335fa6d4b1b278ULL,
+ 0xb199254f7f564d4cULL,
+ 0x02b769f17cf11223ULL,
+ 0xb4f7f6ad86b4690bULL,
+ 0x07d9ba1385133664ULL,
+ 0x4073c0fa2ef4c950ULL,
+ 0xf35d8c442d53963fULL,
+ 0xcf273529793b3738ULL,
+ 0x7c0979977a9c6857ULL,
+ 0x3ba3037ed17b9763ULL,
+ 0x888d4fc0d2dcc80cULL,
+ 0x435671a479aad56dULL,
+ 0xf0783d1a7a0d8a02ULL,
+ 0xb7d247f3d1ea7536ULL,
+ 0x04fc0b4dd24d2a59ULL,
+ 0x3886b22086258b5eULL,
+ 0x8ba8fe9e8582d431ULL,
+ 0xcc0284772e652b05ULL,
+ 0x7f2cc8c92dc2746aULL,
+ 0x325b15e575e1c3d0ULL,
+ 0x8175595b76469cbfULL,
+ 0xc6df23b2dda1638bULL,
+ 0x75f16f0cde063ce4ULL,
+ 0x498bd6618a6e9de3ULL,
+ 0xfaa59adf89c9c28cULL,
+ 0xbd0fe036222e3db8ULL,
+ 0x0e21ac88218962d7ULL,
+ 0xc5fa92ec8aff7fb6ULL,
+ 0x76d4de52895820d9ULL,
+ 0x317ea4bb22bfdfedULL,
+ 0x8250e80521188082ULL,
+ 0xbe2a516875702185ULL,
+ 0x0d041dd676d77eeaULL,
+ 0x4aae673fdd3081deULL,
+ 0xf9802b81de97deb1ULL,
+ 0x4fc0b4dd24d2a599ULL,
+ 0xfceef8632775faf6ULL,
+ 0xbb44828a8c9205c2ULL,
+ 0x086ace348f355aadULL,
+ 0x34107759db5dfbaaULL,
+ 0x873e3be7d8faa4c5ULL,
+ 0xc094410e731d5bf1ULL,
+ 0x73ba0db070ba049eULL,
+ 0xb86133d4dbcc19ffULL,
+ 0x0b4f7f6ad86b4690ULL,
+ 0x4ce50583738cb9a4ULL,
+ 0xffcb493d702be6cbULL,
+ 0xc3b1f050244347ccULL,
+ 0x709fbcee27e418a3ULL,
+ 0x3735c6078c03e797ULL,
+ 0x841b8ab98fa4b8f8ULL,
+ 0xadda7c5f3c4488e3ULL,
+ 0x1ef430e13fe3d78cULL,
+ 0x595e4a08940428b8ULL,
+ 0xea7006b697a377d7ULL,
+ 0xd60abfdbc3cbd6d0ULL,
+ 0x6524f365c06c89bfULL,
+ 0x228e898c6b8b768bULL,
+ 0x91a0c532682c29e4ULL,
+ 0x5a7bfb56c35a3485ULL,
+ 0xe955b7e8c0fd6beaULL,
+ 0xaeffcd016b1a94deULL,
+ 0x1dd181bf68bdcbb1ULL,
+ 0x21ab38d23cd56ab6ULL,
+ 0x9285746c3f7235d9ULL,
+ 0xd52f0e859495caedULL,
+ 0x6601423b97329582ULL,
+ 0xd041dd676d77eeaaULL,
+ 0x636f91d96ed0b1c5ULL,
+ 0x24c5eb30c5374ef1ULL,
+ 0x97eba78ec690119eULL,
+ 0xab911ee392f8b099ULL,
+ 0x18bf525d915feff6ULL,
+ 0x5f1528b43ab810c2ULL,
+ 0xec3b640a391f4fadULL,
+ 0x27e05a6e926952ccULL,
+ 0x94ce16d091ce0da3ULL,
+ 0xd3646c393a29f297ULL,
+ 0x604a2087398eadf8ULL,
+ 0x5c3099ea6de60cffULL,
+ 0xef1ed5546e415390ULL,
+ 0xa8b4afbdc5a6aca4ULL,
+ 0x1b9ae303c601f3cbULL,
+ 0x56ed3e2f9e224471ULL,
+ 0xe5c372919d851b1eULL,
+ 0xa26908783662e42aULL,
+ 0x114744c635c5bb45ULL,
+ 0x2d3dfdab61ad1a42ULL,
+ 0x9e13b115620a452dULL,
+ 0xd9b9cbfcc9edba19ULL,
+ 0x6a978742ca4ae576ULL,
+ 0xa14cb926613cf817ULL,
+ 0x1262f598629ba778ULL,
+ 0x55c88f71c97c584cULL,
+ 0xe6e6c3cfcadb0723ULL,
+ 0xda9c7aa29eb3a624ULL,
+ 0x69b2361c9d14f94bULL,
+ 0x2e184cf536f3067fULL,
+ 0x9d36004b35545910ULL,
+ 0x2b769f17cf112238ULL,
+ 0x9858d3a9ccb67d57ULL,
+ 0xdff2a94067518263ULL,
+ 0x6cdce5fe64f6dd0cULL,
+ 0x50a65c93309e7c0bULL,
+ 0xe388102d33392364ULL,
+ 0xa4226ac498dedc50ULL,
+ 0x170c267a9b79833fULL,
+ 0xdcd7181e300f9e5eULL,
+ 0x6ff954a033a8c131ULL,
+ 0x28532e49984f3e05ULL,
+ 0x9b7d62f79be8616aULL,
+ 0xa707db9acf80c06dULL,
+ 0x14299724cc279f02ULL,
+ 0x5383edcd67c06036ULL,
+ 0xe0ada17364673f59ULL
+ }
+};
+
+
+/*
+ * crc64_ecma_seed - Initializes the CRC64 ECMA seed.
+ */
+u64 crc64_ecma_seed(void)
+{
+ return CRC64_ECMA_182.seed;
+}
+EXPORT_SYMBOL(crc64_ecma_seed);
+
+/*
+ * crc64_ecma - Computes the 64 bit ECMA CRC.
+ *
+ * pdata: pointer to the data to compute checksum for.
+ * nbytes: number of bytes in data buffer.
+ * seed: CRC seed.
+ */
+u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed)
+{
+ unsigned int i;
+ u64 crc = seed;
+
+ for (i = 0; i < nbytes; i++)
+ crc = CRC64_ECMA_182.table[(crc ^ pdata[i]) & CRC64_BYTE_MASK] ^
+ (crc >> 8);
+
+ return crc;
+}
+EXPORT_SYMBOL(crc64_ecma);
+
+MODULE_DESCRIPTION("CRC64 ECMA function");
+MODULE_AUTHOR("Freescale Semiconductor Inc.");
+MODULE_LICENSE("GPL");
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 9722bd2dbc9b..ae4b65e17e64 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -361,7 +361,7 @@ static struct dma_debug_entry *bucket_find_contain(struct hash_bucket **bucket,
unsigned int range = 0;
while (range <= max_range) {
- entry = __hash_bucket_find(*bucket, &index, containing_match);
+ entry = __hash_bucket_find(*bucket, ref, containing_match);
if (entry)
return entry;
diff --git a/lib/find_bit.c b/lib/find_bit.c
new file mode 100644
index 000000000000..18072ea9c20e
--- /dev/null
+++ b/lib/find_bit.c
@@ -0,0 +1,193 @@
+/* bit search implementation
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * Copyright (C) 2008 IBM Corporation
+ * 'find_last_bit' is written by Rusty Russell <rusty@rustcorp.com.au>
+ * (Inspired by David Howell's find_next_bit implementation)
+ *
+ * Rewritten by Yury Norov <yury.norov@gmail.com> to decrease
+ * size and improve performance, 2015.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/bitops.h>
+#include <linux/bitmap.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+
+#if !defined(find_next_bit) || !defined(find_next_zero_bit)
+
+/*
+ * This is a common helper function for find_next_bit and
+ * find_next_zero_bit. The difference is the "invert" argument, which
+ * is XORed with each fetched word before searching it for one bits.
+ */
+static unsigned long _find_next_bit(const unsigned long *addr,
+ unsigned long nbits, unsigned long start, unsigned long invert)
+{
+ unsigned long tmp;
+
+ if (!nbits || start >= nbits)
+ return nbits;
+
+ tmp = addr[start / BITS_PER_LONG] ^ invert;
+
+ /* Handle 1st word. */
+ tmp &= BITMAP_FIRST_WORD_MASK(start);
+ start = round_down(start, BITS_PER_LONG);
+
+ while (!tmp) {
+ start += BITS_PER_LONG;
+ if (start >= nbits)
+ return nbits;
+
+ tmp = addr[start / BITS_PER_LONG] ^ invert;
+ }
+
+ return min(start + __ffs(tmp), nbits);
+}
+#endif
+
+#ifndef find_next_bit
+/*
+ * Find the next set bit in a memory region.
+ */
+unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
+ unsigned long offset)
+{
+ return _find_next_bit(addr, size, offset, 0UL);
+}
+EXPORT_SYMBOL(find_next_bit);
+#endif
+
+#ifndef find_next_zero_bit
+unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
+ unsigned long offset)
+{
+ return _find_next_bit(addr, size, offset, ~0UL);
+}
+EXPORT_SYMBOL(find_next_zero_bit);
+#endif
+
+#ifndef find_first_bit
+/*
+ * Find the first set bit in a memory region.
+ */
+unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
+{
+ unsigned long idx;
+
+ for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
+ if (addr[idx])
+ return min(idx * BITS_PER_LONG + __ffs(addr[idx]), size);
+ }
+
+ return size;
+}
+EXPORT_SYMBOL(find_first_bit);
+#endif
+
+#ifndef find_first_zero_bit
+/*
+ * Find the first cleared bit in a memory region.
+ */
+unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
+{
+ unsigned long idx;
+
+ for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
+ if (addr[idx] != ~0UL)
+ return min(idx * BITS_PER_LONG + ffz(addr[idx]), size);
+ }
+
+ return size;
+}
+EXPORT_SYMBOL(find_first_zero_bit);
+#endif
+
+#ifndef find_last_bit
+unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
+{
+ if (size) {
+ unsigned long val = BITMAP_LAST_WORD_MASK(size);
+ unsigned long idx = (size-1) / BITS_PER_LONG;
+
+ do {
+ val &= addr[idx];
+ if (val)
+ return idx * BITS_PER_LONG + __fls(val);
+
+ val = ~0ul;
+ } while (idx--);
+ }
+ return size;
+}
+EXPORT_SYMBOL(find_last_bit);
+#endif
+
+#ifdef __BIG_ENDIAN
+
+/* include/linux/byteorder does not support "unsigned long" type */
+static inline unsigned long ext2_swab(const unsigned long y)
+{
+#if BITS_PER_LONG == 64
+ return (unsigned long) __swab64((u64) y);
+#elif BITS_PER_LONG == 32
+ return (unsigned long) __swab32((u32) y);
+#else
+#error BITS_PER_LONG not defined
+#endif
+}
+
+#if !defined(find_next_bit_le) || !defined(find_next_zero_bit_le)
+static unsigned long _find_next_bit_le(const unsigned long *addr,
+ unsigned long nbits, unsigned long start, unsigned long invert)
+{
+ unsigned long tmp;
+
+ if (!nbits || start >= nbits)
+ return nbits;
+
+ tmp = addr[start / BITS_PER_LONG] ^ invert;
+
+ /* Handle 1st word. */
+ tmp &= ext2_swab(BITMAP_FIRST_WORD_MASK(start));
+ start = round_down(start, BITS_PER_LONG);
+
+ while (!tmp) {
+ start += BITS_PER_LONG;
+ if (start >= nbits)
+ return nbits;
+
+ tmp = addr[start / BITS_PER_LONG] ^ invert;
+ }
+
+ return min(start + __ffs(ext2_swab(tmp)), nbits);
+}
+#endif
+
+#ifndef find_next_zero_bit_le
+unsigned long find_next_zero_bit_le(const void *addr, unsigned
+ long size, unsigned long offset)
+{
+ return _find_next_bit_le(addr, size, offset, ~0UL);
+}
+EXPORT_SYMBOL(find_next_zero_bit_le);
+#endif
+
+#ifndef find_next_bit_le
+unsigned long find_next_bit_le(const void *addr, unsigned
+ long size, unsigned long offset)
+{
+ return _find_next_bit_le(addr, size, offset, 0UL);
+}
+EXPORT_SYMBOL(find_next_bit_le);
+#endif
+
+#endif /* __BIG_ENDIAN */
diff --git a/lib/find_last_bit.c b/lib/find_last_bit.c
index 91ca09fbf6f9..3e3be40c6a6e 100644
--- a/lib/find_last_bit.c
+++ b/lib/find_last_bit.c
@@ -4,6 +4,9 @@
* Written by Rusty Russell <rusty@rustcorp.com.au>
* (Inspired by David Howell's find_next_bit implementation)
*
+ * Rewritten by Yury Norov <yury.norov@gmail.com> to decrease
+ * size and improve performance, 2015.
+ *
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
@@ -11,37 +14,26 @@
*/
#include <linux/bitops.h>
+#include <linux/bitmap.h>
#include <linux/export.h>
-#include <asm/types.h>
-#include <asm/byteorder.h>
+#include <linux/kernel.h>
#ifndef find_last_bit
unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
{
- unsigned long words;
- unsigned long tmp;
-
- /* Start at final word. */
- words = size / BITS_PER_LONG;
+ if (size) {
+ unsigned long val = BITMAP_LAST_WORD_MASK(size);
+ unsigned long idx = (size-1) / BITS_PER_LONG;
- /* Partial final word? */
- if (size & (BITS_PER_LONG-1)) {
- tmp = (addr[words] & (~0UL >> (BITS_PER_LONG
- - (size & (BITS_PER_LONG-1)))));
- if (tmp)
- goto found;
- }
+ do {
+ val &= addr[idx];
+ if (val)
+ return idx * BITS_PER_LONG + __fls(val);
- while (words) {
- tmp = addr[--words];
- if (tmp) {
-found:
- return words * BITS_PER_LONG + __fls(tmp);
- }
+ val = ~0ul;
+ } while (idx--);
}
-
- /* Not found */
return size;
}
EXPORT_SYMBOL(find_last_bit);
diff --git a/lib/find_next_bit.c b/lib/find_next_bit.c
deleted file mode 100644
index 0cbfc0b4398f..000000000000
--- a/lib/find_next_bit.c
+++ /dev/null
@@ -1,285 +0,0 @@
-/* find_next_bit.c: fallback find next bit implementation
- *
- * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/bitops.h>
-#include <linux/export.h>
-#include <asm/types.h>
-#include <asm/byteorder.h>
-
-#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG)
-
-#ifndef find_next_bit
-/*
- * Find the next set bit in a memory region.
- */
-unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
- unsigned long offset)
-{
- const unsigned long *p = addr + BITOP_WORD(offset);
- unsigned long result = offset & ~(BITS_PER_LONG-1);
- unsigned long tmp;
-
- if (offset >= size)
- return size;
- size -= result;
- offset %= BITS_PER_LONG;
- if (offset) {
- tmp = *(p++);
- tmp &= (~0UL << offset);
- if (size < BITS_PER_LONG)
- goto found_first;
- if (tmp)
- goto found_middle;
- size -= BITS_PER_LONG;
- result += BITS_PER_LONG;
- }
- while (size & ~(BITS_PER_LONG-1)) {
- if ((tmp = *(p++)))
- goto found_middle;
- result += BITS_PER_LONG;
- size -= BITS_PER_LONG;
- }
- if (!size)
- return result;
- tmp = *p;
-
-found_first:
- tmp &= (~0UL >> (BITS_PER_LONG - size));
- if (tmp == 0UL) /* Are any bits set? */
- return result + size; /* Nope. */
-found_middle:
- return result + __ffs(tmp);
-}
-EXPORT_SYMBOL(find_next_bit);
-#endif
-
-#ifndef find_next_zero_bit
-/*
- * This implementation of find_{first,next}_zero_bit was stolen from
- * Linus' asm-alpha/bitops.h.
- */
-unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
- unsigned long offset)
-{
- const unsigned long *p = addr + BITOP_WORD(offset);
- unsigned long result = offset & ~(BITS_PER_LONG-1);
- unsigned long tmp;
-
- if (offset >= size)
- return size;
- size -= result;
- offset %= BITS_PER_LONG;
- if (offset) {
- tmp = *(p++);
- tmp |= ~0UL >> (BITS_PER_LONG - offset);
- if (size < BITS_PER_LONG)
- goto found_first;
- if (~tmp)
- goto found_middle;
- size -= BITS_PER_LONG;
- result += BITS_PER_LONG;
- }
- while (size & ~(BITS_PER_LONG-1)) {
- if (~(tmp = *(p++)))
- goto found_middle;
- result += BITS_PER_LONG;
- size -= BITS_PER_LONG;
- }
- if (!size)
- return result;
- tmp = *p;
-
-found_first:
- tmp |= ~0UL << size;
- if (tmp == ~0UL) /* Are any bits zero? */
- return result + size; /* Nope. */
-found_middle:
- return result + ffz(tmp);
-}
-EXPORT_SYMBOL(find_next_zero_bit);
-#endif
-
-#ifndef find_first_bit
-/*
- * Find the first set bit in a memory region.
- */
-unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
-{
- const unsigned long *p = addr;
- unsigned long result = 0;
- unsigned long tmp;
-
- while (size & ~(BITS_PER_LONG-1)) {
- if ((tmp = *(p++)))
- goto found;
- result += BITS_PER_LONG;
- size -= BITS_PER_LONG;
- }
- if (!size)
- return result;
-
- tmp = (*p) & (~0UL >> (BITS_PER_LONG - size));
- if (tmp == 0UL) /* Are any bits set? */
- return result + size; /* Nope. */
-found:
- return result + __ffs(tmp);
-}
-EXPORT_SYMBOL(find_first_bit);
-#endif
-
-#ifndef find_first_zero_bit
-/*
- * Find the first cleared bit in a memory region.
- */
-unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
-{
- const unsigned long *p = addr;
- unsigned long result = 0;
- unsigned long tmp;
-
- while (size & ~(BITS_PER_LONG-1)) {
- if (~(tmp = *(p++)))
- goto found;
- result += BITS_PER_LONG;
- size -= BITS_PER_LONG;
- }
- if (!size)
- return result;
-
- tmp = (*p) | (~0UL << size);
- if (tmp == ~0UL) /* Are any bits zero? */
- return result + size; /* Nope. */
-found:
- return result + ffz(tmp);
-}
-EXPORT_SYMBOL(find_first_zero_bit);
-#endif
-
-#ifdef __BIG_ENDIAN
-
-/* include/linux/byteorder does not support "unsigned long" type */
-static inline unsigned long ext2_swabp(const unsigned long * x)
-{
-#if BITS_PER_LONG == 64
- return (unsigned long) __swab64p((u64 *) x);
-#elif BITS_PER_LONG == 32
- return (unsigned long) __swab32p((u32 *) x);
-#else
-#error BITS_PER_LONG not defined
-#endif
-}
-
-/* include/linux/byteorder doesn't support "unsigned long" type */
-static inline unsigned long ext2_swab(const unsigned long y)
-{
-#if BITS_PER_LONG == 64
- return (unsigned long) __swab64((u64) y);
-#elif BITS_PER_LONG == 32
- return (unsigned long) __swab32((u32) y);
-#else
-#error BITS_PER_LONG not defined
-#endif
-}
-
-#ifndef find_next_zero_bit_le
-unsigned long find_next_zero_bit_le(const void *addr, unsigned
- long size, unsigned long offset)
-{
- const unsigned long *p = addr;
- unsigned long result = offset & ~(BITS_PER_LONG - 1);
- unsigned long tmp;
-
- if (offset >= size)
- return size;
- p += BITOP_WORD(offset);
- size -= result;
- offset &= (BITS_PER_LONG - 1UL);
- if (offset) {
- tmp = ext2_swabp(p++);
- tmp |= (~0UL >> (BITS_PER_LONG - offset));
- if (size < BITS_PER_LONG)
- goto found_first;
- if (~tmp)
- goto found_middle;
- size -= BITS_PER_LONG;
- result += BITS_PER_LONG;
- }
-
- while (size & ~(BITS_PER_LONG - 1)) {
- if (~(tmp = *(p++)))
- goto found_middle_swap;
- result += BITS_PER_LONG;
- size -= BITS_PER_LONG;
- }
- if (!size)
- return result;
- tmp = ext2_swabp(p);
-found_first:
- tmp |= ~0UL << size;
- if (tmp == ~0UL) /* Are any bits zero? */
- return result + size; /* Nope. Skip ffz */
-found_middle:
- return result + ffz(tmp);
-
-found_middle_swap:
- return result + ffz(ext2_swab(tmp));
-}
-EXPORT_SYMBOL(find_next_zero_bit_le);
-#endif
-
-#ifndef find_next_bit_le
-unsigned long find_next_bit_le(const void *addr, unsigned
- long size, unsigned long offset)
-{
- const unsigned long *p = addr;
- unsigned long result = offset & ~(BITS_PER_LONG - 1);
- unsigned long tmp;
-
- if (offset >= size)
- return size;
- p += BITOP_WORD(offset);
- size -= result;
- offset &= (BITS_PER_LONG - 1UL);
- if (offset) {
- tmp = ext2_swabp(p++);
- tmp &= (~0UL << offset);
- if (size < BITS_PER_LONG)
- goto found_first;
- if (tmp)
- goto found_middle;
- size -= BITS_PER_LONG;
- result += BITS_PER_LONG;
- }
-
- while (size & ~(BITS_PER_LONG - 1)) {
- tmp = *(p++);
- if (tmp)
- goto found_middle_swap;
- result += BITS_PER_LONG;
- size -= BITS_PER_LONG;
- }
- if (!size)
- return result;
- tmp = ext2_swabp(p);
-found_first:
- tmp &= (~0UL >> (BITS_PER_LONG - size));
- if (tmp == 0UL) /* Are any bits set? */
- return result + size; /* Nope. */
-found_middle:
- return result + __ffs(tmp);
-
-found_middle_swap:
- return result + __ffs(ext2_swab(tmp));
-}
-EXPORT_SYMBOL(find_next_bit_le);
-#endif
-
-#endif /* __BIG_ENDIAN */
diff --git a/lib/ioremap.c b/lib/ioremap.c
index 0c9216c48762..86c8911b0e3a 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -13,6 +13,43 @@
#include <asm/cacheflush.h>
#include <asm/pgtable.h>
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+static int __read_mostly ioremap_pud_capable;
+static int __read_mostly ioremap_pmd_capable;
+static int __read_mostly ioremap_huge_disabled;
+
+static int __init set_nohugeiomap(char *str)
+{
+ ioremap_huge_disabled = 1;
+ return 0;
+}
+early_param("nohugeiomap", set_nohugeiomap);
+
+void __init ioremap_huge_init(void)
+{
+ if (!ioremap_huge_disabled) {
+ if (arch_ioremap_pud_supported())
+ ioremap_pud_capable = 1;
+ if (arch_ioremap_pmd_supported())
+ ioremap_pmd_capable = 1;
+ }
+}
+
+static inline int ioremap_pud_enabled(void)
+{
+ return ioremap_pud_capable;
+}
+
+static inline int ioremap_pmd_enabled(void)
+{
+ return ioremap_pmd_capable;
+}
+
+#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
+static inline int ioremap_pud_enabled(void) { return 0; }
+static inline int ioremap_pmd_enabled(void) { return 0; }
+#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
+
static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
{
@@ -43,6 +80,14 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
return -ENOMEM;
do {
next = pmd_addr_end(addr, end);
+
+ if (ioremap_pmd_enabled() &&
+ ((next - addr) == PMD_SIZE) &&
+ IS_ALIGNED(phys_addr + addr, PMD_SIZE)) {
+ if (pmd_set_huge(pmd, phys_addr + addr, prot))
+ continue;
+ }
+
if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, prot))
return -ENOMEM;
} while (pmd++, addr = next, addr != end);
@@ -61,6 +106,14 @@ static inline int ioremap_pud_range(pgd_t *pgd, unsigned long addr,
return -ENOMEM;
do {
next = pud_addr_end(addr, end);
+
+ if (ioremap_pud_enabled() &&
+ ((next - addr) == PUD_SIZE) &&
+ IS_ALIGNED(phys_addr + addr, PUD_SIZE)) {
+ if (pud_set_huge(pud, phys_addr + addr, prot))
+ continue;
+ }
+
if (ioremap_pmd_range(pud, addr, next, phys_addr + addr, prot))
return -ENOMEM;
} while (pud++, addr = next, addr != end);
diff --git a/lib/lru_cache.c b/lib/lru_cache.c
index 852c81e3ba9a..028f5d996eef 100644
--- a/lib/lru_cache.c
+++ b/lib/lru_cache.c
@@ -247,10 +247,11 @@ size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc)
* progress) and "changed", when this in fact lead to an successful
* update of the cache.
*/
- return seq_printf(seq, "\t%s: used:%u/%u "
- "hits:%lu misses:%lu starving:%lu locked:%lu changed:%lu\n",
- lc->name, lc->used, lc->nr_elements,
- lc->hits, lc->misses, lc->starving, lc->locked, lc->changed);
+ seq_printf(seq, "\t%s: used:%u/%u hits:%lu misses:%lu starving:%lu locked:%lu changed:%lu\n",
+ lc->name, lc->used, lc->nr_elements,
+ lc->hits, lc->misses, lc->starving, lc->locked, lc->changed);
+
+ return 0;
}
static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr)
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index 8f8c4417f228..1826c7407258 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -239,29 +239,21 @@ int string_unescape(char *src, char *dst, size_t size, unsigned int flags)
}
EXPORT_SYMBOL(string_unescape);
-static int escape_passthrough(unsigned char c, char **dst, size_t *osz)
+static bool escape_passthrough(unsigned char c, char **dst, char *end)
{
char *out = *dst;
- if (*osz < 1)
- return -ENOMEM;
-
- *out++ = c;
-
- *dst = out;
- *osz -= 1;
-
- return 1;
+ if (out < end)
+ *out = c;
+ *dst = out + 1;
+ return true;
}
-static int escape_space(unsigned char c, char **dst, size_t *osz)
+static bool escape_space(unsigned char c, char **dst, char *end)
{
char *out = *dst;
unsigned char to;
- if (*osz < 2)
- return -ENOMEM;
-
switch (c) {
case '\n':
to = 'n';
@@ -279,26 +271,25 @@ static int escape_space(unsigned char c, char **dst, size_t *osz)
to = 'f';
break;
default:
- return 0;
+ return false;
}
- *out++ = '\\';
- *out++ = to;
+ if (out < end)
+ *out = '\\';
+ ++out;
+ if (out < end)
+ *out = to;
+ ++out;
*dst = out;
- *osz -= 2;
-
- return 1;
+ return true;
}
-static int escape_special(unsigned char c, char **dst, size_t *osz)
+static bool escape_special(unsigned char c, char **dst, char *end)
{
char *out = *dst;
unsigned char to;
- if (*osz < 2)
- return -ENOMEM;
-
switch (c) {
case '\\':
to = '\\';
@@ -310,71 +301,78 @@ static int escape_special(unsigned char c, char **dst, size_t *osz)
to = 'e';
break;
default:
- return 0;
+ return false;
}
- *out++ = '\\';
- *out++ = to;
+ if (out < end)
+ *out = '\\';
+ ++out;
+ if (out < end)
+ *out = to;
+ ++out;
*dst = out;
- *osz -= 2;
-
- return 1;
+ return true;
}
-static int escape_null(unsigned char c, char **dst, size_t *osz)
+static bool escape_null(unsigned char c, char **dst, char *end)
{
char *out = *dst;
- if (*osz < 2)
- return -ENOMEM;
-
if (c)
- return 0;
+ return false;
- *out++ = '\\';
- *out++ = '0';
+ if (out < end)
+ *out = '\\';
+ ++out;
+ if (out < end)
+ *out = '0';
+ ++out;
*dst = out;
- *osz -= 2;
-
- return 1;
+ return true;
}
-static int escape_octal(unsigned char c, char **dst, size_t *osz)
+static bool escape_octal(unsigned char c, char **dst, char *end)
{
char *out = *dst;
- if (*osz < 4)
- return -ENOMEM;
-
- *out++ = '\\';
- *out++ = ((c >> 6) & 0x07) + '0';
- *out++ = ((c >> 3) & 0x07) + '0';
- *out++ = ((c >> 0) & 0x07) + '0';
+ if (out < end)
+ *out = '\\';
+ ++out;
+ if (out < end)
+ *out = ((c >> 6) & 0x07) + '0';
+ ++out;
+ if (out < end)
+ *out = ((c >> 3) & 0x07) + '0';
+ ++out;
+ if (out < end)
+ *out = ((c >> 0) & 0x07) + '0';
+ ++out;
*dst = out;
- *osz -= 4;
-
- return 1;
+ return true;
}
-static int escape_hex(unsigned char c, char **dst, size_t *osz)
+static bool escape_hex(unsigned char c, char **dst, char *end)
{
char *out = *dst;
- if (*osz < 4)
- return -ENOMEM;
-
- *out++ = '\\';
- *out++ = 'x';
- *out++ = hex_asc_hi(c);
- *out++ = hex_asc_lo(c);
+ if (out < end)
+ *out = '\\';
+ ++out;
+ if (out < end)
+ *out = 'x';
+ ++out;
+ if (out < end)
+ *out = hex_asc_hi(c);
+ ++out;
+ if (out < end)
+ *out = hex_asc_lo(c);
+ ++out;
*dst = out;
- *osz -= 4;
-
- return 1;
+ return true;
}
/**
@@ -426,19 +424,17 @@ static int escape_hex(unsigned char c, char **dst, size_t *osz)
* it if needs.
*
* Return:
- * The amount of the characters processed to the destination buffer, or
- * %-ENOMEM if the size of buffer is not enough to put an escaped character is
- * returned.
- *
- * Even in the case of error @dst pointer will be updated to point to the byte
- * after the last processed character.
+ * The total size of the escaped output that would be generated for
+ * the given input and flags. To check whether the output was
+ * truncated, compare the return value to osz. There is room left in
+ * dst for a '\0' terminator if and only if ret < osz.
*/
-int string_escape_mem(const char *src, size_t isz, char **dst, size_t osz,
+int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
unsigned int flags, const char *esc)
{
- char *out = *dst, *p = out;
+ char *p = dst;
+ char *end = p + osz;
bool is_dict = esc && *esc;
- int ret = 0;
while (isz--) {
unsigned char c = *src++;
@@ -458,55 +454,26 @@ int string_escape_mem(const char *src, size_t isz, char **dst, size_t osz,
(is_dict && !strchr(esc, c))) {
/* do nothing */
} else {
- if (flags & ESCAPE_SPACE) {
- ret = escape_space(c, &p, &osz);
- if (ret < 0)
- break;
- if (ret > 0)
- continue;
- }
-
- if (flags & ESCAPE_SPECIAL) {
- ret = escape_special(c, &p, &osz);
- if (ret < 0)
- break;
- if (ret > 0)
- continue;
- }
-
- if (flags & ESCAPE_NULL) {
- ret = escape_null(c, &p, &osz);
- if (ret < 0)
- break;
- if (ret > 0)
- continue;
- }
+ if (flags & ESCAPE_SPACE && escape_space(c, &p, end))
+ continue;
+
+ if (flags & ESCAPE_SPECIAL && escape_special(c, &p, end))
+ continue;
+
+ if (flags & ESCAPE_NULL && escape_null(c, &p, end))
+ continue;
/* ESCAPE_OCTAL and ESCAPE_HEX always go last */
- if (flags & ESCAPE_OCTAL) {
- ret = escape_octal(c, &p, &osz);
- if (ret < 0)
- break;
+ if (flags & ESCAPE_OCTAL && escape_octal(c, &p, end))
continue;
- }
- if (flags & ESCAPE_HEX) {
- ret = escape_hex(c, &p, &osz);
- if (ret < 0)
- break;
+
+ if (flags & ESCAPE_HEX && escape_hex(c, &p, end))
continue;
- }
}
- ret = escape_passthrough(c, &p, &osz);
- if (ret < 0)
- break;
+ escape_passthrough(c, &p, end);
}
- *dst = p;
-
- if (ret < 0)
- return ret;
-
- return p - out;
+ return p - dst;
}
EXPORT_SYMBOL(string_escape_mem);
diff --git a/lib/test-string_helpers.c b/lib/test-string_helpers.c
index ab0d30e1e18f..8e376efd88a4 100644
--- a/lib/test-string_helpers.c
+++ b/lib/test-string_helpers.c
@@ -260,16 +260,28 @@ static __init const char *test_string_find_match(const struct test_string_2 *s2,
return NULL;
}
+static __init void
+test_string_escape_overflow(const char *in, int p, unsigned int flags, const char *esc,
+ int q_test, const char *name)
+{
+ int q_real;
+
+ q_real = string_escape_mem(in, p, NULL, 0, flags, esc);
+ if (q_real != q_test)
+ pr_warn("Test '%s' failed: flags = %u, osz = 0, expected %d, got %d\n",
+ name, flags, q_test, q_real);
+}
+
static __init void test_string_escape(const char *name,
const struct test_string_2 *s2,
unsigned int flags, const char *esc)
{
- int q_real = 512;
- char *out_test = kmalloc(q_real, GFP_KERNEL);
- char *out_real = kmalloc(q_real, GFP_KERNEL);
+ size_t out_size = 512;
+ char *out_test = kmalloc(out_size, GFP_KERNEL);
+ char *out_real = kmalloc(out_size, GFP_KERNEL);
char *in = kmalloc(256, GFP_KERNEL);
- char *buf = out_real;
int p = 0, q_test = 0;
+ int q_real;
if (!out_test || !out_real || !in)
goto out;
@@ -301,29 +313,19 @@ static __init void test_string_escape(const char *name,
q_test += len;
}
- q_real = string_escape_mem(in, p, &buf, q_real, flags, esc);
+ q_real = string_escape_mem(in, p, out_real, out_size, flags, esc);
test_string_check_buf(name, flags, in, p, out_real, q_real, out_test,
q_test);
+
+ test_string_escape_overflow(in, p, flags, esc, q_test, name);
+
out:
kfree(in);
kfree(out_real);
kfree(out_test);
}
-static __init void test_string_escape_nomem(void)
-{
- char *in = "\eb \\C\007\"\x90\r]";
- char out[64], *buf = out;
- int rc = -ENOMEM, ret;
-
- ret = string_escape_str_any_np(in, &buf, strlen(in), NULL);
- if (ret == rc)
- return;
-
- pr_err("Test 'escape nomem' failed: got %d instead of %d\n", ret, rc);
-}
-
static int __init test_string_helpers_init(void)
{
unsigned int i;
@@ -342,8 +344,6 @@ static int __init test_string_helpers_init(void)
for (i = 0; i < (ESCAPE_ANY_NP | ESCAPE_HEX) + 1; i++)
test_string_escape("escape 1", escape1, i, TEST_STRING_2_DICT_1);
- test_string_escape_nomem();
-
return -EINVAL;
}
module_init(test_string_helpers_init);
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index b235c96167d3..8243e2fb1e6b 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -17,6 +17,7 @@
*/
#include <stdarg.h>
+#include <linux/clk-provider.h>
#include <linux/module.h> /* for KSYM_SYMBOL_LEN */
#include <linux/types.h>
#include <linux/string.h>
@@ -32,6 +33,7 @@
#include <asm/page.h> /* for PAGE_SIZE */
#include <asm/sections.h> /* for dereference_function_descriptor() */
+#include <asm/byteorder.h> /* cpu_to_le16 */
#include <linux/string_helpers.h>
#include "kstrtox.h"
@@ -121,142 +123,145 @@ int skip_atoi(const char **s)
return i;
}
-/* Decimal conversion is by far the most typical, and is used
- * for /proc and /sys data. This directly impacts e.g. top performance
- * with many processes running. We optimize it for speed
- * using ideas described at <http://www.cs.uiowa.edu/~jones/bcd/divide.html>
- * (with permission from the author, Douglas W. Jones).
+/*
+ * Decimal conversion is by far the most typical, and is used for
+ * /proc and /sys data. This directly impacts e.g. top performance
+ * with many processes running. We optimize it for speed by emitting
+ * two characters at a time, using a 200 byte lookup table. This
+ * roughly halves the number of multiplications compared to computing
+ * the digits one at a time. Implementation strongly inspired by the
+ * previous version, which in turn used ideas described at
+ * <http://www.cs.uiowa.edu/~jones/bcd/divide.html> (with permission
+ * from the author, Douglas W. Jones).
+ *
+ * It turns out there is precisely one 26 bit fixed-point
+ * approximation a of 64/100 for which x/100 == (x * (u64)a) >> 32
+ * holds for all x in [0, 10^8-1], namely a = 0x28f5c29. The actual
+ * range happens to be somewhat larger (x <= 1073741898), but that's
+ * irrelevant for our purpose.
+ *
+ * For dividing a number in the range [10^4, 10^6-1] by 100, we still
+ * need a 32x32->64 bit multiply, so we simply use the same constant.
+ *
+ * For dividing a number in the range [100, 10^4-1] by 100, there are
+ * several options. The simplest is (x * 0x147b) >> 19, which is valid
+ * for all x <= 43698.
*/
-#if BITS_PER_LONG != 32 || BITS_PER_LONG_LONG != 64
-/* Formats correctly any integer in [0, 999999999] */
+static const u16 decpair[100] = {
+#define _(x) (__force u16) cpu_to_le16(((x % 10) | ((x / 10) << 8)) + 0x3030)
+ _( 0), _( 1), _( 2), _( 3), _( 4), _( 5), _( 6), _( 7), _( 8), _( 9),
+ _(10), _(11), _(12), _(13), _(14), _(15), _(16), _(17), _(18), _(19),
+ _(20), _(21), _(22), _(23), _(24), _(25), _(26), _(27), _(28), _(29),
+ _(30), _(31), _(32), _(33), _(34), _(35), _(36), _(37), _(38), _(39),
+ _(40), _(41), _(42), _(43), _(44), _(45), _(46), _(47), _(48), _(49),
+ _(50), _(51), _(52), _(53), _(54), _(55), _(56), _(57), _(58), _(59),
+ _(60), _(61), _(62), _(63), _(64), _(65), _(66), _(67), _(68), _(69),
+ _(70), _(71), _(72), _(73), _(74), _(75), _(76), _(77), _(78), _(79),
+ _(80), _(81), _(82), _(83), _(84), _(85), _(86), _(87), _(88), _(89),
+ _(90), _(91), _(92), _(93), _(94), _(95), _(96), _(97), _(98), _(99),
+#undef _
+};
+
+/*
+ * This will print a single '0' even if r == 0, since we would
+ * immediately jump to out_r where two 0s would be written but only
+ * one of them accounted for in buf. This is needed by ip4_string
+ * below. All other callers pass a non-zero value of r.
+*/
static noinline_for_stack
-char *put_dec_full9(char *buf, unsigned q)
+char *put_dec_trunc8(char *buf, unsigned r)
{
- unsigned r;
+ unsigned q;
- /*
- * Possible ways to approx. divide by 10
- * (x * 0x1999999a) >> 32 x < 1073741829 (multiply must be 64-bit)
- * (x * 0xcccd) >> 19 x < 81920 (x < 262149 when 64-bit mul)
- * (x * 0x6667) >> 18 x < 43699
- * (x * 0x3334) >> 17 x < 16389
- * (x * 0x199a) >> 16 x < 16389
- * (x * 0x0ccd) >> 15 x < 16389
- * (x * 0x0667) >> 14 x < 2739
- * (x * 0x0334) >> 13 x < 1029
- * (x * 0x019a) >> 12 x < 1029
- * (x * 0x00cd) >> 11 x < 1029 shorter code than * 0x67 (on i386)
- * (x * 0x0067) >> 10 x < 179
- * (x * 0x0034) >> 9 x < 69 same
- * (x * 0x001a) >> 8 x < 69 same
- * (x * 0x000d) >> 7 x < 69 same, shortest code (on i386)
- * (x * 0x0007) >> 6 x < 19
- * See <http://www.cs.uiowa.edu/~jones/bcd/divide.html>
- */
- r = (q * (uint64_t)0x1999999a) >> 32;
- *buf++ = (q - 10 * r) + '0'; /* 1 */
- q = (r * (uint64_t)0x1999999a) >> 32;
- *buf++ = (r - 10 * q) + '0'; /* 2 */
- r = (q * (uint64_t)0x1999999a) >> 32;
- *buf++ = (q - 10 * r) + '0'; /* 3 */
- q = (r * (uint64_t)0x1999999a) >> 32;
- *buf++ = (r - 10 * q) + '0'; /* 4 */
- r = (q * (uint64_t)0x1999999a) >> 32;
- *buf++ = (q - 10 * r) + '0'; /* 5 */
- /* Now value is under 10000, can avoid 64-bit multiply */
- q = (r * 0x199a) >> 16;
- *buf++ = (r - 10 * q) + '0'; /* 6 */
- r = (q * 0xcd) >> 11;
- *buf++ = (q - 10 * r) + '0'; /* 7 */
- q = (r * 0xcd) >> 11;
- *buf++ = (r - 10 * q) + '0'; /* 8 */
- *buf++ = q + '0'; /* 9 */
+ /* 1 <= r < 10^8 */
+ if (r < 100)
+ goto out_r;
+
+ /* 100 <= r < 10^8 */
+ q = (r * (u64)0x28f5c29) >> 32;
+ *((u16 *)buf) = decpair[r - 100*q];
+ buf += 2;
+
+ /* 1 <= q < 10^6 */
+ if (q < 100)
+ goto out_q;
+
+ /* 100 <= q < 10^6 */
+ r = (q * (u64)0x28f5c29) >> 32;
+ *((u16 *)buf) = decpair[q - 100*r];
+ buf += 2;
+
+ /* 1 <= r < 10^4 */
+ if (r < 100)
+ goto out_r;
+
+ /* 100 <= r < 10^4 */
+ q = (r * 0x147b) >> 19;
+ *((u16 *)buf) = decpair[r - 100*q];
+ buf += 2;
+out_q:
+ /* 1 <= q < 100 */
+ r = q;
+out_r:
+ /* 1 <= r < 100 */
+ *((u16 *)buf) = decpair[r];
+ buf += r < 10 ? 1 : 2;
return buf;
}
-#endif
-/* Similar to above but do not pad with zeros.
- * Code can be easily arranged to print 9 digits too, but our callers
- * always call put_dec_full9() instead when the number has 9 decimal digits.
- */
+#if BITS_PER_LONG == 64 && BITS_PER_LONG_LONG == 64
static noinline_for_stack
-char *put_dec_trunc8(char *buf, unsigned r)
+char *put_dec_full8(char *buf, unsigned r)
{
unsigned q;
- /* Copy of previous function's body with added early returns */
- while (r >= 10000) {
- q = r + '0';
- r = (r * (uint64_t)0x1999999a) >> 32;
- *buf++ = q - 10*r;
- }
-
- q = (r * 0x199a) >> 16; /* r <= 9999 */
- *buf++ = (r - 10 * q) + '0';
- if (q == 0)
- return buf;
- r = (q * 0xcd) >> 11; /* q <= 999 */
- *buf++ = (q - 10 * r) + '0';
- if (r == 0)
- return buf;
- q = (r * 0xcd) >> 11; /* r <= 99 */
- *buf++ = (r - 10 * q) + '0';
- if (q == 0)
- return buf;
- *buf++ = q + '0'; /* q <= 9 */
- return buf;
-}
+ /* 0 <= r < 10^8 */
+ q = (r * (u64)0x28f5c29) >> 32;
+ *((u16 *)buf) = decpair[r - 100*q];
+ buf += 2;
-/* There are two algorithms to print larger numbers.
- * One is generic: divide by 1000000000 and repeatedly print
- * groups of (up to) 9 digits. It's conceptually simple,
- * but requires a (unsigned long long) / 1000000000 division.
- *
- * Second algorithm splits 64-bit unsigned long long into 16-bit chunks,
- * manipulates them cleverly and generates groups of 4 decimal digits.
- * It so happens that it does NOT require long long division.
- *
- * If long is > 32 bits, division of 64-bit values is relatively easy,
- * and we will use the first algorithm.
- * If long long is > 64 bits (strange architecture with VERY large long long),
- * second algorithm can't be used, and we again use the first one.
- *
- * Else (if long is 32 bits and long long is 64 bits) we use second one.
- */
+ /* 0 <= q < 10^6 */
+ r = (q * (u64)0x28f5c29) >> 32;
+ *((u16 *)buf) = decpair[q - 100*r];
+ buf += 2;
-#if BITS_PER_LONG != 32 || BITS_PER_LONG_LONG != 64
+ /* 0 <= r < 10^4 */
+ q = (r * 0x147b) >> 19;
+ *((u16 *)buf) = decpair[r - 100*q];
+ buf += 2;
-/* First algorithm: generic */
+ /* 0 <= q < 100 */
+ *((u16 *)buf) = decpair[q];
+ buf += 2;
+ return buf;
+}
-static
+static noinline_for_stack
char *put_dec(char *buf, unsigned long long n)
{
- if (n >= 100*1000*1000) {
- while (n >= 1000*1000*1000)
- buf = put_dec_full9(buf, do_div(n, 1000*1000*1000));
- if (n >= 100*1000*1000)
- return put_dec_full9(buf, n);
- }
+ if (n >= 100*1000*1000)
+ buf = put_dec_full8(buf, do_div(n, 100*1000*1000));
+ /* 1 <= n <= 1.6e11 */
+ if (n >= 100*1000*1000)
+ buf = put_dec_full8(buf, do_div(n, 100*1000*1000));
+ /* 1 <= n < 1e8 */
return put_dec_trunc8(buf, n);
}
-#else
-
-/* Second algorithm: valid only for 64-bit long longs */
+#elif BITS_PER_LONG == 32 && BITS_PER_LONG_LONG == 64
-/* See comment in put_dec_full9 for choice of constants */
-static noinline_for_stack
-void put_dec_full4(char *buf, unsigned q)
+static void
+put_dec_full4(char *buf, unsigned r)
{
- unsigned r;
- r = (q * 0xccd) >> 15;
- buf[0] = (q - 10 * r) + '0';
- q = (r * 0xcd) >> 11;
- buf[1] = (r - 10 * q) + '0';
- r = (q * 0xcd) >> 11;
- buf[2] = (q - 10 * r) + '0';
- buf[3] = r + '0';
+ unsigned q;
+
+ /* 0 <= r < 10^4 */
+ q = (r * 0x147b) >> 19;
+ *((u16 *)buf) = decpair[r - 100*q];
+ buf += 2;
+ /* 0 <= q < 100 */
+ *((u16 *)buf) = decpair[q];
}
/*
@@ -264,9 +269,9 @@ void put_dec_full4(char *buf, unsigned q)
* The approximation x/10000 == (x * 0x346DC5D7) >> 43
* holds for all x < 1,128,869,999. The largest value this
* helper will ever be asked to convert is 1,125,520,955.
- * (d1 in the put_dec code, assuming n is all-ones).
+ * (second call in the put_dec code, assuming n is all-ones).
*/
-static
+static noinline_for_stack
unsigned put_dec_helper4(char *buf, unsigned x)
{
uint32_t q = (x * (uint64_t)0x346DC5D7) >> 43;
@@ -293,6 +298,8 @@ char *put_dec(char *buf, unsigned long long n)
d2 = (h ) & 0xffff;
d3 = (h >> 16); /* implicit "& 0xffff" */
+ /* n = 2^48 d3 + 2^32 d2 + 2^16 d1 + d0
+ = 281_4749_7671_0656 d3 + 42_9496_7296 d2 + 6_5536 d1 + d0 */
q = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff);
q = put_dec_helper4(buf, q);
@@ -322,7 +329,8 @@ char *put_dec(char *buf, unsigned long long n)
*/
int num_to_str(char *buf, int size, unsigned long long num)
{
- char tmp[sizeof(num) * 3];
+ /* put_dec requires 2-byte alignment of the buffer. */
+ char tmp[sizeof(num) * 3] __aligned(2);
int idx, len;
/* put_dec() may work incorrectly for num = 0 (generate "", not "0") */
@@ -340,11 +348,11 @@ int num_to_str(char *buf, int size, unsigned long long num)
return len;
}
-#define ZEROPAD 1 /* pad with zero */
-#define SIGN 2 /* unsigned/signed long */
+#define SIGN 1 /* unsigned/signed, must be 1 */
+#define LEFT 2 /* left justified */
#define PLUS 4 /* show plus */
#define SPACE 8 /* space if plus */
-#define LEFT 16 /* left justified */
+#define ZEROPAD 16 /* pad with zero, must be 16 == '0' - ' ' */
#define SMALL 32 /* use lowercase in hex (must be 32 == 0x20) */
#define SPECIAL 64 /* prefix hex with "0x", octal with "0" */
@@ -383,10 +391,8 @@ static noinline_for_stack
char *number(char *buf, char *end, unsigned long long num,
struct printf_spec spec)
{
- /* we are called with base 8, 10 or 16, only, thus don't need "G..." */
- static const char digits[16] = "0123456789ABCDEF"; /* "GHIJKLMNOPQRSTUVWXYZ"; */
-
- char tmp[66];
+ /* put_dec requires 2-byte alignment of the buffer. */
+ char tmp[3 * sizeof(num)] __aligned(2);
char sign;
char locase;
int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10);
@@ -422,12 +428,7 @@ char *number(char *buf, char *end, unsigned long long num,
/* generate full string in tmp[], in reverse order */
i = 0;
if (num < spec.base)
- tmp[i++] = digits[num] | locase;
- /* Generic code, for any base:
- else do {
- tmp[i++] = (digits[do_div(num,base)] | locase);
- } while (num != 0);
- */
+ tmp[i++] = hex_asc_upper[num] | locase;
else if (spec.base != 10) { /* 8 or 16 */
int mask = spec.base - 1;
int shift = 3;
@@ -435,7 +436,7 @@ char *number(char *buf, char *end, unsigned long long num,
if (spec.base == 16)
shift = 4;
do {
- tmp[i++] = (digits[((unsigned char)num) & mask] | locase);
+ tmp[i++] = (hex_asc_upper[((unsigned char)num) & mask] | locase);
num >>= shift;
} while (num);
} else { /* base 10 */
@@ -447,7 +448,7 @@ char *number(char *buf, char *end, unsigned long long num,
spec.precision = i;
/* leading space padding */
spec.field_width -= spec.precision;
- if (!(spec.flags & (ZEROPAD+LEFT))) {
+ if (!(spec.flags & (ZEROPAD | LEFT))) {
while (--spec.field_width >= 0) {
if (buf < end)
*buf = ' ';
@@ -475,7 +476,8 @@ char *number(char *buf, char *end, unsigned long long num,
}
/* zero or space padding */
if (!(spec.flags & LEFT)) {
- char c = (spec.flags & ZEROPAD) ? '0' : ' ';
+ char c = ' ' + (spec.flags & ZEROPAD);
+ BUILD_BUG_ON(' ' + ZEROPAD != '0');
while (--spec.field_width >= 0) {
if (buf < end)
*buf = c;
@@ -783,11 +785,19 @@ char *hex_string(char *buf, char *end, u8 *addr, struct printf_spec spec,
if (spec.field_width > 0)
len = min_t(int, spec.field_width, 64);
- for (i = 0; i < len && buf < end - 1; i++) {
- buf = hex_byte_pack(buf, addr[i]);
+ for (i = 0; i < len; ++i) {
+ if (buf < end)
+ *buf = hex_asc_hi(addr[i]);
+ ++buf;
+ if (buf < end)
+ *buf = hex_asc_lo(addr[i]);
+ ++buf;
- if (buf < end && separator && i != len - 1)
- *buf++ = separator;
+ if (separator && i != len - 1) {
+ if (buf < end)
+ *buf = separator;
+ ++buf;
+ }
}
return buf;
@@ -942,7 +952,7 @@ char *ip4_string(char *p, const u8 *addr, const char *fmt)
break;
}
for (i = 0; i < 4; i++) {
- char temp[3]; /* hold each IP quad in reverse order */
+ char temp[4] __aligned(2); /* hold each IP quad in reverse order */
int digits = put_dec_trunc8(temp, addr[index]) - temp;
if (leading_zeros) {
if (digits < 3)
@@ -1233,8 +1243,12 @@ char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec,
len = spec.field_width < 0 ? 1 : spec.field_width;
- /* Ignore the error. We print as many characters as we can */
- string_escape_mem(addr, len, &buf, end - buf, flags, NULL);
+ /*
+ * string_escape_mem() writes as many characters as it can to
+ * the given buffer, and returns the total size of the output
+ * had the buffer been big enough.
+ */
+ buf += string_escape_mem(addr, len, buf, buf < end ? end - buf : 0, flags, NULL);
return buf;
}
@@ -1322,6 +1336,45 @@ char *address_val(char *buf, char *end, const void *addr,
return number(buf, end, num, spec);
}
+static noinline_for_stack
+char *clock(char *buf, char *end, struct clk *clk, struct printf_spec spec,
+ const char *fmt)
+{
+ if (!IS_ENABLED(CONFIG_HAVE_CLK) || !clk)
+ return string(buf, end, NULL, spec);
+
+ switch (fmt[1]) {
+ case 'r':
+ return number(buf, end, clk_get_rate(clk), spec);
+
+ case 'n':
+ default:
+#ifdef CONFIG_COMMON_CLK
+ return string(buf, end, __clk_get_name(clk), spec);
+#else
+ spec.base = 16;
+ spec.field_width = sizeof(unsigned long) * 2 + 2;
+ spec.flags |= SPECIAL | SMALL | ZEROPAD;
+ return number(buf, end, (unsigned long)clk, spec);
+#endif
+ }
+}
+
+static noinline_for_stack
+char *comm_name(char *buf, char *end, struct task_struct *tsk,
+ struct printf_spec spec, const char *fmt)
+{
+ char name[TASK_COMM_LEN];
+
+ /* Caller can pass NULL instead of current. */
+ if (!tsk)
+ tsk = current;
+ /* Not using get_task_comm() in case I'm in IRQ context. */
+ memcpy(name, tsk->comm, TASK_COMM_LEN);
+ name[sizeof(name) - 1] = '\0';
+ return string(buf, end, name, spec);
+}
+
int kptr_restrict __read_mostly;
/*
@@ -1404,6 +1457,12 @@ int kptr_restrict __read_mostly;
* (default assumed to be phys_addr_t, passed by reference)
* - 'd[234]' For a dentry name (optionally 2-4 last components)
* - 'D[234]' Same as 'd' but for a struct file
+ * - 'C' For a clock, it prints the name (Common Clock Framework) or address
+ * (legacy clock framework) of the clock
+ * - 'Cn' For a clock, it prints the name (Common Clock Framework) or address
+ * (legacy clock framework) of the clock
+ * - 'Cr' For a clock, it prints the current rate of the clock
+ * - 'T' task_struct->comm
*
* Note: The difference between 'S' and 'F' is that on ia64 and ppc64
* function pointers are really function descriptors, which contain a
@@ -1415,7 +1474,7 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
{
int default_width = 2 * sizeof(void *) + (spec.flags & SPECIAL ? 2 : 0);
- if (!ptr && *fmt != 'K') {
+ if (!ptr && *fmt != 'K' && *fmt != 'T') {
/*
* Print (null) with the same width as a pointer so it makes
* tabular output look nice.
@@ -1548,10 +1607,14 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
return address_val(buf, end, ptr, spec, fmt);
case 'd':
return dentry_name(buf, end, ptr, spec, fmt);
+ case 'C':
+ return clock(buf, end, ptr, spec, fmt);
case 'D':
return dentry_name(buf, end,
((const struct file *)ptr)->f_path.dentry,
spec, fmt);
+ case 'T':
+ return comm_name(buf, end, ptr, spec, fmt);
}
spec.flags |= SMALL;
if (spec.field_width == -1) {
@@ -1738,29 +1801,21 @@ qualifier:
if (spec->qualifier == 'L')
spec->type = FORMAT_TYPE_LONG_LONG;
else if (spec->qualifier == 'l') {
- if (spec->flags & SIGN)
- spec->type = FORMAT_TYPE_LONG;
- else
- spec->type = FORMAT_TYPE_ULONG;
+ BUILD_BUG_ON(FORMAT_TYPE_ULONG + SIGN != FORMAT_TYPE_LONG);
+ spec->type = FORMAT_TYPE_ULONG + (spec->flags & SIGN);
} else if (_tolower(spec->qualifier) == 'z') {
spec->type = FORMAT_TYPE_SIZE_T;
} else if (spec->qualifier == 't') {
spec->type = FORMAT_TYPE_PTRDIFF;
} else if (spec->qualifier == 'H') {
- if (spec->flags & SIGN)
- spec->type = FORMAT_TYPE_BYTE;
- else
- spec->type = FORMAT_TYPE_UBYTE;
+ BUILD_BUG_ON(FORMAT_TYPE_UBYTE + SIGN != FORMAT_TYPE_BYTE);
+ spec->type = FORMAT_TYPE_UBYTE + (spec->flags & SIGN);
} else if (spec->qualifier == 'h') {
- if (spec->flags & SIGN)
- spec->type = FORMAT_TYPE_SHORT;
- else
- spec->type = FORMAT_TYPE_USHORT;
+ BUILD_BUG_ON(FORMAT_TYPE_USHORT + SIGN != FORMAT_TYPE_SHORT);
+ spec->type = FORMAT_TYPE_USHORT + (spec->flags & SIGN);
} else {
- if (spec->flags & SIGN)
- spec->type = FORMAT_TYPE_INT;
- else
- spec->type = FORMAT_TYPE_UINT;
+ BUILD_BUG_ON(FORMAT_TYPE_UINT + SIGN != FORMAT_TYPE_INT);
+ spec->type = FORMAT_TYPE_UINT + (spec->flags & SIGN);
}
return ++fmt - start;
@@ -1800,6 +1855,11 @@ qualifier:
* %*pE[achnops] print an escaped buffer
* %*ph[CDN] a variable-length hex string with a separator (supports up to 64
* bytes of the input)
+ * %pC output the name (Common Clock Framework) or address (legacy clock
+ * framework) of a clock
+ * %pCn output the name (Common Clock Framework) or address (legacy clock
+ * framework) of a clock
+ * %pCr output the current rate of a clock
* %n is ignored
*
* ** Please update Documentation/printk-formats.txt when making changes **
diff --git a/mm/Kconfig b/mm/Kconfig
index a03131b6ba8e..390214da4546 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -517,6 +517,12 @@ config CMA_DEBUG
processing calls such as dma_alloc_from_contiguous().
This option does not affect warning and error messages.
+config CMA_DEBUGFS
+ bool "CMA debugfs interface"
+ depends on CMA && DEBUG_FS
+ help
+ Turns on the DebugFS interface for CMA.
+
config CMA_AREAS
int "Maximum count of the CMA areas"
depends on CMA
diff --git a/mm/Makefile b/mm/Makefile
index 15dbe9903c27..98c4eaeabdcb 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
obj-$(CONFIG_KASAN) += kasan/
obj-$(CONFIG_FAILSLAB) += failslab.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
+obj-$(CONFIG_MEMTEST) += memtest.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
@@ -76,3 +77,4 @@ obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
obj-$(CONFIG_CMA) += cma.o
obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
+obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
diff --git a/mm/cleancache.c b/mm/cleancache.c
index 053bcd8f12fb..8fc50811119b 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -19,7 +19,7 @@
#include <linux/cleancache.h>
/*
- * cleancache_ops is set by cleancache_ops_register to contain the pointers
+ * cleancache_ops is set by cleancache_register_ops to contain the pointers
* to the cleancache "backend" implementation functions.
*/
static struct cleancache_ops *cleancache_ops __read_mostly;
@@ -34,145 +34,107 @@ static u64 cleancache_failed_gets;
static u64 cleancache_puts;
static u64 cleancache_invalidates;
-/*
- * When no backend is registered all calls to init_fs and init_shared_fs
- * are registered and fake poolids (FAKE_FS_POOLID_OFFSET or
- * FAKE_SHARED_FS_POOLID_OFFSET, plus offset in the respective array
- * [shared_|]fs_poolid_map) are given to the respective super block
- * (sb->cleancache_poolid) and no tmem_pools are created. When a backend
- * registers with cleancache the previous calls to init_fs and init_shared_fs
- * are executed to create tmem_pools and set the respective poolids. While no
- * backend is registered all "puts", "gets" and "flushes" are ignored or failed.
- */
-#define MAX_INITIALIZABLE_FS 32
-#define FAKE_FS_POOLID_OFFSET 1000
-#define FAKE_SHARED_FS_POOLID_OFFSET 2000
-
-#define FS_NO_BACKEND (-1)
-#define FS_UNKNOWN (-2)
-static int fs_poolid_map[MAX_INITIALIZABLE_FS];
-static int shared_fs_poolid_map[MAX_INITIALIZABLE_FS];
-static char *uuids[MAX_INITIALIZABLE_FS];
-/*
- * Mutex for the [shared_|]fs_poolid_map to guard against multiple threads
- * invoking umount (and ending in __cleancache_invalidate_fs) and also multiple
- * threads calling mount (and ending up in __cleancache_init_[shared|]fs).
- */
-static DEFINE_MUTEX(poolid_mutex);
-/*
- * When set to false (default) all calls to the cleancache functions, except
- * the __cleancache_invalidate_fs and __cleancache_init_[shared|]fs are guarded
- * by the if (!cleancache_ops) return. This means multiple threads (from
- * different filesystems) will be checking cleancache_ops. The usage of a
- * bool instead of a atomic_t or a bool guarded by a spinlock is OK - we are
- * OK if the time between the backend's have been initialized (and
- * cleancache_ops has been set to not NULL) and when the filesystems start
- * actually calling the backends. The inverse (when unloading) is obviously
- * not good - but this shim does not do that (yet).
- */
-
-/*
- * The backends and filesystems work all asynchronously. This is b/c the
- * backends can be built as modules.
- * The usual sequence of events is:
- * a) mount / -> __cleancache_init_fs is called. We set the
- * [shared_|]fs_poolid_map and uuids for.
- *
- * b). user does I/Os -> we call the rest of __cleancache_* functions
- * which return immediately as cleancache_ops is false.
- *
- * c). modprobe zcache -> cleancache_register_ops. We init the backend
- * and set cleancache_ops to true, and for any fs_poolid_map
- * (which is set by __cleancache_init_fs) we initialize the poolid.
- *
- * d). user does I/Os -> now that cleancache_ops is true all the
- * __cleancache_* functions can call the backend. They all check
- * that fs_poolid_map is valid and if so invoke the backend.
- *
- * e). umount / -> __cleancache_invalidate_fs, the fs_poolid_map is
- * reset (which is the second check in the __cleancache_* ops
- * to call the backend).
- *
- * The sequence of event could also be c), followed by a), and d). and e). The
- * c) would not happen anymore. There is also the chance of c), and one thread
- * doing a) + d), and another doing e). For that case we depend on the
- * filesystem calling __cleancache_invalidate_fs in the proper sequence (so
- * that it handles all I/Os before it invalidates the fs (which is last part
- * of unmounting process).
- *
- * Note: The acute reader will notice that there is no "rmmod zcache" case.
- * This is b/c the functionality for that is not yet implemented and when
- * done, will require some extra locking not yet devised.
- */
+static void cleancache_register_ops_sb(struct super_block *sb, void *unused)
+{
+ switch (sb->cleancache_poolid) {
+ case CLEANCACHE_NO_BACKEND:
+ __cleancache_init_fs(sb);
+ break;
+ case CLEANCACHE_NO_BACKEND_SHARED:
+ __cleancache_init_shared_fs(sb);
+ break;
+ }
+}
/*
- * Register operations for cleancache, returning previous thus allowing
- * detection of multiple backends and possible nesting.
+ * Register operations for cleancache. Returns 0 on success.
*/
-struct cleancache_ops *cleancache_register_ops(struct cleancache_ops *ops)
+int cleancache_register_ops(struct cleancache_ops *ops)
{
- struct cleancache_ops *old = cleancache_ops;
- int i;
+ if (cmpxchg(&cleancache_ops, NULL, ops))
+ return -EBUSY;
- mutex_lock(&poolid_mutex);
- for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
- if (fs_poolid_map[i] == FS_NO_BACKEND)
- fs_poolid_map[i] = ops->init_fs(PAGE_SIZE);
- if (shared_fs_poolid_map[i] == FS_NO_BACKEND)
- shared_fs_poolid_map[i] = ops->init_shared_fs
- (uuids[i], PAGE_SIZE);
- }
/*
- * We MUST set cleancache_ops _after_ we have called the backends
- * init_fs or init_shared_fs functions. Otherwise the compiler might
- * re-order where cleancache_ops is set in this function.
+ * A cleancache backend can be built as a module and hence loaded after
+ * a cleancache enabled filesystem has called cleancache_init_fs. To
+ * handle such a scenario, here we call ->init_fs or ->init_shared_fs
+ * for each active super block. To differentiate between local and
+ * shared filesystems, we temporarily initialize sb->cleancache_poolid
+ * to CLEANCACHE_NO_BACKEND or CLEANCACHE_NO_BACKEND_SHARED
+ * respectively in case there is no backend registered at the time
+ * cleancache_init_fs or cleancache_init_shared_fs is called.
+ *
+ * Since filesystems can be mounted concurrently with cleancache
+ * backend registration, we have to be careful to guarantee that all
+ * cleancache enabled filesystems that has been mounted by the time
+ * cleancache_register_ops is called has got and all mounted later will
+ * get cleancache_poolid. This is assured by the following statements
+ * tied together:
+ *
+ * a) iterate_supers skips only those super blocks that has started
+ * ->kill_sb
+ *
+ * b) if iterate_supers encounters a super block that has not finished
+ * ->mount yet, it waits until it is finished
+ *
+ * c) cleancache_init_fs is called from ->mount and
+ * cleancache_invalidate_fs is called from ->kill_sb
+ *
+ * d) we call iterate_supers after cleancache_ops has been set
+ *
+ * From a) it follows that if iterate_supers skips a super block, then
+ * either the super block is already dead, in which case we do not need
+ * to bother initializing cleancache for it, or it was mounted after we
+ * initiated iterate_supers. In the latter case, it must have seen
+ * cleancache_ops set according to d) and initialized cleancache from
+ * ->mount by itself according to c). This proves that we call
+ * ->init_fs at least once for each active super block.
+ *
+ * From b) and c) it follows that if iterate_supers encounters a super
+ * block that has already started ->init_fs, it will wait until ->mount
+ * and hence ->init_fs has finished, then check cleancache_poolid, see
+ * that it has already been set and therefore do nothing. This proves
+ * that we call ->init_fs no more than once for each super block.
+ *
+ * Combined together, the last two paragraphs prove the function
+ * correctness.
+ *
+ * Note that various cleancache callbacks may proceed before this
+ * function is called or even concurrently with it, but since
+ * CLEANCACHE_NO_BACKEND is negative, they will all result in a noop
+ * until the corresponding ->init_fs has been actually called and
+ * cleancache_ops has been set.
*/
- barrier();
- cleancache_ops = ops;
- mutex_unlock(&poolid_mutex);
- return old;
+ iterate_supers(cleancache_register_ops_sb, NULL);
+ return 0;
}
EXPORT_SYMBOL(cleancache_register_ops);
/* Called by a cleancache-enabled filesystem at time of mount */
void __cleancache_init_fs(struct super_block *sb)
{
- int i;
+ int pool_id = CLEANCACHE_NO_BACKEND;
- mutex_lock(&poolid_mutex);
- for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
- if (fs_poolid_map[i] == FS_UNKNOWN) {
- sb->cleancache_poolid = i + FAKE_FS_POOLID_OFFSET;
- if (cleancache_ops)
- fs_poolid_map[i] = cleancache_ops->init_fs(PAGE_SIZE);
- else
- fs_poolid_map[i] = FS_NO_BACKEND;
- break;
- }
+ if (cleancache_ops) {
+ pool_id = cleancache_ops->init_fs(PAGE_SIZE);
+ if (pool_id < 0)
+ pool_id = CLEANCACHE_NO_POOL;
}
- mutex_unlock(&poolid_mutex);
+ sb->cleancache_poolid = pool_id;
}
EXPORT_SYMBOL(__cleancache_init_fs);
/* Called by a cleancache-enabled clustered filesystem at time of mount */
-void __cleancache_init_shared_fs(char *uuid, struct super_block *sb)
+void __cleancache_init_shared_fs(struct super_block *sb)
{
- int i;
+ int pool_id = CLEANCACHE_NO_BACKEND_SHARED;
- mutex_lock(&poolid_mutex);
- for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
- if (shared_fs_poolid_map[i] == FS_UNKNOWN) {
- sb->cleancache_poolid = i + FAKE_SHARED_FS_POOLID_OFFSET;
- uuids[i] = uuid;
- if (cleancache_ops)
- shared_fs_poolid_map[i] = cleancache_ops->init_shared_fs
- (uuid, PAGE_SIZE);
- else
- shared_fs_poolid_map[i] = FS_NO_BACKEND;
- break;
- }
+ if (cleancache_ops) {
+ pool_id = cleancache_ops->init_shared_fs(sb->s_uuid, PAGE_SIZE);
+ if (pool_id < 0)
+ pool_id = CLEANCACHE_NO_POOL;
}
- mutex_unlock(&poolid_mutex);
+ sb->cleancache_poolid = pool_id;
}
EXPORT_SYMBOL(__cleancache_init_shared_fs);
@@ -202,19 +164,6 @@ static int cleancache_get_key(struct inode *inode,
}
/*
- * Returns a pool_id that is associated with a given fake poolid.
- */
-static int get_poolid_from_fake(int fake_pool_id)
-{
- if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET)
- return shared_fs_poolid_map[fake_pool_id -
- FAKE_SHARED_FS_POOLID_OFFSET];
- else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET)
- return fs_poolid_map[fake_pool_id - FAKE_FS_POOLID_OFFSET];
- return FS_NO_BACKEND;
-}
-
-/*
* "Get" data from cleancache associated with the poolid/inode/index
* that were specified when the data was put to cleanache and, if
* successful, use it to fill the specified page with data and return 0.
@@ -229,7 +178,6 @@ int __cleancache_get_page(struct page *page)
{
int ret = -1;
int pool_id;
- int fake_pool_id;
struct cleancache_filekey key = { .u.key = { 0 } };
if (!cleancache_ops) {
@@ -238,17 +186,14 @@ int __cleancache_get_page(struct page *page)
}
VM_BUG_ON_PAGE(!PageLocked(page), page);
- fake_pool_id = page->mapping->host->i_sb->cleancache_poolid;
- if (fake_pool_id < 0)
+ pool_id = page->mapping->host->i_sb->cleancache_poolid;
+ if (pool_id < 0)
goto out;
- pool_id = get_poolid_from_fake(fake_pool_id);
if (cleancache_get_key(page->mapping->host, &key) < 0)
goto out;
- if (pool_id >= 0)
- ret = cleancache_ops->get_page(pool_id,
- key, page->index, page);
+ ret = cleancache_ops->get_page(pool_id, key, page->index, page);
if (ret == 0)
cleancache_succ_gets++;
else
@@ -271,7 +216,6 @@ EXPORT_SYMBOL(__cleancache_get_page);
void __cleancache_put_page(struct page *page)
{
int pool_id;
- int fake_pool_id;
struct cleancache_filekey key = { .u.key = { 0 } };
if (!cleancache_ops) {
@@ -280,12 +224,7 @@ void __cleancache_put_page(struct page *page)
}
VM_BUG_ON_PAGE(!PageLocked(page), page);
- fake_pool_id = page->mapping->host->i_sb->cleancache_poolid;
- if (fake_pool_id < 0)
- return;
-
- pool_id = get_poolid_from_fake(fake_pool_id);
-
+ pool_id = page->mapping->host->i_sb->cleancache_poolid;
if (pool_id >= 0 &&
cleancache_get_key(page->mapping->host, &key) >= 0) {
cleancache_ops->put_page(pool_id, key, page->index, page);
@@ -306,18 +245,13 @@ void __cleancache_invalidate_page(struct address_space *mapping,
struct page *page)
{
/* careful... page->mapping is NULL sometimes when this is called */
- int pool_id;
- int fake_pool_id = mapping->host->i_sb->cleancache_poolid;
+ int pool_id = mapping->host->i_sb->cleancache_poolid;
struct cleancache_filekey key = { .u.key = { 0 } };
if (!cleancache_ops)
return;
- if (fake_pool_id >= 0) {
- pool_id = get_poolid_from_fake(fake_pool_id);
- if (pool_id < 0)
- return;
-
+ if (pool_id >= 0) {
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (cleancache_get_key(mapping->host, &key) >= 0) {
cleancache_ops->invalidate_page(pool_id,
@@ -339,18 +273,12 @@ EXPORT_SYMBOL(__cleancache_invalidate_page);
*/
void __cleancache_invalidate_inode(struct address_space *mapping)
{
- int pool_id;
- int fake_pool_id = mapping->host->i_sb->cleancache_poolid;
+ int pool_id = mapping->host->i_sb->cleancache_poolid;
struct cleancache_filekey key = { .u.key = { 0 } };
if (!cleancache_ops)
return;
- if (fake_pool_id < 0)
- return;
-
- pool_id = get_poolid_from_fake(fake_pool_id);
-
if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
cleancache_ops->invalidate_inode(pool_id, key);
}
@@ -363,32 +291,18 @@ EXPORT_SYMBOL(__cleancache_invalidate_inode);
*/
void __cleancache_invalidate_fs(struct super_block *sb)
{
- int index;
- int fake_pool_id = sb->cleancache_poolid;
- int old_poolid = fake_pool_id;
+ int pool_id;
- mutex_lock(&poolid_mutex);
- if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET) {
- index = fake_pool_id - FAKE_SHARED_FS_POOLID_OFFSET;
- old_poolid = shared_fs_poolid_map[index];
- shared_fs_poolid_map[index] = FS_UNKNOWN;
- uuids[index] = NULL;
- } else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET) {
- index = fake_pool_id - FAKE_FS_POOLID_OFFSET;
- old_poolid = fs_poolid_map[index];
- fs_poolid_map[index] = FS_UNKNOWN;
- }
- sb->cleancache_poolid = -1;
- if (cleancache_ops)
- cleancache_ops->invalidate_fs(old_poolid);
- mutex_unlock(&poolid_mutex);
+ pool_id = sb->cleancache_poolid;
+ sb->cleancache_poolid = CLEANCACHE_NO_POOL;
+
+ if (cleancache_ops && pool_id >= 0)
+ cleancache_ops->invalidate_fs(pool_id);
}
EXPORT_SYMBOL(__cleancache_invalidate_fs);
static int __init init_cleancache(void)
{
- int i;
-
#ifdef CONFIG_DEBUG_FS
struct dentry *root = debugfs_create_dir("cleancache", NULL);
if (root == NULL)
@@ -400,10 +314,6 @@ static int __init init_cleancache(void)
debugfs_create_u64("invalidates", S_IRUGO,
root, &cleancache_invalidates);
#endif
- for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
- fs_poolid_map[i] = FS_UNKNOWN;
- shared_fs_poolid_map[i] = FS_UNKNOWN;
- }
return 0;
}
module_init(init_cleancache)
diff --git a/mm/cma.c b/mm/cma.c
index 68ecb7a42983..47203faaf65e 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -35,29 +35,24 @@
#include <linux/highmem.h>
#include <linux/io.h>
-struct cma {
- unsigned long base_pfn;
- unsigned long count;
- unsigned long *bitmap;
- unsigned int order_per_bit; /* Order of pages represented by one bit */
- struct mutex lock;
-};
-
-static struct cma cma_areas[MAX_CMA_AREAS];
-static unsigned cma_area_count;
+#include "cma.h"
+
+struct cma cma_areas[MAX_CMA_AREAS];
+unsigned cma_area_count;
static DEFINE_MUTEX(cma_mutex);
-phys_addr_t cma_get_base(struct cma *cma)
+phys_addr_t cma_get_base(const struct cma *cma)
{
return PFN_PHYS(cma->base_pfn);
}
-unsigned long cma_get_size(struct cma *cma)
+unsigned long cma_get_size(const struct cma *cma)
{
return cma->count << PAGE_SHIFT;
}
-static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order)
+static unsigned long cma_bitmap_aligned_mask(const struct cma *cma,
+ int align_order)
{
if (align_order <= cma->order_per_bit)
return 0;
@@ -68,7 +63,8 @@ static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order)
* Find a PFN aligned to the specified order and return an offset represented in
* order_per_bits.
*/
-static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order)
+static unsigned long cma_bitmap_aligned_offset(const struct cma *cma,
+ int align_order)
{
if (align_order <= cma->order_per_bit)
return 0;
@@ -77,18 +73,14 @@ static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order)
- cma->base_pfn) >> cma->order_per_bit;
}
-static unsigned long cma_bitmap_maxno(struct cma *cma)
-{
- return cma->count >> cma->order_per_bit;
-}
-
-static unsigned long cma_bitmap_pages_to_bits(struct cma *cma,
- unsigned long pages)
+static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma,
+ unsigned long pages)
{
return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit;
}
-static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, int count)
+static void cma_clear_bitmap(struct cma *cma, unsigned long pfn,
+ unsigned int count)
{
unsigned long bitmap_no, bitmap_count;
@@ -134,6 +126,12 @@ static int __init cma_activate_area(struct cma *cma)
} while (--i);
mutex_init(&cma->lock);
+
+#ifdef CONFIG_CMA_DEBUGFS
+ INIT_HLIST_HEAD(&cma->mem_head);
+ spin_lock_init(&cma->mem_head_lock);
+#endif
+
return 0;
err:
@@ -167,7 +165,8 @@ core_initcall(cma_init_reserved_areas);
* This function creates custom contiguous area from already reserved memory.
*/
int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
- int order_per_bit, struct cma **res_cma)
+ unsigned int order_per_bit,
+ struct cma **res_cma)
{
struct cma *cma;
phys_addr_t alignment;
@@ -358,7 +357,7 @@ err:
* This function allocates part of contiguous memory on specific
* contiguous memory area.
*/
-struct page *cma_alloc(struct cma *cma, int count, unsigned int align)
+struct page *cma_alloc(struct cma *cma, unsigned int count, unsigned int align)
{
unsigned long mask, offset, pfn, start = 0;
unsigned long bitmap_maxno, bitmap_no, bitmap_count;
@@ -429,7 +428,7 @@ struct page *cma_alloc(struct cma *cma, int count, unsigned int align)
* It returns false when provided pages do not belong to contiguous area and
* true otherwise.
*/
-bool cma_release(struct cma *cma, struct page *pages, int count)
+bool cma_release(struct cma *cma, const struct page *pages, unsigned int count)
{
unsigned long pfn;
diff --git a/mm/cma.h b/mm/cma.h
new file mode 100644
index 000000000000..1132d733556d
--- /dev/null
+++ b/mm/cma.h
@@ -0,0 +1,24 @@
+#ifndef __MM_CMA_H__
+#define __MM_CMA_H__
+
+struct cma {
+ unsigned long base_pfn;
+ unsigned long count;
+ unsigned long *bitmap;
+ unsigned int order_per_bit; /* Order of pages represented by one bit */
+ struct mutex lock;
+#ifdef CONFIG_CMA_DEBUGFS
+ struct hlist_head mem_head;
+ spinlock_t mem_head_lock;
+#endif
+};
+
+extern struct cma cma_areas[MAX_CMA_AREAS];
+extern unsigned cma_area_count;
+
+static unsigned long cma_bitmap_maxno(struct cma *cma)
+{
+ return cma->count >> cma->order_per_bit;
+}
+
+#endif
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
new file mode 100644
index 000000000000..ec915e6f17ee
--- /dev/null
+++ b/mm/cma_debug.c
@@ -0,0 +1,171 @@
+/*
+ * CMA DebugFS Interface
+ *
+ * Copyright (c) 2015 Sasha Levin <sasha.levin@oracle.com>
+ */
+
+
+#include <linux/debugfs.h>
+#include <linux/cma.h>
+#include <linux/list.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mm_types.h>
+
+#include "cma.h"
+
+struct cma_mem {
+ struct hlist_node node;
+ struct page *p;
+ unsigned long n;
+};
+
+static struct dentry *cma_debugfs_root;
+
+static int cma_debugfs_get(void *data, u64 *val)
+{
+ unsigned long *p = data;
+
+ *val = *p;
+
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n");
+
+static void cma_add_to_cma_mem_list(struct cma *cma, struct cma_mem *mem)
+{
+ spin_lock(&cma->mem_head_lock);
+ hlist_add_head(&mem->node, &cma->mem_head);
+ spin_unlock(&cma->mem_head_lock);
+}
+
+static struct cma_mem *cma_get_entry_from_list(struct cma *cma)
+{
+ struct cma_mem *mem = NULL;
+
+ spin_lock(&cma->mem_head_lock);
+ if (!hlist_empty(&cma->mem_head)) {
+ mem = hlist_entry(cma->mem_head.first, struct cma_mem, node);
+ hlist_del_init(&mem->node);
+ }
+ spin_unlock(&cma->mem_head_lock);
+
+ return mem;
+}
+
+static int cma_free_mem(struct cma *cma, int count)
+{
+ struct cma_mem *mem = NULL;
+
+ while (count) {
+ mem = cma_get_entry_from_list(cma);
+ if (mem == NULL)
+ return 0;
+
+ if (mem->n <= count) {
+ cma_release(cma, mem->p, mem->n);
+ count -= mem->n;
+ kfree(mem);
+ } else if (cma->order_per_bit == 0) {
+ cma_release(cma, mem->p, count);
+ mem->p += count;
+ mem->n -= count;
+ count = 0;
+ cma_add_to_cma_mem_list(cma, mem);
+ } else {
+ pr_debug("cma: cannot release partial block when order_per_bit != 0\n");
+ cma_add_to_cma_mem_list(cma, mem);
+ break;
+ }
+ }
+
+ return 0;
+
+}
+
+static int cma_free_write(void *data, u64 val)
+{
+ int pages = val;
+ struct cma *cma = data;
+
+ return cma_free_mem(cma, pages);
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(cma_free_fops, NULL, cma_free_write, "%llu\n");
+
+static int cma_alloc_mem(struct cma *cma, int count)
+{
+ struct cma_mem *mem;
+ struct page *p;
+
+ mem = kzalloc(sizeof(*mem), GFP_KERNEL);
+ if (!mem)
+ return -ENOMEM;
+
+ p = cma_alloc(cma, count, 0);
+ if (!p) {
+ kfree(mem);
+ return -ENOMEM;
+ }
+
+ mem->p = p;
+ mem->n = count;
+
+ cma_add_to_cma_mem_list(cma, mem);
+
+ return 0;
+}
+
+static int cma_alloc_write(void *data, u64 val)
+{
+ int pages = val;
+ struct cma *cma = data;
+
+ return cma_alloc_mem(cma, pages);
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n");
+
+static void cma_debugfs_add_one(struct cma *cma, int idx)
+{
+ struct dentry *tmp;
+ char name[16];
+ int u32s;
+
+ sprintf(name, "cma-%d", idx);
+
+ tmp = debugfs_create_dir(name, cma_debugfs_root);
+
+ debugfs_create_file("alloc", S_IWUSR, cma_debugfs_root, cma,
+ &cma_alloc_fops);
+
+ debugfs_create_file("free", S_IWUSR, cma_debugfs_root, cma,
+ &cma_free_fops);
+
+ debugfs_create_file("base_pfn", S_IRUGO, tmp,
+ &cma->base_pfn, &cma_debugfs_fops);
+ debugfs_create_file("count", S_IRUGO, tmp,
+ &cma->count, &cma_debugfs_fops);
+ debugfs_create_file("order_per_bit", S_IRUGO, tmp,
+ &cma->order_per_bit, &cma_debugfs_fops);
+
+ u32s = DIV_ROUND_UP(cma_bitmap_maxno(cma), BITS_PER_BYTE * sizeof(u32));
+ debugfs_create_u32_array("bitmap", S_IRUGO, tmp, (u32*)cma->bitmap, u32s);
+}
+
+static int __init cma_debugfs_init(void)
+{
+ int i;
+
+ cma_debugfs_root = debugfs_create_dir("cma", NULL);
+ if (!cma_debugfs_root)
+ return -ENOMEM;
+
+ for (i = 0; i < cma_area_count; i++)
+ cma_debugfs_add_one(&cma_areas[i], i);
+
+ return 0;
+}
+late_initcall(cma_debugfs_init);
+
diff --git a/mm/compaction.c b/mm/compaction.c
index 8c0d9459b54a..e6c4f9475d43 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1047,6 +1047,12 @@ typedef enum {
} isolate_migrate_t;
/*
+ * Allow userspace to control policy on scanning the unevictable LRU for
+ * compactable pages.
+ */
+int sysctl_compact_unevictable_allowed __read_mostly = 1;
+
+/*
* Isolate all pages that can be migrated from the first suitable block,
* starting at the block pointed to by the migrate scanner pfn within
* compact_control.
@@ -1057,6 +1063,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
unsigned long low_pfn, end_pfn;
struct page *page;
const isolate_mode_t isolate_mode =
+ (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
(cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0);
/*
@@ -1174,13 +1181,24 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
/* Direct compactor: Is a suitable page free? */
for (order = cc->order; order < MAX_ORDER; order++) {
struct free_area *area = &zone->free_area[order];
+ bool can_steal;
/* Job done if page is free of the right migratetype */
if (!list_empty(&area->free_list[migratetype]))
return COMPACT_PARTIAL;
- /* Job done if allocation would set block type */
- if (order >= pageblock_order && area->nr_free)
+#ifdef CONFIG_CMA
+ /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
+ if (migratetype == MIGRATE_MOVABLE &&
+ !list_empty(&area->free_list[MIGRATE_CMA]))
+ return COMPACT_PARTIAL;
+#endif
+ /*
+ * Job done if allocation would steal freepages from
+ * other migratetype buddy lists.
+ */
+ if (find_suitable_fallback(area, order, migratetype,
+ true, &can_steal) != -1)
return COMPACT_PARTIAL;
}
@@ -1587,6 +1605,14 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
INIT_LIST_HEAD(&cc->freepages);
INIT_LIST_HEAD(&cc->migratepages);
+ /*
+ * When called via /proc/sys/vm/compact_memory
+ * this makes sure we compact the whole zone regardless of
+ * cached scanner positions.
+ */
+ if (cc->order == -1)
+ __reset_isolation_suitable(zone);
+
if (cc->order == -1 || !compaction_deferred(zone, cc->order))
compact_zone(zone, cc);
diff --git a/mm/filemap.c b/mm/filemap.c
index 876f4e6f3ed6..467768d4263b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -202,16 +202,15 @@ void __delete_from_page_cache(struct page *page, void *shadow)
BUG_ON(page_mapped(page));
/*
- * Some filesystems seem to re-dirty the page even after
- * the VM has canceled the dirty bit (eg ext3 journaling).
+ * At this point page must be either written or cleaned by truncate.
+ * Dirty page here signals a bug and loss of unwritten data.
*
- * Fix it up by doing a final dirty accounting check after
- * having removed the page entirely.
+ * This fixes dirty accounting after removing the page entirely but
+ * leaves PageDirty set: it has no effect for truncated page and
+ * anyway will be cleared before returning page into buddy allocator.
*/
- if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
- dec_zone_page_state(page, NR_FILE_DIRTY);
- dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE);
- }
+ if (WARN_ON_ONCE(PageDirty(page)))
+ account_page_cleaned(page, mapping);
}
/**
@@ -616,11 +615,11 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
void *shadow = NULL;
int ret;
- __set_page_locked(page);
+ __SetPageLocked(page);
ret = __add_to_page_cache_locked(page, mapping, offset,
gfp_mask, &shadow);
if (unlikely(ret))
- __clear_page_locked(page);
+ __ClearPageLocked(page);
else {
/*
* The page might have been evicted from cache only
@@ -743,6 +742,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue);
*/
void unlock_page(struct page *page)
{
+ page = compound_head(page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
clear_bit_unlock(PG_locked, &page->flags);
smp_mb__after_atomic();
@@ -807,18 +807,20 @@ EXPORT_SYMBOL_GPL(page_endio);
*/
void __lock_page(struct page *page)
{
- DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+ struct page *page_head = compound_head(page);
+ DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
- __wait_on_bit_lock(page_waitqueue(page), &wait, bit_wait_io,
+ __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io,
TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(__lock_page);
int __lock_page_killable(struct page *page)
{
- DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+ struct page *page_head = compound_head(page);
+ DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
- return __wait_on_bit_lock(page_waitqueue(page), &wait,
+ return __wait_on_bit_lock(page_waitqueue(page_head), &wait,
bit_wait_io, TASK_KILLABLE);
}
EXPORT_SYMBOL_GPL(__lock_page_killable);
diff --git a/mm/gup.c b/mm/gup.c
index a6e24e246f86..6297f6bccfb1 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -92,7 +92,7 @@ retry:
*/
mark_page_accessed(page);
}
- if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+ if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
/*
* The preliminary mapping check is mainly to avoid the
* pointless overhead of lock_page on the ZERO_PAGE
@@ -265,8 +265,8 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
unsigned int fault_flags = 0;
int ret;
- /* For mlock, just skip the stack guard page. */
- if ((*flags & FOLL_MLOCK) &&
+ /* For mm_populate(), just skip the stack guard page. */
+ if ((*flags & FOLL_POPULATE) &&
(stack_guard_page_start(vma, address) ||
stack_guard_page_end(vma, address + PAGE_SIZE)))
return -ENOENT;
@@ -819,6 +819,124 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
EXPORT_SYMBOL(get_user_pages);
/**
+ * populate_vma_page_range() - populate a range of pages in the vma.
+ * @vma: target vma
+ * @start: start address
+ * @end: end address
+ * @nonblocking:
+ *
+ * This takes care of mlocking the pages too if VM_LOCKED is set.
+ *
+ * return 0 on success, negative error code on error.
+ *
+ * vma->vm_mm->mmap_sem must be held.
+ *
+ * If @nonblocking is NULL, it may be held for read or write and will
+ * be unperturbed.
+ *
+ * If @nonblocking is non-NULL, it must held for read only and may be
+ * released. If it's released, *@nonblocking will be set to 0.
+ */
+long populate_vma_page_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, int *nonblocking)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long nr_pages = (end - start) / PAGE_SIZE;
+ int gup_flags;
+
+ VM_BUG_ON(start & ~PAGE_MASK);
+ VM_BUG_ON(end & ~PAGE_MASK);
+ VM_BUG_ON_VMA(start < vma->vm_start, vma);
+ VM_BUG_ON_VMA(end > vma->vm_end, vma);
+ VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
+
+ gup_flags = FOLL_TOUCH | FOLL_POPULATE;
+ /*
+ * We want to touch writable mappings with a write fault in order
+ * to break COW, except for shared mappings because these don't COW
+ * and we would not want to dirty them for nothing.
+ */
+ if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
+ gup_flags |= FOLL_WRITE;
+
+ /*
+ * We want mlock to succeed for regions that have any permissions
+ * other than PROT_NONE.
+ */
+ if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
+ gup_flags |= FOLL_FORCE;
+
+ /*
+ * We made sure addr is within a VMA, so the following will
+ * not result in a stack expansion that recurses back here.
+ */
+ return __get_user_pages(current, mm, start, nr_pages, gup_flags,
+ NULL, NULL, nonblocking);
+}
+
+/*
+ * __mm_populate - populate and/or mlock pages within a range of address space.
+ *
+ * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
+ * flags. VMAs must be already marked with the desired vm_flags, and
+ * mmap_sem must not be held.
+ */
+int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long end, nstart, nend;
+ struct vm_area_struct *vma = NULL;
+ int locked = 0;
+ long ret = 0;
+
+ VM_BUG_ON(start & ~PAGE_MASK);
+ VM_BUG_ON(len != PAGE_ALIGN(len));
+ end = start + len;
+
+ for (nstart = start; nstart < end; nstart = nend) {
+ /*
+ * We want to fault in pages for [nstart; end) address range.
+ * Find first corresponding VMA.
+ */
+ if (!locked) {
+ locked = 1;
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, nstart);
+ } else if (nstart >= vma->vm_end)
+ vma = vma->vm_next;
+ if (!vma || vma->vm_start >= end)
+ break;
+ /*
+ * Set [nstart; nend) to intersection of desired address
+ * range with the first VMA. Also, skip undesirable VMA types.
+ */
+ nend = min(end, vma->vm_end);
+ if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+ continue;
+ if (nstart < vma->vm_start)
+ nstart = vma->vm_start;
+ /*
+ * Now fault in a range of pages. populate_vma_page_range()
+ * double checks the vma flags, so that it won't mlock pages
+ * if the vma was already munlocked.
+ */
+ ret = populate_vma_page_range(vma, nstart, nend, &locked);
+ if (ret < 0) {
+ if (ignore_errors) {
+ ret = 0;
+ continue; /* continue at next VMA */
+ }
+ break;
+ }
+ nend = nstart + ret * PAGE_SIZE;
+ ret = 0;
+ }
+ if (locked)
+ up_read(&mm->mmap_sem);
+ return ret; /* 0 or negative error code */
+}
+
+/**
* get_dump_page() - pin user page in memory while writing it to core dump
* @addr: user address
*
@@ -901,7 +1019,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
*
* for an example see gup_get_pte in arch/x86/mm/gup.c
*/
- pte_t pte = ACCESS_ONCE(*ptep);
+ pte_t pte = READ_ONCE(*ptep);
struct page *page;
/*
@@ -1191,7 +1309,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
local_irq_save(flags);
pgdp = pgd_offset(mm, addr);
do {
- pgd_t pgd = ACCESS_ONCE(*pgdp);
+ pgd_t pgd = READ_ONCE(*pgdp);
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6817b0350c71..3f1504322187 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -183,7 +183,7 @@ static struct page *get_huge_zero_page(void)
struct page *zero_page;
retry:
if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
- return ACCESS_ONCE(huge_zero_page);
+ return READ_ONCE(huge_zero_page);
zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
HPAGE_PMD_ORDER);
@@ -202,7 +202,7 @@ retry:
/* We take additional reference here. It will be put back by shrinker */
atomic_set(&huge_zero_refcount, 2);
preempt_enable();
- return ACCESS_ONCE(huge_zero_page);
+ return READ_ONCE(huge_zero_page);
}
static void put_huge_zero_page(void)
@@ -708,7 +708,7 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long haddr, pmd_t *pmd,
- struct page *page)
+ struct page *page, gfp_t gfp)
{
struct mem_cgroup *memcg;
pgtable_t pgtable;
@@ -716,7 +716,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
VM_BUG_ON_PAGE(!PageCompound(page), page);
- if (mem_cgroup_try_charge(page, mm, GFP_TRANSHUGE, &memcg))
+ if (mem_cgroup_try_charge(page, mm, gfp, &memcg))
return VM_FAULT_OOM;
pgtable = pte_alloc_one(mm, haddr);
@@ -822,7 +822,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
- if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) {
+ if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) {
put_page(page);
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
@@ -1080,6 +1080,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long haddr;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
+ gfp_t huge_gfp; /* for allocation and charge */
ptl = pmd_lockptr(mm, pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma);
@@ -1106,10 +1107,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
alloc:
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow()) {
- gfp_t gfp;
-
- gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
- new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
+ huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+ new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
} else
new_page = NULL;
@@ -1130,8 +1129,7 @@ alloc:
goto out;
}
- if (unlikely(mem_cgroup_try_charge(new_page, mm,
- GFP_TRANSHUGE, &memcg))) {
+ if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg))) {
put_page(new_page);
if (page) {
split_huge_page(page);
@@ -1231,7 +1229,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
pmd, _pmd, 1))
update_mmu_cache_pmd(vma, addr, pmd);
}
- if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+ if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
if (page->mapping && trylock_page(page)) {
lru_add_drain();
if (page->mapping)
@@ -1377,6 +1375,36 @@ out:
return 0;
}
+int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ pmd_t *pmd, unsigned long addr)
+
+{
+ spinlock_t *ptl;
+ struct mm_struct *mm = tlb->mm;
+ int ret = 1;
+
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ struct page *page;
+ pmd_t orig_pmd;
+
+ orig_pmd = pmdp_get_and_clear(mm, addr, pmd);
+
+ /* No hugepage in swapcache */
+ page = pmd_page(orig_pmd);
+ VM_BUG_ON_PAGE(PageSwapCache(page), page);
+
+ orig_pmd = pmd_mkold(orig_pmd);
+ orig_pmd = pmd_mkclean(orig_pmd);
+
+ set_pmd_at(mm, addr, pmd, orig_pmd);
+ tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+ spin_unlock(ptl);
+ ret = 0;
+ }
+
+ return ret;
+}
+
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr)
{
@@ -1592,6 +1620,11 @@ unlock:
return NULL;
}
+int pmd_freeable(pmd_t pmd)
+{
+ return !pmd_dirty(pmd);
+}
+
static int __split_huge_page_splitting(struct page *page,
struct vm_area_struct *vma,
unsigned long address)
@@ -1703,7 +1736,7 @@ static void __split_huge_page_refcount(struct page *page,
*/
page_tail->_mapcount = page->_mapcount;
- BUG_ON(page_tail->mapping);
+ BUG_ON(page_tail->mapping != TAIL_MAPPING);
page_tail->mapping = page->mapping;
page_tail->index = page->index + i;
@@ -2109,7 +2142,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte)
{
while (--_pte >= pte) {
pte_t pteval = *_pte;
- if (!pte_none(pteval))
+ if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)))
release_pte_page(pte_page(pteval));
}
}
@@ -2120,13 +2153,13 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
{
struct page *page;
pte_t *_pte;
- int none = 0;
+ int none_or_zero = 0;
bool referenced = false, writable = false;
for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
- if (pte_none(pteval)) {
- if (++none <= khugepaged_max_ptes_none)
+ if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
+ if (++none_or_zero <= khugepaged_max_ptes_none)
continue;
else
goto out;
@@ -2207,9 +2240,21 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
pte_t pteval = *_pte;
struct page *src_page;
- if (pte_none(pteval)) {
+ if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
clear_user_highpage(page, address);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
+ if (is_zero_pfn(pte_pfn(pteval))) {
+ /*
+ * ptl mostly unnecessary.
+ */
+ spin_lock(ptl);
+ /*
+ * paravirt calls inside pte_clear here are
+ * superfluous.
+ */
+ pte_clear(vma->vm_mm, address, _pte);
+ spin_unlock(ptl);
+ }
} else {
src_page = pte_page(pteval);
copy_user_highpage(page, src_page, address, vma);
@@ -2311,13 +2356,17 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
return true;
}
-static struct page
-*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+static struct page *
+khugepaged_alloc_page(struct page **hpage, gfp_t *gfp, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
int node)
{
VM_BUG_ON_PAGE(*hpage, *hpage);
+ /* Only allocate from the target node */
+ *gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) |
+ __GFP_THISNODE;
+
/*
* Before allocating the hugepage, release the mmap_sem read lock.
* The allocation can take potentially a long time if it involves
@@ -2326,8 +2375,7 @@ static struct page
*/
up_read(&mm->mmap_sem);
- *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask(
- khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
+ *hpage = alloc_pages_exact_node(node, *gfp, HPAGE_PMD_ORDER);
if (unlikely(!*hpage)) {
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
*hpage = ERR_PTR(-ENOMEM);
@@ -2380,13 +2428,19 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
return true;
}
-static struct page
-*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+static struct page *
+khugepaged_alloc_page(struct page **hpage, gfp_t *gfp, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
int node)
{
up_read(&mm->mmap_sem);
VM_BUG_ON(!*hpage);
+
+ /*
+ * khugepaged_alloc_hugepage is doing the preallocation, use the same
+ * gfp flags here.
+ */
+ *gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), 0);
return *hpage;
}
#endif
@@ -2421,16 +2475,17 @@ static void collapse_huge_page(struct mm_struct *mm,
struct mem_cgroup *memcg;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
+ gfp_t gfp;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
/* release the mmap_sem read lock. */
- new_page = khugepaged_alloc_page(hpage, mm, vma, address, node);
+ new_page = khugepaged_alloc_page(hpage, &gfp, mm, vma, address, node);
if (!new_page)
return;
if (unlikely(mem_cgroup_try_charge(new_page, mm,
- GFP_TRANSHUGE, &memcg)))
+ gfp, &memcg)))
return;
/*
@@ -2543,7 +2598,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
{
pmd_t *pmd;
pte_t *pte, *_pte;
- int ret = 0, none = 0;
+ int ret = 0, none_or_zero = 0;
struct page *page;
unsigned long _address;
spinlock_t *ptl;
@@ -2561,8 +2616,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, _address += PAGE_SIZE) {
pte_t pteval = *_pte;
- if (pte_none(pteval)) {
- if (++none <= khugepaged_max_ptes_none)
+ if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
+ if (++none_or_zero <= khugepaged_max_ptes_none)
continue;
else
goto out_unmap;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c41b2a0ee273..011ce317ae73 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -61,6 +61,9 @@ DEFINE_SPINLOCK(hugetlb_lock);
static int num_fault_mutexes;
static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp;
+/* Forward declaration */
+static int hugetlb_acct_memory(struct hstate *h, long delta);
+
static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
{
bool free = (spool->count == 0) && (spool->used_hpages == 0);
@@ -68,23 +71,36 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
spin_unlock(&spool->lock);
/* If no pages are used, and no other handles to the subpool
- * remain, free the subpool the subpool remain */
- if (free)
+ * remain, give up any reservations mased on minimum size and
+ * free the subpool */
+ if (free) {
+ if (spool->min_hpages != -1)
+ hugetlb_acct_memory(spool->hstate,
+ -spool->min_hpages);
kfree(spool);
+ }
}
-struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
+struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
+ long min_hpages)
{
struct hugepage_subpool *spool;
- spool = kmalloc(sizeof(*spool), GFP_KERNEL);
+ spool = kzalloc(sizeof(*spool), GFP_KERNEL);
if (!spool)
return NULL;
spin_lock_init(&spool->lock);
spool->count = 1;
- spool->max_hpages = nr_blocks;
- spool->used_hpages = 0;
+ spool->max_hpages = max_hpages;
+ spool->hstate = h;
+ spool->min_hpages = min_hpages;
+
+ if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
+ kfree(spool);
+ return NULL;
+ }
+ spool->rsv_hpages = min_hpages;
return spool;
}
@@ -97,36 +113,89 @@ void hugepage_put_subpool(struct hugepage_subpool *spool)
unlock_or_release_subpool(spool);
}
-static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
+/*
+ * Subpool accounting for allocating and reserving pages.
+ * Return -ENOMEM if there are not enough resources to satisfy the
+ * the request. Otherwise, return the number of pages by which the
+ * global pools must be adjusted (upward). The returned value may
+ * only be different than the passed value (delta) in the case where
+ * a subpool minimum size must be manitained.
+ */
+static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
long delta)
{
- int ret = 0;
+ long ret = delta;
if (!spool)
- return 0;
+ return ret;
spin_lock(&spool->lock);
- if ((spool->used_hpages + delta) <= spool->max_hpages) {
- spool->used_hpages += delta;
- } else {
- ret = -ENOMEM;
+
+ if (spool->max_hpages != -1) { /* maximum size accounting */
+ if ((spool->used_hpages + delta) <= spool->max_hpages)
+ spool->used_hpages += delta;
+ else {
+ ret = -ENOMEM;
+ goto unlock_ret;
+ }
}
- spin_unlock(&spool->lock);
+ if (spool->min_hpages != -1) { /* minimum size accounting */
+ if (delta > spool->rsv_hpages) {
+ /*
+ * Asking for more reserves than those already taken on
+ * behalf of subpool. Return difference.
+ */
+ ret = delta - spool->rsv_hpages;
+ spool->rsv_hpages = 0;
+ } else {
+ ret = 0; /* reserves already accounted for */
+ spool->rsv_hpages -= delta;
+ }
+ }
+
+unlock_ret:
+ spin_unlock(&spool->lock);
return ret;
}
-static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
+/*
+ * Subpool accounting for freeing and unreserving pages.
+ * Return the number of global page reservations that must be dropped.
+ * The return value may only be different than the passed value (delta)
+ * in the case where a subpool minimum size must be maintained.
+ */
+static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
long delta)
{
+ long ret = delta;
+
if (!spool)
- return;
+ return delta;
spin_lock(&spool->lock);
- spool->used_hpages -= delta;
- /* If hugetlbfs_put_super couldn't free spool due to
- * an outstanding quota reference, free it now. */
+
+ if (spool->max_hpages != -1) /* maximum size accounting */
+ spool->used_hpages -= delta;
+
+ if (spool->min_hpages != -1) { /* minimum size accounting */
+ if (spool->rsv_hpages + delta <= spool->min_hpages)
+ ret = 0;
+ else
+ ret = spool->rsv_hpages + delta - spool->min_hpages;
+
+ spool->rsv_hpages += delta;
+ if (spool->rsv_hpages > spool->min_hpages)
+ spool->rsv_hpages = spool->min_hpages;
+ }
+
+ /*
+ * If hugetlbfs_put_super couldn't free spool due to an outstanding
+ * quota reference, free it now.
+ */
unlock_or_release_subpool(spool);
+
+ return ret;
}
static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
@@ -874,6 +943,14 @@ void free_huge_page(struct page *page)
restore_reserve = PagePrivate(page);
ClearPagePrivate(page);
+ /*
+ * A return code of zero implies that the subpool will be under its
+ * minimum size if the reservation is not restored after page is free.
+ * Therefore, force restore_reserve operation.
+ */
+ if (hugepage_subpool_put_pages(spool, 1) == 0)
+ restore_reserve = true;
+
spin_lock(&hugetlb_lock);
hugetlb_cgroup_uncharge_page(hstate_index(h),
pages_per_huge_page(h), page);
@@ -891,7 +968,6 @@ void free_huge_page(struct page *page)
enqueue_huge_page(h, page);
}
spin_unlock(&hugetlb_lock);
- hugepage_subpool_put_pages(spool, 1);
}
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
@@ -1386,7 +1462,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
if (chg < 0)
return ERR_PTR(-ENOMEM);
if (chg || avoid_reserve)
- if (hugepage_subpool_get_pages(spool, 1))
+ if (hugepage_subpool_get_pages(spool, 1) < 0)
return ERR_PTR(-ENOSPC);
ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
@@ -2454,6 +2530,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
struct resv_map *resv = vma_resv_map(vma);
struct hugepage_subpool *spool = subpool_vma(vma);
unsigned long reserve, start, end;
+ long gbl_reserve;
if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
return;
@@ -2466,8 +2543,12 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
kref_put(&resv->refs, resv_map_release);
if (reserve) {
- hugetlb_acct_memory(h, -reserve);
- hugepage_subpool_put_pages(spool, reserve);
+ /*
+ * Decrement reserve counts. The global reserve count may be
+ * adjusted if the subpool has a minimum size.
+ */
+ gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
+ hugetlb_acct_memory(h, -gbl_reserve);
}
}
@@ -3278,6 +3359,15 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *page;
/*
+ * If we have a pending SIGKILL, don't keep faulting pages and
+ * potentially allocating memory.
+ */
+ if (unlikely(fatal_signal_pending(current))) {
+ remainder = 0;
+ break;
+ }
+
+ /*
* Some archs (sparc64, sh*) have multiple pte_ts to
* each hugepage. We have to make sure we get the
* first, for the page indexing below to work.
@@ -3438,6 +3528,7 @@ int hugetlb_reserve_pages(struct inode *inode,
struct hstate *h = hstate_inode(inode);
struct hugepage_subpool *spool = subpool_inode(inode);
struct resv_map *resv_map;
+ long gbl_reserve;
/*
* Only apply hugepage reservation if asked. At fault time, an
@@ -3474,8 +3565,13 @@ int hugetlb_reserve_pages(struct inode *inode,
goto out_err;
}
- /* There must be enough pages in the subpool for the mapping */
- if (hugepage_subpool_get_pages(spool, chg)) {
+ /*
+ * There must be enough pages in the subpool for the mapping. If
+ * the subpool has a minimum size, there may be some global
+ * reservations already in place (gbl_reserve).
+ */
+ gbl_reserve = hugepage_subpool_get_pages(spool, chg);
+ if (gbl_reserve < 0) {
ret = -ENOSPC;
goto out_err;
}
@@ -3484,9 +3580,10 @@ int hugetlb_reserve_pages(struct inode *inode,
* Check enough hugepages are available for the reservation.
* Hand the pages back to the subpool if there are not
*/
- ret = hugetlb_acct_memory(h, chg);
+ ret = hugetlb_acct_memory(h, gbl_reserve);
if (ret < 0) {
- hugepage_subpool_put_pages(spool, chg);
+ /* put back original number of pages, chg */
+ (void)hugepage_subpool_put_pages(spool, chg);
goto out_err;
}
@@ -3516,6 +3613,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
struct resv_map *resv_map = inode_resv_map(inode);
long chg = 0;
struct hugepage_subpool *spool = subpool_inode(inode);
+ long gbl_reserve;
if (resv_map)
chg = region_truncate(resv_map, offset);
@@ -3523,8 +3621,12 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
inode->i_blocks -= (blocks_per_huge_page(h) * freed);
spin_unlock(&inode->i_lock);
- hugepage_subpool_put_pages(spool, (chg - freed));
- hugetlb_acct_memory(h, -(chg - freed));
+ /*
+ * If the subpool has a minimum size, the number of global
+ * reservations to be released may be adjusted.
+ */
+ gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
+ hugetlb_acct_memory(h, -gbl_reserve);
}
#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
diff --git a/mm/internal.h b/mm/internal.h
index a96da5b0029d..a25e359a4039 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -200,6 +200,8 @@ isolate_freepages_range(struct compact_control *cc,
unsigned long
isolate_migratepages_range(struct compact_control *cc,
unsigned long low_pfn, unsigned long end_pfn);
+int find_suitable_fallback(struct free_area *area, unsigned int order,
+ int migratetype, bool only_stealable, bool *can_steal);
#endif
@@ -222,13 +224,13 @@ static inline unsigned long page_order(struct page *page)
* PageBuddy() should be checked first by the caller to minimize race window,
* and invalid values must be handled gracefully.
*
- * ACCESS_ONCE is used so that if the caller assigns the result into a local
+ * READ_ONCE is used so that if the caller assigns the result into a local
* variable and e.g. tests it for valid range before using, the compiler cannot
* decide to remove the variable and inline the page_private(page) multiple
* times, potentially observing different values in the tests and the actual
* use of the result.
*/
-#define page_order_unsafe(page) ACCESS_ONCE(page_private(page))
+#define page_order_unsafe(page) READ_ONCE(page_private(page))
static inline bool is_cow_mapping(vm_flags_t flags)
{
@@ -240,7 +242,7 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct rb_node *rb_parent);
#ifdef CONFIG_MMU
-extern long __mlock_vma_pages_range(struct vm_area_struct *vma,
+extern long populate_vma_page_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end, int *nonblocking);
extern void munlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end);
diff --git a/mm/ksm.c b/mm/ksm.c
index 4162dce2eb44..bc7be0ee2080 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -542,7 +542,7 @@ static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it)
expected_mapping = (void *)stable_node +
(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
again:
- kpfn = ACCESS_ONCE(stable_node->kpfn);
+ kpfn = READ_ONCE(stable_node->kpfn);
page = pfn_to_page(kpfn);
/*
@@ -551,7 +551,7 @@ again:
* but on Alpha we need to be more careful.
*/
smp_read_barrier_depends();
- if (ACCESS_ONCE(page->mapping) != expected_mapping)
+ if (READ_ONCE(page->mapping) != expected_mapping)
goto stale;
/*
@@ -577,14 +577,14 @@ again:
cpu_relax();
}
- if (ACCESS_ONCE(page->mapping) != expected_mapping) {
+ if (READ_ONCE(page->mapping) != expected_mapping) {
put_page(page);
goto stale;
}
if (lock_it) {
lock_page(page);
- if (ACCESS_ONCE(page->mapping) != expected_mapping) {
+ if (READ_ONCE(page->mapping) != expected_mapping) {
unlock_page(page);
put_page(page);
goto stale;
@@ -600,7 +600,7 @@ stale:
* before checking whether node->kpfn has been changed.
*/
smp_rmb();
- if (ACCESS_ONCE(stable_node->kpfn) != kpfn)
+ if (READ_ONCE(stable_node->kpfn) != kpfn)
goto again;
remove_node_from_stable_tree(stable_node);
return NULL;
@@ -1884,7 +1884,7 @@ struct page *ksm_might_need_to_copy(struct page *page,
SetPageDirty(new_page);
__SetPageUptodate(new_page);
- __set_page_locked(new_page);
+ __SetPageLocked(new_page);
}
return new_page;
diff --git a/mm/madvise.c b/mm/madvise.c
index d551475517bf..22e8f0ca7040 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -19,6 +19,14 @@
#include <linux/blkdev.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/mmu_notifier.h>
+
+#include <asm/tlb.h>
+
+struct madvise_free_private {
+ struct vm_area_struct *vma;
+ struct mmu_gather *tlb;
+};
/*
* Any behaviour which results in changes to the vma->vm_flags needs to
@@ -31,6 +39,7 @@ static int madvise_need_mmap_write(int behavior)
case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_DONTNEED:
+ case MADV_FREE:
return 0;
default:
/* be safe, default to 1. list exceptions explicitly */
@@ -254,6 +263,164 @@ static long madvise_willneed(struct vm_area_struct *vma,
return 0;
}
+static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+
+{
+ struct madvise_free_private *fp = walk->private;
+ struct mmu_gather *tlb = fp->tlb;
+ struct mm_struct *mm = tlb->mm;
+ struct vm_area_struct *vma = fp->vma;
+ spinlock_t *ptl;
+ pte_t *pte, ptent;
+ struct page *page;
+ swp_entry_t entry;
+ unsigned long next;
+ int nr_swap = 0;
+
+ next = pmd_addr_end(addr, end);
+ if (pmd_trans_huge(*pmd)) {
+ if (next - addr != HPAGE_PMD_SIZE)
+ split_huge_page_pmd(vma, addr, pmd);
+ else if (!madvise_free_huge_pmd(tlb, vma, pmd, addr))
+ goto next;
+ /* fall through */
+ }
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
+
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ arch_enter_lazy_mmu_mode();
+ for (; addr != end; pte++, addr += PAGE_SIZE) {
+ ptent = *pte;
+
+ if (pte_none(ptent))
+ continue;
+ /*
+ * If the pte has swp_entry, just clear page table to
+ * prevent swap-in which is more expensive rather than
+ * (page allocation + zeroing).
+ */
+ if (!pte_present(ptent)) {
+ entry = pte_to_swp_entry(ptent);
+ if (non_swap_entry(entry))
+ continue;
+ nr_swap--;
+ free_swap_and_cache(entry);
+ pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ continue;
+ }
+
+ page = vm_normal_page(vma, addr, ptent);
+ if (!page)
+ continue;
+
+ if (PageSwapCache(page)) {
+ if (!trylock_page(page))
+ continue;
+
+ if (!try_to_free_swap(page)) {
+ unlock_page(page);
+ continue;
+ }
+
+ ClearPageDirty(page);
+ unlock_page(page);
+ }
+
+ /*
+ * Some of architecture(ex, PPC) don't update TLB
+ * with set_pte_at and tlb_remove_tlb_entry so for
+ * the portability, remap the pte with old|clean
+ * after pte clearing.
+ */
+ ptent = ptep_get_and_clear_full(mm, addr, pte,
+ tlb->fullmm);
+ ptent = pte_mkold(ptent);
+ ptent = pte_mkclean(ptent);
+ set_pte_at(mm, addr, pte, ptent);
+ if (PageActive(page))
+ deactivate_page(page);
+ tlb_remove_tlb_entry(tlb, pte, addr);
+ }
+
+ if (nr_swap) {
+ if (current->mm == mm)
+ sync_mm_rss(mm);
+
+ add_mm_counter(mm, MM_SWAPENTS, nr_swap);
+ }
+
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(pte - 1, ptl);
+next:
+ cond_resched();
+ return 0;
+}
+
+static void madvise_free_page_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ struct madvise_free_private fp = {
+ .vma = vma,
+ .tlb = tlb,
+ };
+
+ struct mm_walk free_walk = {
+ .pmd_entry = madvise_free_pte_range,
+ .mm = vma->vm_mm,
+ .private = &fp,
+ };
+
+ BUG_ON(addr >= end);
+ tlb_start_vma(tlb, vma);
+ walk_page_range(addr, end, &free_walk);
+ tlb_end_vma(tlb, vma);
+}
+
+static int madvise_free_single_vma(struct vm_area_struct *vma,
+ unsigned long start_addr, unsigned long end_addr)
+{
+ unsigned long start, end;
+ struct mm_struct *mm = vma->vm_mm;
+ struct mmu_gather tlb;
+
+ if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
+ return -EINVAL;
+
+ /* MADV_FREE works for only anon vma at the moment */
+ if (vma->vm_file)
+ return -EINVAL;
+
+ start = max(vma->vm_start, start_addr);
+ if (start >= vma->vm_end)
+ return -EINVAL;
+ end = min(vma->vm_end, end_addr);
+ if (end <= vma->vm_start)
+ return -EINVAL;
+
+ lru_add_drain();
+ tlb_gather_mmu(&tlb, mm, start, end);
+ update_hiwater_rss(mm);
+
+ mmu_notifier_invalidate_range_start(mm, start, end);
+ madvise_free_page_range(&tlb, vma, start, end);
+ mmu_notifier_invalidate_range_end(mm, start, end);
+ tlb_finish_mmu(&tlb, start, end);
+
+ return 0;
+}
+
+static long madvise_free(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end)
+{
+ *prev = vma;
+ return madvise_free_single_vma(vma, start, end);
+}
+
/*
* Application no longer needs these pages. If the pages are dirty,
* it's OK to just throw them away. The app will be more careful about
@@ -377,6 +544,14 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
return madvise_remove(vma, prev, start, end);
case MADV_WILLNEED:
return madvise_willneed(vma, prev, start, end);
+ case MADV_FREE:
+ /*
+ * XXX: In this implementation, MADV_FREE works like
+ * MADV_DONTNEED on swapless system or full swap.
+ */
+ if (get_nr_swap_pages() > 0)
+ return madvise_free(vma, prev, start, end);
+ /* passthrough */
case MADV_DONTNEED:
return madvise_dontneed(vma, prev, start, end);
default:
@@ -396,6 +571,7 @@ madvise_behavior_valid(int behavior)
case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_DONTNEED:
+ case MADV_FREE:
#ifdef CONFIG_KSM
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
diff --git a/mm/memblock.c b/mm/memblock.c
index 252b77bdf65e..3f37a0bca5d5 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -699,14 +699,14 @@ static int __init_memblock memblock_reserve_region(phys_addr_t base,
int nid,
unsigned long flags)
{
- struct memblock_type *_rgn = &memblock.reserved;
+ struct memblock_type *type = &memblock.reserved;
memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n",
(unsigned long long)base,
(unsigned long long)base + size - 1,
flags, (void *)_RET_IP_);
- return memblock_add_range(_rgn, base, size, nid, flags);
+ return memblock_add_range(type, base, size, nid, flags);
}
int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b34ef4a32a3b..14c2f2017e37 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -14,6 +14,12 @@
* Copyright (C) 2012 Parallels Inc. and Google Inc.
* Authors: Glauber Costa and Suleiman Souhlal
*
+ * Native page reclaim
+ * Charge lifetime sanitation
+ * Lockless page tracking & accounting
+ * Unified hierarchy configuration model
+ * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
+ *
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
@@ -253,11 +259,6 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
* page cache and RSS per cgroup. We would eventually like to provide
* statistics based on the statistics developed by Rik Van Riel for clock-pro,
* to help the administrator determine what knobs to tune.
- *
- * TODO: Add a water mark for the memory controller. Reclaim will begin when
- * we hit the water mark. May be even add a low water mark, such that
- * no reclaim occurs from a cgroup at it's low water mark, this is
- * a feature that will be implemented much later in the future.
*/
struct mem_cgroup {
struct cgroup_subsys_state css;
@@ -454,6 +455,12 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
return memcg->css.id;
}
+/*
+ * A helper function to get mem_cgroup from ID. must be called under
+ * rcu_read_lock(). The caller is responsible for calling
+ * css_tryget_online() if the mem_cgroup is used for charging. (dropping
+ * refcnt from swap can be called against removed memcg.)
+ */
static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
{
struct cgroup_subsys_state *css;
@@ -667,7 +674,7 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
{
unsigned long nr_pages = page_counter_read(&memcg->memory);
- unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit);
+ unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
unsigned long excess = 0;
if (nr_pages > soft_limit)
@@ -1035,7 +1042,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
goto out_unlock;
do {
- pos = ACCESS_ONCE(iter->position);
+ pos = READ_ONCE(iter->position);
/*
* A racing update may change the position and
* put the last reference, hence css_tryget(),
@@ -1352,13 +1359,13 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
unsigned long limit;
count = page_counter_read(&memcg->memory);
- limit = ACCESS_ONCE(memcg->memory.limit);
+ limit = READ_ONCE(memcg->memory.limit);
if (count < limit)
margin = limit - count;
if (do_swap_account) {
count = page_counter_read(&memcg->memsw);
- limit = ACCESS_ONCE(memcg->memsw.limit);
+ limit = READ_ONCE(memcg->memsw.limit);
if (count <= limit)
margin = min(margin, limit - count);
}
@@ -1436,15 +1443,17 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
struct mem_cgroup *iter;
unsigned int i;
- if (!p)
- return;
-
mutex_lock(&oom_info_lock);
rcu_read_lock();
- pr_info("Task in ");
- pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
- pr_cont(" killed as a result of limit of ");
+ if (p) {
+ pr_info("Task in ");
+ pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
+ pr_cont(" killed as a result of limit of ");
+ } else {
+ pr_info("Memory limit reached of cgroup ");
+ }
+
pr_cont_cgroup_path(memcg->css.cgroup);
pr_cont("\n");
@@ -1531,7 +1540,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
return;
}
- check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
+ check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
totalpages = mem_cgroup_get_limit(memcg) ? : 1;
for_each_mem_cgroup_tree(iter, memcg) {
struct css_task_iter it;
@@ -2341,20 +2350,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
}
/*
- * A helper function to get mem_cgroup from ID. must be called under
- * rcu_read_lock(). The caller is responsible for calling
- * css_tryget_online() if the mem_cgroup is used for charging. (dropping
- * refcnt from swap can be called against removed memcg.)
- */
-static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
-{
- /* ID 0 is unused ID */
- if (!id)
- return NULL;
- return mem_cgroup_from_id(id);
-}
-
-/*
* try_get_mem_cgroup_from_page - look up page's memcg association
* @page: the page
*
@@ -2380,7 +2375,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
ent.val = page_private(page);
id = lookup_swap_cgroup_id(ent);
rcu_read_lock();
- memcg = mem_cgroup_lookup(id);
+ memcg = mem_cgroup_from_id(id);
if (memcg && !css_tryget_online(&memcg->css))
memcg = NULL;
rcu_read_unlock();
@@ -2642,7 +2637,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
return cachep;
memcg = get_mem_cgroup_from_mm(current->mm);
- kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id);
+ kmemcg_id = READ_ONCE(memcg->kmemcg_id);
if (kmemcg_id < 0)
goto out;
@@ -2779,92 +2774,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-/**
- * mem_cgroup_move_account - move account of the page
- * @page: the page
- * @nr_pages: number of regular pages (>1 for huge pages)
- * @from: mem_cgroup which the page is moved from.
- * @to: mem_cgroup which the page is moved to. @from != @to.
- *
- * The caller must confirm following.
- * - page is not on LRU (isolate_page() is useful.)
- * - compound_lock is held when nr_pages > 1
- *
- * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
- * from old cgroup.
- */
-static int mem_cgroup_move_account(struct page *page,
- unsigned int nr_pages,
- struct mem_cgroup *from,
- struct mem_cgroup *to)
-{
- unsigned long flags;
- int ret;
-
- VM_BUG_ON(from == to);
- VM_BUG_ON_PAGE(PageLRU(page), page);
- /*
- * The page is isolated from LRU. So, collapse function
- * will not handle this page. But page splitting can happen.
- * Do this check under compound_page_lock(). The caller should
- * hold it.
- */
- ret = -EBUSY;
- if (nr_pages > 1 && !PageTransHuge(page))
- goto out;
-
- /*
- * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
- * of its source page while we change it: page migration takes
- * both pages off the LRU, but page cache replacement doesn't.
- */
- if (!trylock_page(page))
- goto out;
-
- ret = -EINVAL;
- if (page->mem_cgroup != from)
- goto out_unlock;
-
- spin_lock_irqsave(&from->move_lock, flags);
-
- if (!PageAnon(page) && page_mapped(page)) {
- __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
- nr_pages);
- __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
- nr_pages);
- }
-
- if (PageWriteback(page)) {
- __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
- nr_pages);
- __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
- nr_pages);
- }
-
- /*
- * It is safe to change page->mem_cgroup here because the page
- * is referenced, charged, and isolated - we can't race with
- * uncharging, charging, migration, or LRU putback.
- */
-
- /* caller should have done css_get */
- page->mem_cgroup = to;
- spin_unlock_irqrestore(&from->move_lock, flags);
-
- ret = 0;
-
- local_irq_disable();
- mem_cgroup_charge_statistics(to, page, nr_pages);
- memcg_check_events(to, page);
- mem_cgroup_charge_statistics(from, page, -nr_pages);
- memcg_check_events(from, page);
- local_irq_enable();
-out_unlock:
- unlock_page(page);
-out:
- return ret;
-}
-
#ifdef CONFIG_MEMCG_SWAP
static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
bool charge)
@@ -4816,6 +4725,92 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
return page;
}
+/**
+ * mem_cgroup_move_account - move account of the page
+ * @page: the page
+ * @nr_pages: number of regular pages (>1 for huge pages)
+ * @from: mem_cgroup which the page is moved from.
+ * @to: mem_cgroup which the page is moved to. @from != @to.
+ *
+ * The caller must confirm following.
+ * - page is not on LRU (isolate_page() is useful.)
+ * - compound_lock is held when nr_pages > 1
+ *
+ * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
+ * from old cgroup.
+ */
+static int mem_cgroup_move_account(struct page *page,
+ unsigned int nr_pages,
+ struct mem_cgroup *from,
+ struct mem_cgroup *to)
+{
+ unsigned long flags;
+ int ret;
+
+ VM_BUG_ON(from == to);
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ /*
+ * The page is isolated from LRU. So, collapse function
+ * will not handle this page. But page splitting can happen.
+ * Do this check under compound_page_lock(). The caller should
+ * hold it.
+ */
+ ret = -EBUSY;
+ if (nr_pages > 1 && !PageTransHuge(page))
+ goto out;
+
+ /*
+ * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
+ * of its source page while we change it: page migration takes
+ * both pages off the LRU, but page cache replacement doesn't.
+ */
+ if (!trylock_page(page))
+ goto out;
+
+ ret = -EINVAL;
+ if (page->mem_cgroup != from)
+ goto out_unlock;
+
+ spin_lock_irqsave(&from->move_lock, flags);
+
+ if (!PageAnon(page) && page_mapped(page)) {
+ __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
+ nr_pages);
+ __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
+ nr_pages);
+ }
+
+ if (PageWriteback(page)) {
+ __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
+ nr_pages);
+ __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
+ nr_pages);
+ }
+
+ /*
+ * It is safe to change page->mem_cgroup here because the page
+ * is referenced, charged, and isolated - we can't race with
+ * uncharging, charging, migration, or LRU putback.
+ */
+
+ /* caller should have done css_get */
+ page->mem_cgroup = to;
+ spin_unlock_irqrestore(&from->move_lock, flags);
+
+ ret = 0;
+
+ local_irq_disable();
+ mem_cgroup_charge_statistics(to, page, nr_pages);
+ memcg_check_events(to, page);
+ mem_cgroup_charge_statistics(from, page, -nr_pages);
+ memcg_check_events(from, page);
+ local_irq_enable();
+out_unlock:
+ unlock_page(page);
+out:
+ return ret;
+}
+
static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, union mc_target *target)
{
@@ -5012,7 +5007,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
* tunable will only affect upcoming migrations, not the current one.
* So we need to save it, and keep it going.
*/
- move_flags = ACCESS_ONCE(memcg->move_charge_at_immigrate);
+ move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
if (move_flags) {
struct mm_struct *mm;
struct mem_cgroup *from = mem_cgroup_from_task(p);
@@ -5246,7 +5241,7 @@ static u64 memory_current_read(struct cgroup_subsys_state *css,
static int memory_low_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long low = ACCESS_ONCE(memcg->low);
+ unsigned long low = READ_ONCE(memcg->low);
if (low == PAGE_COUNTER_MAX)
seq_puts(m, "max\n");
@@ -5276,7 +5271,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of,
static int memory_high_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long high = ACCESS_ONCE(memcg->high);
+ unsigned long high = READ_ONCE(memcg->high);
if (high == PAGE_COUNTER_MAX)
seq_puts(m, "max\n");
@@ -5306,7 +5301,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
static int memory_max_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long max = ACCESS_ONCE(memcg->memory.limit);
+ unsigned long max = READ_ONCE(memcg->memory.limit);
if (max == PAGE_COUNTER_MAX)
seq_puts(m, "max\n");
@@ -5861,7 +5856,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
id = swap_cgroup_record(entry, 0);
rcu_read_lock();
- memcg = mem_cgroup_lookup(id);
+ memcg = mem_cgroup_from_id(id);
if (memcg) {
if (!mem_cgroup_is_root(memcg))
page_counter_uncharge(&memcg->memsw, 1);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index d487f8dc6d39..50749989c18d 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -521,6 +521,52 @@ static const char *action_name[] = {
[RECOVERED] = "Recovered",
};
+enum page_type {
+ KERNEL,
+ KERNEL_HIGH_ORDER,
+ SLAB,
+ DIFFERENT_COMPOUND,
+ POISONED_HUGE,
+ HUGE,
+ FREE_HUGE,
+ UNMAP_FAILED,
+ DIRTY_SWAPCACHE,
+ CLEAN_SWAPCACHE,
+ DIRTY_MLOCKED_LRU,
+ CLEAN_MLOCKED_LRU,
+ DIRTY_UNEVICTABLE_LRU,
+ CLEAN_UNEVICTABLE_LRU,
+ DIRTY_LRU,
+ CLEAN_LRU,
+ TRUNCATED_LRU,
+ BUDDY,
+ BUDDY_2ND,
+ UNKNOWN,
+};
+
+static const char *action_page_type[] = {
+ [KERNEL] = "reserved kernel page",
+ [KERNEL_HIGH_ORDER] = "high-order kernel page",
+ [SLAB] = "kernel slab page",
+ [DIFFERENT_COMPOUND] = "different compound page after locking",
+ [POISONED_HUGE] = "huge page already hardware poisoned",
+ [HUGE] = "huge page",
+ [FREE_HUGE] = "free huge page",
+ [UNMAP_FAILED] = "unmapping failed page",
+ [DIRTY_SWAPCACHE] = "dirty swapcache page",
+ [CLEAN_SWAPCACHE] = "clean swapcache page",
+ [DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page",
+ [CLEAN_MLOCKED_LRU] = "clean mlocked LRU page",
+ [DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page",
+ [CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page",
+ [DIRTY_LRU] = "dirty LRU page",
+ [CLEAN_LRU] = "clean LRU page",
+ [TRUNCATED_LRU] = "already truncated LRU page",
+ [BUDDY] = "free buddy page",
+ [BUDDY_2ND] = "free buddy page (2nd try)",
+ [UNKNOWN] = "unknown page",
+};
+
/*
* XXX: It is possible that a page is isolated from LRU cache,
* and then kept in swap cache or failed to remove from page cache.
@@ -777,10 +823,10 @@ static int me_huge_page(struct page *p, unsigned long pfn)
static struct page_state {
unsigned long mask;
unsigned long res;
- char *msg;
+ int type;
int (*action)(struct page *p, unsigned long pfn);
} error_states[] = {
- { reserved, reserved, "reserved kernel", me_kernel },
+ { reserved, reserved, KERNEL, me_kernel },
/*
* free pages are specially detected outside this table:
* PG_buddy pages only make a small fraction of all free pages.
@@ -791,31 +837,31 @@ static struct page_state {
* currently unused objects without touching them. But just
* treat it as standard kernel for now.
*/
- { slab, slab, "kernel slab", me_kernel },
+ { slab, slab, SLAB, me_kernel },
#ifdef CONFIG_PAGEFLAGS_EXTENDED
- { head, head, "huge", me_huge_page },
- { tail, tail, "huge", me_huge_page },
+ { head, head, HUGE, me_huge_page },
+ { tail, tail, HUGE, me_huge_page },
#else
- { compound, compound, "huge", me_huge_page },
+ { compound, compound, HUGE, me_huge_page },
#endif
- { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty },
- { sc|dirty, sc, "clean swapcache", me_swapcache_clean },
+ { sc|dirty, sc|dirty, DIRTY_SWAPCACHE, me_swapcache_dirty },
+ { sc|dirty, sc, CLEAN_SWAPCACHE, me_swapcache_clean },
- { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty },
- { mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean },
+ { mlock|dirty, mlock|dirty, DIRTY_MLOCKED_LRU, me_pagecache_dirty },
+ { mlock|dirty, mlock, CLEAN_MLOCKED_LRU, me_pagecache_clean },
- { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
- { unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean },
+ { unevict|dirty, unevict|dirty, DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty },
+ { unevict|dirty, unevict, CLEAN_UNEVICTABLE_LRU, me_pagecache_clean },
- { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty },
- { lru|dirty, lru, "clean LRU", me_pagecache_clean },
+ { lru|dirty, lru|dirty, DIRTY_LRU, me_pagecache_dirty },
+ { lru|dirty, lru, CLEAN_LRU, me_pagecache_clean },
/*
* Catchall entry: must be at end.
*/
- { 0, 0, "unknown page state", me_unknown },
+ { 0, 0, UNKNOWN, me_unknown },
};
#undef dirty
@@ -835,10 +881,10 @@ static struct page_state {
* "Dirty/Clean" indication is not 100% accurate due to the possibility of
* setting PG_dirty outside page lock. See also comment above set_page_dirty().
*/
-static void action_result(unsigned long pfn, char *msg, int result)
+static void action_result(unsigned long pfn, int type, int result)
{
- pr_err("MCE %#lx: %s page recovery: %s\n",
- pfn, msg, action_name[result]);
+ pr_err("MCE %#lx: recovery action for %s: %s\n",
+ pfn, action_page_type[type], action_name[result]);
}
static int page_action(struct page_state *ps, struct page *p,
@@ -854,11 +900,11 @@ static int page_action(struct page_state *ps, struct page *p,
count--;
if (count != 0) {
printk(KERN_ERR
- "MCE %#lx: %s page still referenced by %d users\n",
- pfn, ps->msg, count);
+ "MCE %#lx: %s still referenced by %d users\n",
+ pfn, action_page_type[ps->type], count);
result = FAILED;
}
- action_result(pfn, ps->msg, result);
+ action_result(pfn, ps->type, result);
/* Could do more checks here if page looks ok */
/*
@@ -1106,7 +1152,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
if (!(flags & MF_COUNT_INCREASED) &&
!get_page_unless_zero(hpage)) {
if (is_free_buddy_page(p)) {
- action_result(pfn, "free buddy", DELAYED);
+ action_result(pfn, BUDDY, DELAYED);
return 0;
} else if (PageHuge(hpage)) {
/*
@@ -1123,12 +1169,12 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
}
set_page_hwpoison_huge_page(hpage);
res = dequeue_hwpoisoned_huge_page(hpage);
- action_result(pfn, "free huge",
+ action_result(pfn, FREE_HUGE,
res ? IGNORED : DELAYED);
unlock_page(hpage);
return res;
} else {
- action_result(pfn, "high order kernel", IGNORED);
+ action_result(pfn, KERNEL_HIGH_ORDER, IGNORED);
return -EBUSY;
}
}
@@ -1136,7 +1182,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
/*
* We ignore non-LRU pages for good reasons.
* - PG_locked is only well defined for LRU pages and a few others
- * - to avoid races with __set_page_locked()
+ * - to avoid races with __SetPageLocked()
* - to avoid races with __SetPageSlab*() (and more non-atomic ops)
* The check (unnecessarily) ignores LRU pages being isolated and
* walked by the page reclaim code, however that's not a big loss.
@@ -1150,9 +1196,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
*/
if (is_free_buddy_page(p)) {
if (flags & MF_COUNT_INCREASED)
- action_result(pfn, "free buddy", DELAYED);
+ action_result(pfn, BUDDY, DELAYED);
else
- action_result(pfn, "free buddy, 2nd try", DELAYED);
+ action_result(pfn, BUDDY_2ND, DELAYED);
return 0;
}
}
@@ -1165,7 +1211,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* If this happens just bail out.
*/
if (compound_head(p) != hpage) {
- action_result(pfn, "different compound page after locking", IGNORED);
+ action_result(pfn, DIFFERENT_COMPOUND, IGNORED);
res = -EBUSY;
goto out;
}
@@ -1205,8 +1251,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* on the head page to show that the hugepage is hwpoisoned
*/
if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
- action_result(pfn, "hugepage already hardware poisoned",
- IGNORED);
+ action_result(pfn, POISONED_HUGE, IGNORED);
unlock_page(hpage);
put_page(hpage);
return 0;
@@ -1235,7 +1280,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
*/
if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
!= SWAP_SUCCESS) {
- action_result(pfn, "unmapping failed", IGNORED);
+ action_result(pfn, UNMAP_FAILED, IGNORED);
res = -EBUSY;
goto out;
}
@@ -1244,7 +1289,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* Torn down by someone else?
*/
if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
- action_result(pfn, "already truncated LRU", IGNORED);
+ action_result(pfn, TRUNCATED_LRU, IGNORED);
res = -EBUSY;
goto out;
}
diff --git a/mm/memory.c b/mm/memory.c
index 97839f5c8c30..59f62683908e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1983,167 +1983,91 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
}
/*
- * This routine handles present pages, when users try to write
- * to a shared page. It is done by copying the page to a new address
- * and decrementing the shared-page counter for the old page.
- *
- * Note that this routine assumes that the protection checks have been
- * done by the caller (the low-level page fault routine in most cases).
- * Thus we can safely just mark it writable once we've done any necessary
- * COW.
+ * Handle write page faults for pages that can be reused in the current vma
*
- * We also mark the page dirty at this point even though the page will
- * change only once the write actually happens. This avoids a few races,
- * and potentially makes it more efficient.
- *
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), with pte both mapped and locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
+ * This can happen either due to the mapping being with the VM_SHARED flag,
+ * or due to us being the last reference standing to the page. In either
+ * case, all we need to do here is to mark the page as writable and update
+ * any related book-keeping.
*/
-static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *page_table, pmd_t *pmd,
- spinlock_t *ptl, pte_t orig_pte)
+static inline int wp_page_reuse(struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
+ struct page *page, int page_mkwrite,
+ int dirty_shared)
__releases(ptl)
{
- struct page *old_page, *new_page = NULL;
pte_t entry;
- int ret = 0;
- int page_mkwrite = 0;
- bool dirty_shared = false;
- unsigned long mmun_start = 0; /* For mmu_notifiers */
- unsigned long mmun_end = 0; /* For mmu_notifiers */
- struct mem_cgroup *memcg;
-
- old_page = vm_normal_page(vma, address, orig_pte);
- if (!old_page) {
- /*
- * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
- * VM_PFNMAP VMA.
- *
- * We should not cow pages in a shared writeable mapping.
- * Just mark the pages writable as we can't do any dirty
- * accounting on raw pfn maps.
- */
- if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
- (VM_WRITE|VM_SHARED))
- goto reuse;
- goto gotten;
- }
-
/*
- * Take out anonymous pages first, anonymous shared vmas are
- * not dirty accountable.
+ * Clear the pages cpupid information as the existing
+ * information potentially belongs to a now completely
+ * unrelated process.
*/
- if (PageAnon(old_page) && !PageKsm(old_page)) {
- if (!trylock_page(old_page)) {
- page_cache_get(old_page);
- pte_unmap_unlock(page_table, ptl);
- lock_page(old_page);
- page_table = pte_offset_map_lock(mm, pmd, address,
- &ptl);
- if (!pte_same(*page_table, orig_pte)) {
- unlock_page(old_page);
- goto unlock;
- }
- page_cache_release(old_page);
- }
- if (reuse_swap_page(old_page)) {
- /*
- * The page is all ours. Move it to our anon_vma so
- * the rmap code will not search our parent or siblings.
- * Protected against the rmap code by the page lock.
- */
- page_move_anon_rmap(old_page, vma, address);
- unlock_page(old_page);
- goto reuse;
- }
- unlock_page(old_page);
- } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
- (VM_WRITE|VM_SHARED))) {
- page_cache_get(old_page);
- /*
- * Only catch write-faults on shared writable pages,
- * read-only shared pages can get COWed by
- * get_user_pages(.write=1, .force=1).
- */
- if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
- int tmp;
-
- pte_unmap_unlock(page_table, ptl);
- tmp = do_page_mkwrite(vma, old_page, address);
- if (unlikely(!tmp || (tmp &
- (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
- page_cache_release(old_page);
- return tmp;
- }
- /*
- * Since we dropped the lock we need to revalidate
- * the PTE as someone else may have changed it. If
- * they did, we just return, as we can count on the
- * MMU to tell us if they didn't also make it writable.
- */
- page_table = pte_offset_map_lock(mm, pmd, address,
- &ptl);
- if (!pte_same(*page_table, orig_pte)) {
- unlock_page(old_page);
- goto unlock;
- }
- page_mkwrite = 1;
- }
-
- dirty_shared = true;
-
-reuse:
- /*
- * Clear the pages cpupid information as the existing
- * information potentially belongs to a now completely
- * unrelated process.
- */
- if (old_page)
- page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1);
-
- flush_cache_page(vma, address, pte_pfn(orig_pte));
- entry = pte_mkyoung(orig_pte);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- if (ptep_set_access_flags(vma, address, page_table, entry,1))
- update_mmu_cache(vma, address, page_table);
- pte_unmap_unlock(page_table, ptl);
- ret |= VM_FAULT_WRITE;
+ if (page)
+ page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
- if (dirty_shared) {
- struct address_space *mapping;
- int dirtied;
+ flush_cache_page(vma, address, pte_pfn(orig_pte));
+ entry = pte_mkyoung(orig_pte);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (ptep_set_access_flags(vma, address, page_table, entry, 1))
+ update_mmu_cache(vma, address, page_table);
+ pte_unmap_unlock(page_table, ptl);
- if (!page_mkwrite)
- lock_page(old_page);
+ if (dirty_shared) {
+ struct address_space *mapping;
+ int dirtied;
- dirtied = set_page_dirty(old_page);
- VM_BUG_ON_PAGE(PageAnon(old_page), old_page);
- mapping = old_page->mapping;
- unlock_page(old_page);
- page_cache_release(old_page);
+ if (!page_mkwrite)
+ lock_page(page);
- if ((dirtied || page_mkwrite) && mapping) {
- /*
- * Some device drivers do not set page.mapping
- * but still dirty their pages
- */
- balance_dirty_pages_ratelimited(mapping);
- }
+ dirtied = set_page_dirty(page);
+ VM_BUG_ON_PAGE(PageAnon(page), page);
+ mapping = page->mapping;
+ unlock_page(page);
+ page_cache_release(page);
- if (!page_mkwrite)
- file_update_time(vma->vm_file);
+ if ((dirtied || page_mkwrite) && mapping) {
+ /*
+ * Some device drivers do not set page.mapping
+ * but still dirty their pages
+ */
+ balance_dirty_pages_ratelimited(mapping);
}
- return ret;
+ if (!page_mkwrite)
+ file_update_time(vma->vm_file);
}
- /*
- * Ok, we need to copy. Oh, well..
- */
- page_cache_get(old_page);
-gotten:
- pte_unmap_unlock(page_table, ptl);
+ return VM_FAULT_WRITE;
+}
+
+/*
+ * Handle the case of a page which we actually need to copy to a new page.
+ *
+ * Called with mmap_sem locked and the old page referenced, but
+ * without the ptl held.
+ *
+ * High level logic flow:
+ *
+ * - Allocate a page, copy the content of the old page to the new one.
+ * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
+ * - Take the PTL. If the pte changed, bail out and release the allocated page
+ * - If the pte is still the way we remember it, update the page table and all
+ * relevant references. This includes dropping the reference the page-table
+ * held to the old page, as well as updating the rmap.
+ * - In any case, unlock the PTL and drop the reference we took to the old page.
+ */
+static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ pte_t orig_pte, struct page *old_page)
+{
+ struct page *new_page = NULL;
+ spinlock_t *ptl = NULL;
+ pte_t entry;
+ int page_copied = 0;
+ const unsigned long mmun_start = address & PAGE_MASK; /* For mmu_notifiers */
+ const unsigned long mmun_end = mmun_start + PAGE_SIZE; /* For mmu_notifiers */
+ struct mem_cgroup *memcg;
if (unlikely(anon_vma_prepare(vma)))
goto oom;
@@ -2163,8 +2087,6 @@ gotten:
if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
goto oom_free_new;
- mmun_start = address & PAGE_MASK;
- mmun_end = mmun_start + PAGE_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
/*
@@ -2177,8 +2099,9 @@ gotten:
dec_mm_counter_fast(mm, MM_FILEPAGES);
inc_mm_counter_fast(mm, MM_ANONPAGES);
}
- } else
+ } else {
inc_mm_counter_fast(mm, MM_ANONPAGES);
+ }
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2227,29 +2150,29 @@ gotten:
/* Free the old page.. */
new_page = old_page;
- ret |= VM_FAULT_WRITE;
- } else
+ page_copied = 1;
+ } else {
mem_cgroup_cancel_charge(new_page, memcg);
+ }
if (new_page)
page_cache_release(new_page);
-unlock:
+
pte_unmap_unlock(page_table, ptl);
- if (mmun_end > mmun_start)
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
if (old_page) {
/*
* Don't let another task, with possibly unlocked vma,
* keep the mlocked page.
*/
- if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
+ if (page_copied && (vma->vm_flags & VM_LOCKED)) {
lock_page(old_page); /* LRU manipulation */
munlock_vma_page(old_page);
unlock_page(old_page);
}
page_cache_release(old_page);
}
- return ret;
+ return page_copied ? VM_FAULT_WRITE : 0;
oom_free_new:
page_cache_release(new_page);
oom:
@@ -2258,6 +2181,144 @@ oom:
return VM_FAULT_OOM;
}
+static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table,
+ pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte,
+ struct page *old_page)
+ __releases(ptl)
+{
+ int page_mkwrite = 0;
+
+ page_cache_get(old_page);
+
+ /*
+ * Only catch write-faults on shared writable pages,
+ * read-only shared pages can get COWed by
+ * get_user_pages(.write=1, .force=1).
+ */
+ if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
+ int tmp;
+
+ pte_unmap_unlock(page_table, ptl);
+ tmp = do_page_mkwrite(vma, old_page, address);
+ if (unlikely(!tmp || (tmp &
+ (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
+ page_cache_release(old_page);
+ return tmp;
+ }
+ /*
+ * Since we dropped the lock we need to revalidate
+ * the PTE as someone else may have changed it. If
+ * they did, we just return, as we can count on the
+ * MMU to tell us if they didn't also make it writable.
+ */
+ page_table = pte_offset_map_lock(mm, pmd, address,
+ &ptl);
+ if (!pte_same(*page_table, orig_pte)) {
+ unlock_page(old_page);
+ pte_unmap_unlock(page_table, ptl);
+ page_cache_release(old_page);
+ return 0;
+ }
+ page_mkwrite = 1;
+ }
+
+ return wp_page_reuse(mm, vma, address, page_table, ptl,
+ orig_pte, old_page, page_mkwrite, 1);
+}
+
+/*
+ * This routine handles present pages, when users try to write
+ * to a shared page. It is done by copying the page to a new address
+ * and decrementing the shared-page counter for the old page.
+ *
+ * Note that this routine assumes that the protection checks have been
+ * done by the caller (the low-level page fault routine in most cases).
+ * Thus we can safely just mark it writable once we've done any necessary
+ * COW.
+ *
+ * We also mark the page dirty at this point even though the page will
+ * change only once the write actually happens. This avoids a few races,
+ * and potentially makes it more efficient.
+ *
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), with pte both mapped and locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
+ */
+static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ spinlock_t *ptl, pte_t orig_pte)
+ __releases(ptl)
+{
+ struct page *old_page;
+
+ old_page = vm_normal_page(vma, address, orig_pte);
+ if (!old_page) {
+ /*
+ * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
+ * VM_PFNMAP VMA.
+ *
+ * We should not cow pages in a shared writeable mapping.
+ * Just mark the pages writable as we can't do any dirty
+ * accounting on raw pfn maps.
+ */
+ if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+ (VM_WRITE|VM_SHARED))
+ return wp_page_reuse(mm, vma, address, page_table, ptl,
+ orig_pte, old_page, 0, 0);
+
+ pte_unmap_unlock(page_table, ptl);
+ return wp_page_copy(mm, vma, address, page_table, pmd,
+ orig_pte, old_page);
+ }
+
+ /*
+ * Take out anonymous pages first, anonymous shared vmas are
+ * not dirty accountable.
+ */
+ if (PageAnon(old_page) && !PageKsm(old_page)) {
+ if (!trylock_page(old_page)) {
+ page_cache_get(old_page);
+ pte_unmap_unlock(page_table, ptl);
+ lock_page(old_page);
+ page_table = pte_offset_map_lock(mm, pmd, address,
+ &ptl);
+ if (!pte_same(*page_table, orig_pte)) {
+ unlock_page(old_page);
+ pte_unmap_unlock(page_table, ptl);
+ page_cache_release(old_page);
+ return 0;
+ }
+ page_cache_release(old_page);
+ }
+ if (reuse_swap_page(old_page)) {
+ /*
+ * The page is all ours. Move it to our anon_vma so
+ * the rmap code will not search our parent or siblings.
+ * Protected against the rmap code by the page lock.
+ */
+ page_move_anon_rmap(old_page, vma, address);
+ unlock_page(old_page);
+ return wp_page_reuse(mm, vma, address, page_table, ptl,
+ orig_pte, old_page, 0, 0);
+ }
+ unlock_page(old_page);
+ } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+ (VM_WRITE|VM_SHARED))) {
+ return wp_page_shared(mm, vma, address, page_table, pmd,
+ ptl, orig_pte, old_page);
+ }
+
+ /*
+ * Ok, we need to copy. Oh, well..
+ */
+ page_cache_get(old_page);
+
+ pte_unmap_unlock(page_table, ptl);
+ return wp_page_copy(mm, vma, address, page_table, pmd,
+ orig_pte, old_page);
+}
+
static void unmap_mapping_range_vma(struct vm_area_struct *vma,
unsigned long start_addr, unsigned long end_addr,
struct zap_details *details)
@@ -2784,7 +2845,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
struct vm_fault vmf;
int off;
- nr_pages = ACCESS_ONCE(fault_around_bytes) >> PAGE_SHIFT;
+ nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
start_addr = max(address & mask, vma->vm_start);
@@ -2972,7 +3033,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* pinned by vma->vm_file's reference. We rely on unlock_page()'s
* release semantics to prevent the compiler from undoing this copying.
*/
- mapping = fault_page->mapping;
+ mapping = page_rmapping(fault_page);
unlock_page(fault_page);
if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
/*
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 65842d688b7c..533d4eac1775 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -104,7 +104,7 @@ void put_online_mems(void)
}
-static void mem_hotplug_begin(void)
+void mem_hotplug_begin(void)
{
mem_hotplug.active_writer = current;
@@ -119,7 +119,7 @@ static void mem_hotplug_begin(void)
}
}
-static void mem_hotplug_done(void)
+void mem_hotplug_done(void)
{
mem_hotplug.active_writer = NULL;
mutex_unlock(&mem_hotplug.lock);
@@ -502,7 +502,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
for (i = start_sec; i <= end_sec; i++) {
- err = __add_section(nid, zone, i << PFN_SECTION_SHIFT);
+ err = __add_section(nid, zone, section_nr_to_pfn(i));
/*
* EEXIST is finally dealt with by ioresource collision
@@ -959,6 +959,7 @@ static void node_states_set_node(int node, struct memory_notify *arg)
}
+/* Must be protected by mem_hotplug_begin() */
int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
{
unsigned long flags;
@@ -969,7 +970,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
int ret;
struct memory_notify arg;
- mem_hotplug_begin();
/*
* This doesn't need a lock to do pfn_to_page().
* The section can't be removed here because of the
@@ -977,21 +977,20 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
*/
zone = page_zone(pfn_to_page(pfn));
- ret = -EINVAL;
if ((zone_idx(zone) > ZONE_NORMAL ||
online_type == MMOP_ONLINE_MOVABLE) &&
!can_online_high_movable(zone))
- goto out;
+ return -EINVAL;
if (online_type == MMOP_ONLINE_KERNEL &&
zone_idx(zone) == ZONE_MOVABLE) {
if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages))
- goto out;
+ return -EINVAL;
}
if (online_type == MMOP_ONLINE_MOVABLE &&
zone_idx(zone) == ZONE_MOVABLE - 1) {
if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages))
- goto out;
+ return -EINVAL;
}
/* Previous code may changed the zone of the pfn range */
@@ -1007,7 +1006,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
ret = notifier_to_errno(ret);
if (ret) {
memory_notify(MEM_CANCEL_ONLINE, &arg);
- goto out;
+ return ret;
}
/*
* If this zone is not populated, then it is not in zonelist.
@@ -1031,7 +1030,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
(((unsigned long long) pfn + nr_pages)
<< PAGE_SHIFT) - 1);
memory_notify(MEM_CANCEL_ONLINE, &arg);
- goto out;
+ return ret;
}
zone->present_pages += onlined_pages;
@@ -1061,9 +1060,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
if (onlined_pages)
memory_notify(MEM_ONLINE, &arg);
-out:
- mem_hotplug_done();
- return ret;
+ return 0;
}
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
@@ -1335,7 +1332,7 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
}
/*
- * Confirm all pages in a range [start, end) is belongs to the same zone.
+ * Confirm all pages in a range [start, end) belong to the same zone.
*/
int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
{
@@ -1346,10 +1343,11 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
for (pfn = start_pfn;
pfn < end_pfn;
pfn += MAX_ORDER_NR_PAGES) {
- i = 0;
- /* This is just a CONFIG_HOLES_IN_ZONE check.*/
- while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i))
- i++;
+ /* Find the first valid pfn in this pageblock */
+ for (i = 0; i < MAX_ORDER_NR_PAGES; i++) {
+ if (pfn_valid(pfn + i))
+ break;
+ }
if (i == MAX_ORDER_NR_PAGES)
continue;
page = pfn_to_page(pfn + i);
@@ -1688,21 +1686,18 @@ static int __ref __offline_pages(unsigned long start_pfn,
if (!test_pages_in_a_zone(start_pfn, end_pfn))
return -EINVAL;
- mem_hotplug_begin();
-
zone = page_zone(pfn_to_page(start_pfn));
node = zone_to_nid(zone);
nr_pages = end_pfn - start_pfn;
- ret = -EINVAL;
if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages))
- goto out;
+ return -EINVAL;
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn,
MIGRATE_MOVABLE, true);
if (ret)
- goto out;
+ return ret;
arg.start_pfn = start_pfn;
arg.nr_pages = nr_pages;
@@ -1795,7 +1790,6 @@ repeat:
writeback_set_ratelimit();
memory_notify(MEM_OFFLINE, &arg);
- mem_hotplug_done();
return 0;
failed_removal:
@@ -1805,12 +1799,10 @@ failed_removal:
memory_notify(MEM_CANCEL_OFFLINE, &arg);
/* pushback to free area */
undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
-
-out:
- mem_hotplug_done();
return ret;
}
+/* Must be protected by mem_hotplug_begin() */
int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
{
return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4721046a134a..ede26291d4aa 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -945,7 +945,8 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x
return alloc_huge_page_node(page_hstate(compound_head(page)),
node);
else
- return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
+ return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE |
+ __GFP_THISNODE, 0);
}
/*
@@ -1985,7 +1986,8 @@ retry_cpuset:
nmask = policy_nodemask(gfp, pol);
if (!nmask || node_isset(node, *nmask)) {
mpol_cond_put(pol);
- page = alloc_pages_exact_node(node, gfp, order);
+ page = alloc_pages_exact_node(node,
+ gfp | __GFP_THISNODE, order);
goto out;
}
}
diff --git a/mm/mempool.c b/mm/mempool.c
index e209c98c7203..949970db2874 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -113,23 +113,24 @@ EXPORT_SYMBOL(mempool_create_node);
* mempool_create().
* @new_min_nr: the new minimum number of elements guaranteed to be
* allocated for this pool.
- * @gfp_mask: the usual allocation bitmask.
*
* This function shrinks/grows the pool. In the case of growing,
* it cannot be guaranteed that the pool will be grown to the new
* size immediately, but new mempool_free() calls will refill it.
+ * This function may sleep.
*
* Note, the caller must guarantee that no mempool_destroy is called
* while this function is running. mempool_alloc() & mempool_free()
* might be called (eg. from IRQ contexts) while this function executes.
*/
-int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask)
+int mempool_resize(mempool_t *pool, int new_min_nr)
{
void *element;
void **new_elements;
unsigned long flags;
BUG_ON(new_min_nr <= 0);
+ might_sleep();
spin_lock_irqsave(&pool->lock, flags);
if (new_min_nr <= pool->min_nr) {
@@ -145,7 +146,8 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask)
spin_unlock_irqrestore(&pool->lock, flags);
/* Grow the pool */
- new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask);
+ new_elements = kmalloc_array(new_min_nr, sizeof(*new_elements),
+ GFP_KERNEL);
if (!new_elements)
return -ENOMEM;
@@ -164,7 +166,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask)
while (pool->curr_nr < pool->min_nr) {
spin_unlock_irqrestore(&pool->lock, flags);
- element = pool->alloc(gfp_mask, pool->pool_data);
+ element = pool->alloc(GFP_KERNEL, pool->pool_data);
if (!element)
goto out;
spin_lock_irqsave(&pool->lock, flags);
diff --git a/arch/x86/mm/memtest.c b/mm/memtest.c
index 1e9da795767a..1997d934b13b 100644
--- a/arch/x86/mm/memtest.c
+++ b/mm/memtest.c
@@ -29,7 +29,7 @@ static u64 patterns[] __initdata = {
0x7a6c7258554e494cULL, /* yeah ;-) */
};
-static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
+static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr_t end_bad)
{
printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n",
(unsigned long long) pattern,
@@ -38,11 +38,11 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
memblock_reserve(start_bad, end_bad - start_bad);
}
-static void __init memtest(u64 pattern, u64 start_phys, u64 size)
+static void __init memtest(u64 pattern, phys_addr_t start_phys, phys_addr_t size)
{
u64 *p, *start, *end;
- u64 start_bad, last_bad;
- u64 start_phys_aligned;
+ phys_addr_t start_bad, last_bad;
+ phys_addr_t start_phys_aligned;
const size_t incr = sizeof(pattern);
start_phys_aligned = ALIGN(start_phys, incr);
@@ -69,14 +69,14 @@ static void __init memtest(u64 pattern, u64 start_phys, u64 size)
reserve_bad_mem(pattern, start_bad, last_bad + incr);
}
-static void __init do_one_pass(u64 pattern, u64 start, u64 end)
+static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end)
{
u64 i;
phys_addr_t this_start, this_end;
for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) {
- this_start = clamp_t(phys_addr_t, this_start, start, end);
- this_end = clamp_t(phys_addr_t, this_end, start, end);
+ this_start = clamp(this_start, start, end);
+ this_end = clamp(this_end, start, end);
if (this_start < this_end) {
printk(KERN_INFO " %010llx - %010llx pattern %016llx\n",
(unsigned long long)this_start,
@@ -102,7 +102,7 @@ static int __init parse_memtest(char *arg)
early_param("memtest", parse_memtest);
-void __init early_memtest(unsigned long start, unsigned long end)
+void __init early_memtest(phys_addr_t start, phys_addr_t end)
{
unsigned int i;
unsigned int idx = 0;
diff --git a/mm/migrate.c b/mm/migrate.c
index 85e042686031..114602a68111 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1554,30 +1554,10 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
* page migration rate limiting control.
* Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
* window of time. Default here says do not migrate more than 1280M per second.
- * If a node is rate-limited then PTE NUMA updates are also rate-limited. However
- * as it is faults that reset the window, pte updates will happen unconditionally
- * if there has not been a fault since @pteupdate_interval_millisecs after the
- * throttle window closed.
*/
static unsigned int migrate_interval_millisecs __read_mostly = 100;
-static unsigned int pteupdate_interval_millisecs __read_mostly = 1000;
static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
-/* Returns true if NUMA migration is currently rate limited */
-bool migrate_ratelimited(int node)
-{
- pg_data_t *pgdat = NODE_DATA(node);
-
- if (time_after(jiffies, pgdat->numabalancing_migrate_next_window +
- msecs_to_jiffies(pteupdate_interval_millisecs)))
- return false;
-
- if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages)
- return false;
-
- return true;
-}
-
/* Returns true if the node is migrate rate-limited after the update */
static bool numamigrate_update_ratelimit(pg_data_t *pgdat,
unsigned long nr_pages)
@@ -1754,7 +1734,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
flush_tlb_range(vma, mmun_start, mmun_end);
/* Prepare a page as a migration target */
- __set_page_locked(new_page);
+ __SetPageLocked(new_page);
SetPageSwapBacked(new_page);
/* anon mapping, we can simply copy page->mapping to the new page: */
diff --git a/mm/mlock.c b/mm/mlock.c
index 8a54cd214925..6fd2cf15e868 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -205,62 +205,6 @@ out:
return nr_pages - 1;
}
-/**
- * __mlock_vma_pages_range() - mlock a range of pages in the vma.
- * @vma: target vma
- * @start: start address
- * @end: end address
- * @nonblocking:
- *
- * This takes care of making the pages present too.
- *
- * return 0 on success, negative error code on error.
- *
- * vma->vm_mm->mmap_sem must be held.
- *
- * If @nonblocking is NULL, it may be held for read or write and will
- * be unperturbed.
- *
- * If @nonblocking is non-NULL, it must held for read only and may be
- * released. If it's released, *@nonblocking will be set to 0.
- */
-long __mlock_vma_pages_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end, int *nonblocking)
-{
- struct mm_struct *mm = vma->vm_mm;
- unsigned long nr_pages = (end - start) / PAGE_SIZE;
- int gup_flags;
-
- VM_BUG_ON(start & ~PAGE_MASK);
- VM_BUG_ON(end & ~PAGE_MASK);
- VM_BUG_ON_VMA(start < vma->vm_start, vma);
- VM_BUG_ON_VMA(end > vma->vm_end, vma);
- VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
-
- gup_flags = FOLL_TOUCH | FOLL_MLOCK;
- /*
- * We want to touch writable mappings with a write fault in order
- * to break COW, except for shared mappings because these don't COW
- * and we would not want to dirty them for nothing.
- */
- if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
- gup_flags |= FOLL_WRITE;
-
- /*
- * We want mlock to succeed for regions that have any permissions
- * other than PROT_NONE.
- */
- if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
- gup_flags |= FOLL_FORCE;
-
- /*
- * We made sure addr is within a VMA, so the following will
- * not result in a stack expansion that recurses back here.
- */
- return __get_user_pages(current, mm, start, nr_pages, gup_flags,
- NULL, NULL, nonblocking);
-}
-
/*
* convert get_user_pages() return value to posix mlock() error
*/
@@ -596,7 +540,7 @@ success:
/*
* vm_flags is protected by the mmap_sem held in write mode.
* It's okay if try_to_unmap_one unmaps a page just after we
- * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
+ * set VM_LOCKED, populate_vma_page_range will bring it back.
*/
if (lock)
@@ -660,69 +604,6 @@ static int do_mlock(unsigned long start, size_t len, int on)
return error;
}
-/*
- * __mm_populate - populate and/or mlock pages within a range of address space.
- *
- * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
- * flags. VMAs must be already marked with the desired vm_flags, and
- * mmap_sem must not be held.
- */
-int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
-{
- struct mm_struct *mm = current->mm;
- unsigned long end, nstart, nend;
- struct vm_area_struct *vma = NULL;
- int locked = 0;
- long ret = 0;
-
- VM_BUG_ON(start & ~PAGE_MASK);
- VM_BUG_ON(len != PAGE_ALIGN(len));
- end = start + len;
-
- for (nstart = start; nstart < end; nstart = nend) {
- /*
- * We want to fault in pages for [nstart; end) address range.
- * Find first corresponding VMA.
- */
- if (!locked) {
- locked = 1;
- down_read(&mm->mmap_sem);
- vma = find_vma(mm, nstart);
- } else if (nstart >= vma->vm_end)
- vma = vma->vm_next;
- if (!vma || vma->vm_start >= end)
- break;
- /*
- * Set [nstart; nend) to intersection of desired address
- * range with the first VMA. Also, skip undesirable VMA types.
- */
- nend = min(end, vma->vm_end);
- if (vma->vm_flags & (VM_IO | VM_PFNMAP))
- continue;
- if (nstart < vma->vm_start)
- nstart = vma->vm_start;
- /*
- * Now fault in a range of pages. __mlock_vma_pages_range()
- * double checks the vma flags, so that it won't mlock pages
- * if the vma was already munlocked.
- */
- ret = __mlock_vma_pages_range(vma, nstart, nend, &locked);
- if (ret < 0) {
- if (ignore_errors) {
- ret = 0;
- continue; /* continue at next VMA */
- }
- ret = __mlock_posix_error_return(ret);
- break;
- }
- nend = nstart + ret * PAGE_SIZE;
- ret = 0;
- }
- if (locked)
- up_read(&mm->mmap_sem);
- return ret; /* 0 or negative error code */
-}
-
SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
{
unsigned long locked;
@@ -750,9 +631,13 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
error = do_mlock(start, len, 1);
up_write(&current->mm->mmap_sem);
- if (!error)
- error = __mm_populate(start, len, 0);
- return error;
+ if (error)
+ return error;
+
+ error = __mm_populate(start, len, 0);
+ if (error)
+ return __mlock_posix_error_return(error);
+ return 0;
}
SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
diff --git a/mm/mmap.c b/mm/mmap.c
index 9ec50a368634..e65cbe0d64fc 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1133,7 +1133,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *
* by another page fault trying to merge _that_. But that's ok: if it
* is being set up, that automatically means that it will be a singleton
* acceptable for merging, so we can do all of this optimistically. But
- * we do that ACCESS_ONCE() to make sure that we never re-load the pointer.
+ * we do that READ_ONCE() to make sure that we never re-load the pointer.
*
* IOW: that the "list_is_singular()" test on the anon_vma_chain only
* matters for the 'stable anon_vma' case (ie the thing we want to avoid
@@ -1147,7 +1147,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *
static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
{
if (anon_vma_compatible(a, b)) {
- struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma);
+ struct anon_vma *anon_vma = READ_ONCE(old->anon_vma);
if (anon_vma && list_is_singular(&old->anon_vma_chain))
return anon_vma;
@@ -2100,7 +2100,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
actual_size = size;
if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN)))
actual_size -= PAGE_SIZE;
- if (actual_size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur))
+ if (actual_size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur))
return -ENOMEM;
/* mlock limit tests */
@@ -2108,7 +2108,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
unsigned long locked;
unsigned long limit;
locked = mm->locked_vm + grow;
- limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
+ limit = READ_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
limit >>= PAGE_SHIFT;
if (locked > limit && !capable(CAP_IPC_LOCK))
return -ENOMEM;
@@ -2316,7 +2316,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
if (!prev || expand_stack(prev, addr))
return NULL;
if (prev->vm_flags & VM_LOCKED)
- __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL);
+ populate_vma_page_range(prev, addr, prev->vm_end, NULL);
return prev;
}
#else
@@ -2351,7 +2351,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
if (expand_stack(vma, addr))
return NULL;
if (vma->vm_flags & VM_LOCKED)
- __mlock_vma_pages_range(vma, addr, start, NULL);
+ populate_vma_page_range(vma, addr, start, NULL);
return vma;
}
#endif
diff --git a/mm/mremap.c b/mm/mremap.c
index 57dadc025c64..afa3ab740d8c 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -339,25 +339,25 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
struct vm_area_struct *vma = find_vma(mm, addr);
if (!vma || vma->vm_start > addr)
- goto Efault;
+ return ERR_PTR(-EFAULT);
if (is_vm_hugetlb_page(vma))
- goto Einval;
+ return ERR_PTR(-EINVAL);
/* We can't remap across vm area boundaries */
if (old_len > vma->vm_end - addr)
- goto Efault;
+ return ERR_PTR(-EFAULT);
/* Need to be careful about a growing mapping */
if (new_len > old_len) {
unsigned long pgoff;
if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
- goto Efault;
+ return ERR_PTR(-EFAULT);
pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
pgoff += vma->vm_pgoff;
if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
- goto Einval;
+ return ERR_PTR(-EINVAL);
}
if (vma->vm_flags & VM_LOCKED) {
@@ -366,29 +366,20 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
lock_limit = rlimit(RLIMIT_MEMLOCK);
locked += new_len - old_len;
if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- goto Eagain;
+ return ERR_PTR(-EAGAIN);
}
if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT))
- goto Enomem;
+ return ERR_PTR(-ENOMEM);
if (vma->vm_flags & VM_ACCOUNT) {
unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
if (security_vm_enough_memory_mm(mm, charged))
- goto Efault;
+ return ERR_PTR(-ENOMEM);
*p = charged;
}
return vma;
-
-Efault: /* very odd choice for most of the cases, but... */
- return ERR_PTR(-EFAULT);
-Einval:
- return ERR_PTR(-EINVAL);
-Enomem:
- return ERR_PTR(-ENOMEM);
-Eagain:
- return ERR_PTR(-EAGAIN);
}
static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 642f38cb175a..2b665da1b3c9 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -408,7 +408,7 @@ bool oom_killer_disabled __read_mostly;
static DECLARE_RWSEM(oom_sem);
/**
- * mark_tsk_oom_victim - marks the given taks as OOM victim.
+ * mark_tsk_oom_victim - marks the given task as OOM victim.
* @tsk: task to mark
*
* Has to be called with oom_sem taken for read and never after
@@ -612,7 +612,8 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
*/
void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
- int order, const nodemask_t *nodemask)
+ int order, const nodemask_t *nodemask,
+ struct mem_cgroup *memcg)
{
if (likely(!sysctl_panic_on_oom))
return;
@@ -625,7 +626,7 @@ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
if (constraint != CONSTRAINT_NONE)
return;
}
- dump_header(NULL, gfp_mask, order, NULL, nodemask);
+ dump_header(NULL, gfp_mask, order, memcg, nodemask);
panic("Out of memory: %s panic_on_oom is enabled\n",
sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
}
@@ -740,7 +741,7 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
&totalpages);
mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
- check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
+ check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL);
if (sysctl_oom_kill_allocating_task && current->mm &&
!oom_unkillable_task(current, NULL, nodemask) &&
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 644bcb665773..0372411f38fc 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2111,6 +2111,25 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
EXPORT_SYMBOL(account_page_dirtied);
/*
+ * Helper function for deaccounting dirty page without writeback.
+ *
+ * Doing this should *normally* only ever be done when a page
+ * is truncated, and is not actually mapped anywhere at all. However,
+ * fs/buffer.c does this when it notices that somebody has cleaned
+ * out all the buffers on a page without actually doing it through
+ * the VM. Can you say "ext3 is horribly ugly"? Thought you could.
+ */
+void account_page_cleaned(struct page *page, struct address_space *mapping)
+{
+ if (mapping_cap_account_dirty(mapping)) {
+ dec_zone_page_state(page, NR_FILE_DIRTY);
+ dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE);
+ task_io_account_cancelled_write(PAGE_CACHE_SIZE);
+ }
+}
+EXPORT_SYMBOL(account_page_cleaned);
+
+/*
* For address_spaces which do not use buffers. Just tag the page as dirty in
* its radix tree.
*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 40e29429e7b0..ed0f1c63065f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -373,6 +373,7 @@ void prep_compound_page(struct page *page, unsigned long order)
for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
set_page_count(p, 0);
+ p->mapping = TAIL_MAPPING;
p->first_page = page;
/* Make sure p->first_page is always valid for PageTail() */
smp_wmb();
@@ -765,6 +766,12 @@ static void free_one_page(struct zone *zone,
static int free_tail_pages_check(struct page *head_page, struct page *page)
{
+ if (page->mapping != TAIL_MAPPING) {
+ bad_page(page, "corrupted mapping in tail page", 0);
+ page->mapping = NULL;
+ return 1;
+ }
+ page->mapping = NULL;
if (!IS_ENABLED(CONFIG_DEBUG_VM))
return 0;
if (unlikely(!PageTail(page))) {
@@ -1032,11 +1039,9 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
static int fallbacks[MIGRATE_TYPES][4] = {
[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
+ [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
#ifdef CONFIG_CMA
- [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
[MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */
-#else
- [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
#endif
[MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
#ifdef CONFIG_MEMORY_ISOLATION
@@ -1044,6 +1049,17 @@ static int fallbacks[MIGRATE_TYPES][4] = {
#endif
};
+#ifdef CONFIG_CMA
+static struct page *__rmqueue_cma_fallback(struct zone *zone,
+ unsigned int order)
+{
+ return __rmqueue_smallest(zone, order, MIGRATE_CMA);
+}
+#else
+static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
+ unsigned int order) { return NULL; }
+#endif
+
/*
* Move the free pages in a range to the free lists of the requested type.
* Note that start_page and end_pages are not aligned on a pageblock
@@ -1136,14 +1152,40 @@ static void change_pageblock_range(struct page *pageblock_page,
* as fragmentation caused by those allocations polluting movable pageblocks
* is worse than movable allocations stealing from unmovable and reclaimable
* pageblocks.
- *
- * If we claim more than half of the pageblock, change pageblock's migratetype
- * as well.
*/
-static void try_to_steal_freepages(struct zone *zone, struct page *page,
- int start_type, int fallback_type)
+static bool can_steal_fallback(unsigned int order, int start_mt)
+{
+ /*
+ * Leaving this order check is intended, although there is
+ * relaxed order check in next check. The reason is that
+ * we can actually steal whole pageblock if this condition met,
+ * but, below check doesn't guarantee it and that is just heuristic
+ * so could be changed anytime.
+ */
+ if (order >= pageblock_order)
+ return true;
+
+ if (order >= pageblock_order / 2 ||
+ start_mt == MIGRATE_RECLAIMABLE ||
+ start_mt == MIGRATE_UNMOVABLE ||
+ page_group_by_mobility_disabled)
+ return true;
+
+ return false;
+}
+
+/*
+ * This function implements actual steal behaviour. If order is large enough,
+ * we can steal whole pageblock. If not, we first move freepages in this
+ * pageblock and check whether half of pages are moved or not. If half of
+ * pages are moved, we can change migratetype of pageblock and permanently
+ * use it's pages as requested migratetype in the future.
+ */
+static void steal_suitable_fallback(struct zone *zone, struct page *page,
+ int start_type)
{
int current_order = page_order(page);
+ int pages;
/* Take ownership for orders >= pageblock_order */
if (current_order >= pageblock_order) {
@@ -1151,19 +1193,49 @@ static void try_to_steal_freepages(struct zone *zone, struct page *page,
return;
}
- if (current_order >= pageblock_order / 2 ||
- start_type == MIGRATE_RECLAIMABLE ||
- start_type == MIGRATE_UNMOVABLE ||
- page_group_by_mobility_disabled) {
- int pages;
+ pages = move_freepages_block(zone, page, start_type);
+
+ /* Claim the whole block if over half of it is free */
+ if (pages >= (1 << (pageblock_order-1)) ||
+ page_group_by_mobility_disabled)
+ set_pageblock_migratetype(page, start_type);
+}
+
+/*
+ * Check whether there is a suitable fallback freepage with requested order.
+ * If only_stealable is true, this function returns fallback_mt only if
+ * we can steal other freepages all together. This would help to reduce
+ * fragmentation due to mixed migratetype pages in one pageblock.
+ */
+int find_suitable_fallback(struct free_area *area, unsigned int order,
+ int migratetype, bool only_stealable, bool *can_steal)
+{
+ int i;
+ int fallback_mt;
+
+ if (area->nr_free == 0)
+ return -1;
- pages = move_freepages_block(zone, page, start_type);
+ *can_steal = false;
+ for (i = 0;; i++) {
+ fallback_mt = fallbacks[migratetype][i];
+ if (fallback_mt == MIGRATE_RESERVE)
+ break;
- /* Claim the whole block if over half of it is free */
- if (pages >= (1 << (pageblock_order-1)) ||
- page_group_by_mobility_disabled)
- set_pageblock_migratetype(page, start_type);
+ if (list_empty(&area->free_list[fallback_mt]))
+ continue;
+
+ if (can_steal_fallback(order, migratetype))
+ *can_steal = true;
+
+ if (!only_stealable)
+ return fallback_mt;
+
+ if (*can_steal)
+ return fallback_mt;
}
+
+ return -1;
}
/* Remove an element from the buddy allocator from the fallback list */
@@ -1173,64 +1245,45 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
struct free_area *area;
unsigned int current_order;
struct page *page;
+ int fallback_mt;
+ bool can_steal;
/* Find the largest possible block of pages in the other list */
for (current_order = MAX_ORDER-1;
current_order >= order && current_order <= MAX_ORDER-1;
--current_order) {
- int i;
- for (i = 0;; i++) {
- int migratetype = fallbacks[start_migratetype][i];
- int buddy_type = start_migratetype;
-
- /* MIGRATE_RESERVE handled later if necessary */
- if (migratetype == MIGRATE_RESERVE)
- break;
-
- area = &(zone->free_area[current_order]);
- if (list_empty(&area->free_list[migratetype]))
- continue;
-
- page = list_entry(area->free_list[migratetype].next,
- struct page, lru);
- area->nr_free--;
-
- if (!is_migrate_cma(migratetype)) {
- try_to_steal_freepages(zone, page,
- start_migratetype,
- migratetype);
- } else {
- /*
- * When borrowing from MIGRATE_CMA, we need to
- * release the excess buddy pages to CMA
- * itself, and we do not try to steal extra
- * free pages.
- */
- buddy_type = migratetype;
- }
+ area = &(zone->free_area[current_order]);
+ fallback_mt = find_suitable_fallback(area, current_order,
+ start_migratetype, false, &can_steal);
+ if (fallback_mt == -1)
+ continue;
- /* Remove the page from the freelists */
- list_del(&page->lru);
- rmv_page_order(page);
+ page = list_entry(area->free_list[fallback_mt].next,
+ struct page, lru);
+ if (can_steal)
+ steal_suitable_fallback(zone, page, start_migratetype);
- expand(zone, page, order, current_order, area,
- buddy_type);
+ /* Remove the page from the freelists */
+ area->nr_free--;
+ list_del(&page->lru);
+ rmv_page_order(page);
- /*
- * The freepage_migratetype may differ from pageblock's
- * migratetype depending on the decisions in
- * try_to_steal_freepages(). This is OK as long as it
- * does not differ for MIGRATE_CMA pageblocks. For CMA
- * we need to make sure unallocated pages flushed from
- * pcp lists are returned to the correct freelist.
- */
- set_freepage_migratetype(page, buddy_type);
+ expand(zone, page, order, current_order, area,
+ start_migratetype);
+ /*
+ * The freepage_migratetype may differ from pageblock's
+ * migratetype depending on the decisions in
+ * try_to_steal_freepages(). This is OK as long as it
+ * does not differ for MIGRATE_CMA pageblocks. For CMA
+ * we need to make sure unallocated pages flushed from
+ * pcp lists are returned to the correct freelist.
+ */
+ set_freepage_migratetype(page, start_migratetype);
- trace_mm_page_alloc_extfrag(page, order, current_order,
- start_migratetype, migratetype);
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, fallback_mt);
- return page;
- }
+ return page;
}
return NULL;
@@ -1249,7 +1302,11 @@ retry_reserve:
page = __rmqueue_smallest(zone, order, migratetype);
if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
- page = __rmqueue_fallback(zone, order, migratetype);
+ if (migratetype == MIGRATE_MOVABLE)
+ page = __rmqueue_cma_fallback(zone, order);
+
+ if (!page)
+ page = __rmqueue_fallback(zone, order, migratetype);
/*
* Use MIGRATE_RESERVE rather than fail an allocation. goto
@@ -1321,7 +1378,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
int to_drain, batch;
local_irq_save(flags);
- batch = ACCESS_ONCE(pcp->batch);
+ batch = READ_ONCE(pcp->batch);
to_drain = min(pcp->count, batch);
if (to_drain > 0) {
free_pcppages_bulk(zone, to_drain, pcp);
@@ -1520,7 +1577,7 @@ void free_hot_cold_page(struct page *page, bool cold)
list_add_tail(&page->lru, &pcp->lists[migratetype]);
pcp->count++;
if (pcp->count >= pcp->high) {
- unsigned long batch = ACCESS_ONCE(pcp->batch);
+ unsigned long batch = READ_ONCE(pcp->batch);
free_pcppages_bulk(zone, batch, pcp);
pcp->count -= batch;
}
@@ -2362,13 +2419,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
*did_some_progress = 1;
goto out;
}
- /*
- * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
- * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
- * The caller should handle page allocation failure by itself if
- * it specifies __GFP_THISNODE.
- * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
- */
+ /* The OOM killer may not free memory on a specific node */
if (gfp_mask & __GFP_THISNODE)
goto out;
}
@@ -2623,15 +2674,11 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
}
/*
- * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
- * __GFP_NOWARN set) should not cause reclaim since the subsystem
- * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
- * using a larger set of nodes after it has established that the
- * allowed per node queues are empty and that nodes are
- * over allocated.
+ * If this allocation cannot block and it is for a specific node, then
+ * fail early. There's no need to wakeup kswapd or retry for a
+ * speculative node-specific allocation.
*/
- if (IS_ENABLED(CONFIG_NUMA) &&
- (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
+ if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !wait)
goto nopage;
retry:
@@ -2824,7 +2871,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
/*
* Check the zones suitable for the gfp_mask contain at least one
* valid zone. It's possible to have an empty zonelist as a result
- * of GFP_THISNODE and a memoryless node
+ * of __GFP_THISNODE and a memoryless node
*/
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;
@@ -3201,38 +3248,31 @@ static void show_migration_types(unsigned char type)
* Show free area list (used inside shift_scroll-lock stuff)
* We also calculate the percentage fragmentation. We do this by counting the
* memory on each free list with the exception of the first item on the list.
- * Suppresses nodes that are not allowed by current's cpuset if
- * SHOW_MEM_FILTER_NODES is passed.
+ *
+ * Bits in @filter:
+ * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
+ * cpuset.
*/
void show_free_areas(unsigned int filter)
{
+ unsigned long free_pcp = 0;
int cpu;
struct zone *zone;
for_each_populated_zone(zone) {
if (skip_free_areas_node(filter, zone_to_nid(zone)))
continue;
- show_node(zone);
- printk("%s per-cpu:\n", zone->name);
-
- for_each_online_cpu(cpu) {
- struct per_cpu_pageset *pageset;
- pageset = per_cpu_ptr(zone->pageset, cpu);
-
- printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
- cpu, pageset->pcp.high,
- pageset->pcp.batch, pageset->pcp.count);
- }
+ for_each_online_cpu(cpu)
+ free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
}
printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
- " unevictable:%lu"
- " dirty:%lu writeback:%lu unstable:%lu\n"
- " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
+ " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
+ " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
- " free_cma:%lu\n",
+ " free:%lu free_pcp:%lu free_cma:%lu\n",
global_page_state(NR_ACTIVE_ANON),
global_page_state(NR_INACTIVE_ANON),
global_page_state(NR_ISOLATED_ANON),
@@ -3243,13 +3283,14 @@ void show_free_areas(unsigned int filter)
global_page_state(NR_FILE_DIRTY),
global_page_state(NR_WRITEBACK),
global_page_state(NR_UNSTABLE_NFS),
- global_page_state(NR_FREE_PAGES),
global_page_state(NR_SLAB_RECLAIMABLE),
global_page_state(NR_SLAB_UNRECLAIMABLE),
global_page_state(NR_FILE_MAPPED),
global_page_state(NR_SHMEM),
global_page_state(NR_PAGETABLE),
global_page_state(NR_BOUNCE),
+ global_page_state(NR_FREE_PAGES),
+ free_pcp,
global_page_state(NR_FREE_CMA_PAGES));
for_each_populated_zone(zone) {
@@ -3257,6 +3298,11 @@ void show_free_areas(unsigned int filter)
if (skip_free_areas_node(filter, zone_to_nid(zone)))
continue;
+
+ free_pcp = 0;
+ for_each_online_cpu(cpu)
+ free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
+
show_node(zone);
printk("%s"
" free:%lukB"
@@ -3283,6 +3329,8 @@ void show_free_areas(unsigned int filter)
" pagetables:%lukB"
" unstable:%lukB"
" bounce:%lukB"
+ " free_pcp:%lukB"
+ " local_pcp:%ukB"
" free_cma:%lukB"
" writeback_tmp:%lukB"
" pages_scanned:%lu"
@@ -3314,6 +3362,8 @@ void show_free_areas(unsigned int filter)
K(zone_page_state(zone, NR_PAGETABLE)),
K(zone_page_state(zone, NR_UNSTABLE_NFS)),
K(zone_page_state(zone, NR_BOUNCE)),
+ K(free_pcp),
+ K(this_cpu_read(zone->pageset->pcp.count)),
K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
K(zone_page_state(zone, NR_PAGES_SCANNED)),
@@ -5717,7 +5767,7 @@ static void __setup_per_zone_wmarks(void)
* value here.
*
* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
- * deltas controls asynch page reclaim, and so should
+ * deltas control asynch page reclaim, and so should
* not be capped for highmem.
*/
unsigned long min_pages;
@@ -6164,7 +6214,7 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
mask <<= (BITS_PER_LONG - bitidx - 1);
flags <<= (BITS_PER_LONG - bitidx - 1);
- word = ACCESS_ONCE(bitmap[word_bitidx]);
+ word = READ_ONCE(bitmap[word_bitidx]);
for (;;) {
old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
if (word == old_word)
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 755a42c76eb4..0c4505be0ed6 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -177,8 +177,11 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
undo:
for (pfn = start_pfn;
pfn < undo_pfn;
- pfn += pageblock_nr_pages)
- unset_migratetype_isolate(pfn_to_page(pfn), migratetype);
+ pfn += pageblock_nr_pages) {
+ page = __first_valid_page(pfn, pageblock_nr_pages);
+ if (page)
+ unset_migratetype_isolate(page, migratetype);
+ }
return -EBUSY;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index c161a14b6a8f..dad23a43e42c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -456,7 +456,7 @@ struct anon_vma *page_get_anon_vma(struct page *page)
unsigned long anon_mapping;
rcu_read_lock();
- anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
+ anon_mapping = (unsigned long)READ_ONCE(page->mapping);
if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
goto out;
if (!page_mapped(page))
@@ -500,14 +500,14 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page)
unsigned long anon_mapping;
rcu_read_lock();
- anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
+ anon_mapping = (unsigned long)READ_ONCE(page->mapping);
if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
goto out;
if (!page_mapped(page))
goto out;
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
- root_anon_vma = ACCESS_ONCE(anon_vma->root);
+ root_anon_vma = READ_ONCE(anon_vma->root);
if (down_read_trylock(&root_anon_vma->rwsem)) {
/*
* If the page is still mapped, then this anon_vma is still
@@ -712,6 +712,7 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
}
struct page_referenced_arg {
+ int dirtied;
int mapcount;
int referenced;
unsigned long vm_flags;
@@ -726,6 +727,7 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
int referenced = 0;
+ int dirty = 0;
struct page_referenced_arg *pra = arg;
if (unlikely(PageTransHuge(page))) {
@@ -749,6 +751,15 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
/* go ahead even if the pmd is pmd_trans_splitting() */
if (pmdp_clear_flush_young_notify(vma, address, pmd))
referenced++;
+
+ /*
+ * Use pmd_freeable instead of raw pmd_dirty because in some
+ * of architecture, pmd_dirty is not defined unless
+ * CONFIG_TRANSPARENT_HUGEPAGE is enabled
+ */
+ if (!pmd_freeable(*pmd))
+ dirty++;
+
spin_unlock(ptl);
} else {
pte_t *pte;
@@ -778,6 +789,10 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
if (likely(!(vma->vm_flags & VM_SEQ_READ)))
referenced++;
}
+
+ if (pte_dirty(*pte))
+ dirty++;
+
pte_unmap_unlock(pte, ptl);
}
@@ -786,6 +801,9 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
pra->vm_flags |= vma->vm_flags;
}
+ if (dirty)
+ pra->dirtied++;
+
pra->mapcount--;
if (!pra->mapcount)
return SWAP_SUCCESS; /* To break the loop */
@@ -810,6 +828,7 @@ static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
* @is_locked: caller holds lock on the page
* @memcg: target memory cgroup
* @vm_flags: collect encountered vma->vm_flags who actually referenced the page
+ * @is_pte_dirty: ptes which have marked dirty bit - used for lazyfree page
*
* Quick test_and_clear_referenced for all mappings to a page,
* returns the number of ptes which referenced the page.
@@ -817,7 +836,8 @@ static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
int page_referenced(struct page *page,
int is_locked,
struct mem_cgroup *memcg,
- unsigned long *vm_flags)
+ unsigned long *vm_flags,
+ int *is_pte_dirty)
{
int ret;
int we_locked = 0;
@@ -832,6 +852,9 @@ int page_referenced(struct page *page,
};
*vm_flags = 0;
+ if (is_pte_dirty)
+ *is_pte_dirty = 0;
+
if (!page_mapped(page))
return 0;
@@ -859,6 +882,9 @@ int page_referenced(struct page *page,
if (we_locked)
unlock_page(page);
+ if (is_pte_dirty)
+ *is_pte_dirty = pra.dirtied;
+
return pra.referenced;
}
@@ -1187,6 +1213,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
spinlock_t *ptl;
int ret = SWAP_AGAIN;
enum ttu_flags flags = (enum ttu_flags)arg;
+ int dirty = 0;
pte = page_check_address(page, mm, address, &ptl, 0);
if (!pte)
@@ -1216,7 +1243,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
pteval = ptep_clear_flush(vma, address, pte);
/* Move the dirty bit to the physical page now the pte is gone. */
- if (pte_dirty(pteval))
+ dirty = pte_dirty(pteval);
+ if (dirty)
set_page_dirty(page);
/* Update high watermark before we lower rss */
@@ -1245,6 +1273,19 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
swp_entry_t entry = { .val = page_private(page) };
pte_t swp_pte;
+ if (flags & TTU_FREE) {
+ VM_BUG_ON_PAGE(PageSwapCache(page), page);
+ if (!dirty && !PageDirty(page)) {
+ /* It's a freeable page by MADV_FREE */
+ dec_mm_counter(mm, MM_ANONPAGES);
+ goto discard;
+ } else {
+ set_pte_at(mm, address, pte, pteval);
+ ret = SWAP_FAIL;
+ goto out_unmap;
+ }
+ }
+
if (PageSwapCache(page)) {
/*
* Store the swap location in the pte.
@@ -1285,6 +1326,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
} else
dec_mm_counter(mm, MM_FILEPAGES);
+discard:
page_remove_rmap(page);
page_cache_release(page);
diff --git a/mm/shmem.c b/mm/shmem.c
index 80b360c7bcd1..2e2b943c8e62 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -981,7 +981,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
copy_highpage(newpage, oldpage);
flush_dcache_page(newpage);
- __set_page_locked(newpage);
+ __SetPageLocked(newpage);
SetPageUptodate(newpage);
SetPageSwapBacked(newpage);
set_page_private(newpage, swap_index);
@@ -1173,7 +1173,7 @@ repeat:
}
__SetPageSwapBacked(page);
- __set_page_locked(page);
+ __SetPageLocked(page);
if (sgp == SGP_WRITE)
__SetPageReferenced(page);
diff --git a/mm/slab.c b/mm/slab.c
index c4b89eaf4c96..7eb38dd1cefa 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -857,6 +857,11 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep,
return NULL;
}
+static inline gfp_t gfp_exact_node(gfp_t flags)
+{
+ return flags;
+}
+
#else /* CONFIG_NUMA */
static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
@@ -1023,6 +1028,15 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
return __cache_free_alien(cachep, objp, node, page_node);
}
+
+/*
+ * Construct gfp mask to allocate from a specific node but do not invoke reclaim
+ * or warn about failures.
+ */
+static inline gfp_t gfp_exact_node(gfp_t flags)
+{
+ return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_WAIT;
+}
#endif
/*
@@ -2825,7 +2839,7 @@ alloc_done:
if (unlikely(!ac->avail)) {
int x;
force_grow:
- x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
+ x = cache_grow(cachep, gfp_exact_node(flags), node, NULL);
/* cache_grow can reenable interrupts, then ac could change. */
ac = cpu_cache_get(cachep);
@@ -3019,7 +3033,7 @@ retry:
get_node(cache, nid) &&
get_node(cache, nid)->free_objects) {
obj = ____cache_alloc_node(cache,
- flags | GFP_THISNODE, nid);
+ gfp_exact_node(flags), nid);
if (obj)
break;
}
@@ -3047,7 +3061,7 @@ retry:
nid = page_to_nid(page);
if (cache_grow(cache, flags, nid, page)) {
obj = ____cache_alloc_node(cache,
- flags | GFP_THISNODE, nid);
+ gfp_exact_node(flags), nid);
if (!obj)
/*
* Another processor may allocate the
@@ -3118,7 +3132,7 @@ retry:
must_grow:
spin_unlock(&n->list_lock);
- x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
+ x = cache_grow(cachep, gfp_exact_node(flags), nodeid, NULL);
if (x)
goto retry;
diff --git a/mm/slub.c b/mm/slub.c
index 82c473780c91..90227ad712f9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -338,11 +338,13 @@ static inline int oo_objects(struct kmem_cache_order_objects x)
*/
static __always_inline void slab_lock(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
bit_spin_lock(PG_locked, &page->flags);
}
static __always_inline void slab_unlock(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
__bit_spin_unlock(PG_locked, &page->flags);
}
@@ -1137,15 +1139,6 @@ static int __init setup_slub_debug(char *str)
*/
goto check_slabs;
- if (tolower(*str) == 'o') {
- /*
- * Avoid enabling debugging on caches if its minimum order
- * would increase as a result.
- */
- disable_higher_order_debug = 1;
- goto out;
- }
-
slub_debug = 0;
if (*str == '-')
/*
@@ -1176,6 +1169,13 @@ static int __init setup_slub_debug(char *str)
case 'a':
slub_debug |= SLAB_FAILSLAB;
break;
+ case 'o':
+ /*
+ * Avoid enabling debugging on caches if its minimum
+ * order would increase as a result.
+ */
+ disable_higher_order_debug = 1;
+ break;
default:
pr_err("slub_debug option '%c' unknown. skipped\n",
*str);
@@ -4279,7 +4279,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
int node;
struct page *page;
- page = ACCESS_ONCE(c->page);
+ page = READ_ONCE(c->page);
if (!page)
continue;
@@ -4294,7 +4294,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
total += x;
nodes[node] += x;
- page = ACCESS_ONCE(c->partial);
+ page = READ_ONCE(c->partial);
if (page) {
node = page_to_nid(page);
if (flags & SO_TOTAL)
diff --git a/mm/swap.c b/mm/swap.c
index cd3a5e64cea9..6b5adc7eb226 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -42,6 +42,7 @@ int page_cluster;
static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
/*
@@ -743,7 +744,7 @@ void lru_cache_add_active_or_unevictable(struct page *page,
* be write it out by flusher threads as this is much more effective
* than the single-page writeout from reclaim.
*/
-static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
void *arg)
{
int lru, file;
@@ -789,6 +790,23 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
update_page_reclaim_stat(lruvec, file, 0);
}
+
+static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+ void *arg)
+{
+ if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+ int file = page_is_file_cache(page);
+ int lru = page_lru_base_type(page);
+
+ del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
+ ClearPageActive(page);
+ add_page_to_lru_list(page, lruvec, lru);
+
+ __count_vm_event(PGDEACTIVATE);
+ update_page_reclaim_stat(lruvec, file, 0);
+ }
+}
+
/*
* Drain pages out of the cpu's pagevecs.
* Either "cpu" is the current CPU, and preemption has already been
@@ -811,6 +829,10 @@ void lru_add_drain_cpu(int cpu)
local_irq_restore(flags);
}
+ pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
+ if (pagevec_count(pvec))
+ pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
+
pvec = &per_cpu(lru_deactivate_pvecs, cpu);
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
@@ -819,25 +841,37 @@ void lru_add_drain_cpu(int cpu)
}
/**
- * deactivate_page - forcefully deactivate a page
+ * deactivate_file_page - forcefully deactivate a file page
* @page: page to deactivate
*
* This function hints the VM that @page is a good reclaim candidate,
* for example if its invalidation fails due to the page being dirty
* or under writeback.
*/
-void deactivate_page(struct page *page)
+void deactivate_file_page(struct page *page)
{
/*
- * In a workload with many unevictable page such as mprotect, unevictable
- * page deactivation for accelerating reclaim is pointless.
+ * In a workload with many unevictable page such as mprotect,
+ * unevictable page deactivation for accelerating reclaim is pointless.
*/
if (PageUnevictable(page))
return;
if (likely(get_page_unless_zero(page))) {
+ struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
+
+ if (!pagevec_add(pvec, page))
+ pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
+ put_cpu_var(lru_deactivate_file_pvecs);
+ }
+}
+
+void deactivate_page(struct page *page)
+{
+ if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+ page_cache_get(page);
if (!pagevec_add(pvec, page))
pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
put_cpu_var(lru_deactivate_pvecs);
@@ -872,6 +906,7 @@ void lru_add_drain_all(void)
if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
+ pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
need_activate_page_drain(cpu)) {
INIT_WORK(work, lru_add_drain_per_cpu);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 405923f77334..a2611ce55413 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -357,7 +357,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
}
/* May fail (-ENOMEM) if radix-tree node allocation failed. */
- __set_page_locked(new_page);
+ __SetPageLocked(new_page);
SetPageSwapBacked(new_page);
err = __add_to_swap_cache(new_page, entry);
if (likely(!err)) {
@@ -371,7 +371,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
}
radix_tree_preload_end();
ClearPageSwapBacked(new_page);
- __clear_page_locked(new_page);
+ __ClearPageLocked(new_page);
/*
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
* clear SWAP_HAS_CACHE flag.
@@ -390,7 +390,7 @@ static unsigned long swapin_nr_pages(unsigned long offset)
unsigned int pages, max_pages, last_ra;
static atomic_t last_readahead_pages;
- max_pages = 1 << ACCESS_ONCE(page_cluster);
+ max_pages = 1 << READ_ONCE(page_cluster);
if (max_pages <= 1)
return 1;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 63f55ccb9b26..a7e72103f23b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1312,7 +1312,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
else
continue;
}
- count = ACCESS_ONCE(si->swap_map[i]);
+ count = READ_ONCE(si->swap_map[i]);
if (count && swap_count(count) != SWAP_MAP_BAD)
break;
}
diff --git a/mm/truncate.c b/mm/truncate.c
index ddec5a5966d7..66af9031fae8 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -93,35 +93,6 @@ void do_invalidatepage(struct page *page, unsigned int offset,
}
/*
- * This cancels just the dirty bit on the kernel page itself, it
- * does NOT actually remove dirty bits on any mmap's that may be
- * around. It also leaves the page tagged dirty, so any sync
- * activity will still find it on the dirty lists, and in particular,
- * clear_page_dirty_for_io() will still look at the dirty bits in
- * the VM.
- *
- * Doing this should *normally* only ever be done when a page
- * is truncated, and is not actually mapped anywhere at all. However,
- * fs/buffer.c does this when it notices that somebody has cleaned
- * out all the buffers on a page without actually doing it through
- * the VM. Can you say "ext3 is horribly ugly"? Tought you could.
- */
-void cancel_dirty_page(struct page *page, unsigned int account_size)
-{
- if (TestClearPageDirty(page)) {
- struct address_space *mapping = page->mapping;
- if (mapping && mapping_cap_account_dirty(mapping)) {
- dec_zone_page_state(page, NR_FILE_DIRTY);
- dec_bdi_stat(inode_to_bdi(mapping->host),
- BDI_RECLAIMABLE);
- if (account_size)
- task_io_account_cancelled_write(account_size);
- }
- }
-}
-EXPORT_SYMBOL(cancel_dirty_page);
-
-/*
* If truncate cannot remove the fs-private metadata from the page, the page
* becomes orphaned. It will be left on the LRU and may even be mapped into
* user pagetables if we're racing with filemap_fault().
@@ -140,7 +111,13 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
if (page_has_private(page))
do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
- cancel_dirty_page(page, PAGE_CACHE_SIZE);
+ /*
+ * Some filesystems seem to re-dirty the page even after
+ * the VM has canceled the dirty bit (eg ext3 journaling).
+ * Hence dirty accounting check is placed after invalidation.
+ */
+ if (TestClearPageDirty(page))
+ account_page_cleaned(page, mapping);
ClearPageMappedToDisk(page);
delete_from_page_cache(page);
@@ -513,7 +490,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
* of interest and try to speed up its reclaim.
*/
if (!ret)
- deactivate_page(page);
+ deactivate_file_page(page);
count += ret;
}
pagevec_remove_exceptionals(&pvec);
diff --git a/mm/util.c b/mm/util.c
index 3981ae9d1b15..769a7a2870af 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -3,6 +3,7 @@
#include <linux/string.h>
#include <linux/compiler.h>
#include <linux/export.h>
+#include <linux/ctype.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/security.h>
@@ -100,6 +101,35 @@ char *kstrndup(const char *s, size_t max, gfp_t gfp)
EXPORT_SYMBOL(kstrndup);
/**
+ * kstrimdup - Trim and copy a %NUL terminated string.
+ * @s: the string to trim and duplicate
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ *
+ * Returns an address, which the caller must kfree, containing
+ * a duplicate of the passed string with leading and/or trailing
+ * whitespace (as defined by isspace) removed.
+ */
+char *kstrimdup(const char *s, gfp_t gfp)
+{
+ char *buf;
+ char *begin = skip_spaces(s);
+ size_t len = strlen(begin);
+
+ while (len && isspace(begin[len - 1]))
+ len--;
+
+ buf = kmalloc_track_caller(len + 1, gfp);
+ if (!buf)
+ return NULL;
+
+ memcpy(buf, begin, len);
+ buf[len] = '\0';
+
+ return buf;
+}
+EXPORT_SYMBOL(kstrimdup);
+
+/**
* kmemdup - duplicate region of memory
*
* @src: memory region to duplicate
@@ -327,7 +357,10 @@ EXPORT_SYMBOL(kvfree);
struct address_space *page_mapping(struct page *page)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping;
+
+ page = compound_head(page);
+ mapping = page->mapping;
/* This happens if someone calls flush_dcache_page on slab page */
if (unlikely(PageSlab(page)))
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 49abccf29a29..2faaa2976447 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -29,6 +29,7 @@
#include <linux/atomic.h>
#include <linux/compiler.h>
#include <linux/llist.h>
+#include <linux/bitops.h>
#include <asm/uaccess.h>
#include <asm/tlbflush.h>
@@ -74,6 +75,8 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
+ if (pmd_clear_huge(pmd))
+ continue;
if (pmd_none_or_clear_bad(pmd))
continue;
vunmap_pte_range(pmd, addr, next);
@@ -88,6 +91,8 @@ static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end)
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
+ if (pud_clear_huge(pud))
+ continue;
if (pud_none_or_clear_bad(pud))
continue;
vunmap_pmd_range(pud, addr, next);
@@ -760,7 +765,7 @@ struct vmap_block {
spinlock_t lock;
struct vmap_area *va;
unsigned long free, dirty;
- DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
+ unsigned long dirty_min, dirty_max; /*< dirty range */
struct list_head free_list;
struct rcu_head rcu_head;
struct list_head purge;
@@ -791,13 +796,31 @@ static unsigned long addr_to_vb_idx(unsigned long addr)
return addr;
}
-static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
+static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
+{
+ unsigned long addr;
+
+ addr = va_start + (pages_off << PAGE_SHIFT);
+ BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
+ return (void *)addr;
+}
+
+/**
+ * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
+ * block. Of course pages number can't exceed VMAP_BBMAP_BITS
+ * @order: how many 2^order pages should be occupied in newly allocated block
+ * @gfp_mask: flags for the page level allocator
+ *
+ * Returns: virtual address in a newly allocated block or ERR_PTR(-errno)
+ */
+static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
{
struct vmap_block_queue *vbq;
struct vmap_block *vb;
struct vmap_area *va;
unsigned long vb_idx;
int node, err;
+ void *vaddr;
node = numa_node_id();
@@ -821,11 +844,15 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
return ERR_PTR(err);
}
+ vaddr = vmap_block_vaddr(va->va_start, 0);
spin_lock_init(&vb->lock);
vb->va = va;
- vb->free = VMAP_BBMAP_BITS;
+ /* At least something should be left free */
+ BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
+ vb->free = VMAP_BBMAP_BITS - (1UL << order);
vb->dirty = 0;
- bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS);
+ vb->dirty_min = VMAP_BBMAP_BITS;
+ vb->dirty_max = 0;
INIT_LIST_HEAD(&vb->free_list);
vb_idx = addr_to_vb_idx(va->va_start);
@@ -837,11 +864,11 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
vbq = &get_cpu_var(vmap_block_queue);
spin_lock(&vbq->lock);
- list_add_rcu(&vb->free_list, &vbq->free);
+ list_add_tail_rcu(&vb->free_list, &vbq->free);
spin_unlock(&vbq->lock);
put_cpu_var(vmap_block_queue);
- return vb;
+ return vaddr;
}
static void free_vmap_block(struct vmap_block *vb)
@@ -876,7 +903,8 @@ static void purge_fragmented_blocks(int cpu)
if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
vb->free = 0; /* prevent further allocs after releasing lock */
vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
- bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS);
+ vb->dirty_min = 0;
+ vb->dirty_max = VMAP_BBMAP_BITS;
spin_lock(&vbq->lock);
list_del_rcu(&vb->free_list);
spin_unlock(&vbq->lock);
@@ -905,7 +933,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
{
struct vmap_block_queue *vbq;
struct vmap_block *vb;
- unsigned long addr = 0;
+ void *vaddr = NULL;
unsigned int order;
BUG_ON(size & ~PAGE_MASK);
@@ -920,43 +948,38 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
}
order = get_order(size);
-again:
rcu_read_lock();
vbq = &get_cpu_var(vmap_block_queue);
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
- int i;
+ unsigned long pages_off;
spin_lock(&vb->lock);
- if (vb->free < 1UL << order)
- goto next;
+ if (vb->free < (1UL << order)) {
+ spin_unlock(&vb->lock);
+ continue;
+ }
- i = VMAP_BBMAP_BITS - vb->free;
- addr = vb->va->va_start + (i << PAGE_SHIFT);
- BUG_ON(addr_to_vb_idx(addr) !=
- addr_to_vb_idx(vb->va->va_start));
+ pages_off = VMAP_BBMAP_BITS - vb->free;
+ vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
vb->free -= 1UL << order;
if (vb->free == 0) {
spin_lock(&vbq->lock);
list_del_rcu(&vb->free_list);
spin_unlock(&vbq->lock);
}
+
spin_unlock(&vb->lock);
break;
-next:
- spin_unlock(&vb->lock);
}
put_cpu_var(vmap_block_queue);
rcu_read_unlock();
- if (!addr) {
- vb = new_vmap_block(gfp_mask);
- if (IS_ERR(vb))
- return vb;
- goto again;
- }
+ /* Allocate new block if nothing was found */
+ if (!vaddr)
+ vaddr = new_vmap_block(order, gfp_mask);
- return (void *)addr;
+ return vaddr;
}
static void vb_free(const void *addr, unsigned long size)
@@ -974,6 +997,7 @@ static void vb_free(const void *addr, unsigned long size)
order = get_order(size);
offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
+ offset >>= PAGE_SHIFT;
vb_idx = addr_to_vb_idx((unsigned long)addr);
rcu_read_lock();
@@ -984,7 +1008,10 @@ static void vb_free(const void *addr, unsigned long size)
vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
spin_lock(&vb->lock);
- BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order));
+
+ /* Expand dirty range */
+ vb->dirty_min = min(vb->dirty_min, offset);
+ vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));
vb->dirty += 1UL << order;
if (vb->dirty == VMAP_BBMAP_BITS) {
@@ -1023,25 +1050,18 @@ void vm_unmap_aliases(void)
rcu_read_lock();
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
- int i, j;
-
spin_lock(&vb->lock);
- i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS);
- if (i < VMAP_BBMAP_BITS) {
+ if (vb->dirty) {
+ unsigned long va_start = vb->va->va_start;
unsigned long s, e;
- j = find_last_bit(vb->dirty_map,
- VMAP_BBMAP_BITS);
- j = j + 1; /* need exclusive index */
+ s = va_start + (vb->dirty_min << PAGE_SHIFT);
+ e = va_start + (vb->dirty_max << PAGE_SHIFT);
- s = vb->va->va_start + (i << PAGE_SHIFT);
- e = vb->va->va_start + (j << PAGE_SHIFT);
- flush = 1;
+ start = min(s, start);
+ end = max(e, end);
- if (s < start)
- start = s;
- if (e > end)
- end = e;
+ flush = 1;
}
spin_unlock(&vb->lock);
}
@@ -1314,7 +1334,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
BUG_ON(in_interrupt());
if (flags & VM_IOREMAP)
- align = 1ul << clamp(fls(size), PAGE_SHIFT, IOREMAP_MAX_ORDER);
+ align = 1ul << clamp_t(int, fls_long(size),
+ PAGE_SHIFT, IOREMAP_MAX_ORDER);
size = PAGE_ALIGN(size);
if (unlikely(!size))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5e8eadd71bac..dc6cd51577a6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -754,13 +754,17 @@ enum page_references {
};
static enum page_references page_check_references(struct page *page,
- struct scan_control *sc)
+ struct scan_control *sc,
+ bool *freeable)
{
int referenced_ptes, referenced_page;
unsigned long vm_flags;
+ int pte_dirty;
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
- &vm_flags);
+ &vm_flags, &pte_dirty);
referenced_page = TestClearPageReferenced(page);
/*
@@ -801,6 +805,10 @@ static enum page_references page_check_references(struct page *page,
return PAGEREF_KEEP;
}
+ if (PageAnon(page) && !pte_dirty && !PageSwapCache(page) &&
+ !PageDirty(page))
+ *freeable = true;
+
/* Reclaim if clean, defer dirty pages to writeback */
if (referenced_page && !PageSwapBacked(page))
return PAGEREF_RECLAIM_CLEAN;
@@ -869,6 +877,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
int may_enter_fs;
enum page_references references = PAGEREF_RECLAIM_CLEAN;
bool dirty, writeback;
+ bool freeable = false;
cond_resched();
@@ -992,7 +1001,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
}
if (!force_reclaim)
- references = page_check_references(page, sc);
+ references = page_check_references(page, sc,
+ &freeable);
switch (references) {
case PAGEREF_ACTIVATE:
@@ -1009,22 +1019,31 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* Try to allocate it some swap space here.
*/
if (PageAnon(page) && !PageSwapCache(page)) {
- if (!(sc->gfp_mask & __GFP_IO))
- goto keep_locked;
- if (!add_to_swap(page, page_list))
- goto activate_locked;
- may_enter_fs = 1;
-
- /* Adding to swap updated mapping */
- mapping = page_mapping(page);
+ if (!freeable) {
+ if (!(sc->gfp_mask & __GFP_IO))
+ goto keep_locked;
+ if (!add_to_swap(page, page_list))
+ goto activate_locked;
+ may_enter_fs = 1;
+ /* Adding to swap updated mapping */
+ mapping = page_mapping(page);
+ } else {
+ if (likely(!PageTransHuge(page)))
+ goto unmap;
+ /* try_to_unmap isn't aware of THP page */
+ if (unlikely(split_huge_page_to_list(page,
+ page_list)))
+ goto keep_locked;
+ }
}
-
+unmap:
/*
* The page is mapped into the page tables of one or more
* processes. Try to unmap it here.
*/
- if (page_mapped(page) && mapping) {
- switch (try_to_unmap(page, ttu_flags)) {
+ if (page_mapped(page) && (mapping || freeable)) {
+ switch (try_to_unmap(page,
+ freeable ? TTU_FREE : ttu_flags)) {
case SWAP_FAIL:
goto activate_locked;
case SWAP_AGAIN:
@@ -1032,7 +1051,20 @@ static unsigned long shrink_page_list(struct list_head *page_list,
case SWAP_MLOCK:
goto cull_mlocked;
case SWAP_SUCCESS:
- ; /* try to free the page below */
+ /* try to free the page below */
+ if (!freeable)
+ break;
+ /*
+ * Freeable anon page doesn't have mapping
+ * due to skipping of swapcache so we free
+ * page in here rather than __remove_mapping.
+ */
+ VM_BUG_ON_PAGE(PageSwapCache(page), page);
+ if (!page_freeze_refs(page, 1))
+ goto keep_locked;
+ __ClearPageLocked(page);
+ count_vm_event(PGLAZYFREED);
+ goto free_it;
}
}
@@ -1142,7 +1174,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* we obviously don't have to worry about waking up a process
* waiting on the page lock, because there are no references.
*/
- __clear_page_locked(page);
+ __ClearPageLocked(page);
free_it:
nr_reclaimed++;
@@ -1401,6 +1433,32 @@ int isolate_lru_page(struct page *page)
return ret;
}
+static int __too_many_isolated(struct zone *zone, int file,
+ struct scan_control *sc, int safe)
+{
+ unsigned long inactive, isolated;
+
+ if (safe) {
+ inactive = zone_page_state_snapshot(zone,
+ NR_INACTIVE_ANON + 2 * file);
+ isolated = zone_page_state_snapshot(zone,
+ NR_ISOLATED_ANON + file);
+ } else {
+ inactive = zone_page_state(zone, NR_INACTIVE_ANON + 2 * file);
+ isolated = zone_page_state(zone, NR_ISOLATED_ANON + file);
+ }
+
+ /*
+ * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
+ * won't get blocked by normal direct-reclaimers, forming a circular
+ * deadlock.
+ */
+ if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
+ inactive >>= 3;
+
+ return isolated > inactive;
+}
+
/*
* A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
* then get resheduled. When there are massive number of tasks doing page
@@ -1409,33 +1467,24 @@ int isolate_lru_page(struct page *page)
* unnecessary swapping, thrashing and OOM.
*/
static int too_many_isolated(struct zone *zone, int file,
- struct scan_control *sc)
+ struct scan_control *sc)
{
- unsigned long inactive, isolated;
-
if (current_is_kswapd())
return 0;
if (!global_reclaim(sc))
return 0;
- if (file) {
- inactive = zone_page_state(zone, NR_INACTIVE_FILE);
- isolated = zone_page_state(zone, NR_ISOLATED_FILE);
- } else {
- inactive = zone_page_state(zone, NR_INACTIVE_ANON);
- isolated = zone_page_state(zone, NR_ISOLATED_ANON);
- }
-
/*
- * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
- * won't get blocked by normal direct-reclaimers, forming a circular
- * deadlock.
+ * __too_many_isolated(safe=0) is fast but inaccurate, because it
+ * doesn't account for the vm_stat_diff[] counters. So if it looks
+ * like too_many_isolated() is about to return true, fall back to the
+ * slower, more accurate zone_page_state_snapshot().
*/
- if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
- inactive >>= 3;
+ if (unlikely(__too_many_isolated(zone, file, sc, 0)))
+ return __too_many_isolated(zone, file, sc, 1);
- return isolated > inactive;
+ return 0;
}
static noinline_for_stack void
@@ -1772,7 +1821,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
}
if (page_referenced(page, 0, sc->target_mem_cgroup,
- &vm_flags)) {
+ &vm_flags, NULL)) {
nr_rotated += hpage_nr_pages(page);
/*
* Identify referenced, file-backed active pages and
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4f5cd974e11a..1fd0886a389f 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -759,6 +759,7 @@ const char * const vmstat_text[] = {
"pgfault",
"pgmajfault",
+ "pglazyfreed",
TEXTS_FOR_ZONES("pgrefill")
TEXTS_FOR_ZONES("pgsteal_kswapd")
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 0dec1fa5f656..3b1512d005c9 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -12,35 +12,6 @@
*/
/*
- * This allocator is designed for use with zram. Thus, the allocator is
- * supposed to work well under low memory conditions. In particular, it
- * never attempts higher order page allocation which is very likely to
- * fail under memory pressure. On the other hand, if we just use single
- * (0-order) pages, it would suffer from very high fragmentation --
- * any object of size PAGE_SIZE/2 or larger would occupy an entire page.
- * This was one of the major issues with its predecessor (xvmalloc).
- *
- * To overcome these issues, zsmalloc allocates a bunch of 0-order pages
- * and links them together using various 'struct page' fields. These linked
- * pages act as a single higher-order page i.e. an object can span 0-order
- * page boundaries. The code refers to these linked pages as a single entity
- * called zspage.
- *
- * For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE
- * since this satisfies the requirements of all its current users (in the
- * worst case, page is incompressible and is thus stored "as-is" i.e. in
- * uncompressed form). For allocation requests larger than this size, failure
- * is returned (see zs_malloc).
- *
- * Additionally, zs_malloc() does not return a dereferenceable pointer.
- * Instead, it returns an opaque handle (unsigned long) which encodes actual
- * location of the allocated object. The reason for this indirection is that
- * zsmalloc does not keep zspages permanently mapped since that would cause
- * issues on 32-bit systems where the VA region for kernel space mappings
- * is very small. So, before using the allocating memory, the object has to
- * be mapped using zs_map_object() to get a usable pointer and subsequently
- * unmapped using zs_unmap_object().
- *
* Following is how we use various fields and flags of underlying
* struct page(s) to form a zspage.
*
@@ -57,6 +28,8 @@
*
* page->private (union with page->first_page): refers to the
* component page after the first page
+ * If the page is first_page for huge object, it stores handle.
+ * Look at size_class->huge.
* page->freelist: points to the first free object in zspage.
* Free objects are linked together using in-place
* metadata.
@@ -78,6 +51,7 @@
#include <linux/module.h>
#include <linux/kernel.h>
+#include <linux/sched.h>
#include <linux/bitops.h>
#include <linux/errno.h>
#include <linux/highmem.h>
@@ -110,6 +84,8 @@
#define ZS_MAX_ZSPAGE_ORDER 2
#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
+#define ZS_HANDLE_SIZE (sizeof(unsigned long))
+
/*
* Object location (<PFN>, <obj_idx>) is encoded as
* as single (unsigned long) handle value.
@@ -133,13 +109,33 @@
#endif
#endif
#define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT)
-#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS)
+
+/*
+ * Memory for allocating for handle keeps object position by
+ * encoding <page, obj_idx> and the encoded value has a room
+ * in least bit(ie, look at obj_to_location).
+ * We use the bit to synchronize between object access by
+ * user and migration.
+ */
+#define HANDLE_PIN_BIT 0
+
+/*
+ * Head in allocated object should have OBJ_ALLOCATED_TAG
+ * to identify the object was allocated or not.
+ * It's okay to add the status bit in the least bit because
+ * header keeps handle which is 4byte-aligned address so we
+ * have room for two bit at least.
+ */
+#define OBJ_ALLOCATED_TAG 1
+#define OBJ_TAG_BITS 1
+#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
#define MAX(a, b) ((a) >= (b) ? (a) : (b))
/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
#define ZS_MIN_ALLOC_SIZE \
MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
+/* each chunk includes extra space to keep handle */
#define ZS_MAX_ALLOC_SIZE PAGE_SIZE
/*
@@ -172,6 +168,8 @@ enum fullness_group {
enum zs_stat_type {
OBJ_ALLOCATED,
OBJ_USED,
+ CLASS_ALMOST_FULL,
+ CLASS_ALMOST_EMPTY,
NR_ZS_STAT_TYPE,
};
@@ -216,6 +214,8 @@ struct size_class {
/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
int pages_per_zspage;
+ /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
+ bool huge;
#ifdef CONFIG_ZSMALLOC_STAT
struct zs_size_stat stats;
@@ -233,14 +233,24 @@ struct size_class {
* This must be power of 2 and less than or equal to ZS_ALIGN
*/
struct link_free {
- /* Handle of next free chunk (encodes <PFN, obj_idx>) */
- void *next;
+ union {
+ /*
+ * Position of next free chunk (encodes <PFN, obj_idx>)
+ * It's valid for non-allocated object
+ */
+ void *next;
+ /*
+ * Handle of allocated object.
+ */
+ unsigned long handle;
+ };
};
struct zs_pool {
char *name;
struct size_class **size_class;
+ struct kmem_cache *handle_cachep;
gfp_t flags; /* allocation flags used when growing pool */
atomic_long_t pages_allocated;
@@ -267,8 +277,37 @@ struct mapping_area {
#endif
char *vm_addr; /* address of kmap_atomic()'ed pages */
enum zs_mapmode vm_mm; /* mapping mode */
+ bool huge;
};
+static int create_handle_cache(struct zs_pool *pool)
+{
+ pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
+ 0, 0, NULL);
+ return pool->handle_cachep ? 0 : 1;
+}
+
+static void destroy_handle_cache(struct zs_pool *pool)
+{
+ kmem_cache_destroy(pool->handle_cachep);
+}
+
+static unsigned long alloc_handle(struct zs_pool *pool)
+{
+ return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
+ pool->flags & ~__GFP_HIGHMEM);
+}
+
+static void free_handle(struct zs_pool *pool, unsigned long handle)
+{
+ kmem_cache_free(pool->handle_cachep, (void *)handle);
+}
+
+static void record_obj(unsigned long handle, unsigned long obj)
+{
+ *(unsigned long *)handle = obj;
+}
+
/* zpool driver */
#ifdef CONFIG_ZPOOL
@@ -346,6 +385,11 @@ static struct zpool_driver zs_zpool_driver = {
MODULE_ALIAS("zpool-zsmalloc");
#endif /* CONFIG_ZPOOL */
+static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
+{
+ return pages_per_zspage * PAGE_SIZE / size;
+}
+
/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
@@ -396,9 +440,182 @@ static int get_size_class_index(int size)
idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE,
ZS_SIZE_CLASS_DELTA);
- return idx;
+ return min(zs_size_classes - 1, idx);
}
+#ifdef CONFIG_ZSMALLOC_STAT
+
+static inline void zs_stat_inc(struct size_class *class,
+ enum zs_stat_type type, unsigned long cnt)
+{
+ class->stats.objs[type] += cnt;
+}
+
+static inline void zs_stat_dec(struct size_class *class,
+ enum zs_stat_type type, unsigned long cnt)
+{
+ class->stats.objs[type] -= cnt;
+}
+
+static inline unsigned long zs_stat_get(struct size_class *class,
+ enum zs_stat_type type)
+{
+ return class->stats.objs[type];
+}
+
+static int __init zs_stat_init(void)
+{
+ if (!debugfs_initialized())
+ return -ENODEV;
+
+ zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
+ if (!zs_stat_root)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void __exit zs_stat_exit(void)
+{
+ debugfs_remove_recursive(zs_stat_root);
+}
+
+static int zs_stats_size_show(struct seq_file *s, void *v)
+{
+ int i;
+ struct zs_pool *pool = s->private;
+ struct size_class *class;
+ int objs_per_zspage;
+ unsigned long class_almost_full, class_almost_empty;
+ unsigned long obj_allocated, obj_used, pages_used;
+ unsigned long total_class_almost_full = 0, total_class_almost_empty = 0;
+ unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
+
+ seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s\n",
+ "class", "size", "almost_full", "almost_empty",
+ "obj_allocated", "obj_used", "pages_used",
+ "pages_per_zspage");
+
+ for (i = 0; i < zs_size_classes; i++) {
+ class = pool->size_class[i];
+
+ if (class->index != i)
+ continue;
+
+ spin_lock(&class->lock);
+ class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL);
+ class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY);
+ obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
+ obj_used = zs_stat_get(class, OBJ_USED);
+ spin_unlock(&class->lock);
+
+ objs_per_zspage = get_maxobj_per_zspage(class->size,
+ class->pages_per_zspage);
+ pages_used = obj_allocated / objs_per_zspage *
+ class->pages_per_zspage;
+
+ seq_printf(s, " %5u %5u %11lu %12lu %13lu %10lu %10lu %16d\n",
+ i, class->size, class_almost_full, class_almost_empty,
+ obj_allocated, obj_used, pages_used,
+ class->pages_per_zspage);
+
+ total_class_almost_full += class_almost_full;
+ total_class_almost_empty += class_almost_empty;
+ total_objs += obj_allocated;
+ total_used_objs += obj_used;
+ total_pages += pages_used;
+ }
+
+ seq_puts(s, "\n");
+ seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu\n",
+ "Total", "", total_class_almost_full,
+ total_class_almost_empty, total_objs,
+ total_used_objs, total_pages);
+
+ return 0;
+}
+
+static int zs_stats_size_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, zs_stats_size_show, inode->i_private);
+}
+
+static const struct file_operations zs_stat_size_ops = {
+ .open = zs_stats_size_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int zs_pool_stat_create(char *name, struct zs_pool *pool)
+{
+ struct dentry *entry;
+
+ if (!zs_stat_root)
+ return -ENODEV;
+
+ entry = debugfs_create_dir(name, zs_stat_root);
+ if (!entry) {
+ pr_warn("debugfs dir <%s> creation failed\n", name);
+ return -ENOMEM;
+ }
+ pool->stat_dentry = entry;
+
+ entry = debugfs_create_file("classes", S_IFREG | S_IRUGO,
+ pool->stat_dentry, pool, &zs_stat_size_ops);
+ if (!entry) {
+ pr_warn("%s: debugfs file entry <%s> creation failed\n",
+ name, "classes");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void zs_pool_stat_destroy(struct zs_pool *pool)
+{
+ debugfs_remove_recursive(pool->stat_dentry);
+}
+
+#else /* CONFIG_ZSMALLOC_STAT */
+
+static inline void zs_stat_inc(struct size_class *class,
+ enum zs_stat_type type, unsigned long cnt)
+{
+}
+
+static inline void zs_stat_dec(struct size_class *class,
+ enum zs_stat_type type, unsigned long cnt)
+{
+}
+
+static inline unsigned long zs_stat_get(struct size_class *class,
+ enum zs_stat_type type)
+{
+ return 0;
+}
+
+static int __init zs_stat_init(void)
+{
+ return 0;
+}
+
+static void __exit zs_stat_exit(void)
+{
+}
+
+static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
+{
+ return 0;
+}
+
+static inline void zs_pool_stat_destroy(struct zs_pool *pool)
+{
+}
+
+#endif
+
+
/*
* For each size class, zspages are divided into different groups
* depending on how "full" they are. This was done so that we could
@@ -419,7 +636,7 @@ static enum fullness_group get_fullness_group(struct page *page)
fg = ZS_EMPTY;
else if (inuse == max_objects)
fg = ZS_FULL;
- else if (inuse <= max_objects / fullness_threshold_frac)
+ else if (inuse <= 3 * max_objects / fullness_threshold_frac)
fg = ZS_ALMOST_EMPTY;
else
fg = ZS_ALMOST_FULL;
@@ -448,6 +665,8 @@ static void insert_zspage(struct page *page, struct size_class *class,
list_add_tail(&page->lru, &(*head)->lru);
*head = page;
+ zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ?
+ CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
}
/*
@@ -473,6 +692,8 @@ static void remove_zspage(struct page *page, struct size_class *class,
struct page, lru);
list_del_init(&page->lru);
+ zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ?
+ CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
}
/*
@@ -484,11 +705,10 @@ static void remove_zspage(struct page *page, struct size_class *class,
* page from the freelist of the old fullness group to that of the new
* fullness group.
*/
-static enum fullness_group fix_fullness_group(struct zs_pool *pool,
+static enum fullness_group fix_fullness_group(struct size_class *class,
struct page *page)
{
int class_idx;
- struct size_class *class;
enum fullness_group currfg, newfg;
BUG_ON(!is_first_page(page));
@@ -498,7 +718,6 @@ static enum fullness_group fix_fullness_group(struct zs_pool *pool,
if (newfg == currfg)
goto out;
- class = pool->size_class[class_idx];
remove_zspage(page, class, currfg);
insert_zspage(page, class, newfg);
set_zspage_mapping(page, class_idx, newfg);
@@ -512,7 +731,8 @@ out:
* to form a zspage for each size class. This is important
* to reduce wastage due to unusable space left at end of
* each zspage which is given as:
- * wastage = Zp - Zp % size_class
+ * wastage = Zp % class_size
+ * usage = Zp - wastage
* where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ...
*
* For example, for size class of 3/8 * PAGE_SIZE, we should
@@ -571,35 +791,50 @@ static struct page *get_next_page(struct page *page)
/*
* Encode <page, obj_idx> as a single handle value.
- * On hardware platforms with physical memory starting at 0x0 the pfn
- * could be 0 so we ensure that the handle will never be 0 by adjusting the
- * encoded obj_idx value before encoding.
+ * We use the least bit of handle for tagging.
*/
-static void *obj_location_to_handle(struct page *page, unsigned long obj_idx)
+static void *location_to_obj(struct page *page, unsigned long obj_idx)
{
- unsigned long handle;
+ unsigned long obj;
if (!page) {
BUG_ON(obj_idx);
return NULL;
}
- handle = page_to_pfn(page) << OBJ_INDEX_BITS;
- handle |= ((obj_idx + 1) & OBJ_INDEX_MASK);
+ obj = page_to_pfn(page) << OBJ_INDEX_BITS;
+ obj |= ((obj_idx) & OBJ_INDEX_MASK);
+ obj <<= OBJ_TAG_BITS;
- return (void *)handle;
+ return (void *)obj;
}
/*
* Decode <page, obj_idx> pair from the given object handle. We adjust the
* decoded obj_idx back to its original value since it was adjusted in
- * obj_location_to_handle().
+ * location_to_obj().
*/
-static void obj_handle_to_location(unsigned long handle, struct page **page,
+static void obj_to_location(unsigned long obj, struct page **page,
unsigned long *obj_idx)
{
- *page = pfn_to_page(handle >> OBJ_INDEX_BITS);
- *obj_idx = (handle & OBJ_INDEX_MASK) - 1;
+ obj >>= OBJ_TAG_BITS;
+ *page = pfn_to_page(obj >> OBJ_INDEX_BITS);
+ *obj_idx = (obj & OBJ_INDEX_MASK);
+}
+
+static unsigned long handle_to_obj(unsigned long handle)
+{
+ return *(unsigned long *)handle;
+}
+
+static unsigned long obj_to_head(struct size_class *class, struct page *page,
+ void *obj)
+{
+ if (class->huge) {
+ VM_BUG_ON(!is_first_page(page));
+ return *(unsigned long *)page_private(page);
+ } else
+ return *(unsigned long *)obj;
}
static unsigned long obj_idx_to_offset(struct page *page,
@@ -613,6 +848,25 @@ static unsigned long obj_idx_to_offset(struct page *page,
return off + obj_idx * class_size;
}
+static inline int trypin_tag(unsigned long handle)
+{
+ unsigned long *ptr = (unsigned long *)handle;
+
+ return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr);
+}
+
+static void pin_tag(unsigned long handle)
+{
+ while (!trypin_tag(handle));
+}
+
+static void unpin_tag(unsigned long handle)
+{
+ unsigned long *ptr = (unsigned long *)handle;
+
+ clear_bit_unlock(HANDLE_PIN_BIT, ptr);
+}
+
static void reset_page(struct page *page)
{
clear_bit(PG_private, &page->flags);
@@ -674,7 +928,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
link = (struct link_free *)vaddr + off / sizeof(*link);
while ((off += class->size) < PAGE_SIZE) {
- link->next = obj_location_to_handle(page, i++);
+ link->next = location_to_obj(page, i++);
link += class->size / sizeof(*link);
}
@@ -684,7 +938,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
* page (if present)
*/
next_page = get_next_page(page);
- link->next = obj_location_to_handle(next_page, 0);
+ link->next = location_to_obj(next_page, 0);
kunmap_atomic(vaddr);
page = next_page;
off %= PAGE_SIZE;
@@ -738,7 +992,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
init_zspage(first_page, class);
- first_page->freelist = obj_location_to_handle(first_page, 0);
+ first_page->freelist = location_to_obj(first_page, 0);
/* Maximum number of objects we can store in this zspage */
first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size;
@@ -860,12 +1114,19 @@ static void __zs_unmap_object(struct mapping_area *area,
{
int sizes[2];
void *addr;
- char *buf = area->vm_buf;
+ char *buf;
/* no write fastpath */
if (area->vm_mm == ZS_MM_RO)
goto out;
+ buf = area->vm_buf;
+ if (!area->huge) {
+ buf = buf + ZS_HANDLE_SIZE;
+ size -= ZS_HANDLE_SIZE;
+ off += ZS_HANDLE_SIZE;
+ }
+
sizes[0] = PAGE_SIZE - off;
sizes[1] = size - sizes[0];
@@ -952,11 +1213,6 @@ static void init_zs_size_classes(void)
zs_size_classes = nr;
}
-static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
-{
- return pages_per_zspage * PAGE_SIZE / size;
-}
-
static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
{
if (prev->pages_per_zspage != pages_per_zspage)
@@ -969,166 +1225,13 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
return true;
}
-#ifdef CONFIG_ZSMALLOC_STAT
-
-static inline void zs_stat_inc(struct size_class *class,
- enum zs_stat_type type, unsigned long cnt)
-{
- class->stats.objs[type] += cnt;
-}
-
-static inline void zs_stat_dec(struct size_class *class,
- enum zs_stat_type type, unsigned long cnt)
-{
- class->stats.objs[type] -= cnt;
-}
-
-static inline unsigned long zs_stat_get(struct size_class *class,
- enum zs_stat_type type)
-{
- return class->stats.objs[type];
-}
-
-static int __init zs_stat_init(void)
-{
- if (!debugfs_initialized())
- return -ENODEV;
-
- zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
- if (!zs_stat_root)
- return -ENOMEM;
-
- return 0;
-}
-
-static void __exit zs_stat_exit(void)
-{
- debugfs_remove_recursive(zs_stat_root);
-}
-
-static int zs_stats_size_show(struct seq_file *s, void *v)
+static bool zspage_full(struct page *page)
{
- int i;
- struct zs_pool *pool = s->private;
- struct size_class *class;
- int objs_per_zspage;
- unsigned long obj_allocated, obj_used, pages_used;
- unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
-
- seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size",
- "obj_allocated", "obj_used", "pages_used");
-
- for (i = 0; i < zs_size_classes; i++) {
- class = pool->size_class[i];
-
- if (class->index != i)
- continue;
-
- spin_lock(&class->lock);
- obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
- obj_used = zs_stat_get(class, OBJ_USED);
- spin_unlock(&class->lock);
-
- objs_per_zspage = get_maxobj_per_zspage(class->size,
- class->pages_per_zspage);
- pages_used = obj_allocated / objs_per_zspage *
- class->pages_per_zspage;
-
- seq_printf(s, " %5u %5u %10lu %10lu %10lu\n", i,
- class->size, obj_allocated, obj_used, pages_used);
-
- total_objs += obj_allocated;
- total_used_objs += obj_used;
- total_pages += pages_used;
- }
-
- seq_puts(s, "\n");
- seq_printf(s, " %5s %5s %10lu %10lu %10lu\n", "Total", "",
- total_objs, total_used_objs, total_pages);
-
- return 0;
-}
-
-static int zs_stats_size_open(struct inode *inode, struct file *file)
-{
- return single_open(file, zs_stats_size_show, inode->i_private);
-}
-
-static const struct file_operations zs_stat_size_ops = {
- .open = zs_stats_size_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static int zs_pool_stat_create(char *name, struct zs_pool *pool)
-{
- struct dentry *entry;
-
- if (!zs_stat_root)
- return -ENODEV;
-
- entry = debugfs_create_dir(name, zs_stat_root);
- if (!entry) {
- pr_warn("debugfs dir <%s> creation failed\n", name);
- return -ENOMEM;
- }
- pool->stat_dentry = entry;
-
- entry = debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO,
- pool->stat_dentry, pool, &zs_stat_size_ops);
- if (!entry) {
- pr_warn("%s: debugfs file entry <%s> creation failed\n",
- name, "obj_in_classes");
- return -ENOMEM;
- }
-
- return 0;
-}
-
-static void zs_pool_stat_destroy(struct zs_pool *pool)
-{
- debugfs_remove_recursive(pool->stat_dentry);
-}
-
-#else /* CONFIG_ZSMALLOC_STAT */
-
-static inline void zs_stat_inc(struct size_class *class,
- enum zs_stat_type type, unsigned long cnt)
-{
-}
-
-static inline void zs_stat_dec(struct size_class *class,
- enum zs_stat_type type, unsigned long cnt)
-{
-}
-
-static inline unsigned long zs_stat_get(struct size_class *class,
- enum zs_stat_type type)
-{
- return 0;
-}
-
-static int __init zs_stat_init(void)
-{
- return 0;
-}
-
-static void __exit zs_stat_exit(void)
-{
-}
-
-static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
-{
- return 0;
-}
+ BUG_ON(!is_first_page(page));
-static inline void zs_pool_stat_destroy(struct zs_pool *pool)
-{
+ return page->inuse == page->objects;
}
-#endif
-
unsigned long zs_get_total_pages(struct zs_pool *pool)
{
return atomic_long_read(&pool->pages_allocated);
@@ -1153,13 +1256,14 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
enum zs_mapmode mm)
{
struct page *page;
- unsigned long obj_idx, off;
+ unsigned long obj, obj_idx, off;
unsigned int class_idx;
enum fullness_group fg;
struct size_class *class;
struct mapping_area *area;
struct page *pages[2];
+ void *ret;
BUG_ON(!handle);
@@ -1170,7 +1274,11 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
*/
BUG_ON(in_interrupt());
- obj_handle_to_location(handle, &page, &obj_idx);
+ /* From now on, migration cannot move the object */
+ pin_tag(handle);
+
+ obj = handle_to_obj(handle);
+ obj_to_location(obj, &page, &obj_idx);
get_zspage_mapping(get_first_page(page), &class_idx, &fg);
class = pool->size_class[class_idx];
off = obj_idx_to_offset(page, obj_idx, class->size);
@@ -1180,7 +1288,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
if (off + class->size <= PAGE_SIZE) {
/* this object is contained entirely within a page */
area->vm_addr = kmap_atomic(page);
- return area->vm_addr + off;
+ ret = area->vm_addr + off;
+ goto out;
}
/* this object spans two pages */
@@ -1188,14 +1297,19 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
pages[1] = get_next_page(page);
BUG_ON(!pages[1]);
- return __zs_map_object(area, pages, off, class->size);
+ ret = __zs_map_object(area, pages, off, class->size);
+out:
+ if (!class->huge)
+ ret += ZS_HANDLE_SIZE;
+
+ return ret;
}
EXPORT_SYMBOL_GPL(zs_map_object);
void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
{
struct page *page;
- unsigned long obj_idx, off;
+ unsigned long obj, obj_idx, off;
unsigned int class_idx;
enum fullness_group fg;
@@ -1204,7 +1318,8 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
BUG_ON(!handle);
- obj_handle_to_location(handle, &page, &obj_idx);
+ obj = handle_to_obj(handle);
+ obj_to_location(obj, &page, &obj_idx);
get_zspage_mapping(get_first_page(page), &class_idx, &fg);
class = pool->size_class[class_idx];
off = obj_idx_to_offset(page, obj_idx, class->size);
@@ -1222,9 +1337,42 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
__zs_unmap_object(area, pages, off, class->size);
}
put_cpu_var(zs_map_area);
+ unpin_tag(handle);
}
EXPORT_SYMBOL_GPL(zs_unmap_object);
+static unsigned long obj_malloc(struct page *first_page,
+ struct size_class *class, unsigned long handle)
+{
+ unsigned long obj;
+ struct link_free *link;
+
+ struct page *m_page;
+ unsigned long m_objidx, m_offset;
+ void *vaddr;
+
+ handle |= OBJ_ALLOCATED_TAG;
+ obj = (unsigned long)first_page->freelist;
+ obj_to_location(obj, &m_page, &m_objidx);
+ m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
+
+ vaddr = kmap_atomic(m_page);
+ link = (struct link_free *)vaddr + m_offset / sizeof(*link);
+ first_page->freelist = link->next;
+ if (!class->huge)
+ /* record handle in the header of allocated chunk */
+ link->handle = handle;
+ else
+ /* record handle in first_page->private */
+ set_page_private(first_page, handle);
+ kunmap_atomic(vaddr);
+ first_page->inuse++;
+ zs_stat_inc(class, OBJ_USED, 1);
+
+ return obj;
+}
+
+
/**
* zs_malloc - Allocate block of given size from pool.
* @pool: pool to allocate from
@@ -1236,18 +1384,25 @@ EXPORT_SYMBOL_GPL(zs_unmap_object);
*/
unsigned long zs_malloc(struct zs_pool *pool, size_t size)
{
- unsigned long obj;
- struct link_free *link;
+ unsigned long handle, obj;
struct size_class *class;
- void *vaddr;
-
- struct page *first_page, *m_page;
- unsigned long m_objidx, m_offset;
+ struct page *first_page;
if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
return 0;
+ handle = alloc_handle(pool);
+ if (!handle)
+ return 0;
+
+ /* extra space in chunk to keep the handle */
+ size += ZS_HANDLE_SIZE;
class = pool->size_class[get_size_class_index(size)];
+ /* In huge class size, we store the handle into first_page->private */
+ if (class->huge) {
+ size -= ZS_HANDLE_SIZE;
+ class = pool->size_class[get_size_class_index(size)];
+ }
spin_lock(&class->lock);
first_page = find_get_zspage(class);
@@ -1255,8 +1410,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
if (!first_page) {
spin_unlock(&class->lock);
first_page = alloc_zspage(class, pool->flags);
- if (unlikely(!first_page))
+ if (unlikely(!first_page)) {
+ free_handle(pool, handle);
return 0;
+ }
set_zspage_mapping(first_page, class->index, ZS_EMPTY);
atomic_long_add(class->pages_per_zspage,
@@ -1267,73 +1424,360 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
class->size, class->pages_per_zspage));
}
- obj = (unsigned long)first_page->freelist;
- obj_handle_to_location(obj, &m_page, &m_objidx);
- m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
-
- vaddr = kmap_atomic(m_page);
- link = (struct link_free *)vaddr + m_offset / sizeof(*link);
- first_page->freelist = link->next;
- memset(link, POISON_INUSE, sizeof(*link));
- kunmap_atomic(vaddr);
-
- first_page->inuse++;
- zs_stat_inc(class, OBJ_USED, 1);
+ obj = obj_malloc(first_page, class, handle);
/* Now move the zspage to another fullness group, if required */
- fix_fullness_group(pool, first_page);
+ fix_fullness_group(class, first_page);
+ record_obj(handle, obj);
spin_unlock(&class->lock);
- return obj;
+ return handle;
}
EXPORT_SYMBOL_GPL(zs_malloc);
-void zs_free(struct zs_pool *pool, unsigned long obj)
+static void obj_free(struct zs_pool *pool, struct size_class *class,
+ unsigned long obj)
{
struct link_free *link;
struct page *first_page, *f_page;
unsigned long f_objidx, f_offset;
void *vaddr;
-
int class_idx;
- struct size_class *class;
enum fullness_group fullness;
- if (unlikely(!obj))
- return;
+ BUG_ON(!obj);
- obj_handle_to_location(obj, &f_page, &f_objidx);
+ obj &= ~OBJ_ALLOCATED_TAG;
+ obj_to_location(obj, &f_page, &f_objidx);
first_page = get_first_page(f_page);
get_zspage_mapping(first_page, &class_idx, &fullness);
- class = pool->size_class[class_idx];
f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
- spin_lock(&class->lock);
+ vaddr = kmap_atomic(f_page);
/* Insert this object in containing zspage's freelist */
- vaddr = kmap_atomic(f_page);
link = (struct link_free *)(vaddr + f_offset);
link->next = first_page->freelist;
+ if (class->huge)
+ set_page_private(first_page, 0);
kunmap_atomic(vaddr);
first_page->freelist = (void *)obj;
-
first_page->inuse--;
- fullness = fix_fullness_group(pool, first_page);
-
zs_stat_dec(class, OBJ_USED, 1);
- if (fullness == ZS_EMPTY)
+}
+
+void zs_free(struct zs_pool *pool, unsigned long handle)
+{
+ struct page *first_page, *f_page;
+ unsigned long obj, f_objidx;
+ int class_idx;
+ struct size_class *class;
+ enum fullness_group fullness;
+
+ if (unlikely(!handle))
+ return;
+
+ pin_tag(handle);
+ obj = handle_to_obj(handle);
+ obj_to_location(obj, &f_page, &f_objidx);
+ first_page = get_first_page(f_page);
+
+ get_zspage_mapping(first_page, &class_idx, &fullness);
+ class = pool->size_class[class_idx];
+
+ spin_lock(&class->lock);
+ obj_free(pool, class, obj);
+ fullness = fix_fullness_group(class, first_page);
+ if (fullness == ZS_EMPTY) {
zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
class->size, class->pages_per_zspage));
-
+ atomic_long_sub(class->pages_per_zspage,
+ &pool->pages_allocated);
+ free_zspage(first_page);
+ }
spin_unlock(&class->lock);
+ unpin_tag(handle);
+
+ free_handle(pool, handle);
+}
+EXPORT_SYMBOL_GPL(zs_free);
+
+static void zs_object_copy(unsigned long src, unsigned long dst,
+ struct size_class *class)
+{
+ struct page *s_page, *d_page;
+ unsigned long s_objidx, d_objidx;
+ unsigned long s_off, d_off;
+ void *s_addr, *d_addr;
+ int s_size, d_size, size;
+ int written = 0;
+
+ s_size = d_size = class->size;
+
+ obj_to_location(src, &s_page, &s_objidx);
+ obj_to_location(dst, &d_page, &d_objidx);
+
+ s_off = obj_idx_to_offset(s_page, s_objidx, class->size);
+ d_off = obj_idx_to_offset(d_page, d_objidx, class->size);
+
+ if (s_off + class->size > PAGE_SIZE)
+ s_size = PAGE_SIZE - s_off;
+
+ if (d_off + class->size > PAGE_SIZE)
+ d_size = PAGE_SIZE - d_off;
+
+ s_addr = kmap_atomic(s_page);
+ d_addr = kmap_atomic(d_page);
+
+ while (1) {
+ size = min(s_size, d_size);
+ memcpy(d_addr + d_off, s_addr + s_off, size);
+ written += size;
+
+ if (written == class->size)
+ break;
+
+ s_off += size;
+ s_size -= size;
+ d_off += size;
+ d_size -= size;
+
+ if (s_off >= PAGE_SIZE) {
+ kunmap_atomic(d_addr);
+ kunmap_atomic(s_addr);
+ s_page = get_next_page(s_page);
+ BUG_ON(!s_page);
+ s_addr = kmap_atomic(s_page);
+ d_addr = kmap_atomic(d_page);
+ s_size = class->size - written;
+ s_off = 0;
+ }
+
+ if (d_off >= PAGE_SIZE) {
+ kunmap_atomic(d_addr);
+ d_page = get_next_page(d_page);
+ BUG_ON(!d_page);
+ d_addr = kmap_atomic(d_page);
+ d_size = class->size - written;
+ d_off = 0;
+ }
+ }
+
+ kunmap_atomic(d_addr);
+ kunmap_atomic(s_addr);
+}
+
+/*
+ * Find alloced object in zspage from index object and
+ * return handle.
+ */
+static unsigned long find_alloced_obj(struct page *page, int index,
+ struct size_class *class)
+{
+ unsigned long head;
+ int offset = 0;
+ unsigned long handle = 0;
+ void *addr = kmap_atomic(page);
+
+ if (!is_first_page(page))
+ offset = page->index;
+ offset += class->size * index;
+
+ while (offset < PAGE_SIZE) {
+ head = obj_to_head(class, page, addr + offset);
+ if (head & OBJ_ALLOCATED_TAG) {
+ handle = head & ~OBJ_ALLOCATED_TAG;
+ if (trypin_tag(handle))
+ break;
+ handle = 0;
+ }
+
+ offset += class->size;
+ index++;
+ }
+
+ kunmap_atomic(addr);
+ return handle;
+}
+
+struct zs_compact_control {
+ /* Source page for migration which could be a subpage of zspage. */
+ struct page *s_page;
+ /* Destination page for migration which should be a first page
+ * of zspage. */
+ struct page *d_page;
+ /* Starting object index within @s_page which used for live object
+ * in the subpage. */
+ int index;
+ /* how many of objects are migrated */
+ int nr_migrated;
+};
+
+static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
+ struct zs_compact_control *cc)
+{
+ unsigned long used_obj, free_obj;
+ unsigned long handle;
+ struct page *s_page = cc->s_page;
+ struct page *d_page = cc->d_page;
+ unsigned long index = cc->index;
+ int nr_migrated = 0;
+ int ret = 0;
+
+ while (1) {
+ handle = find_alloced_obj(s_page, index, class);
+ if (!handle) {
+ s_page = get_next_page(s_page);
+ if (!s_page)
+ break;
+ index = 0;
+ continue;
+ }
+
+ /* Stop if there is no more space */
+ if (zspage_full(d_page)) {
+ unpin_tag(handle);
+ ret = -ENOMEM;
+ break;
+ }
+
+ used_obj = handle_to_obj(handle);
+ free_obj = obj_malloc(d_page, class, handle);
+ zs_object_copy(used_obj, free_obj, class);
+ index++;
+ record_obj(handle, free_obj);
+ unpin_tag(handle);
+ obj_free(pool, class, used_obj);
+ nr_migrated++;
+ }
+
+ /* Remember last position in this iteration */
+ cc->s_page = s_page;
+ cc->index = index;
+ cc->nr_migrated = nr_migrated;
+
+ return ret;
+}
+static struct page *alloc_target_page(struct size_class *class)
+{
+ int i;
+ struct page *page;
+
+ for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
+ page = class->fullness_list[i];
+ if (page) {
+ remove_zspage(page, class, i);
+ break;
+ }
+ }
+
+ return page;
+}
+
+static void putback_zspage(struct zs_pool *pool, struct size_class *class,
+ struct page *first_page)
+{
+ int class_idx;
+ enum fullness_group fullness;
+
+ BUG_ON(!is_first_page(first_page));
+
+ get_zspage_mapping(first_page, &class_idx, &fullness);
+ insert_zspage(first_page, class, fullness);
+ fullness = fix_fullness_group(class, first_page);
if (fullness == ZS_EMPTY) {
+ zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
+ class->size, class->pages_per_zspage));
atomic_long_sub(class->pages_per_zspage,
&pool->pages_allocated);
+
free_zspage(first_page);
}
}
-EXPORT_SYMBOL_GPL(zs_free);
+
+static struct page *isolate_source_page(struct size_class *class)
+{
+ struct page *page;
+
+ page = class->fullness_list[ZS_ALMOST_EMPTY];
+ if (page)
+ remove_zspage(page, class, ZS_ALMOST_EMPTY);
+
+ return page;
+}
+
+static unsigned long __zs_compact(struct zs_pool *pool,
+ struct size_class *class)
+{
+ int nr_to_migrate;
+ struct zs_compact_control cc;
+ struct page *src_page;
+ struct page *dst_page = NULL;
+ unsigned long nr_total_migrated = 0;
+
+ spin_lock(&class->lock);
+ while ((src_page = isolate_source_page(class))) {
+
+ BUG_ON(!is_first_page(src_page));
+
+ /* The goal is to migrate all live objects in source page */
+ nr_to_migrate = src_page->inuse;
+ cc.index = 0;
+ cc.s_page = src_page;
+
+ while ((dst_page = alloc_target_page(class))) {
+ cc.d_page = dst_page;
+ /*
+ * If there is no more space in dst_page, try to
+ * allocate another zspage.
+ */
+ if (!migrate_zspage(pool, class, &cc))
+ break;
+
+ putback_zspage(pool, class, dst_page);
+ nr_total_migrated += cc.nr_migrated;
+ nr_to_migrate -= cc.nr_migrated;
+ }
+
+ /* Stop if we couldn't find slot */
+ if (dst_page == NULL)
+ break;
+
+ putback_zspage(pool, class, dst_page);
+ putback_zspage(pool, class, src_page);
+ spin_unlock(&class->lock);
+ nr_total_migrated += cc.nr_migrated;
+ cond_resched();
+ spin_lock(&class->lock);
+ }
+
+ if (src_page)
+ putback_zspage(pool, class, src_page);
+
+ spin_unlock(&class->lock);
+
+ return nr_total_migrated;
+}
+
+unsigned long zs_compact(struct zs_pool *pool)
+{
+ int i;
+ unsigned long nr_migrated = 0;
+ struct size_class *class;
+
+ for (i = zs_size_classes - 1; i >= 0; i--) {
+ class = pool->size_class[i];
+ if (!class)
+ continue;
+ if (class->index != i)
+ continue;
+ nr_migrated += __zs_compact(pool, class);
+ }
+
+ return nr_migrated;
+}
+EXPORT_SYMBOL_GPL(zs_compact);
/**
* zs_create_pool - Creates an allocation pool to work from.
@@ -1355,20 +1799,20 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags)
if (!pool)
return NULL;
- pool->name = kstrdup(name, GFP_KERNEL);
- if (!pool->name) {
- kfree(pool);
- return NULL;
- }
-
pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *),
GFP_KERNEL);
if (!pool->size_class) {
- kfree(pool->name);
kfree(pool);
return NULL;
}
+ pool->name = kstrdup(name, GFP_KERNEL);
+ if (!pool->name)
+ goto err;
+
+ if (create_handle_cache(pool))
+ goto err;
+
/*
* Iterate reversly, because, size of size_class that we want to use
* for merging should be larger or equal to current size.
@@ -1406,6 +1850,9 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags)
class->size = size;
class->index = i;
class->pages_per_zspage = pages_per_zspage;
+ if (pages_per_zspage == 1 &&
+ get_maxobj_per_zspage(size, pages_per_zspage) == 1)
+ class->huge = true;
spin_lock_init(&class->lock);
pool->size_class[i] = class;
@@ -1450,6 +1897,7 @@ void zs_destroy_pool(struct zs_pool *pool)
kfree(class);
}
+ destroy_handle_cache(pool);
kfree(pool->size_class);
kfree(pool->name);
kfree(pool);
diff --git a/mm/zswap.c b/mm/zswap.c
index 4249e82ff934..f8583f1fc938 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -490,7 +490,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry,
}
/* May fail (-ENOMEM) if radix-tree node allocation failed. */
- __set_page_locked(new_page);
+ __SetPageLocked(new_page);
SetPageSwapBacked(new_page);
err = __add_to_swap_cache(new_page, entry);
if (likely(!err)) {
@@ -501,7 +501,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry,
}
radix_tree_preload_end();
ClearPageSwapBacked(new_page);
- __clear_page_locked(new_page);
+ __ClearPageLocked(new_page);
/*
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
* clear SWAP_HAS_CACHE flag.
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 50ec42f170a0..2dacc7b5af23 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -100,7 +100,9 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags,
new_stats =
kmem_cache_alloc_node(flow_stats_cache,
- GFP_THISNODE |
+ GFP_NOWAIT |
+ __GFP_THISNODE |
+ __GFP_NOWARN |
__GFP_NOMEMALLOC,
node);
if (likely(new_stats)) {
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index fb78117b896c..9068e72aa73c 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -1,9 +1,11 @@
config SUNRPC
tristate
+ depends on MULTIUSER
config SUNRPC_GSS
tristate
select OID_REGISTRY
+ depends on MULTIUSER
config SUNRPC_BACKCHANNEL
bool
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 5199bb1a017e..2928afffbb81 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -1072,10 +1072,12 @@ void qword_add(char **bpp, int *lp, char *str)
if (len < 0) return;
- ret = string_escape_str(str, &bp, len, ESCAPE_OCTAL, "\\ \n\t");
- if (ret < 0 || ret == len)
+ ret = string_escape_str(str, bp, len, ESCAPE_OCTAL, "\\ \n\t");
+ if (ret >= len) {
+ bp += len;
len = -1;
- else {
+ } else {
+ bp += ret;
len -= ret;
*bp++ = ' ';
len--;
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index d12435992dea..d54a814a4bc8 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -47,6 +47,8 @@ my $ignore_perl_version = 0;
my $minimum_perl_version = 5.10.0;
my $min_conf_desc_length = 4;
my $spelling_file = "$D/spelling.txt";
+my $codespell = 0;
+my $codespellfile = "/usr/local/share/codespell/dictionary.txt";
sub help {
my ($exitcode) = @_;
@@ -88,6 +90,9 @@ Options:
file. It's your fault if there's no backup or git
--ignore-perl-version override checking of perl version. expect
runtime errors.
+ --codespell Use the codespell dictionary for spelling/typos
+ (default:/usr/local/share/codespell/dictionary.txt)
+ --codespellfile Use this codespell dictionary
-h, --help, --version display this help and exit
When FILE is - read standard input.
@@ -146,6 +151,8 @@ GetOptions(
'ignore-perl-version!' => \$ignore_perl_version,
'debug=s' => \%debug,
'test-only=s' => \$tst_only,
+ 'codespell!' => \$codespell,
+ 'codespellfile=s' => \$codespellfile,
'h|help' => \$help,
'version' => \$help
) or help(1);
@@ -316,6 +323,7 @@ our $Operators = qr{
our $c90_Keywords = qr{do|for|while|if|else|return|goto|continue|switch|default|case|break}x;
+our $BasicType;
our $NonptrType;
our $NonptrTypeMisordered;
our $NonptrTypeWithAttr;
@@ -436,6 +444,14 @@ foreach my $entry (@mode_permission_funcs) {
$mode_perms_search .= $entry->[0];
}
+our $mode_perms_world_writable = qr{
+ S_IWUGO |
+ S_IWOTH |
+ S_IRWXUGO |
+ S_IALLUGO |
+ 0[0-7][0-7][2367]
+}x;
+
our $allowed_asm_includes = qr{(?x:
irq|
memory|
@@ -449,7 +465,6 @@ my $misspellings;
my %spelling_fix;
if (open(my $spelling, '<', $spelling_file)) {
- my @spelling_list;
while (<$spelling>) {
my $line = $_;
@@ -461,21 +476,50 @@ if (open(my $spelling, '<', $spelling_file)) {
my ($suspect, $fix) = split(/\|\|/, $line);
- push(@spelling_list, $suspect);
$spelling_fix{$suspect} = $fix;
}
close($spelling);
- $misspellings = join("|", @spelling_list);
} else {
warn "No typos will be found - file '$spelling_file': $!\n";
}
+if ($codespell) {
+ if (open(my $spelling, '<', $codespellfile)) {
+ while (<$spelling>) {
+ my $line = $_;
+
+ $line =~ s/\s*\n?$//g;
+ $line =~ s/^\s*//g;
+
+ next if ($line =~ m/^\s*#/);
+ next if ($line =~ m/^\s*$/);
+ next if ($line =~ m/, disabled/i);
+
+ $line =~ s/,.*$//;
+
+ my ($suspect, $fix) = split(/->/, $line);
+
+ $spelling_fix{$suspect} = $fix;
+ }
+ close($spelling);
+ } else {
+ warn "No codespell typos will be found - file '$codespellfile': $!\n";
+ }
+}
+
+$misspellings = join("|", sort keys %spelling_fix) if keys %spelling_fix;
+
sub build_types {
my $mods = "(?x: \n" . join("|\n ", @modifierList) . "\n)";
my $all = "(?x: \n" . join("|\n ", @typeList) . "\n)";
my $Misordered = "(?x: \n" . join("|\n ", @typeListMisordered) . "\n)";
my $allWithAttr = "(?x: \n" . join("|\n ", @typeListWithAttr) . "\n)";
$Modifier = qr{(?:$Attribute|$Sparse|$mods)};
+ $BasicType = qr{
+ (?:$typeOtherOSTypedefs\b)|
+ (?:$typeTypedefs\b)|
+ (?:${all}\b)
+ }x;
$NonptrType = qr{
(?:$Modifier\s+|const\s+)*
(?:
@@ -2303,8 +2347,9 @@ sub process {
}
# Check for various typo / spelling mistakes
- if (defined($misspellings) && ($in_commit_log || $line =~ /^\+/)) {
- while ($rawline =~ /(?:^|[^a-z@])($misspellings)(?:$|[^a-z@])/gi) {
+ if (defined($misspellings) &&
+ ($in_commit_log || $line =~ /^(?:\+|Subject:)/i)) {
+ while ($rawline =~ /(?:^|[^a-z@])($misspellings)(?:\b|$|[^a-z@])/gi) {
my $typo = $1;
my $typo_fix = $spelling_fix{lc($typo)};
$typo_fix = ucfirst($typo_fix) if ($typo =~ /^[A-Z]/);
@@ -2552,8 +2597,15 @@ sub process {
}
}
- if ($line =~ /^\+.*(\w+\s*)?\(\s*$Type\s*\)[ \t]+(?!$Assignment|$Arithmetic|[,;:\?\(\{\}\[\<\>]|&&|\|\||\\$)/ &&
- (!defined($1) || $1 !~ /sizeof\s*/)) {
+# check for space after cast like "(int) foo" or "(struct foo) bar"
+# avoid checking a few false positives:
+# "sizeof(<type>)" or "__alignof__(<type>)"
+# function pointer declarations like "(*foo)(int) = bar;"
+# structure definitions like "(struct foo) { 0 };"
+# multiline macros that define functions
+# known attributes or the __attribute__ keyword
+ if ($line =~ /^\+(.*)\(\s*$Type\s*\)([ \t]++)((?![={]|\\$|$Attribute|__attribute__))/ &&
+ (!defined($1) || $1 !~ /\b(?:sizeof|__alignof__)\s*$/)) {
if (CHK("SPACING",
"No space is necessary after a cast\n" . $herecurr) &&
$fix) {
@@ -3146,6 +3198,18 @@ sub process {
$herecurr);
}
+# check for const <foo> const where <foo> is not a pointer or array type
+ if ($sline =~ /\bconst\s+($BasicType)\s+const\b/) {
+ my $found = $1;
+ if ($sline =~ /\bconst\s+\Q$found\E\s+const\b\s*\*/) {
+ WARN("CONST_CONST",
+ "'const $found const *' should probably be 'const $found * const'\n" . $herecurr);
+ } elsif ($sline !~ /\bconst\s+\Q$found\E\s+const\s+\w+\s*\[/) {
+ WARN("CONST_CONST",
+ "'const $found const' should probably be 'const $found'\n" . $herecurr);
+ }
+ }
+
# check for non-global char *foo[] = {"bar", ...} declarations.
if ($line =~ /^.\s+(?:static\s+|const\s+)?char\s+\*\s*\w+\s*\[\s*\]\s*=\s*\{/) {
WARN("STATIC_CONST_CHAR_ARRAY",
@@ -3963,12 +4027,12 @@ sub process {
}
}
-# Return of what appears to be an errno should normally be -'ve
- if ($line =~ /^.\s*return\s*(E[A-Z]*)\s*;/) {
+# Return of what appears to be an errno should normally be negative
+ if ($sline =~ /\breturn(?:\s*\(+\s*|\s+)(E[A-Z]+)(?:\s*\)+\s*|\s*)[;:,]/) {
my $name = $1;
if ($name ne 'EOF' && $name ne 'ERROR') {
WARN("USE_NEGATIVE_ERRNO",
- "return of an errno should typically be -ve (return -$1)\n" . $herecurr);
+ "return of an errno should typically be negative (ie: return -$1)\n" . $herecurr);
}
}
@@ -5318,8 +5382,8 @@ sub process {
}
}
- if ($line =~ /debugfs_create_file.*S_IWUGO/ ||
- $line =~ /DEVICE_ATTR.*S_IWUGO/ ) {
+ if ($line =~ /debugfs_create_\w+.*\b$mode_perms_world_writable\b/ ||
+ $line =~ /DEVICE_ATTR.*\b$mode_perms_world_writable\b/) {
WARN("EXPORTED_WORLD_WRITABLE",
"Exporting world writable files is usually an error. Consider more restrictive permissions.\n" . $herecurr);
}
diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index fc7fd52b5e03..bb8e4d0a1911 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -825,6 +825,7 @@ retreived||retrieved
retreive||retrieve
retrive||retrieve
retuned||returned
+reudce||reduce
reuest||request
reuqest||request
reutnred||returned
diff --git a/security/Kconfig b/security/Kconfig
index beb86b500adf..bf4ec46474b6 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -21,6 +21,7 @@ config SECURITY_DMESG_RESTRICT
config SECURITY
bool "Enable different security models"
depends on SYSFS
+ depends on MULTIUSER
help
This allows you to choose different security modules to be
configured into your kernel.