summaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rw-r--r--lib/Kconfig.debug46
-rw-r--r--lib/Kconfig.ubsan3
-rw-r--r--lib/debugobjects.c10
-rw-r--r--lib/idr.c11
-rw-r--r--lib/iov_iter.c1
-rw-r--r--lib/kobject_uevent.c6
-rw-r--r--lib/list_debug.c99
-rw-r--r--lib/locking-selftest.c66
-rw-r--r--lib/lockref.c2
-rw-r--r--lib/nlattr.c2
-rw-r--r--lib/parser.c47
-rw-r--r--lib/percpu_counter.c25
-rw-r--r--lib/radix-tree.c1195
-rw-r--r--lib/raid6/avx2.c232
-rw-r--r--lib/rbtree.c23
-rw-r--r--lib/swiotlb.c81
-rw-r--r--lib/test_kasan.c29
17 files changed, 1260 insertions, 618 deletions
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index a6c8db1d62f6..7446097f72bd 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -13,7 +13,22 @@ config PRINTK_TIME
be included, not that the timestamp is recorded.
The behavior is also controlled by the kernel command line
- parameter printk.time=1. See Documentation/kernel-parameters.txt
+ parameter printk.time=1. See Documentation/admin-guide/kernel-parameters.rst
+
+config CONSOLE_LOGLEVEL_DEFAULT
+ int "Default console loglevel (1-15)"
+ range 1 15
+ default "7"
+ help
+ Default loglevel to determine what will be printed on the console.
+
+ Setting a default here is equivalent to passing in loglevel=<x> in
+ the kernel bootargs. loglevel=<x> continues to override whatever
+ value is specified here as well.
+
+ Note: This does not affect the log level of un-prefixed prink()
+ usage in the kernel. That is controlled by the MESSAGE_LOGLEVEL_DEFAULT
+ option.
config MESSAGE_LOGLEVEL_DEFAULT
int "Default message log level (1-7)"
@@ -26,6 +41,10 @@ config MESSAGE_LOGLEVEL_DEFAULT
that are auditing their logs closely may want to set it to a lower
priority.
+ Note: This does not affect what message level gets printed on the console
+ by default. To change that, use loglevel=<x> in the kernel bootargs,
+ or pick a different CONSOLE_LOGLEVEL_DEFAULT configuration value.
+
config BOOT_PRINTK_DELAY
bool "Delay each boot printk message by N milliseconds"
depends on DEBUG_KERNEL && PRINTK && GENERIC_CALIBRATE_DELAY
@@ -175,8 +194,8 @@ config GDB_SCRIPTS
build directory. If you load vmlinux into gdb, the helper
scripts will be automatically imported by gdb as well, and
additional functions are available to analyze a Linux kernel
- instance. See Documentation/gdb-kernel-debugging.txt for further
- details.
+ instance. See Documentation/dev-tools/gdb-kernel-debugging.rst
+ for further details.
config ENABLE_WARN_DEPRECATED
bool "Enable __deprecated logic"
@@ -523,7 +542,7 @@ config DEBUG_KMEMLEAK
difference being that the orphan objects are not freed but
only shown in /sys/kernel/debug/kmemleak. Enabling this
feature will introduce an overhead to memory
- allocations. See Documentation/kmemleak.txt for more
+ allocations. See Documentation/dev-tools/kmemleak.rst for more
details.
Enabling DEBUG_SLAB or SLUB_DEBUG may increase the chances
@@ -720,7 +739,7 @@ config KCOV
different machines and across reboots. If you need stable PC values,
disable RANDOMIZE_BASE.
- For more details, see Documentation/kcov.txt.
+ For more details, see Documentation/dev-tools/kcov.rst.
config KCOV_INSTRUMENT_ALL
bool "Instrument all code by default"
@@ -1218,7 +1237,7 @@ config DEBUG_BUGVERBOSE
config DEBUG_LIST
bool "Debug linked list manipulation"
- depends on DEBUG_KERNEL
+ depends on DEBUG_KERNEL || BUG_ON_DATA_CORRUPTION
help
Enable this to turn on extended checks in the linked-list
walking routines.
@@ -1434,7 +1453,8 @@ config RCU_TRACE
select TRACE_CLOCK
help
This option provides tracing in RCU which presents stats
- in debugfs for debugging RCU implementation.
+ in debugfs for debugging RCU implementation. It also enables
+ additional tracepoints for ftrace-style event tracing.
Say Y here if you want to enable RCU tracing
Say N if you are unsure.
@@ -1964,6 +1984,16 @@ config TEST_STATIC_KEYS
If unsure, say N.
+config BUG_ON_DATA_CORRUPTION
+ bool "Trigger a BUG when data corruption is detected"
+ select DEBUG_LIST
+ help
+ Select this option if the kernel should BUG when it encounters
+ data corruption in kernel memory structures when they get checked
+ for validity.
+
+ If unsure, say N.
+
source "samples/Kconfig"
source "lib/Kconfig.kgdb"
@@ -1975,7 +2005,7 @@ config ARCH_HAS_DEVMEM_IS_ALLOWED
config STRICT_DEVMEM
bool "Filter access to /dev/mem"
- depends on MMU
+ depends on MMU && DEVMEM
depends on ARCH_HAS_DEVMEM_IS_ALLOWED
default y if TILE || PPC
---help---
diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan
index bc6e651df68c..a669c193b878 100644
--- a/lib/Kconfig.ubsan
+++ b/lib/Kconfig.ubsan
@@ -10,7 +10,8 @@ config UBSAN
This option enables undefined behaviour sanity checker
Compile-time instrumentation is used to detect various undefined
behaviours in runtime. Various types of checks may be enabled
- via boot parameter ubsan_handle (see: Documentation/ubsan.txt).
+ via boot parameter ubsan_handle
+ (see: Documentation/dev-tools/ubsan.rst).
config UBSAN_SANITIZE_ALL
bool "Enable instrumentation for the entire kernel"
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index a8e12601eb37..04c1ef717fe0 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -199,7 +199,7 @@ static void free_object(struct debug_obj *obj)
* initialized:
*/
if (obj_pool_free > ODEBUG_POOL_SIZE && obj_cache)
- sched = keventd_up();
+ sched = 1;
hlist_add_head(&obj->node, &obj_pool);
obj_pool_free++;
obj_pool_used--;
@@ -362,6 +362,7 @@ void debug_object_init(void *addr, struct debug_obj_descr *descr)
__debug_object_init(addr, descr, 0);
}
+EXPORT_SYMBOL_GPL(debug_object_init);
/**
* debug_object_init_on_stack - debug checks when an object on stack is
@@ -376,6 +377,7 @@ void debug_object_init_on_stack(void *addr, struct debug_obj_descr *descr)
__debug_object_init(addr, descr, 1);
}
+EXPORT_SYMBOL_GPL(debug_object_init_on_stack);
/**
* debug_object_activate - debug checks when an object is activated
@@ -449,6 +451,7 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr)
}
return 0;
}
+EXPORT_SYMBOL_GPL(debug_object_activate);
/**
* debug_object_deactivate - debug checks when an object is deactivated
@@ -496,6 +499,7 @@ void debug_object_deactivate(void *addr, struct debug_obj_descr *descr)
raw_spin_unlock_irqrestore(&db->lock, flags);
}
+EXPORT_SYMBOL_GPL(debug_object_deactivate);
/**
* debug_object_destroy - debug checks when an object is destroyed
@@ -542,6 +546,7 @@ void debug_object_destroy(void *addr, struct debug_obj_descr *descr)
out_unlock:
raw_spin_unlock_irqrestore(&db->lock, flags);
}
+EXPORT_SYMBOL_GPL(debug_object_destroy);
/**
* debug_object_free - debug checks when an object is freed
@@ -582,6 +587,7 @@ void debug_object_free(void *addr, struct debug_obj_descr *descr)
out_unlock:
raw_spin_unlock_irqrestore(&db->lock, flags);
}
+EXPORT_SYMBOL_GPL(debug_object_free);
/**
* debug_object_assert_init - debug checks when object should be init-ed
@@ -626,6 +632,7 @@ void debug_object_assert_init(void *addr, struct debug_obj_descr *descr)
raw_spin_unlock_irqrestore(&db->lock, flags);
}
+EXPORT_SYMBOL_GPL(debug_object_assert_init);
/**
* debug_object_active_state - debug checks object usage state machine
@@ -673,6 +680,7 @@ debug_object_active_state(void *addr, struct debug_obj_descr *descr,
raw_spin_unlock_irqrestore(&db->lock, flags);
}
+EXPORT_SYMBOL_GPL(debug_object_active_state);
#ifdef CONFIG_DEBUG_OBJECTS_FREE
static void __debug_check_no_obj_freed(const void *address, unsigned long size)
diff --git a/lib/idr.c b/lib/idr.c
index 6098336df267..52d2979a05e8 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -927,6 +927,9 @@ EXPORT_SYMBOL(ida_pre_get);
* and go back to the ida_pre_get() call. If the ida is full, it will
* return %-ENOSPC.
*
+ * Note that callers must ensure that concurrent access to @ida is not possible.
+ * See ida_simple_get() for a varaint which takes care of locking.
+ *
* @p_id returns a value in the range @starting_id ... %0x7fffffff.
*/
int ida_get_new_above(struct ida *ida, int starting_id, int *p_id)
@@ -1073,6 +1076,9 @@ EXPORT_SYMBOL(ida_destroy);
* Allocates an id in the range start <= id < end, or returns -ENOSPC.
* On memory allocation failure, returns -ENOMEM.
*
+ * Compared to ida_get_new_above() this function does its own locking, and
+ * should be used unless there are special requirements.
+ *
* Use ida_simple_remove() to get rid of an id.
*/
int ida_simple_get(struct ida *ida, unsigned int start, unsigned int end,
@@ -1119,6 +1125,11 @@ EXPORT_SYMBOL(ida_simple_get);
* ida_simple_remove - remove an allocated id.
* @ida: the (initialized) ida.
* @id: the id returned by ida_simple_get.
+ *
+ * Use to release an id allocated with ida_simple_get().
+ *
+ * Compared to ida_remove() this function does its own locking, and should be
+ * used unless there are special requirements.
*/
void ida_simple_remove(struct ida *ida, unsigned int id)
{
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 83c00b7f59b5..228892dabba6 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1,4 +1,5 @@
#include <linux/export.h>
+#include <linux/bvec.h>
#include <linux/uio.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index f6c2c1e7779c..9a2b811966eb 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -56,7 +56,7 @@ static const char *kobject_actions[] = {
* kobject_action_type - translate action string to numeric type
*
* @buf: buffer containing the action string, newline is ignored
- * @len: length of buffer
+ * @count: length of buffer
* @type: pointer to the location to store the action type
*
* Returns 0 if the action string was recognized.
@@ -154,8 +154,8 @@ static void cleanup_uevent_env(struct subprocess_info *info)
/**
* kobject_uevent_env - send an uevent with environmental data
*
- * @action: action that is happening
* @kobj: struct kobject that the action is happening to
+ * @action: action that is happening
* @envp_ext: pointer to environmental data
*
* Returns 0 if kobject_uevent_env() is completed with success or the
@@ -363,8 +363,8 @@ EXPORT_SYMBOL_GPL(kobject_uevent_env);
/**
* kobject_uevent - notify userspace by sending an uevent
*
- * @action: action that is happening
* @kobj: struct kobject that the action is happening to
+ * @action: action that is happening
*
* Returns 0 if kobject_uevent() is completed with success or the
* corresponding error when it fails.
diff --git a/lib/list_debug.c b/lib/list_debug.c
index 3859bf63561c..7f7bfa55eb6d 100644
--- a/lib/list_debug.c
+++ b/lib/list_debug.c
@@ -2,8 +2,7 @@
* Copyright 2006, Red Hat, Inc., Dave Jones
* Released under the General Public License (GPL).
*
- * This file contains the linked list implementations for
- * DEBUG_LIST.
+ * This file contains the linked list validation for DEBUG_LIST.
*/
#include <linux/export.h>
@@ -13,88 +12,48 @@
#include <linux/rculist.h>
/*
- * Insert a new entry between two known consecutive entries.
- *
- * This is only for internal list manipulation where we know
- * the prev/next entries already!
+ * Check that the data structures for the list manipulations are reasonably
+ * valid. Failures here indicate memory corruption (and possibly an exploit
+ * attempt).
*/
-void __list_add(struct list_head *new,
- struct list_head *prev,
- struct list_head *next)
+bool __list_add_valid(struct list_head *new, struct list_head *prev,
+ struct list_head *next)
{
- WARN(next->prev != prev,
- "list_add corruption. next->prev should be "
- "prev (%p), but was %p. (next=%p).\n",
+ CHECK_DATA_CORRUPTION(next->prev != prev,
+ "list_add corruption. next->prev should be prev (%p), but was %p. (next=%p).\n",
prev, next->prev, next);
- WARN(prev->next != next,
- "list_add corruption. prev->next should be "
- "next (%p), but was %p. (prev=%p).\n",
+ CHECK_DATA_CORRUPTION(prev->next != next,
+ "list_add corruption. prev->next should be next (%p), but was %p. (prev=%p).\n",
next, prev->next, prev);
- WARN(new == prev || new == next,
- "list_add double add: new=%p, prev=%p, next=%p.\n",
- new, prev, next);
- next->prev = new;
- new->next = next;
- new->prev = prev;
- WRITE_ONCE(prev->next, new);
+ CHECK_DATA_CORRUPTION(new == prev || new == next,
+ "list_add double add: new=%p, prev=%p, next=%p.\n",
+ new, prev, next);
+
+ return true;
}
-EXPORT_SYMBOL(__list_add);
+EXPORT_SYMBOL(__list_add_valid);
-void __list_del_entry(struct list_head *entry)
+bool __list_del_entry_valid(struct list_head *entry)
{
struct list_head *prev, *next;
prev = entry->prev;
next = entry->next;
- if (WARN(next == LIST_POISON1,
+ CHECK_DATA_CORRUPTION(next == LIST_POISON1,
"list_del corruption, %p->next is LIST_POISON1 (%p)\n",
- entry, LIST_POISON1) ||
- WARN(prev == LIST_POISON2,
+ entry, LIST_POISON1);
+ CHECK_DATA_CORRUPTION(prev == LIST_POISON2,
"list_del corruption, %p->prev is LIST_POISON2 (%p)\n",
- entry, LIST_POISON2) ||
- WARN(prev->next != entry,
- "list_del corruption. prev->next should be %p, "
- "but was %p\n", entry, prev->next) ||
- WARN(next->prev != entry,
- "list_del corruption. next->prev should be %p, "
- "but was %p\n", entry, next->prev))
- return;
-
- __list_del(prev, next);
-}
-EXPORT_SYMBOL(__list_del_entry);
+ entry, LIST_POISON2);
+ CHECK_DATA_CORRUPTION(prev->next != entry,
+ "list_del corruption. prev->next should be %p, but was %p\n",
+ entry, prev->next);
+ CHECK_DATA_CORRUPTION(next->prev != entry,
+ "list_del corruption. next->prev should be %p, but was %p\n",
+ entry, next->prev);
+ return true;
-/**
- * list_del - deletes entry from list.
- * @entry: the element to delete from the list.
- * Note: list_empty on entry does not return true after this, the entry is
- * in an undefined state.
- */
-void list_del(struct list_head *entry)
-{
- __list_del_entry(entry);
- entry->next = LIST_POISON1;
- entry->prev = LIST_POISON2;
-}
-EXPORT_SYMBOL(list_del);
-
-/*
- * RCU variants.
- */
-void __list_add_rcu(struct list_head *new,
- struct list_head *prev, struct list_head *next)
-{
- WARN(next->prev != prev,
- "list_add_rcu corruption. next->prev should be prev (%p), but was %p. (next=%p).\n",
- prev, next->prev, next);
- WARN(prev->next != next,
- "list_add_rcu corruption. prev->next should be next (%p), but was %p. (prev=%p).\n",
- next, prev->next, prev);
- new->next = next;
- new->prev = prev;
- rcu_assign_pointer(list_next_rcu(prev), new);
- next->prev = new;
}
-EXPORT_SYMBOL(__list_add_rcu);
+EXPORT_SYMBOL(__list_del_entry_valid);
diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index 872a15a2a637..f3a217ea0388 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -980,23 +980,23 @@ static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask)
#ifndef CONFIG_PROVE_LOCKING
if (expected == FAILURE && debug_locks) {
expected_testcase_failures++;
- printk("failed|");
+ pr_cont("failed|");
}
else
#endif
if (debug_locks != expected) {
unexpected_testcase_failures++;
- printk("FAILED|");
+ pr_cont("FAILED|");
dump_stack();
} else {
testcase_successes++;
- printk(" ok |");
+ pr_cont(" ok |");
}
testcase_total++;
if (debug_locks_verbose)
- printk(" lockclass mask: %x, debug_locks: %d, expected: %d\n",
+ pr_cont(" lockclass mask: %x, debug_locks: %d, expected: %d\n",
lockclass_mask, debug_locks, expected);
/*
* Some tests (e.g. double-unlock) might corrupt the preemption
@@ -1021,26 +1021,26 @@ static inline void print_testname(const char *testname)
#define DO_TESTCASE_1(desc, name, nr) \
print_testname(desc"/"#nr); \
dotest(name##_##nr, SUCCESS, LOCKTYPE_RWLOCK); \
- printk("\n");
+ pr_cont("\n");
#define DO_TESTCASE_1B(desc, name, nr) \
print_testname(desc"/"#nr); \
dotest(name##_##nr, FAILURE, LOCKTYPE_RWLOCK); \
- printk("\n");
+ pr_cont("\n");
#define DO_TESTCASE_3(desc, name, nr) \
print_testname(desc"/"#nr); \
dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN); \
dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \
dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK); \
- printk("\n");
+ pr_cont("\n");
#define DO_TESTCASE_3RW(desc, name, nr) \
print_testname(desc"/"#nr); \
dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN|LOCKTYPE_RWLOCK);\
dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \
dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK); \
- printk("\n");
+ pr_cont("\n");
#define DO_TESTCASE_6(desc, name) \
print_testname(desc); \
@@ -1050,7 +1050,7 @@ static inline void print_testname(const char *testname)
dotest(name##_mutex, FAILURE, LOCKTYPE_MUTEX); \
dotest(name##_wsem, FAILURE, LOCKTYPE_RWSEM); \
dotest(name##_rsem, FAILURE, LOCKTYPE_RWSEM); \
- printk("\n");
+ pr_cont("\n");
#define DO_TESTCASE_6_SUCCESS(desc, name) \
print_testname(desc); \
@@ -1060,7 +1060,7 @@ static inline void print_testname(const char *testname)
dotest(name##_mutex, SUCCESS, LOCKTYPE_MUTEX); \
dotest(name##_wsem, SUCCESS, LOCKTYPE_RWSEM); \
dotest(name##_rsem, SUCCESS, LOCKTYPE_RWSEM); \
- printk("\n");
+ pr_cont("\n");
/*
* 'read' variant: rlocks must not trigger.
@@ -1073,7 +1073,7 @@ static inline void print_testname(const char *testname)
dotest(name##_mutex, FAILURE, LOCKTYPE_MUTEX); \
dotest(name##_wsem, FAILURE, LOCKTYPE_RWSEM); \
dotest(name##_rsem, FAILURE, LOCKTYPE_RWSEM); \
- printk("\n");
+ pr_cont("\n");
#define DO_TESTCASE_2I(desc, name, nr) \
DO_TESTCASE_1("hard-"desc, name##_hard, nr); \
@@ -1726,25 +1726,25 @@ static void ww_tests(void)
dotest(ww_test_fail_acquire, SUCCESS, LOCKTYPE_WW);
dotest(ww_test_normal, SUCCESS, LOCKTYPE_WW);
dotest(ww_test_unneeded_slow, FAILURE, LOCKTYPE_WW);
- printk("\n");
+ pr_cont("\n");
print_testname("ww contexts mixing");
dotest(ww_test_two_contexts, FAILURE, LOCKTYPE_WW);
dotest(ww_test_diff_class, FAILURE, LOCKTYPE_WW);
- printk("\n");
+ pr_cont("\n");
print_testname("finishing ww context");
dotest(ww_test_context_done_twice, FAILURE, LOCKTYPE_WW);
dotest(ww_test_context_unlock_twice, FAILURE, LOCKTYPE_WW);
dotest(ww_test_context_fini_early, FAILURE, LOCKTYPE_WW);
dotest(ww_test_context_lock_after_done, FAILURE, LOCKTYPE_WW);
- printk("\n");
+ pr_cont("\n");
print_testname("locking mismatches");
dotest(ww_test_object_unlock_twice, FAILURE, LOCKTYPE_WW);
dotest(ww_test_object_lock_unbalanced, FAILURE, LOCKTYPE_WW);
dotest(ww_test_object_lock_stale_context, FAILURE, LOCKTYPE_WW);
- printk("\n");
+ pr_cont("\n");
print_testname("EDEADLK handling");
dotest(ww_test_edeadlk_normal, SUCCESS, LOCKTYPE_WW);
@@ -1757,11 +1757,11 @@ static void ww_tests(void)
dotest(ww_test_edeadlk_acquire_more_edeadlk_slow, FAILURE, LOCKTYPE_WW);
dotest(ww_test_edeadlk_acquire_wrong, FAILURE, LOCKTYPE_WW);
dotest(ww_test_edeadlk_acquire_wrong_slow, FAILURE, LOCKTYPE_WW);
- printk("\n");
+ pr_cont("\n");
print_testname("spinlock nest unlocked");
dotest(ww_test_spin_nest_unlocked, FAILURE, LOCKTYPE_WW);
- printk("\n");
+ pr_cont("\n");
printk(" -----------------------------------------------------\n");
printk(" |block | try |context|\n");
@@ -1771,25 +1771,25 @@ static void ww_tests(void)
dotest(ww_test_context_block, FAILURE, LOCKTYPE_WW);
dotest(ww_test_context_try, SUCCESS, LOCKTYPE_WW);
dotest(ww_test_context_context, SUCCESS, LOCKTYPE_WW);
- printk("\n");
+ pr_cont("\n");
print_testname("try");
dotest(ww_test_try_block, FAILURE, LOCKTYPE_WW);
dotest(ww_test_try_try, SUCCESS, LOCKTYPE_WW);
dotest(ww_test_try_context, FAILURE, LOCKTYPE_WW);
- printk("\n");
+ pr_cont("\n");
print_testname("block");
dotest(ww_test_block_block, FAILURE, LOCKTYPE_WW);
dotest(ww_test_block_try, SUCCESS, LOCKTYPE_WW);
dotest(ww_test_block_context, FAILURE, LOCKTYPE_WW);
- printk("\n");
+ pr_cont("\n");
print_testname("spinlock");
dotest(ww_test_spin_block, FAILURE, LOCKTYPE_WW);
dotest(ww_test_spin_try, SUCCESS, LOCKTYPE_WW);
dotest(ww_test_spin_context, FAILURE, LOCKTYPE_WW);
- printk("\n");
+ pr_cont("\n");
}
void locking_selftest(void)
@@ -1829,32 +1829,32 @@ void locking_selftest(void)
printk(" --------------------------------------------------------------------------\n");
print_testname("recursive read-lock");
- printk(" |");
+ pr_cont(" |");
dotest(rlock_AA1, SUCCESS, LOCKTYPE_RWLOCK);
- printk(" |");
+ pr_cont(" |");
dotest(rsem_AA1, FAILURE, LOCKTYPE_RWSEM);
- printk("\n");
+ pr_cont("\n");
print_testname("recursive read-lock #2");
- printk(" |");
+ pr_cont(" |");
dotest(rlock_AA1B, SUCCESS, LOCKTYPE_RWLOCK);
- printk(" |");
+ pr_cont(" |");
dotest(rsem_AA1B, FAILURE, LOCKTYPE_RWSEM);
- printk("\n");
+ pr_cont("\n");
print_testname("mixed read-write-lock");
- printk(" |");
+ pr_cont(" |");
dotest(rlock_AA2, FAILURE, LOCKTYPE_RWLOCK);
- printk(" |");
+ pr_cont(" |");
dotest(rsem_AA2, FAILURE, LOCKTYPE_RWSEM);
- printk("\n");
+ pr_cont("\n");
print_testname("mixed write-read-lock");
- printk(" |");
+ pr_cont(" |");
dotest(rlock_AA3, FAILURE, LOCKTYPE_RWLOCK);
- printk(" |");
+ pr_cont(" |");
dotest(rsem_AA3, FAILURE, LOCKTYPE_RWSEM);
- printk("\n");
+ pr_cont("\n");
printk(" --------------------------------------------------------------------------\n");
diff --git a/lib/lockref.c b/lib/lockref.c
index 5a92189ad711..c4bfcb8836cd 100644
--- a/lib/lockref.c
+++ b/lib/lockref.c
@@ -20,7 +20,7 @@
if (likely(old.lock_count == prev.lock_count)) { \
SUCCESS; \
} \
- cpu_relax_lowlatency(); \
+ cpu_relax(); \
} \
} while (0)
diff --git a/lib/nlattr.c b/lib/nlattr.c
index fce1e9afc6d9..b42b8577fc23 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -14,7 +14,7 @@
#include <linux/types.h>
#include <net/netlink.h>
-static const u16 nla_attr_minlen[NLA_TYPE_MAX+1] = {
+static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = {
[NLA_U8] = sizeof(u8),
[NLA_U16] = sizeof(u16),
[NLA_U32] = sizeof(u32),
diff --git a/lib/parser.c b/lib/parser.c
index b6d11631231b..3278958b472a 100644
--- a/lib/parser.c
+++ b/lib/parser.c
@@ -152,6 +152,36 @@ static int match_number(substring_t *s, int *result, int base)
}
/**
+ * match_u64int: scan a number in the given base from a substring_t
+ * @s: substring to be scanned
+ * @result: resulting u64 on success
+ * @base: base to use when converting string
+ *
+ * Description: Given a &substring_t and a base, attempts to parse the substring
+ * as a number in that base. On success, sets @result to the integer represented
+ * by the string and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
+ */
+static int match_u64int(substring_t *s, u64 *result, int base)
+{
+ char *buf;
+ int ret;
+ u64 val;
+ size_t len = s->to - s->from;
+
+ buf = kmalloc(len + 1, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ memcpy(buf, s->from, len);
+ buf[len] = '\0';
+
+ ret = kstrtoull(buf, base, &val);
+ if (!ret)
+ *result = val;
+ kfree(buf);
+ return ret;
+}
+
+/**
* match_int: - scan a decimal representation of an integer from a substring_t
* @s: substring_t to be scanned
* @result: resulting integer on success
@@ -167,6 +197,23 @@ int match_int(substring_t *s, int *result)
EXPORT_SYMBOL(match_int);
/**
+ * match_u64: - scan a decimal representation of a u64 from
+ * a substring_t
+ * @s: substring_t to be scanned
+ * @result: resulting unsigned long long on success
+ *
+ * Description: Attempts to parse the &substring_t @s as a long decimal
+ * integer. On success, sets @result to the integer represented by the
+ * string and returns 0.
+ * Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
+ */
+int match_u64(substring_t *s, u64 *result)
+{
+ return match_u64int(s, result, 0);
+}
+EXPORT_SYMBOL(match_u64);
+
+/**
* match_octal: - scan an octal representation of an integer from a substring_t
* @s: substring_t to be scanned
* @result: resulting integer on success
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 72d36113ccaa..c8cebb137076 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -158,25 +158,21 @@ EXPORT_SYMBOL(percpu_counter_destroy);
int percpu_counter_batch __read_mostly = 32;
EXPORT_SYMBOL(percpu_counter_batch);
-static void compute_batch_value(void)
+static int compute_batch_value(unsigned int cpu)
{
int nr = num_online_cpus();
percpu_counter_batch = max(32, nr*2);
+ return 0;
}
-static int percpu_counter_hotcpu_callback(struct notifier_block *nb,
- unsigned long action, void *hcpu)
+static int percpu_counter_cpu_dead(unsigned int cpu)
{
#ifdef CONFIG_HOTPLUG_CPU
- unsigned int cpu;
struct percpu_counter *fbc;
- compute_batch_value();
- if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
- return NOTIFY_OK;
+ compute_batch_value(cpu);
- cpu = (unsigned long)hcpu;
spin_lock_irq(&percpu_counters_lock);
list_for_each_entry(fbc, &percpu_counters, list) {
s32 *pcount;
@@ -190,7 +186,7 @@ static int percpu_counter_hotcpu_callback(struct notifier_block *nb,
}
spin_unlock_irq(&percpu_counters_lock);
#endif
- return NOTIFY_OK;
+ return 0;
}
/*
@@ -222,8 +218,15 @@ EXPORT_SYMBOL(__percpu_counter_compare);
static int __init percpu_counter_startup(void)
{
- compute_batch_value();
- hotcpu_notifier(percpu_counter_hotcpu_callback, 0);
+ int ret;
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "lib/percpu_cnt:online",
+ compute_batch_value, NULL);
+ WARN_ON(ret < 0);
+ ret = cpuhp_setup_state_nocalls(CPUHP_PERCPU_CNT_DEAD,
+ "lib/percpu_cnt:dead", NULL,
+ percpu_counter_cpu_dead);
+ WARN_ON(ret < 0);
return 0;
}
module_init(percpu_counter_startup);
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 8e6d552c40dd..6f382e07de77 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -22,6 +22,7 @@
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
+#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/kernel.h>
@@ -30,7 +31,6 @@
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/kmemleak.h>
-#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/string.h>
#include <linux/bitops.h>
@@ -69,6 +69,11 @@ struct radix_tree_preload {
};
static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
+static inline struct radix_tree_node *entry_to_node(void *ptr)
+{
+ return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE);
+}
+
static inline void *node_to_entry(void *ptr)
{
return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE);
@@ -191,13 +196,12 @@ static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
* Returns next bit offset, or size if nothing found.
*/
static __always_inline unsigned long
-radix_tree_find_next_bit(const unsigned long *addr,
- unsigned long size, unsigned long offset)
+radix_tree_find_next_bit(struct radix_tree_node *node, unsigned int tag,
+ unsigned long offset)
{
- if (!__builtin_constant_p(size))
- return find_next_bit(addr, size, offset);
+ const unsigned long *addr = node->tags[tag];
- if (offset < size) {
+ if (offset < RADIX_TREE_MAP_SIZE) {
unsigned long tmp;
addr += offset / BITS_PER_LONG;
@@ -205,14 +209,32 @@ radix_tree_find_next_bit(const unsigned long *addr,
if (tmp)
return __ffs(tmp) + offset;
offset = (offset + BITS_PER_LONG) & ~(BITS_PER_LONG - 1);
- while (offset < size) {
+ while (offset < RADIX_TREE_MAP_SIZE) {
tmp = *++addr;
if (tmp)
return __ffs(tmp) + offset;
offset += BITS_PER_LONG;
}
}
- return size;
+ return RADIX_TREE_MAP_SIZE;
+}
+
+static unsigned int iter_offset(const struct radix_tree_iter *iter)
+{
+ return (iter->index >> iter_shift(iter)) & RADIX_TREE_MAP_MASK;
+}
+
+/*
+ * The maximum index which can be stored in a radix tree
+ */
+static inline unsigned long shift_maxindex(unsigned int shift)
+{
+ return (RADIX_TREE_MAP_SIZE << shift) - 1;
+}
+
+static inline unsigned long node_maxindex(struct radix_tree_node *node)
+{
+ return shift_maxindex(node->shift);
}
#ifndef __KERNEL__
@@ -220,10 +242,11 @@ static void dump_node(struct radix_tree_node *node, unsigned long index)
{
unsigned long i;
- pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d parent %p\n",
- node, node->offset,
+ pr_debug("radix node: %p offset %d indices %lu-%lu parent %p tags %lx %lx %lx shift %d count %d exceptional %d\n",
+ node, node->offset, index, index | node_maxindex(node),
+ node->parent,
node->tags[0][0], node->tags[1][0], node->tags[2][0],
- node->shift, node->count, node->parent);
+ node->shift, node->count, node->exceptional);
for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
unsigned long first = index | (i << node->shift);
@@ -231,14 +254,16 @@ static void dump_node(struct radix_tree_node *node, unsigned long index)
void *entry = node->slots[i];
if (!entry)
continue;
- if (is_sibling_entry(node, entry)) {
- pr_debug("radix sblng %p offset %ld val %p indices %ld-%ld\n",
- entry, i,
- *(void **)entry_to_node(entry),
- first, last);
+ if (entry == RADIX_TREE_RETRY) {
+ pr_debug("radix retry offset %ld indices %lu-%lu parent %p\n",
+ i, first, last, node);
} else if (!radix_tree_is_internal_node(entry)) {
- pr_debug("radix entry %p offset %ld indices %ld-%ld\n",
- entry, i, first, last);
+ pr_debug("radix entry %p offset %ld indices %lu-%lu parent %p\n",
+ entry, i, first, last, node);
+ } else if (is_sibling_entry(node, entry)) {
+ pr_debug("radix sblng %p offset %ld indices %lu-%lu parent %p val %p\n",
+ entry, i, first, last, node,
+ *(void **)entry_to_node(entry));
} else {
dump_node(entry_to_node(entry), first);
}
@@ -262,7 +287,10 @@ static void radix_tree_dump(struct radix_tree_root *root)
* that the caller has pinned this thread of control to the current CPU.
*/
static struct radix_tree_node *
-radix_tree_node_alloc(struct radix_tree_root *root)
+radix_tree_node_alloc(struct radix_tree_root *root,
+ struct radix_tree_node *parent,
+ unsigned int shift, unsigned int offset,
+ unsigned int count, unsigned int exceptional)
{
struct radix_tree_node *ret = NULL;
gfp_t gfp_mask = root_gfp_mask(root);
@@ -307,6 +335,13 @@ radix_tree_node_alloc(struct radix_tree_root *root)
ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
out:
BUG_ON(radix_tree_is_internal_node(ret));
+ if (ret) {
+ ret->parent = parent;
+ ret->shift = shift;
+ ret->offset = offset;
+ ret->count = count;
+ ret->exceptional = exceptional;
+ }
return ret;
}
@@ -314,18 +349,15 @@ static void radix_tree_node_rcu_free(struct rcu_head *head)
{
struct radix_tree_node *node =
container_of(head, struct radix_tree_node, rcu_head);
- int i;
/*
- * must only free zeroed nodes into the slab. radix_tree_shrink
- * can leave us with a non-NULL entry in the first slot, so clear
- * that here to make sure.
+ * Must only free zeroed nodes into the slab. We can be left with
+ * non-NULL entries by radix_tree_free_nodes, so clear the entries
+ * and tags here.
*/
- for (i = 0; i < RADIX_TREE_MAX_TAGS; i++)
- tag_clear(node, i, 0);
-
- node->slots[0] = NULL;
- node->count = 0;
+ memset(node->slots, 0, sizeof(node->slots));
+ memset(node->tags, 0, sizeof(node->tags));
+ INIT_LIST_HEAD(&node->private_list);
kmem_cache_free(radix_tree_node_cachep, node);
}
@@ -345,7 +377,7 @@ radix_tree_node_free(struct radix_tree_node *node)
* To make use of this facility, the radix tree must be initialised without
* __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
*/
-static int __radix_tree_preload(gfp_t gfp_mask, int nr)
+static int __radix_tree_preload(gfp_t gfp_mask, unsigned nr)
{
struct radix_tree_preload *rtp;
struct radix_tree_node *node;
@@ -411,6 +443,28 @@ int radix_tree_maybe_preload(gfp_t gfp_mask)
}
EXPORT_SYMBOL(radix_tree_maybe_preload);
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+/*
+ * Preload with enough objects to ensure that we can split a single entry
+ * of order @old_order into many entries of size @new_order
+ */
+int radix_tree_split_preload(unsigned int old_order, unsigned int new_order,
+ gfp_t gfp_mask)
+{
+ unsigned top = 1 << (old_order % RADIX_TREE_MAP_SHIFT);
+ unsigned layers = (old_order / RADIX_TREE_MAP_SHIFT) -
+ (new_order / RADIX_TREE_MAP_SHIFT);
+ unsigned nr = 0;
+
+ WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask));
+ BUG_ON(new_order >= old_order);
+
+ while (layers--)
+ nr = nr * RADIX_TREE_MAP_SIZE + 1;
+ return __radix_tree_preload(gfp_mask, top * nr);
+}
+#endif
+
/*
* The same as function above, but preload number of nodes required to insert
* (1 << order) continuous naturally-aligned elements.
@@ -456,19 +510,6 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
return __radix_tree_preload(gfp_mask, nr_nodes);
}
-/*
- * The maximum index which can be stored in a radix tree
- */
-static inline unsigned long shift_maxindex(unsigned int shift)
-{
- return (RADIX_TREE_MAP_SIZE << shift) - 1;
-}
-
-static inline unsigned long node_maxindex(struct radix_tree_node *node)
-{
- return shift_maxindex(node->shift);
-}
-
static unsigned radix_tree_load_root(struct radix_tree_root *root,
struct radix_tree_node **nodep, unsigned long *maxindex)
{
@@ -506,8 +547,8 @@ static int radix_tree_extend(struct radix_tree_root *root,
goto out;
do {
- struct radix_tree_node *node = radix_tree_node_alloc(root);
-
+ struct radix_tree_node *node = radix_tree_node_alloc(root,
+ NULL, shift, 0, 1, 0);
if (!node)
return -ENOMEM;
@@ -518,12 +559,12 @@ static int radix_tree_extend(struct radix_tree_root *root,
}
BUG_ON(shift > BITS_PER_LONG);
- node->shift = shift;
- node->offset = 0;
- node->count = 1;
- node->parent = NULL;
- if (radix_tree_is_internal_node(slot))
+ if (radix_tree_is_internal_node(slot)) {
entry_to_node(slot)->parent = node;
+ } else if (radix_tree_exceptional_entry(slot)) {
+ /* Moving an exceptional root->rnode to a node */
+ node->exceptional = 1;
+ }
node->slots[0] = slot;
slot = node_to_entry(node);
rcu_assign_pointer(root->rnode, slot);
@@ -534,6 +575,104 @@ out:
}
/**
+ * radix_tree_shrink - shrink radix tree to minimum height
+ * @root radix tree root
+ */
+static inline void radix_tree_shrink(struct radix_tree_root *root,
+ radix_tree_update_node_t update_node,
+ void *private)
+{
+ for (;;) {
+ struct radix_tree_node *node = root->rnode;
+ struct radix_tree_node *child;
+
+ if (!radix_tree_is_internal_node(node))
+ break;
+ node = entry_to_node(node);
+
+ /*
+ * The candidate node has more than one child, or its child
+ * is not at the leftmost slot, or the child is a multiorder
+ * entry, we cannot shrink.
+ */
+ if (node->count != 1)
+ break;
+ child = node->slots[0];
+ if (!child)
+ break;
+ if (!radix_tree_is_internal_node(child) && node->shift)
+ break;
+
+ if (radix_tree_is_internal_node(child))
+ entry_to_node(child)->parent = NULL;
+
+ /*
+ * We don't need rcu_assign_pointer(), since we are simply
+ * moving the node from one part of the tree to another: if it
+ * was safe to dereference the old pointer to it
+ * (node->slots[0]), it will be safe to dereference the new
+ * one (root->rnode) as far as dependent read barriers go.
+ */
+ root->rnode = child;
+
+ /*
+ * We have a dilemma here. The node's slot[0] must not be
+ * NULLed in case there are concurrent lookups expecting to
+ * find the item. However if this was a bottom-level node,
+ * then it may be subject to the slot pointer being visible
+ * to callers dereferencing it. If item corresponding to
+ * slot[0] is subsequently deleted, these callers would expect
+ * their slot to become empty sooner or later.
+ *
+ * For example, lockless pagecache will look up a slot, deref
+ * the page pointer, and if the page has 0 refcount it means it
+ * was concurrently deleted from pagecache so try the deref
+ * again. Fortunately there is already a requirement for logic
+ * to retry the entire slot lookup -- the indirect pointer
+ * problem (replacing direct root node with an indirect pointer
+ * also results in a stale slot). So tag the slot as indirect
+ * to force callers to retry.
+ */
+ node->count = 0;
+ if (!radix_tree_is_internal_node(child)) {
+ node->slots[0] = RADIX_TREE_RETRY;
+ if (update_node)
+ update_node(node, private);
+ }
+
+ radix_tree_node_free(node);
+ }
+}
+
+static void delete_node(struct radix_tree_root *root,
+ struct radix_tree_node *node,
+ radix_tree_update_node_t update_node, void *private)
+{
+ do {
+ struct radix_tree_node *parent;
+
+ if (node->count) {
+ if (node == entry_to_node(root->rnode))
+ radix_tree_shrink(root, update_node, private);
+ return;
+ }
+
+ parent = node->parent;
+ if (parent) {
+ parent->slots[node->offset] = NULL;
+ parent->count--;
+ } else {
+ root_tag_clear_all(root);
+ root->rnode = NULL;
+ }
+
+ radix_tree_node_free(node);
+
+ node = parent;
+ } while (node);
+}
+
+/**
* __radix_tree_create - create a slot in a radix tree
* @root: radix tree root
* @index: index key
@@ -563,26 +702,24 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
shift = radix_tree_load_root(root, &child, &maxindex);
/* Make sure the tree is high enough. */
+ if (order > 0 && max == ((1UL << order) - 1))
+ max++;
if (max > maxindex) {
int error = radix_tree_extend(root, max, shift);
if (error < 0)
return error;
shift = error;
child = root->rnode;
- if (order == shift)
- shift += RADIX_TREE_MAP_SHIFT;
}
while (shift > order) {
shift -= RADIX_TREE_MAP_SHIFT;
if (child == NULL) {
/* Have to add a child node. */
- child = radix_tree_node_alloc(root);
+ child = radix_tree_node_alloc(root, node, shift,
+ offset, 0, 0);
if (!child)
return -ENOMEM;
- child->shift = shift;
- child->offset = offset;
- child->parent = node;
rcu_assign_pointer(*slot, node_to_entry(child));
if (node)
node->count++;
@@ -595,31 +732,125 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
slot = &node->slots[offset];
}
+ if (nodep)
+ *nodep = node;
+ if (slotp)
+ *slotp = slot;
+ return 0;
+}
+
#ifdef CONFIG_RADIX_TREE_MULTIORDER
- /* Insert pointers to the canonical entry */
- if (order > shift) {
- unsigned i, n = 1 << (order - shift);
+/*
+ * Free any nodes below this node. The tree is presumed to not need
+ * shrinking, and any user data in the tree is presumed to not need a
+ * destructor called on it. If we need to add a destructor, we can
+ * add that functionality later. Note that we may not clear tags or
+ * slots from the tree as an RCU walker may still have a pointer into
+ * this subtree. We could replace the entries with RADIX_TREE_RETRY,
+ * but we'll still have to clear those in rcu_free.
+ */
+static void radix_tree_free_nodes(struct radix_tree_node *node)
+{
+ unsigned offset = 0;
+ struct radix_tree_node *child = entry_to_node(node);
+
+ for (;;) {
+ void *entry = child->slots[offset];
+ if (radix_tree_is_internal_node(entry) &&
+ !is_sibling_entry(child, entry)) {
+ child = entry_to_node(entry);
+ offset = 0;
+ continue;
+ }
+ offset++;
+ while (offset == RADIX_TREE_MAP_SIZE) {
+ struct radix_tree_node *old = child;
+ offset = child->offset + 1;
+ child = child->parent;
+ radix_tree_node_free(old);
+ if (old == entry_to_node(node))
+ return;
+ }
+ }
+}
+
+static inline int insert_entries(struct radix_tree_node *node, void **slot,
+ void *item, unsigned order, bool replace)
+{
+ struct radix_tree_node *child;
+ unsigned i, n, tag, offset, tags = 0;
+
+ if (node) {
+ if (order > node->shift)
+ n = 1 << (order - node->shift);
+ else
+ n = 1;
+ offset = get_slot_offset(node, slot);
+ } else {
+ n = 1;
+ offset = 0;
+ }
+
+ if (n > 1) {
offset = offset & ~(n - 1);
slot = &node->slots[offset];
- child = node_to_entry(slot);
- for (i = 0; i < n; i++) {
- if (slot[i])
+ }
+ child = node_to_entry(slot);
+
+ for (i = 0; i < n; i++) {
+ if (slot[i]) {
+ if (replace) {
+ node->count--;
+ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+ if (tag_get(node, tag, offset + i))
+ tags |= 1 << tag;
+ } else
return -EEXIST;
}
+ }
- for (i = 1; i < n; i++) {
+ for (i = 0; i < n; i++) {
+ struct radix_tree_node *old = slot[i];
+ if (i) {
rcu_assign_pointer(slot[i], child);
- node->count++;
+ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+ if (tags & (1 << tag))
+ tag_clear(node, tag, offset + i);
+ } else {
+ rcu_assign_pointer(slot[i], item);
+ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+ if (tags & (1 << tag))
+ tag_set(node, tag, offset);
}
+ if (radix_tree_is_internal_node(old) &&
+ !is_sibling_entry(node, old) &&
+ (old != RADIX_TREE_RETRY))
+ radix_tree_free_nodes(old);
+ if (radix_tree_exceptional_entry(old))
+ node->exceptional--;
}
-#endif
-
- if (nodep)
- *nodep = node;
- if (slotp)
- *slotp = slot;
- return 0;
+ if (node) {
+ node->count += n;
+ if (radix_tree_exceptional_entry(item))
+ node->exceptional += n;
+ }
+ return n;
+}
+#else
+static inline int insert_entries(struct radix_tree_node *node, void **slot,
+ void *item, unsigned order, bool replace)
+{
+ if (*slot)
+ return -EEXIST;
+ rcu_assign_pointer(*slot, item);
+ if (node) {
+ node->count++;
+ if (radix_tree_exceptional_entry(item))
+ node->exceptional++;
+ }
+ return 1;
}
+#endif
/**
* __radix_tree_insert - insert into a radix tree
@@ -642,13 +873,13 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index,
error = __radix_tree_create(root, index, order, &node, &slot);
if (error)
return error;
- if (*slot != NULL)
- return -EEXIST;
- rcu_assign_pointer(*slot, item);
+
+ error = insert_entries(node, slot, item, order, false);
+ if (error < 0)
+ return error;
if (node) {
unsigned offset = get_slot_offset(node, slot);
- node->count++;
BUG_ON(tag_get(node, 0, offset));
BUG_ON(tag_get(node, 1, offset));
BUG_ON(tag_get(node, 2, offset));
@@ -746,6 +977,287 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
}
EXPORT_SYMBOL(radix_tree_lookup);
+static inline int slot_count(struct radix_tree_node *node,
+ void **slot)
+{
+ int n = 1;
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+ void *ptr = node_to_entry(slot);
+ unsigned offset = get_slot_offset(node, slot);
+ int i;
+
+ for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) {
+ if (node->slots[offset + i] != ptr)
+ break;
+ n++;
+ }
+#endif
+ return n;
+}
+
+static void replace_slot(struct radix_tree_root *root,
+ struct radix_tree_node *node,
+ void **slot, void *item,
+ bool warn_typeswitch)
+{
+ void *old = rcu_dereference_raw(*slot);
+ int count, exceptional;
+
+ WARN_ON_ONCE(radix_tree_is_internal_node(item));
+
+ count = !!item - !!old;
+ exceptional = !!radix_tree_exceptional_entry(item) -
+ !!radix_tree_exceptional_entry(old);
+
+ WARN_ON_ONCE(warn_typeswitch && (count || exceptional));
+
+ if (node) {
+ node->count += count;
+ if (exceptional) {
+ exceptional *= slot_count(node, slot);
+ node->exceptional += exceptional;
+ }
+ }
+
+ rcu_assign_pointer(*slot, item);
+}
+
+static inline void delete_sibling_entries(struct radix_tree_node *node,
+ void **slot)
+{
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+ bool exceptional = radix_tree_exceptional_entry(*slot);
+ void *ptr = node_to_entry(slot);
+ unsigned offset = get_slot_offset(node, slot);
+ int i;
+
+ for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) {
+ if (node->slots[offset + i] != ptr)
+ break;
+ node->slots[offset + i] = NULL;
+ node->count--;
+ if (exceptional)
+ node->exceptional--;
+ }
+#endif
+}
+
+/**
+ * __radix_tree_replace - replace item in a slot
+ * @root: radix tree root
+ * @node: pointer to tree node
+ * @slot: pointer to slot in @node
+ * @item: new item to store in the slot.
+ * @update_node: callback for changing leaf nodes
+ * @private: private data to pass to @update_node
+ *
+ * For use with __radix_tree_lookup(). Caller must hold tree write locked
+ * across slot lookup and replacement.
+ */
+void __radix_tree_replace(struct radix_tree_root *root,
+ struct radix_tree_node *node,
+ void **slot, void *item,
+ radix_tree_update_node_t update_node, void *private)
+{
+ if (!item)
+ delete_sibling_entries(node, slot);
+ /*
+ * This function supports replacing exceptional entries and
+ * deleting entries, but that needs accounting against the
+ * node unless the slot is root->rnode.
+ */
+ replace_slot(root, node, slot, item,
+ !node && slot != (void **)&root->rnode);
+
+ if (!node)
+ return;
+
+ if (update_node)
+ update_node(node, private);
+
+ delete_node(root, node, update_node, private);
+}
+
+/**
+ * radix_tree_replace_slot - replace item in a slot
+ * @root: radix tree root
+ * @slot: pointer to slot
+ * @item: new item to store in the slot.
+ *
+ * For use with radix_tree_lookup_slot(), radix_tree_gang_lookup_slot(),
+ * radix_tree_gang_lookup_tag_slot(). Caller must hold tree write locked
+ * across slot lookup and replacement.
+ *
+ * NOTE: This cannot be used to switch between non-entries (empty slots),
+ * regular entries, and exceptional entries, as that requires accounting
+ * inside the radix tree node. When switching from one type of entry or
+ * deleting, use __radix_tree_lookup() and __radix_tree_replace() or
+ * radix_tree_iter_replace().
+ */
+void radix_tree_replace_slot(struct radix_tree_root *root,
+ void **slot, void *item)
+{
+ replace_slot(root, NULL, slot, item, true);
+}
+
+/**
+ * radix_tree_iter_replace - replace item in a slot
+ * @root: radix tree root
+ * @slot: pointer to slot
+ * @item: new item to store in the slot.
+ *
+ * For use with radix_tree_split() and radix_tree_for_each_slot().
+ * Caller must hold tree write locked across split and replacement.
+ */
+void radix_tree_iter_replace(struct radix_tree_root *root,
+ const struct radix_tree_iter *iter, void **slot, void *item)
+{
+ __radix_tree_replace(root, iter->node, slot, item, NULL, NULL);
+}
+
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+/**
+ * radix_tree_join - replace multiple entries with one multiorder entry
+ * @root: radix tree root
+ * @index: an index inside the new entry
+ * @order: order of the new entry
+ * @item: new entry
+ *
+ * Call this function to replace several entries with one larger entry.
+ * The existing entries are presumed to not need freeing as a result of
+ * this call.
+ *
+ * The replacement entry will have all the tags set on it that were set
+ * on any of the entries it is replacing.
+ */
+int radix_tree_join(struct radix_tree_root *root, unsigned long index,
+ unsigned order, void *item)
+{
+ struct radix_tree_node *node;
+ void **slot;
+ int error;
+
+ BUG_ON(radix_tree_is_internal_node(item));
+
+ error = __radix_tree_create(root, index, order, &node, &slot);
+ if (!error)
+ error = insert_entries(node, slot, item, order, true);
+ if (error > 0)
+ error = 0;
+
+ return error;
+}
+
+/**
+ * radix_tree_split - Split an entry into smaller entries
+ * @root: radix tree root
+ * @index: An index within the large entry
+ * @order: Order of new entries
+ *
+ * Call this function as the first step in replacing a multiorder entry
+ * with several entries of lower order. After this function returns,
+ * loop over the relevant portion of the tree using radix_tree_for_each_slot()
+ * and call radix_tree_iter_replace() to set up each new entry.
+ *
+ * The tags from this entry are replicated to all the new entries.
+ *
+ * The radix tree should be locked against modification during the entire
+ * replacement operation. Lock-free lookups will see RADIX_TREE_RETRY which
+ * should prompt RCU walkers to restart the lookup from the root.
+ */
+int radix_tree_split(struct radix_tree_root *root, unsigned long index,
+ unsigned order)
+{
+ struct radix_tree_node *parent, *node, *child;
+ void **slot;
+ unsigned int offset, end;
+ unsigned n, tag, tags = 0;
+
+ if (!__radix_tree_lookup(root, index, &parent, &slot))
+ return -ENOENT;
+ if (!parent)
+ return -ENOENT;
+
+ offset = get_slot_offset(parent, slot);
+
+ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+ if (tag_get(parent, tag, offset))
+ tags |= 1 << tag;
+
+ for (end = offset + 1; end < RADIX_TREE_MAP_SIZE; end++) {
+ if (!is_sibling_entry(parent, parent->slots[end]))
+ break;
+ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+ if (tags & (1 << tag))
+ tag_set(parent, tag, end);
+ /* rcu_assign_pointer ensures tags are set before RETRY */
+ rcu_assign_pointer(parent->slots[end], RADIX_TREE_RETRY);
+ }
+ rcu_assign_pointer(parent->slots[offset], RADIX_TREE_RETRY);
+ parent->exceptional -= (end - offset);
+
+ if (order == parent->shift)
+ return 0;
+ if (order > parent->shift) {
+ while (offset < end)
+ offset += insert_entries(parent, &parent->slots[offset],
+ RADIX_TREE_RETRY, order, true);
+ return 0;
+ }
+
+ node = parent;
+
+ for (;;) {
+ if (node->shift > order) {
+ child = radix_tree_node_alloc(root, node,
+ node->shift - RADIX_TREE_MAP_SHIFT,
+ offset, 0, 0);
+ if (!child)
+ goto nomem;
+ if (node != parent) {
+ node->count++;
+ node->slots[offset] = node_to_entry(child);
+ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+ if (tags & (1 << tag))
+ tag_set(node, tag, offset);
+ }
+
+ node = child;
+ offset = 0;
+ continue;
+ }
+
+ n = insert_entries(node, &node->slots[offset],
+ RADIX_TREE_RETRY, order, false);
+ BUG_ON(n > RADIX_TREE_MAP_SIZE);
+
+ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+ if (tags & (1 << tag))
+ tag_set(node, tag, offset);
+ offset += n;
+
+ while (offset == RADIX_TREE_MAP_SIZE) {
+ if (node == parent)
+ break;
+ offset = node->offset;
+ child = node;
+ node = node->parent;
+ rcu_assign_pointer(node->slots[offset],
+ node_to_entry(child));
+ offset++;
+ }
+ if ((node == parent) && (offset == end))
+ return 0;
+ }
+
+ nomem:
+ /* Shouldn't happen; did user forget to preload? */
+ /* TODO: free all the allocated nodes */
+ WARN_ON(1);
+ return -ENOMEM;
+}
+#endif
+
/**
* radix_tree_tag_set - set a tag on a radix tree node
* @root: radix tree root
@@ -807,6 +1319,34 @@ static void node_tag_clear(struct radix_tree_root *root,
root_tag_clear(root, tag);
}
+static void node_tag_set(struct radix_tree_root *root,
+ struct radix_tree_node *node,
+ unsigned int tag, unsigned int offset)
+{
+ while (node) {
+ if (tag_get(node, tag, offset))
+ return;
+ tag_set(node, tag, offset);
+ offset = node->offset;
+ node = node->parent;
+ }
+
+ if (!root_tag_get(root, tag))
+ root_tag_set(root, tag);
+}
+
+/**
+ * radix_tree_iter_tag_set - set a tag on the current iterator entry
+ * @root: radix tree root
+ * @iter: iterator state
+ * @tag: tag to set
+ */
+void radix_tree_iter_tag_set(struct radix_tree_root *root,
+ const struct radix_tree_iter *iter, unsigned int tag)
+{
+ node_tag_set(root, iter->node, tag, iter_offset(iter));
+}
+
/**
* radix_tree_tag_clear - clear a tag on a radix tree node
* @root: radix tree root
@@ -902,6 +1442,121 @@ static inline void __set_iter_shift(struct radix_tree_iter *iter,
#endif
}
+/* Construct iter->tags bit-mask from node->tags[tag] array */
+static void set_iter_tags(struct radix_tree_iter *iter,
+ struct radix_tree_node *node, unsigned offset,
+ unsigned tag)
+{
+ unsigned tag_long = offset / BITS_PER_LONG;
+ unsigned tag_bit = offset % BITS_PER_LONG;
+
+ iter->tags = node->tags[tag][tag_long] >> tag_bit;
+
+ /* This never happens if RADIX_TREE_TAG_LONGS == 1 */
+ if (tag_long < RADIX_TREE_TAG_LONGS - 1) {
+ /* Pick tags from next element */
+ if (tag_bit)
+ iter->tags |= node->tags[tag][tag_long + 1] <<
+ (BITS_PER_LONG - tag_bit);
+ /* Clip chunk size, here only BITS_PER_LONG tags */
+ iter->next_index = __radix_tree_iter_add(iter, BITS_PER_LONG);
+ }
+}
+
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+static void **skip_siblings(struct radix_tree_node **nodep,
+ void **slot, struct radix_tree_iter *iter)
+{
+ void *sib = node_to_entry(slot - 1);
+
+ while (iter->index < iter->next_index) {
+ *nodep = rcu_dereference_raw(*slot);
+ if (*nodep && *nodep != sib)
+ return slot;
+ slot++;
+ iter->index = __radix_tree_iter_add(iter, 1);
+ iter->tags >>= 1;
+ }
+
+ *nodep = NULL;
+ return NULL;
+}
+
+void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter,
+ unsigned flags)
+{
+ unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK;
+ struct radix_tree_node *node = rcu_dereference_raw(*slot);
+
+ slot = skip_siblings(&node, slot, iter);
+
+ while (radix_tree_is_internal_node(node)) {
+ unsigned offset;
+ unsigned long next_index;
+
+ if (node == RADIX_TREE_RETRY)
+ return slot;
+ node = entry_to_node(node);
+ iter->node = node;
+ iter->shift = node->shift;
+
+ if (flags & RADIX_TREE_ITER_TAGGED) {
+ offset = radix_tree_find_next_bit(node, tag, 0);
+ if (offset == RADIX_TREE_MAP_SIZE)
+ return NULL;
+ slot = &node->slots[offset];
+ iter->index = __radix_tree_iter_add(iter, offset);
+ set_iter_tags(iter, node, offset, tag);
+ node = rcu_dereference_raw(*slot);
+ } else {
+ offset = 0;
+ slot = &node->slots[0];
+ for (;;) {
+ node = rcu_dereference_raw(*slot);
+ if (node)
+ break;
+ slot++;
+ offset++;
+ if (offset == RADIX_TREE_MAP_SIZE)
+ return NULL;
+ }
+ iter->index = __radix_tree_iter_add(iter, offset);
+ }
+ if ((flags & RADIX_TREE_ITER_CONTIG) && (offset > 0))
+ goto none;
+ next_index = (iter->index | shift_maxindex(iter->shift)) + 1;
+ if (next_index < iter->next_index)
+ iter->next_index = next_index;
+ }
+
+ return slot;
+ none:
+ iter->next_index = 0;
+ return NULL;
+}
+EXPORT_SYMBOL(__radix_tree_next_slot);
+#else
+static void **skip_siblings(struct radix_tree_node **nodep,
+ void **slot, struct radix_tree_iter *iter)
+{
+ return slot;
+}
+#endif
+
+void **radix_tree_iter_resume(void **slot, struct radix_tree_iter *iter)
+{
+ struct radix_tree_node *node;
+
+ slot++;
+ iter->index = __radix_tree_iter_add(iter, 1);
+ node = rcu_dereference_raw(*slot);
+ skip_siblings(&node, slot, iter);
+ iter->next_index = iter->index;
+ iter->tags = 0;
+ return NULL;
+}
+EXPORT_SYMBOL(radix_tree_iter_resume);
+
/**
* radix_tree_next_chunk - find next chunk of slots for iteration
*
@@ -927,7 +1582,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
* because RADIX_TREE_MAP_SHIFT < BITS_PER_LONG.
*
* This condition also used by radix_tree_next_slot() to stop
- * contiguous iterating, and forbid swithing to the next chunk.
+ * contiguous iterating, and forbid switching to the next chunk.
*/
index = iter->next_index;
if (!index && iter->index)
@@ -945,6 +1600,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
iter->index = index;
iter->next_index = maxindex + 1;
iter->tags = 1;
+ iter->node = NULL;
__set_iter_shift(iter, 0);
return (void **)&root->rnode;
}
@@ -960,9 +1616,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
return NULL;
if (flags & RADIX_TREE_ITER_TAGGED)
- offset = radix_tree_find_next_bit(
- node->tags[tag],
- RADIX_TREE_MAP_SIZE,
+ offset = radix_tree_find_next_bit(node, tag,
offset + 1);
else
while (++offset < RADIX_TREE_MAP_SIZE) {
@@ -982,154 +1636,26 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
child = rcu_dereference_raw(node->slots[offset]);
}
- if ((child == NULL) || (child == RADIX_TREE_RETRY))
+ if (!child)
goto restart;
+ if (child == RADIX_TREE_RETRY)
+ break;
} while (radix_tree_is_internal_node(child));
/* Update the iterator state */
iter->index = (index &~ node_maxindex(node)) | (offset << node->shift);
iter->next_index = (index | node_maxindex(node)) + 1;
+ iter->node = node;
__set_iter_shift(iter, node->shift);
- /* Construct iter->tags bit-mask from node->tags[tag] array */
- if (flags & RADIX_TREE_ITER_TAGGED) {
- unsigned tag_long, tag_bit;
-
- tag_long = offset / BITS_PER_LONG;
- tag_bit = offset % BITS_PER_LONG;
- iter->tags = node->tags[tag][tag_long] >> tag_bit;
- /* This never happens if RADIX_TREE_TAG_LONGS == 1 */
- if (tag_long < RADIX_TREE_TAG_LONGS - 1) {
- /* Pick tags from next element */
- if (tag_bit)
- iter->tags |= node->tags[tag][tag_long + 1] <<
- (BITS_PER_LONG - tag_bit);
- /* Clip chunk size, here only BITS_PER_LONG tags */
- iter->next_index = index + BITS_PER_LONG;
- }
- }
+ if (flags & RADIX_TREE_ITER_TAGGED)
+ set_iter_tags(iter, node, offset, tag);
return node->slots + offset;
}
EXPORT_SYMBOL(radix_tree_next_chunk);
/**
- * radix_tree_range_tag_if_tagged - for each item in given range set given
- * tag if item has another tag set
- * @root: radix tree root
- * @first_indexp: pointer to a starting index of a range to scan
- * @last_index: last index of a range to scan
- * @nr_to_tag: maximum number items to tag
- * @iftag: tag index to test
- * @settag: tag index to set if tested tag is set
- *
- * This function scans range of radix tree from first_index to last_index
- * (inclusive). For each item in the range if iftag is set, the function sets
- * also settag. The function stops either after tagging nr_to_tag items or
- * after reaching last_index.
- *
- * The tags must be set from the leaf level only and propagated back up the
- * path to the root. We must do this so that we resolve the full path before
- * setting any tags on intermediate nodes. If we set tags as we descend, then
- * we can get to the leaf node and find that the index that has the iftag
- * set is outside the range we are scanning. This reults in dangling tags and
- * can lead to problems with later tag operations (e.g. livelocks on lookups).
- *
- * The function returns the number of leaves where the tag was set and sets
- * *first_indexp to the first unscanned index.
- * WARNING! *first_indexp can wrap if last_index is ULONG_MAX. Caller must
- * be prepared to handle that.
- */
-unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
- unsigned long *first_indexp, unsigned long last_index,
- unsigned long nr_to_tag,
- unsigned int iftag, unsigned int settag)
-{
- struct radix_tree_node *parent, *node, *child;
- unsigned long maxindex;
- unsigned long tagged = 0;
- unsigned long index = *first_indexp;
-
- radix_tree_load_root(root, &child, &maxindex);
- last_index = min(last_index, maxindex);
- if (index > last_index)
- return 0;
- if (!nr_to_tag)
- return 0;
- if (!root_tag_get(root, iftag)) {
- *first_indexp = last_index + 1;
- return 0;
- }
- if (!radix_tree_is_internal_node(child)) {
- *first_indexp = last_index + 1;
- root_tag_set(root, settag);
- return 1;
- }
-
- node = entry_to_node(child);
-
- for (;;) {
- unsigned offset = radix_tree_descend(node, &child, index);
- if (!child)
- goto next;
- if (!tag_get(node, iftag, offset))
- goto next;
- /* Sibling slots never have tags set on them */
- if (radix_tree_is_internal_node(child)) {
- node = entry_to_node(child);
- continue;
- }
-
- /* tag the leaf */
- tagged++;
- tag_set(node, settag, offset);
-
- /* walk back up the path tagging interior nodes */
- parent = node;
- for (;;) {
- offset = parent->offset;
- parent = parent->parent;
- if (!parent)
- break;
- /* stop if we find a node with the tag already set */
- if (tag_get(parent, settag, offset))
- break;
- tag_set(parent, settag, offset);
- }
- next:
- /* Go to next entry in node */
- index = ((index >> node->shift) + 1) << node->shift;
- /* Overflow can happen when last_index is ~0UL... */
- if (index > last_index || !index)
- break;
- offset = (index >> node->shift) & RADIX_TREE_MAP_MASK;
- while (offset == 0) {
- /*
- * We've fully scanned this node. Go up. Because
- * last_index is guaranteed to be in the tree, what
- * we do below cannot wander astray.
- */
- node = node->parent;
- offset = (index >> node->shift) & RADIX_TREE_MAP_MASK;
- }
- if (is_sibling_entry(node, node->slots[offset]))
- goto next;
- if (tagged >= nr_to_tag)
- break;
- }
- /*
- * We need not to tag the root tag if there is no tag which is set with
- * settag within the range from *first_indexp to last_index.
- */
- if (tagged > 0)
- root_tag_set(root, settag);
- *first_indexp = index;
-
- return tagged;
-}
-EXPORT_SYMBOL(radix_tree_range_tag_if_tagged);
-
-/**
* radix_tree_gang_lookup - perform multiple lookup on a radix tree
* @root: radix tree root
* @results: where the results of the lookup are placed
@@ -1294,174 +1820,6 @@ radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results,
}
EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot);
-#if defined(CONFIG_SHMEM) && defined(CONFIG_SWAP)
-#include <linux/sched.h> /* for cond_resched() */
-
-struct locate_info {
- unsigned long found_index;
- bool stop;
-};
-
-/*
- * This linear search is at present only useful to shmem_unuse_inode().
- */
-static unsigned long __locate(struct radix_tree_node *slot, void *item,
- unsigned long index, struct locate_info *info)
-{
- unsigned long i;
-
- do {
- unsigned int shift = slot->shift;
-
- for (i = (index >> shift) & RADIX_TREE_MAP_MASK;
- i < RADIX_TREE_MAP_SIZE;
- i++, index += (1UL << shift)) {
- struct radix_tree_node *node =
- rcu_dereference_raw(slot->slots[i]);
- if (node == RADIX_TREE_RETRY)
- goto out;
- if (!radix_tree_is_internal_node(node)) {
- if (node == item) {
- info->found_index = index;
- info->stop = true;
- goto out;
- }
- continue;
- }
- node = entry_to_node(node);
- if (is_sibling_entry(slot, node))
- continue;
- slot = node;
- break;
- }
- } while (i < RADIX_TREE_MAP_SIZE);
-
-out:
- if ((index == 0) && (i == RADIX_TREE_MAP_SIZE))
- info->stop = true;
- return index;
-}
-
-/**
- * radix_tree_locate_item - search through radix tree for item
- * @root: radix tree root
- * @item: item to be found
- *
- * Returns index where item was found, or -1 if not found.
- * Caller must hold no lock (since this time-consuming function needs
- * to be preemptible), and must check afterwards if item is still there.
- */
-unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
-{
- struct radix_tree_node *node;
- unsigned long max_index;
- unsigned long cur_index = 0;
- struct locate_info info = {
- .found_index = -1,
- .stop = false,
- };
-
- do {
- rcu_read_lock();
- node = rcu_dereference_raw(root->rnode);
- if (!radix_tree_is_internal_node(node)) {
- rcu_read_unlock();
- if (node == item)
- info.found_index = 0;
- break;
- }
-
- node = entry_to_node(node);
-
- max_index = node_maxindex(node);
- if (cur_index > max_index) {
- rcu_read_unlock();
- break;
- }
-
- cur_index = __locate(node, item, cur_index, &info);
- rcu_read_unlock();
- cond_resched();
- } while (!info.stop && cur_index <= max_index);
-
- return info.found_index;
-}
-#else
-unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
-{
- return -1;
-}
-#endif /* CONFIG_SHMEM && CONFIG_SWAP */
-
-/**
- * radix_tree_shrink - shrink radix tree to minimum height
- * @root radix tree root
- */
-static inline bool radix_tree_shrink(struct radix_tree_root *root)
-{
- bool shrunk = false;
-
- for (;;) {
- struct radix_tree_node *node = root->rnode;
- struct radix_tree_node *child;
-
- if (!radix_tree_is_internal_node(node))
- break;
- node = entry_to_node(node);
-
- /*
- * The candidate node has more than one child, or its child
- * is not at the leftmost slot, or the child is a multiorder
- * entry, we cannot shrink.
- */
- if (node->count != 1)
- break;
- child = node->slots[0];
- if (!child)
- break;
- if (!radix_tree_is_internal_node(child) && node->shift)
- break;
-
- if (radix_tree_is_internal_node(child))
- entry_to_node(child)->parent = NULL;
-
- /*
- * We don't need rcu_assign_pointer(), since we are simply
- * moving the node from one part of the tree to another: if it
- * was safe to dereference the old pointer to it
- * (node->slots[0]), it will be safe to dereference the new
- * one (root->rnode) as far as dependent read barriers go.
- */
- root->rnode = child;
-
- /*
- * We have a dilemma here. The node's slot[0] must not be
- * NULLed in case there are concurrent lookups expecting to
- * find the item. However if this was a bottom-level node,
- * then it may be subject to the slot pointer being visible
- * to callers dereferencing it. If item corresponding to
- * slot[0] is subsequently deleted, these callers would expect
- * their slot to become empty sooner or later.
- *
- * For example, lockless pagecache will look up a slot, deref
- * the page pointer, and if the page has 0 refcount it means it
- * was concurrently deleted from pagecache so try the deref
- * again. Fortunately there is already a requirement for logic
- * to retry the entire slot lookup -- the indirect pointer
- * problem (replacing direct root node with an indirect pointer
- * also results in a stale slot). So tag the slot as indirect
- * to force callers to retry.
- */
- if (!radix_tree_is_internal_node(child))
- node->slots[0] = RADIX_TREE_RETRY;
-
- radix_tree_node_free(node);
- shrunk = true;
- }
-
- return shrunk;
-}
-
/**
* __radix_tree_delete_node - try to free node after clearing a slot
* @root: radix tree root
@@ -1470,53 +1828,11 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root)
* After clearing the slot at @index in @node from radix tree
* rooted at @root, call this function to attempt freeing the
* node and shrinking the tree.
- *
- * Returns %true if @node was freed, %false otherwise.
*/
-bool __radix_tree_delete_node(struct radix_tree_root *root,
+void __radix_tree_delete_node(struct radix_tree_root *root,
struct radix_tree_node *node)
{
- bool deleted = false;
-
- do {
- struct radix_tree_node *parent;
-
- if (node->count) {
- if (node == entry_to_node(root->rnode))
- deleted |= radix_tree_shrink(root);
- return deleted;
- }
-
- parent = node->parent;
- if (parent) {
- parent->slots[node->offset] = NULL;
- parent->count--;
- } else {
- root_tag_clear_all(root);
- root->rnode = NULL;
- }
-
- radix_tree_node_free(node);
- deleted = true;
-
- node = parent;
- } while (node);
-
- return deleted;
-}
-
-static inline void delete_sibling_entries(struct radix_tree_node *node,
- void *ptr, unsigned offset)
-{
-#ifdef CONFIG_RADIX_TREE_MULTIORDER
- int i;
- for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) {
- if (node->slots[offset + i] != ptr)
- break;
- node->slots[offset + i] = NULL;
- node->count--;
- }
-#endif
+ delete_node(root, node, NULL, NULL);
}
/**
@@ -1558,11 +1874,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
node_tag_clear(root, node, tag, offset);
- delete_sibling_entries(node, node_to_entry(slot), offset);
- node->slots[offset] = NULL;
- node->count--;
-
- __radix_tree_delete_node(root, node);
+ __radix_tree_replace(root, node, slot, NULL, NULL, NULL);
return entry;
}
@@ -1642,32 +1954,31 @@ static __init void radix_tree_init_maxnodes(void)
}
}
-static int radix_tree_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
+static int radix_tree_cpu_dead(unsigned int cpu)
{
- int cpu = (long)hcpu;
struct radix_tree_preload *rtp;
struct radix_tree_node *node;
/* Free per-cpu pool of preloaded nodes */
- if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
- rtp = &per_cpu(radix_tree_preloads, cpu);
- while (rtp->nr) {
- node = rtp->nodes;
- rtp->nodes = node->private_data;
- kmem_cache_free(radix_tree_node_cachep, node);
- rtp->nr--;
- }
+ rtp = &per_cpu(radix_tree_preloads, cpu);
+ while (rtp->nr) {
+ node = rtp->nodes;
+ rtp->nodes = node->private_data;
+ kmem_cache_free(radix_tree_node_cachep, node);
+ rtp->nr--;
}
- return NOTIFY_OK;
+ return 0;
}
void __init radix_tree_init(void)
{
+ int ret;
radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
sizeof(struct radix_tree_node), 0,
SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
radix_tree_node_ctor);
radix_tree_init_maxnodes();
- hotcpu_notifier(radix_tree_callback, 0);
+ ret = cpuhp_setup_state_nocalls(CPUHP_RADIX_DEAD, "lib/radix:dead",
+ NULL, radix_tree_cpu_dead);
+ WARN_ON(ret < 0);
}
diff --git a/lib/raid6/avx2.c b/lib/raid6/avx2.c
index 76734004358d..20bca3d44f67 100644
--- a/lib/raid6/avx2.c
+++ b/lib/raid6/avx2.c
@@ -87,9 +87,57 @@ static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
kernel_fpu_end();
}
+static void raid6_avx21_xor_syndrome(int disks, int start, int stop,
+ size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = stop; /* P/Q right side optimization */
+ p = dptr[disks-2]; /* XOR parity */
+ q = dptr[disks-1]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
+
+ for (d = 0 ; d < bytes ; d += 32) {
+ asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
+ asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
+ asm volatile("vpxor %ymm4,%ymm2,%ymm2");
+ /* P/Q data pages */
+ for (z = z0-1 ; z >= start ; z--) {
+ asm volatile("vpxor %ymm5,%ymm5,%ymm5");
+ asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
+ asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
+ asm volatile("vpand %ymm0,%ymm5,%ymm5");
+ asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+ asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
+ asm volatile("vpxor %ymm5,%ymm2,%ymm2");
+ asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+ }
+ /* P/Q left side optimization */
+ for (z = start-1 ; z >= 0 ; z--) {
+ asm volatile("vpxor %ymm5,%ymm5,%ymm5");
+ asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
+ asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
+ asm volatile("vpand %ymm0,%ymm5,%ymm5");
+ asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+ }
+ asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
+ /* Don't use movntdq for r/w memory area < cache line */
+ asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
+ asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
+ }
+
+ asm volatile("sfence" : : : "memory");
+ kernel_fpu_end();
+}
+
const struct raid6_calls raid6_avx2x1 = {
raid6_avx21_gen_syndrome,
- NULL, /* XOR not yet implemented */
+ raid6_avx21_xor_syndrome,
raid6_have_avx2,
"avx2x1",
1 /* Has cache hints */
@@ -149,9 +197,77 @@ static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
kernel_fpu_end();
}
+static void raid6_avx22_xor_syndrome(int disks, int start, int stop,
+ size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = stop; /* P/Q right side optimization */
+ p = dptr[disks-2]; /* XOR parity */
+ q = dptr[disks-1]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
+
+ for (d = 0 ; d < bytes ; d += 64) {
+ asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
+ asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
+ asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
+ asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
+ asm volatile("vpxor %ymm4,%ymm2,%ymm2");
+ asm volatile("vpxor %ymm6,%ymm3,%ymm3");
+ /* P/Q data pages */
+ for (z = z0-1 ; z >= start ; z--) {
+ asm volatile("vpxor %ymm5,%ymm5,%ymm5");
+ asm volatile("vpxor %ymm7,%ymm7,%ymm7");
+ asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
+ asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
+ asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
+ asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
+ asm volatile("vpand %ymm0,%ymm5,%ymm5");
+ asm volatile("vpand %ymm0,%ymm7,%ymm7");
+ asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+ asm volatile("vpxor %ymm7,%ymm6,%ymm6");
+ asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
+ asm volatile("vmovdqa %0,%%ymm7"
+ :: "m" (dptr[z][d+32]));
+ asm volatile("vpxor %ymm5,%ymm2,%ymm2");
+ asm volatile("vpxor %ymm7,%ymm3,%ymm3");
+ asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+ asm volatile("vpxor %ymm7,%ymm6,%ymm6");
+ }
+ /* P/Q left side optimization */
+ for (z = start-1 ; z >= 0 ; z--) {
+ asm volatile("vpxor %ymm5,%ymm5,%ymm5");
+ asm volatile("vpxor %ymm7,%ymm7,%ymm7");
+ asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
+ asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
+ asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
+ asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
+ asm volatile("vpand %ymm0,%ymm5,%ymm5");
+ asm volatile("vpand %ymm0,%ymm7,%ymm7");
+ asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+ asm volatile("vpxor %ymm7,%ymm6,%ymm6");
+ }
+ asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
+ asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
+ /* Don't use movntdq for r/w memory area < cache line */
+ asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
+ asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32]));
+ asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
+ asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32]));
+ }
+
+ asm volatile("sfence" : : : "memory");
+ kernel_fpu_end();
+}
+
const struct raid6_calls raid6_avx2x2 = {
raid6_avx22_gen_syndrome,
- NULL, /* XOR not yet implemented */
+ raid6_avx22_xor_syndrome,
raid6_have_avx2,
"avx2x2",
1 /* Has cache hints */
@@ -242,9 +358,119 @@ static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
kernel_fpu_end();
}
+static void raid6_avx24_xor_syndrome(int disks, int start, int stop,
+ size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = stop; /* P/Q right side optimization */
+ p = dptr[disks-2]; /* XOR parity */
+ q = dptr[disks-1]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0]));
+
+ for (d = 0 ; d < bytes ; d += 128) {
+ asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
+ asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
+ asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64]));
+ asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96]));
+ asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
+ asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
+ asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64]));
+ asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96]));
+ asm volatile("vpxor %ymm4,%ymm2,%ymm2");
+ asm volatile("vpxor %ymm6,%ymm3,%ymm3");
+ asm volatile("vpxor %ymm12,%ymm10,%ymm10");
+ asm volatile("vpxor %ymm14,%ymm11,%ymm11");
+ /* P/Q data pages */
+ for (z = z0-1 ; z >= start ; z--) {
+ asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
+ asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64]));
+ asm volatile("vpxor %ymm5,%ymm5,%ymm5");
+ asm volatile("vpxor %ymm7,%ymm7,%ymm7");
+ asm volatile("vpxor %ymm13,%ymm13,%ymm13");
+ asm volatile("vpxor %ymm15,%ymm15,%ymm15");
+ asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
+ asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
+ asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
+ asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
+ asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
+ asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
+ asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
+ asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
+ asm volatile("vpand %ymm0,%ymm5,%ymm5");
+ asm volatile("vpand %ymm0,%ymm7,%ymm7");
+ asm volatile("vpand %ymm0,%ymm13,%ymm13");
+ asm volatile("vpand %ymm0,%ymm15,%ymm15");
+ asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+ asm volatile("vpxor %ymm7,%ymm6,%ymm6");
+ asm volatile("vpxor %ymm13,%ymm12,%ymm12");
+ asm volatile("vpxor %ymm15,%ymm14,%ymm14");
+ asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
+ asm volatile("vmovdqa %0,%%ymm7"
+ :: "m" (dptr[z][d+32]));
+ asm volatile("vmovdqa %0,%%ymm13"
+ :: "m" (dptr[z][d+64]));
+ asm volatile("vmovdqa %0,%%ymm15"
+ :: "m" (dptr[z][d+96]));
+ asm volatile("vpxor %ymm5,%ymm2,%ymm2");
+ asm volatile("vpxor %ymm7,%ymm3,%ymm3");
+ asm volatile("vpxor %ymm13,%ymm10,%ymm10");
+ asm volatile("vpxor %ymm15,%ymm11,%ymm11");
+ asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+ asm volatile("vpxor %ymm7,%ymm6,%ymm6");
+ asm volatile("vpxor %ymm13,%ymm12,%ymm12");
+ asm volatile("vpxor %ymm15,%ymm14,%ymm14");
+ }
+ asm volatile("prefetchnta %0" :: "m" (q[d]));
+ asm volatile("prefetchnta %0" :: "m" (q[d+64]));
+ /* P/Q left side optimization */
+ for (z = start-1 ; z >= 0 ; z--) {
+ asm volatile("vpxor %ymm5,%ymm5,%ymm5");
+ asm volatile("vpxor %ymm7,%ymm7,%ymm7");
+ asm volatile("vpxor %ymm13,%ymm13,%ymm13");
+ asm volatile("vpxor %ymm15,%ymm15,%ymm15");
+ asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
+ asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
+ asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
+ asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
+ asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
+ asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
+ asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
+ asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
+ asm volatile("vpand %ymm0,%ymm5,%ymm5");
+ asm volatile("vpand %ymm0,%ymm7,%ymm7");
+ asm volatile("vpand %ymm0,%ymm13,%ymm13");
+ asm volatile("vpand %ymm0,%ymm15,%ymm15");
+ asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+ asm volatile("vpxor %ymm7,%ymm6,%ymm6");
+ asm volatile("vpxor %ymm13,%ymm12,%ymm12");
+ asm volatile("vpxor %ymm15,%ymm14,%ymm14");
+ }
+ asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
+ asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
+ asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
+ asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
+ asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
+ asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
+ asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64]));
+ asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96]));
+ asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
+ asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
+ asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
+ asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
+ }
+ asm volatile("sfence" : : : "memory");
+ kernel_fpu_end();
+}
+
const struct raid6_calls raid6_avx2x4 = {
raid6_avx24_gen_syndrome,
- NULL, /* XOR not yet implemented */
+ raid6_avx24_xor_syndrome,
raid6_have_avx2,
"avx2x4",
1 /* Has cache hints */
diff --git a/lib/rbtree.c b/lib/rbtree.c
index eb8a19fee110..1f8b112a7c35 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -296,11 +296,26 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
*
* (p) (p)
* / \ / \
- * N S --> N Sl
+ * N S --> N sl
* / \ \
- * sl Sr s
+ * sl Sr S
* \
* Sr
+ *
+ * Note: p might be red, and then both
+ * p and sl are red after rotation(which
+ * breaks property 4). This is fixed in
+ * Case 4 (in __rb_rotate_set_parents()
+ * which set sl the color of p
+ * and set p RB_BLACK)
+ *
+ * (p) (sl)
+ * / \ / \
+ * N sl --> P S
+ * \ / \
+ * S N Sr
+ * \
+ * Sr
*/
tmp1 = tmp2->rb_right;
WRITE_ONCE(sibling->rb_left, tmp1);
@@ -365,7 +380,7 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
}
break;
}
- /* Case 3 - right rotate at sibling */
+ /* Case 3 - left rotate at sibling */
tmp1 = tmp2->rb_left;
WRITE_ONCE(sibling->rb_right, tmp1);
WRITE_ONCE(tmp2->rb_left, sibling);
@@ -377,7 +392,7 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
tmp1 = sibling;
sibling = tmp2;
}
- /* Case 4 - left rotate at parent + color flips */
+ /* Case 4 - right rotate at parent + color flips */
tmp2 = sibling->rb_right;
WRITE_ONCE(parent->rb_left, tmp2);
WRITE_ONCE(sibling->rb_right, parent);
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 22e13a0e19d7..cb1b54ee8527 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -425,7 +425,8 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
dma_addr_t tbl_dma_addr,
phys_addr_t orig_addr, size_t size,
- enum dma_data_direction dir)
+ enum dma_data_direction dir,
+ unsigned long attrs)
{
unsigned long flags;
phys_addr_t tlb_addr;
@@ -526,7 +527,8 @@ found:
*/
for (i = 0; i < nslots; i++)
io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT);
- if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
+ if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
+ (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE);
return tlb_addr;
@@ -539,18 +541,20 @@ EXPORT_SYMBOL_GPL(swiotlb_tbl_map_single);
static phys_addr_t
map_single(struct device *hwdev, phys_addr_t phys, size_t size,
- enum dma_data_direction dir)
+ enum dma_data_direction dir, unsigned long attrs)
{
dma_addr_t start_dma_addr = phys_to_dma(hwdev, io_tlb_start);
- return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size, dir);
+ return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size,
+ dir, attrs);
}
/*
* dma_addr is the kernel virtual address of the bounce buffer to unmap.
*/
void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
- size_t size, enum dma_data_direction dir)
+ size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
{
unsigned long flags;
int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
@@ -561,6 +565,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
* First, sync the memory before unmapping the entry
*/
if (orig_addr != INVALID_PHYS_ADDR &&
+ !(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
swiotlb_bounce(orig_addr, tlb_addr, size, DMA_FROM_DEVICE);
@@ -654,7 +659,8 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
* GFP_DMA memory; fall back on map_single(), which
* will grab memory from the lowest available address range.
*/
- phys_addr_t paddr = map_single(hwdev, 0, size, DMA_FROM_DEVICE);
+ phys_addr_t paddr = map_single(hwdev, 0, size,
+ DMA_FROM_DEVICE, 0);
if (paddr == SWIOTLB_MAP_ERROR)
goto err_warn;
@@ -667,9 +673,13 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
(unsigned long long)dma_mask,
(unsigned long long)dev_addr);
- /* DMA_TO_DEVICE to avoid memcpy in unmap_single */
+ /*
+ * DMA_TO_DEVICE to avoid memcpy in unmap_single.
+ * The DMA_ATTR_SKIP_CPU_SYNC is optional.
+ */
swiotlb_tbl_unmap_single(hwdev, paddr,
- size, DMA_TO_DEVICE);
+ size, DMA_TO_DEVICE,
+ DMA_ATTR_SKIP_CPU_SYNC);
goto err_warn;
}
}
@@ -698,8 +708,12 @@ swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
if (!is_swiotlb_buffer(paddr))
free_pages((unsigned long)vaddr, get_order(size));
else
- /* DMA_TO_DEVICE to avoid memcpy in swiotlb_tbl_unmap_single */
- swiotlb_tbl_unmap_single(hwdev, paddr, size, DMA_TO_DEVICE);
+ /*
+ * DMA_TO_DEVICE to avoid memcpy in swiotlb_tbl_unmap_single.
+ * DMA_ATTR_SKIP_CPU_SYNC is optional.
+ */
+ swiotlb_tbl_unmap_single(hwdev, paddr, size, DMA_TO_DEVICE,
+ DMA_ATTR_SKIP_CPU_SYNC);
}
EXPORT_SYMBOL(swiotlb_free_coherent);
@@ -714,8 +728,8 @@ swiotlb_full(struct device *dev, size_t size, enum dma_data_direction dir,
* When the mapping is small enough return a static buffer to limit
* the damage, or panic when the transfer is too big.
*/
- printk(KERN_ERR "DMA: Out of SW-IOMMU space for %zu bytes at "
- "device %s\n", size, dev ? dev_name(dev) : "?");
+ dev_err_ratelimited(dev, "DMA: Out of SW-IOMMU space for %zu bytes\n",
+ size);
if (size <= io_tlb_overflow || !do_panic)
return;
@@ -755,7 +769,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force);
/* Oh well, have to allocate and map a bounce buffer. */
- map = map_single(dev, phys, size, dir);
+ map = map_single(dev, phys, size, dir, attrs);
if (map == SWIOTLB_MAP_ERROR) {
swiotlb_full(dev, size, dir, 1);
return phys_to_dma(dev, io_tlb_overflow_buffer);
@@ -764,12 +778,13 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
dev_addr = phys_to_dma(dev, map);
/* Ensure that the address returned is DMA'ble */
- if (!dma_capable(dev, dev_addr, size)) {
- swiotlb_tbl_unmap_single(dev, map, size, dir);
- return phys_to_dma(dev, io_tlb_overflow_buffer);
- }
+ if (dma_capable(dev, dev_addr, size))
+ return dev_addr;
- return dev_addr;
+ attrs |= DMA_ATTR_SKIP_CPU_SYNC;
+ swiotlb_tbl_unmap_single(dev, map, size, dir, attrs);
+
+ return phys_to_dma(dev, io_tlb_overflow_buffer);
}
EXPORT_SYMBOL_GPL(swiotlb_map_page);
@@ -782,14 +797,15 @@ EXPORT_SYMBOL_GPL(swiotlb_map_page);
* whatever the device wrote there.
*/
static void unmap_single(struct device *hwdev, dma_addr_t dev_addr,
- size_t size, enum dma_data_direction dir)
+ size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
{
phys_addr_t paddr = dma_to_phys(hwdev, dev_addr);
BUG_ON(dir == DMA_NONE);
if (is_swiotlb_buffer(paddr)) {
- swiotlb_tbl_unmap_single(hwdev, paddr, size, dir);
+ swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, attrs);
return;
}
@@ -809,7 +825,7 @@ void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
size_t size, enum dma_data_direction dir,
unsigned long attrs)
{
- unmap_single(hwdev, dev_addr, size, dir);
+ unmap_single(hwdev, dev_addr, size, dir, attrs);
}
EXPORT_SYMBOL_GPL(swiotlb_unmap_page);
@@ -891,11 +907,12 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
if (swiotlb_force ||
!dma_capable(hwdev, dev_addr, sg->length)) {
phys_addr_t map = map_single(hwdev, sg_phys(sg),
- sg->length, dir);
+ sg->length, dir, attrs);
if (map == SWIOTLB_MAP_ERROR) {
/* Don't panic here, we expect map_sg users
to do proper error handling. */
swiotlb_full(hwdev, sg->length, dir, 0);
+ attrs |= DMA_ATTR_SKIP_CPU_SYNC;
swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir,
attrs);
sg_dma_len(sgl) = 0;
@@ -910,14 +927,6 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
}
EXPORT_SYMBOL(swiotlb_map_sg_attrs);
-int
-swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
- enum dma_data_direction dir)
-{
- return swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, 0);
-}
-EXPORT_SYMBOL(swiotlb_map_sg);
-
/*
* Unmap a set of streaming mode DMA translations. Again, cpu read rules
* concerning calls here are the same as for swiotlb_unmap_page() above.
@@ -933,19 +942,11 @@ swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
BUG_ON(dir == DMA_NONE);
for_each_sg(sgl, sg, nelems, i)
- unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir);
-
+ unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir,
+ attrs);
}
EXPORT_SYMBOL(swiotlb_unmap_sg_attrs);
-void
-swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
- enum dma_data_direction dir)
-{
- return swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, 0);
-}
-EXPORT_SYMBOL(swiotlb_unmap_sg);
-
/*
* Make physical memory consistent for a set of streaming mode DMA translations
* after a transfer.
diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index 5e51872b3fc1..fbdf87920093 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -20,6 +20,11 @@
#include <linux/uaccess.h>
#include <linux/module.h>
+/*
+ * Note: test functions are marked noinline so that their names appear in
+ * reports.
+ */
+
static noinline void __init kmalloc_oob_right(void)
{
char *ptr;
@@ -411,6 +416,29 @@ static noinline void __init copy_user_test(void)
kfree(kmem);
}
+static noinline void __init use_after_scope_test(void)
+{
+ volatile char *volatile p;
+
+ pr_info("use-after-scope on int\n");
+ {
+ int local = 0;
+
+ p = (char *)&local;
+ }
+ p[0] = 1;
+ p[3] = 1;
+
+ pr_info("use-after-scope on array\n");
+ {
+ char local[1024] = {0};
+
+ p = local;
+ }
+ p[0] = 1;
+ p[1023] = 1;
+}
+
static int __init kmalloc_tests_init(void)
{
kmalloc_oob_right();
@@ -436,6 +464,7 @@ static int __init kmalloc_tests_init(void)
kasan_global_oob();
ksize_unpoisons_memory();
copy_user_test();
+ use_after_scope_test();
return -EAGAIN;
}