From e52f4a8f3be1c6fdf6fb777eacbeec5fac5d6512 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 24 Dec 2018 10:27:45 -0500 Subject: [PATCH] maple_tree: Change my thoughts Reorganise the Thoughts file to bring it to some kind of order, and change the tree dumping code accordingly. It only supports three kinds of node so far: leaf_64, range_64 and dense. But it does now support user pointers that are only 2-byte aligned. Signed-off-by: Matthew Wilcox --- Thoughts | 108 +++++++++++++-------- include/linux/maple_tree.h | 19 ++-- lib/maple_tree.c | 161 +++++++++++++++++++++---------- tools/testing/radix-tree/maple.c | 2 +- 4 files changed, 188 insertions(+), 102 deletions(-) diff --git a/Thoughts b/Thoughts index b11f691a8f7d..588b331ae5bf 100644 --- a/Thoughts +++ b/Thoughts @@ -1,54 +1,78 @@ +Tree format +=========== + The Maple Tree squeezes various bits in at various points which aren't necessarily obvious. Usually, this is done by observing that pointers are N-byte aligned and thus the bottom log_2(N) bits are available for use. We don't use the high bits of pointers to store additional information because we don't know what bits are unused on any given architecture. - -Nodes are 128 bytes in size and are also aligned to 128 bytes, giving us -7 bits for our own purposes in each entry in an internal node. We need -to store the type of the node pointed to (enum maple_type, four bits), -and whether there are any unallocated slots anywhere below this node -(for implementing xa_alloc). That leaves two bits unused for now. - -The tree->ma_root slot pointer uses those two bits (if we end up needing -extra bits, we'll choose some functionality to drop from the root). -If the bottom two bits of the root entry have value '10', then the pointer -is a pointer to a node. Otherwise, the tree contains only a single entry -at index 0. If we attempt to store a single entry at index 0 which has -this pattern in its bottom two bits, we allocate a node and store the -entry in the node. - -state->alloc does not use the low 7 bits for storing the type of the -node pointed at (since it has no type yet). Instead it stores the -number of nodes which still need to be allocated. The first node allocated -is pointed to by state->alloc. Subsequent nodes allocated are pointed -to by state->alloc->slots[n]. - -state->node may also contain things which are not nodes. In this -case, bit 0 set indicates a tree location which is not a slot, bit -1 set indicates an error. Bit 2 (the 'unallocated slots' bit) is -never set. Bits 3-7 indicate the node type. This means that (for the -optimised iteration loop), the inline function can simply test (unsigned +Nodes are 128 bytes in size and are also aligned to 128 bytes, giving +us 7 bits for our own purposes. + +Tree Root +--------- + +If the tree contains a single entry at index 0, it is usually stored in +tree->ma_root. To optimise for the page cache, an entry which ends in +'00', '01' or '11' is stored in the root, but an entry which ends in +'10' will be stored in a node. Bit 2 is set if there are any NULL +entries in the tree. Bits 3-6 are used to store enum maple_type. + +Node Slots +---------- + +Leaf nodes (dense, sparse_N and leaf_N) do not store pointers to nodes, +they store user data. Users may store almost any bit pattern. As noted +above, the optimisation of storing an entry at 0 in the root pointer +cannot be done for data which have the bottom two bits set to '10'. +We also reserve values with the bottom two bits set to '10' which are +below 4096 (ie 2, 6, 10 .. 4094) for internal use. Some APIs return +errnos as a negative errno shifted right by two bits and the bottom two +bits set to '10', and while choosing to store these values in the array +is not an error, it may lead to confusion if you're testing for an error +with xa_is_err(). + +Non-leaf nodes store the type of the node pointed to (enum maple_type +in bits 3-6), and whether there are any NULL entries anywhere below this +node in bit 2. That leaves bits 0-1 unused for now. + +Node Parent +----------- + +The node->parent of the root node has bit 0 set and the rest of the +pointer is a pointer to the tree itself. No more bits are available in +this pointer (on m68k, the data structure may only be 2-byte aligned). + +Non-root nodes can only have maple_range_* nodes as parents, so we only +need to distinguish between range_16, range_32 and range_64. With 32-bit +pointers, we can store up to 11 pointers in a range_64, 16 in a range_32 +and 21 in a range_16, so encode a range_16 as 00 (Bits 2-6 encode the +the slot number), encode range_32 as 010 (Bits 3-6 encode the slot +number) and range_64 as 110 (Bits 3-6 encode the slot number). + +Maple State +----------- + +If state->node has bit 0 set then it references a tree location which +is not a node (eg the root). If bit 1 is set, the rest of the bits +are a negative errno. Bit 2 (the 'unallocated slots' bit) is clear. +Bits 3-6 indicate the node type. This encoding is chosen so that in the +optimised iteration loop the inline function can simply test (unsigned long)state->node & 127 and bail to the out-of-line functions if it's -not 0. 'maple_dense' must remain as enum 0 for this to work. - -Leaf nodes do not store pointers to nodes, they store user data. -Users may store almost any bit pattern. As noted above, the optimisation -of storing an entry at 0 in the root pointer cannot be done for data -which have the bottom two bits set to '10'. We also reserve values -with the bottom two bits set to '10' which are below 4096 (ie 2, 6, -10 .. 4094) for internal use. Some APIs return errnos as a negative -errno shifted right by two bits and the bottom two bits set to '10', -and while choosing to store these values in the array is not an error, -it may lead to confusion if you're testing for an error with xa_is_err(). +not 0. The maple_dense enum must remain at 0 for this to work. -To summarise: Users may store any valid kernel pointer and any value -between 0 and LONG_MAX (encoded as an xa_value). Other values may or -may not work well. +state->alloc uses the low 7 bits for storing a number. If state->node +indicates -ENOMEM, the number is the number of nodes which need to +be allocated. If state->node is a pointer to a node, the number is the +offset in the slots array of the entry. The high bits of state->alloc are +either NULL or a pointer to the first node allocated. Subsequent nodes +allocated are pointed to by state->alloc->slots[n]. ------ +Worked examples +=============== -Inserting multiples of 5: +Inserting multiples of 5 +------------------------ Insert p0 at 0: Tree contains p0 at root. diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 4c135f49d08c..431d1f10757f 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -118,15 +118,18 @@ struct maple_node { enum maple_type { maple_dense, - maple_range_64, - maple_range_32, - maple_range_16, - maple_sparse_64, - maple_sparse_32, - maple_sparse_21, - maple_sparse_16, - maple_sparse_9, maple_sparse_6, + maple_sparse_9, + maple_sparse_16, + maple_sparse_21, + maple_sparse_32, + maple_sparse_64, + maple_leaf_16, + maple_leaf_32, + maple_leaf_64, + maple_range_16, + maple_range_32, + maple_range_64, }; struct maple_tree { diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 42d49595eb7f..d308f6b5708a 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -8,23 +8,37 @@ #include +static inline enum maple_type mt_node_type(const void *entry) +{ + return ((unsigned long)entry >> 3) & 15; +} + +static inline bool mt_is_leaf(const void *entry) +{ + return mt_node_type(entry) < maple_range_16; +} + +static inline bool ma_is_reserved(const void *entry) +{ + return ((unsigned long)entry < 4096) && xa_is_internal(entry); +} static inline bool mas_is_start(struct ma_state *ms) { return ms->node == MAS_START; } -static inline struct maple_node *ma_to_node(const void *entry) +static inline struct maple_node *mt_to_node(const void *entry) { BUG_ON(!xa_is_node(entry)); return (struct maple_node *)((unsigned long)entry & ~127); } static inline -void *ma_mk_node(const struct maple_node *node, enum maple_type type) +void *mt_mk_node(const struct maple_node *node, enum maple_type type) { BUG_ON(xa_is_node(node)); - return (void *)((unsigned long)node | (type << 2) | 2); + return (void *)((unsigned long)node | (type << 3) | 2); } void mtree_init(struct maple_tree *mt) { @@ -50,76 +64,121 @@ void mtree_destroy(struct maple_tree *mt) { } #ifdef MT_DEBUG -void mn_dump(void *entry, unsigned long min, unsigned long max, - unsigned int depth) +void mt_dump_range(unsigned long min, unsigned long max) { if (min == max) pr_info("%lu: ", min); else pr_info("%lu-%lu: ", min, max); +} - if (xa_is_node(entry)) { - struct maple_range_64 *node = &ma_to_node(entry)->mr64; - unsigned long first = min; - unsigned int i; - - pr_cont("node %p depth %d parent %p contents: ", node, depth, - node->parent); - for (i = 0; i < MAPLE_RANGE64_SLOTS - 1; i++) { - pr_cont("%p %lu ", node->slot[i], node->pivot[i]); - } - pr_cont("%p\n", node->slot[i]); - - for (i = 0; i < MAPLE_RANGE64_SLOTS; i++) { - unsigned long last = max; +void mt_dump_entry(void *entry, unsigned long min, unsigned long max) +{ + mt_dump_range(min, max); - if (i < (MAPLE_RANGE64_SLOTS - 1)) - last = node->pivot[i]; - if (last == 0 && i > 0) - break; - mn_dump(node->slot[i], first, last, depth + 1); - if (last > max) { - pr_err("node %p last (%lu) > max (%lu) at pivot %d!\n", node, last, max, i); - break; - } - first = last + 1; - } - } else if (xa_is_value(entry)) + if (xa_is_value(entry)) pr_cont("value %ld (0x%lx) [%p]\n", xa_to_value(entry), xa_to_value(entry), entry); - else if (!xa_is_internal(entry)) - pr_cont("%p\n", entry); else if (xa_is_zero(entry)) pr_cont("zero (%ld)\n", xa_to_internal(entry)); - else + else if (ma_is_reserved(entry)) pr_cont("UNKNOWN ENTRY (%p)\n", entry); + else + pr_cont("%p\n", entry); +} + +void mt_dump_node(void *entry, unsigned long min, unsigned long max, + unsigned int depth); + +void mt_dump_range64(void *entry, unsigned long min, unsigned long max, + unsigned int depth) +{ + struct maple_range_64 *node = &mt_to_node(entry)->mr64; + bool leaf = mt_is_leaf(entry); + unsigned long first = min; + int i; + + pr_cont(" contents: "); + for (i = 0; i < MAPLE_RANGE64_SLOTS - 1; i++) + pr_cont("%p %lu ", node->slot[i], node->pivot[i]); + pr_cont("%p\n", node->slot[i]); + for (i = 0; i < MAPLE_RANGE64_SLOTS; i++) { + unsigned long last = max; + + if (i < (MAPLE_RANGE64_SLOTS - 1)) + last = node->pivot[i]; + if (last == 0 && i > 0) + break; + + if (leaf) + mt_dump_entry(node->slot[i], first, last); + else + mt_dump_node(node->slot[i], first, last, depth + 1); + + if (last == max) + break; + if (last > max) { + pr_err("node %p last (%lu) > max (%lu) at pivot %d!\n", node, last, max, i); + break; + } + first = last + 1; + } +} + +void mt_dump_node(void *entry, unsigned long min, unsigned long max, + unsigned int depth) +{ + struct maple_node *node = mt_to_node(entry); + unsigned int type = mt_node_type(entry); + unsigned int i; + + mt_dump_range(min, max); + + pr_cont("node %p depth %d type %d parent %p", node, depth, type, + node->parent); + switch (type) { + case maple_dense: + pr_cont("\n"); + for (i = 0; i < MAPLE_NODE_SLOTS; i++) { + mt_dump_entry(node->slot[i], min + i, min + i); + if (min + i == max) + break; + } + break; + case maple_leaf_64: + case maple_range_64: + mt_dump_range64(entry, min, max, depth); + break; + default: + pr_cont(" UNKNOWN TYPE\n"); + } } unsigned long mt_max[] = { - MAPLE_NODE_SLOTS, - ULONG_MAX, - UINT_MAX, - (1UL << 16) - 1, - ULONG_MAX, - UINT_MAX, - (1UL << 21) - 1, - (1UL << 16) - 1, - (1UL << 9) - 1, - (1UL << 6) - 1, + [maple_dense] = MAPLE_NODE_SLOTS, + [maple_sparse_6] = (1UL << 6) - 1, + [maple_sparse_9] = (1UL << 9) - 1, + [maple_sparse_16] = (1UL << 16) - 1, + [maple_sparse_21] = (1UL << 21) - 1, + [maple_sparse_32] = UINT_MAX, + [maple_sparse_64] = ULONG_MAX, + [maple_leaf_16] = (1UL << 16) - 1, + [maple_leaf_32] = UINT_MAX, + [maple_leaf_64] = ULONG_MAX, + [maple_range_16] = (1UL << 16) - 1, + [maple_range_32] = UINT_MAX, + [maple_range_64] = ULONG_MAX, }; void mt_dump(const struct maple_tree *mt) { void *entry = mt->ma_root; - unsigned long max; + pr_info("maple_tree(%p) flags %X, root %p\n", + mt, mt->ma_flags, entry); if (!xa_is_node(entry)) - max = 0; + mt_dump_entry(entry, 0, 0); else - max = mt_max[xa_to_internal(entry) & 15]; - - pr_info("maple_tree(%p) flags %X, root %p, max %lu\n", - mt, mt->ma_flags, entry, max); - mn_dump(entry, 0, max, 0); + mt_dump_node(entry, 0, mt_max[mt_node_type(entry)], 0); } #endif diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 9838f144545f..7ec885eca802 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -33,7 +33,7 @@ void farmer_tests(void) node->mr64.pivot[0] = 0; node->mr64.pivot[1] = 1; node->mr64.pivot[2] = 0; - tree.ma_root = ma_mk_node(node, maple_range_64); + tree.ma_root = mt_mk_node(node, maple_leaf_64); mt_dump(&tree); } -- 2.50.1