1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "block_tree.h"
18 
19 #include <assert.h>
20 #include <inttypes.h>
21 #include <limits.h>
22 #include <lk/compiler.h>
23 #include <lk/macros.h>
24 #include <stdbool.h>
25 #include <stdint.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 
30 #include "array.h"
31 #include "block_allocator.h"
32 #include "block_cache.h"
33 #include "crypt.h"
34 #include "debug.h"
35 #include "transaction.h"
36 
37 #if BUILD_STORAGE_TEST
38 #include <math.h>
39 #endif
40 
41 bool print_lookup;
42 static bool print_ops;
43 static bool print_node_changes;
44 static bool print_internal_changes;
45 static bool print_changes;
46 static bool print_changes_full_tree;
47 static bool print_check_errors;
48 
49 /**
50  * struct block_tree_node - On-disk B+ tree node header and payload start
51  * @iv:         initial value used for encrypt/decrypt
52  * @is_leaf:    0 for internal nodes, 1 for leaf nodes.
53  * @data:       Key, child and data payload, size of entries is tree specific
54  *              so accessor functions are needed to get to specific entries.
55  *
56  * On-disk B+ tree node format.
57  *
58  * If %is_leaf is 0, the node is an internal B+ tree node and @data contains n
59  * keys and n + 1 child block-macs. If %is_leaf is 1, the node is a leaf node
60  * and @data contains n keys and n data entries. For either node type
61  * n <= block_tree_max_key_count. For root leaf nodes n >= 0, for root internal
62  * nodes n >= 1, otherwise n >= block_tree_min_key_count. While updating the
63  * tree, n can temporarily exceed these limits by 1 for a single node. There is
64  * no room in this struct to store extra keys, so the extra key and child/data
65  * is stored in the in-memory tree struct.
66  *
67  * The value of block_tree_max_key_count and block_tree_min_key_count depends
68  * on the block, key, child and data sizes for the tree, and may not be the
69  * same for leaf and internal nodes.
70  *
71  * Keys are stored starting at @data, child and data values are stored starting
72  * at data + block_tree_max_key_count * key-size.
73  *
74  * TODO: store n in this struct? Currently a 0 key value us used to determine
75  * how full a node is, which prevents storing 0 keys in the tree.
76  */
77 struct block_tree_node {
78     struct iv iv;
79     uint64_t is_leaf;
80     uint8_t data[0];
81 };
82 
83 /**
84  * block_tree_node_is_leaf - Check if node is a leaf or an internal node
85  * @node_ro:    Node pointer.
86  *
87  * Return: %true if @node_ro is a leaf node, %false if @node_ro is an internal
88  * node.
89  */
block_tree_node_is_leaf(const struct block_tree_node * node_ro)90 static bool block_tree_node_is_leaf(const struct block_tree_node* node_ro) {
91     assert(node_ro);
92     assert(node_ro->is_leaf <= 1);
93 
94     return node_ro->is_leaf;
95 }
96 
97 /**
98  * block_tree_set_sizes - Configure tree block and entry sizes
99  * @tree:           Tree object.
100  * @block_size:     Block size.
101  * @key_size:       Key size. [1-8].
102  * @child_size:     Child size. [@key_size-24].
103  * @data_size:      Data size. [1-24].
104  *
105  * Store tree block and entry sizes and calculate key counts.
106  */
block_tree_set_sizes(struct block_tree * tree,size_t block_size,size_t key_size,size_t child_size,size_t data_size)107 static void block_tree_set_sizes(struct block_tree* tree,
108                                  size_t block_size,
109                                  size_t key_size,
110                                  size_t child_size,
111                                  size_t data_size) {
112     size_t payload_size = block_size - sizeof(struct block_tree_node);
113 
114     assert(payload_size < block_size);
115     assert(key_size);
116     assert(key_size <= sizeof(data_block_t));
117     assert(child_size >= key_size);
118     assert(child_size <= sizeof(struct block_mac));
119     assert(data_size);
120     assert(data_size <= sizeof(struct block_mac));
121 
122     tree->key_size = key_size;
123     tree->child_data_size[0] = child_size;
124     tree->child_data_size[1] = data_size;
125     tree->key_count[0] = (payload_size - child_size) / (key_size + child_size);
126     /* TODO: allow next pointer when mac is not needed? */
127     tree->key_count[1] = (payload_size) / (key_size + data_size);
128     tree->block_size = block_size;
129 
130     assert(tree->key_count[0] >= 2);
131     assert(tree->key_count[1] >= 2);
132 }
133 
134 /**
135  * is_zero - Helper function to check that buffer only contain 0 bytes
136  * @data:       Data pointer.
137  * @size:       Number of bytes to check.
138  *
139  * Return: %true if @size is 0 or @data[0..@size - 1] is all 0, %false
140  * otherwise.
141  */
is_zero(const void * data,size_t size)142 static bool is_zero(const void* data, size_t size) {
143     if (!size) {
144         return true;
145     }
146     assert(size >= 1);
147     return !*(char*)data && !memcmp(data, data + 1, size - 1);
148 }
149 
150 /**
151  * block_tree_max_key_count - Return max key count for leaf or internal nodes
152  * @tree:       Tree object.
153  * @leaf:       %true for leaf node, %false for internal node.
154  *
155  * Return: Maximum number of keys that fit in a leaf node or in and internal
156  * node.
157  */
block_tree_max_key_count(const struct block_tree * tree,bool leaf)158 static unsigned int block_tree_max_key_count(const struct block_tree* tree,
159                                              bool leaf) {
160     unsigned int key_count = tree->key_count[leaf];
161     assert(key_count);
162     assert(key_count * tree->key_size < tree->block_size);
163 
164     return key_count;
165 }
166 
167 /**
168  * block_tree_node_max_key_count - Return max key count for specific node
169  * @tree:       Tree object.
170  * @node_ro:    Node pointer.
171  *
172  * Return: Maximum number of keys that fit @node_ro.
173  */
block_tree_node_max_key_count(const struct block_tree * tree,const struct block_tree_node * node_ro)174 static unsigned int block_tree_node_max_key_count(
175         const struct block_tree* tree,
176         const struct block_tree_node* node_ro) {
177     return block_tree_max_key_count(tree, block_tree_node_is_leaf(node_ro));
178 }
179 
180 /**
181  * Child selection for block_tree_node_shift
182  * @SHIFT_LEAF_OR_LEFT_CHILD:   Required value for leaf nodes, selects left
183  *                              child for internal nodes.
184  * @SHIFT_RIGHT_CHILD:          Select right child.
185  * @SHIFT_LEFT_CHILD:           Select left child (for src_index).
186  * @SHIFT_BOTH:                 Select left child at start and right child at
187  *                              end. Only valid when inserting data at the end
188  *                              of a node.
189  */
190 #define SHIFT_LEAF_OR_LEFT_CHILD 0U
191 #define SHIFT_RIGHT_CHILD (1U << 0)
192 #define SHIFT_LEFT_CHILD (1U << 1)
193 #define SHIFT_BOTH (SHIFT_RIGHT_CHILD | SHIFT_LEFT_CHILD)
194 
195 /**
196  * block_tree_node_shift - Helper function to insert or remove entries in a node
197  * @tree:           Tree object.
198  * @node_rw:        Node pointer.
199  * @dest_index:     Destination index for existing entries to shift.
200  * @src_index:      Source index for existing entries to shift.
201  * @shift_mode:     Specifies which child to shift for internal nodes.
202  * @new_key:        When shifting up (inserting), keys to insert.
203  * @new_data:       When shifting up (inserting), data (or child) to insert.
204  * @overflow_key:   When shifting up (inserting), location to store keys that
205  *                  was shifted out of range. Can be %NULL if all those keys are
206  *                  0.
207  * @overflow_data:  When shifting up (inserting), location to store data (or
208  *                  child) that was shifted out of range. Can be %NULL if that
209  *                  data is all 0.
210  */
block_tree_node_shift(const struct block_tree * tree,struct block_tree_node * node_rw,unsigned int dest_index,unsigned int src_index,uint32_t shift_mode,const void * new_key,const void * new_data,void * overflow_key,void * overflow_data)211 static void block_tree_node_shift(const struct block_tree* tree,
212                                   struct block_tree_node* node_rw,
213                                   unsigned int dest_index,
214                                   unsigned int src_index,
215                                   uint32_t shift_mode,
216                                   const void* new_key,
217                                   const void* new_data,
218                                   void* overflow_key,
219                                   void* overflow_data) {
220     bool is_leaf = block_tree_node_is_leaf(node_rw);
221     unsigned int max_count = tree->key_count[is_leaf];
222 
223     int i;
224     void* base;
225     size_t entry_size;
226 
227     const void* src;
228     void* dest;
229     size_t size;
230     unsigned int clear_index;
231 
232     assert(max_count);
233     assert(dest_index <= max_count + !is_leaf + 1);
234 
235     for (i = 0; i < 2; i++) {
236         if (i == 0) {
237             /* key */
238             base = node_rw->data;
239             entry_size = tree->key_size;
240         } else {
241             /* data/child */
242             base = node_rw->data + tree->key_size * max_count;
243             entry_size = tree->child_data_size[is_leaf];
244             if (!is_leaf) {
245                 /* internal nodes have one more child than keys */
246                 max_count++;
247             }
248             if (shift_mode & SHIFT_RIGHT_CHILD) {
249                 assert(!is_leaf);
250                 if (!(shift_mode & SHIFT_LEFT_CHILD) && src_index != UINT_MAX) {
251                     src_index++;
252                 }
253                 dest_index++;
254             }
255         }
256 
257         if (src_index < dest_index) {
258             /* Inserting, copy entries that will be overwritten to overflow_* */
259             size = (dest_index - src_index) * entry_size;
260             if (src_index == max_count) {
261                 src = i == 0 ? new_key : new_data;
262             } else {
263                 src = base + max_count * entry_size - size;
264             }
265             dest = i == 0 ? overflow_key : overflow_data;
266             if (dest) {
267                 if (print_node_changes) {
268 #if TLOG_LVL >= TLOG_LVL_DEBUG
269                     printf("%s: copy %p, index %d/%d, to overflow, %p, size %zd, is_zero %d\n",
270                            __func__, src, max_count - (dest_index - src_index),
271                            max_count, dest, size, is_zero(src, size));
272 #else
273                     printf("%s: copy src index %d/%d, to overflow dest, size %zd, is_zero %d\n",
274                            __func__, max_count - (dest_index - src_index),
275                            max_count, size, is_zero(src, size));
276 #endif
277                 }
278                 memcpy(dest, src, size);
279             } else {
280                 assert(is_zero(src, size));
281             }
282         }
283 
284         if (src_index < max_count) {
285             /* Inserting or deleting, shift up or down */
286             src = base + src_index * entry_size;
287             dest = base + dest_index * entry_size;
288             size = (max_count - MAX(src_index, dest_index)) * entry_size;
289             if (print_node_changes) {
290 #if TLOG_LVL >= TLOG_LVL_DEBUG
291                 printf("%s: move %p, index %d, to %p, index %d, size %zd, is_zero %d\n",
292                        __func__, src, src_index, dest, dest_index, size,
293                        is_zero(src, size));
294 #else
295                 printf("%s: move src index %d, to dest index %d, size %zd, is_zero %d\n",
296                        __func__, src_index, dest_index, size,
297                        is_zero(src, size));
298 #endif
299             }
300             memmove(dest, src, size);
301             if (src_index >= dest_index) {
302                 clear_index = max_count + dest_index - src_index;
303             }
304         } else {
305             clear_index = dest_index;
306         }
307 
308         if (src_index < dest_index) {
309             /* Inserting, copy new entries passed in */
310             assert(new_key);
311             assert(new_data);
312             src = i == 0 ? new_key : new_data;
313             dest = base + src_index * entry_size;
314             size = (dest_index - src_index) * entry_size;
315             if (src_index == max_count) {
316                 /* NOP, data was copied to overflow_* above */
317             } else {
318                 assert(src);
319                 if (print_node_changes) {
320 #if TLOG_LVL >= TLOG_LVL_DEBUG
321                     printf("%s: copy new data %p, to %p, index %d, size %zd, is_zero %d\n",
322                            __func__, src, dest, src_index, size,
323                            is_zero(src, size));
324 #else
325                     printf("%s: copy new data, index %d, size %zd, is_zero %d\n",
326                            __func__, src_index, size,
327                            is_zero(src, size));
328 #endif
329                 }
330                 memcpy(dest, src, size);
331             }
332         } else {
333             /* Deleting, clear unused space at end */
334             assert(dest_index <= max_count);
335             dest = base + clear_index * entry_size;
336             size = (max_count - clear_index) * entry_size;
337             if (print_node_changes) {
338 #if TLOG_LVL >= TLOG_LVL_DEBUG
339                 printf("%s: clear %p, index %d/%d, size %zd\n", __func__, dest,
340                        clear_index, max_count, size);
341 #else
342                 printf("%s: clear dest, index %d/%d, size %zd\n", __func__,
343                        clear_index, max_count, size);
344 #endif
345             }
346             memset(dest, 0, size);
347         }
348     }
349 }
350 
351 /**
352  * block_tree_node_merge_entries - Helper function to merge nodes
353  * @tree:           Tree object.
354  * @node_rw:        Destination node.
355  * @src_node_ro:    Source node.
356  * @dest_index:     Index to insert at in @node_rw.
357  * @count:          Number of entries to copy from start of @src_node_ro.
358  * @merge_key:      For internal nodes, key to insert between nodes.
359  */
block_tree_node_merge_entries(const struct block_tree * tree,struct block_tree_node * node_rw,const struct block_tree_node * src_node_ro,unsigned int dest_index,unsigned int count,const void * merge_key)360 static void block_tree_node_merge_entries(
361         const struct block_tree* tree,
362         struct block_tree_node* node_rw,
363         const struct block_tree_node* src_node_ro,
364         unsigned int dest_index,
365         unsigned int count,
366         const void* merge_key) {
367     bool is_leaf = block_tree_node_is_leaf(node_rw);
368     unsigned int max_count = tree->key_count[is_leaf];
369     void* dest_key;
370     uint32_t shift_mode = SHIFT_LEAF_OR_LEFT_CHILD;
371     if (!is_leaf) {
372         dest_key = node_rw->data + tree->key_size * dest_index;
373         assert(is_zero(dest_key, tree->key_size));
374         memcpy(dest_key, merge_key, tree->key_size);
375         dest_index++;
376         shift_mode = SHIFT_BOTH;
377     }
378     block_tree_node_shift(tree, node_rw, dest_index + count, dest_index,
379                           shift_mode, src_node_ro->data,
380                           src_node_ro->data + tree->key_size * max_count, NULL,
381                           NULL);
382 }
383 
384 /**
385  * block_tree_node_shift_down - Helper function to delete entries from node
386  * @tree:           Tree object.
387  * @node_rw:        Node.
388  * @dest_index:     Destination index for existing entries to shift (or
389  *                  first entry to remove).
390  * @src_index:      Source index for existing entries to shift (or first entry
391  *                  after @dest_index not to remove).
392  * @shift_mode:     Specifies which child to shift for internal nodes.
393  */
block_tree_node_shift_down(const struct block_tree * tree,struct block_tree_node * node_rw,unsigned int dest_index,unsigned int src_index,uint32_t shift_mode)394 static void block_tree_node_shift_down(const struct block_tree* tree,
395                                        struct block_tree_node* node_rw,
396                                        unsigned int dest_index,
397                                        unsigned int src_index,
398                                        uint32_t shift_mode) {
399     assert(dest_index < src_index);
400     block_tree_node_shift(tree, node_rw, dest_index, src_index, shift_mode,
401                           NULL, NULL, NULL, NULL);
402 }
403 
404 /**
405  * block_tree_node_shift_down - Helper function to delete entries from end of
406  * node
407  * @tree:           Tree object.
408  * @node_rw:        Node.
409  * @start_index:    Index of first entry to remove. For internal nodes the
410  *                  the right child is removed, so @start_index refers to the
411  *                  key index, not the child index.
412  */
block_tree_node_clear_end(const struct block_tree * tree,struct block_tree_node * node_rw,unsigned int start_index)413 static void block_tree_node_clear_end(const struct block_tree* tree,
414                                       struct block_tree_node* node_rw,
415                                       unsigned int start_index) {
416     block_tree_node_shift_down(tree, node_rw, start_index, UINT_MAX,
417                                block_tree_node_is_leaf(node_rw)
418                                        ? SHIFT_LEAF_OR_LEFT_CHILD
419                                        : SHIFT_RIGHT_CHILD);
420 }
421 
422 /**
423  * block_tree_node_insert - Helper function to insert one entry in a node
424  * @tree:           Tree object.
425  * @node_rw:        Node.
426  * @index:          Index to insert at.
427  * @shift_mode:     Specifies which child to insert for internal nodes.
428  * @new_key:        Key to insert.
429  * @new_data:       Data or child to insert.
430  * @overflow_key:   Location to store key that was shifted out of range. Can be
431  *                  %NULL if that key is always 0.
432  * @overflow_data:  Location to store data (or child) that was shifted out of
433  *                  range. Can be %NULL if that data is all 0.
434  */
block_tree_node_insert(const struct block_tree * tree,struct block_tree_node * node_rw,unsigned int index,uint32_t shift_mode,const void * new_key,const void * new_data,void * overflow_key,void * overflow_data)435 static void block_tree_node_insert(const struct block_tree* tree,
436                                    struct block_tree_node* node_rw,
437                                    unsigned int index,
438                                    uint32_t shift_mode,
439                                    const void* new_key,
440                                    const void* new_data,
441                                    void* overflow_key,
442                                    void* overflow_data) {
443     block_tree_node_shift(tree, node_rw, index + 1, index, shift_mode, new_key,
444                           new_data, overflow_key, overflow_data);
445 }
446 
447 /**
448  * block_tree_node_get_key - Get key from node or in-memory pending update
449  * @tree:           Tree object.
450  * @node_block:     Block number where @node_ro data is stored.
451  * @node_ro:        Node.
452  * @index:          Index of key to get.
453  *
454  * Return: key at @index, or 0 if there is no key at @index. If @index is
455  * equal to max key count, check for a matching entry in @tree->inserting.
456  */
block_tree_node_get_key(const struct block_tree * tree,data_block_t node_block,const struct block_tree_node * node_ro,unsigned int index)457 static data_block_t block_tree_node_get_key(
458         const struct block_tree* tree,
459         data_block_t node_block,
460         const struct block_tree_node* node_ro,
461         unsigned int index) {
462     data_block_t key = 0;
463     const void* keyp;
464     const size_t key_count = block_tree_node_max_key_count(tree, node_ro);
465     const size_t key_size = tree->key_size;
466 
467     assert(node_ro);
468 
469     if (index < key_count) {
470         keyp = node_ro->data + index * tree->key_size;
471         assert(sizeof(key) >= key_size);
472         memcpy(&key, keyp, key_size);
473     }
474     if (!key && node_block == tree->inserting.block) {
475         assert(index >= key_count);
476 
477         if (index <= key_count) {
478             key = tree->inserting.key;
479         }
480     }
481     return key;
482 }
483 
484 /**
485  * block_tree_node_set_key - Set key in node
486  * @tree:           Tree object.
487  * @node_rw:        Node.
488  * @index:          Index of key to set.
489  * @new_key:        Key value to write.
490  */
block_tree_node_set_key(const struct block_tree * tree,struct block_tree_node * node_rw,unsigned int index,data_block_t new_key)491 static void block_tree_node_set_key(const struct block_tree* tree,
492                                     struct block_tree_node* node_rw,
493                                     unsigned int index,
494                                     data_block_t new_key) {
495     const size_t key_size = tree->key_size;
496     const size_t key_count = block_tree_node_max_key_count(tree, node_rw);
497 
498     assert(node_rw);
499     assert(index < key_count);
500     assert(key_size <= sizeof(new_key));
501     /* TODO: support big-endian host */
502     memcpy(node_rw->data + index * tree->key_size, &new_key, key_size);
503 }
504 
505 /**
506  * block_tree_node_get_child_data - Get pointer to child or data
507  * @tree:           Tree object.
508  * @node_ro:        Node.
509  * @index:          Index of child or data entry to get.
510  *
511  * Return: pointer to child or data entry at index @index in @node_ro.
512  */
block_tree_node_get_child_data(const struct block_tree * tree,const struct block_tree_node * node_ro,unsigned int index)513 static const void* block_tree_node_get_child_data(
514         const struct block_tree* tree,
515         const struct block_tree_node* node_ro,
516         unsigned int index) {
517     bool is_leaf = block_tree_node_is_leaf(node_ro);
518     const size_t max_key_count = tree->key_count[is_leaf];
519     const size_t child_data_size = tree->child_data_size[is_leaf];
520     const void* child_data;
521 
522     assert(index < max_key_count + !is_leaf);
523 
524     child_data = node_ro->data + tree->key_size * max_key_count +
525                  child_data_size * index;
526 
527     assert(child_data > (void*)node_ro->data);
528     assert(child_data < (void*)node_ro + tree->block_size);
529 
530     return child_data;
531 }
532 
533 /**
534  * block_tree_node_get_child_data_rw - Get pointer to child or data
535  * @tree:           Tree object.
536  * @node_rw:        Node.
537  * @index:          Index of child or data entry to get.
538  *
539  * Return: pointer to child or data entry at index @index in @node_rw.
540  */
block_tree_node_get_child_data_rw(const struct block_tree * tree,struct block_tree_node * node_rw,int index)541 static void* block_tree_node_get_child_data_rw(const struct block_tree* tree,
542                                                struct block_tree_node* node_rw,
543                                                int index) {
544     return (void*)block_tree_node_get_child_data(tree, node_rw, index);
545 }
546 
547 /**
548  * block_tree_node_get_child - Get child from node or in-memory pending update
549  * @tree:           Tree object.
550  * @node_block:     Block number where @node_ro data is stored.
551  * @node_ro:        Internal node.
552  * @index:          Index of child to get.
553  *
554  * Return: child at @index, or 0 if there is no child at @index. If @index is
555  * equal to max child count, check for a matching entry in @tree->inserting.
556  */
block_tree_node_get_child(const struct transaction * tr,const struct block_tree * tree,data_block_t node_block,const struct block_tree_node * node_ro,unsigned int index)557 static const struct block_mac* block_tree_node_get_child(
558         const struct transaction* tr,
559         const struct block_tree* tree,
560         data_block_t node_block,
561         const struct block_tree_node* node_ro,
562         unsigned int index) {
563     const struct block_mac* child = NULL;
564     const size_t key_count = block_tree_node_max_key_count(tree, node_ro);
565 
566     assert(!block_tree_node_is_leaf(node_ro));
567 
568     if (index <= key_count) {
569         child = block_tree_node_get_child_data(tree, node_ro, index);
570         if (!block_mac_to_block(tr, child)) {
571             child = NULL;
572         }
573     }
574 
575     if (!child && node_block == tree->inserting.block) {
576         assert(index > key_count);
577 
578         if (index <= key_count + 1) {
579             child = &tree->inserting.child;
580         }
581     }
582 
583     return child;
584 }
585 
586 /**
587  * block_tree_node_get_data - Get data from node or in-memory pending update
588  * @tree:           Tree object.
589  * @node_block:     Block number where @node_ro data is stored.
590  * @node_ro:        Leaf node.
591  * @index:          Index of data to get.
592  *
593  * Return: data at @index, or 0 if there is no data at @index. If @index is
594  * equal to max key count, check for a matching entry in @tree->inserting.
595  */
block_tree_node_get_data(const struct transaction * tr,const struct block_tree * tree,data_block_t node_block,const struct block_tree_node * node_ro,unsigned int index)596 static struct block_mac block_tree_node_get_data(
597         const struct transaction* tr,
598         const struct block_tree* tree,
599         data_block_t node_block,
600         const struct block_tree_node* node_ro,
601         unsigned int index) {
602     struct block_mac block_mac_ret = BLOCK_MAC_INITIAL_VALUE(block_mac_ret);
603     const struct block_mac* datap = NULL;
604     const size_t max_key_count = block_tree_node_max_key_count(tree, node_ro);
605 
606     assert(block_tree_node_is_leaf(node_ro));
607 
608     if (index < max_key_count) {
609         datap = block_tree_node_get_child_data(tree, node_ro, index);
610         if (!block_mac_to_block(tr, datap)) {
611             datap = NULL;
612         }
613     }
614     if (!datap && node_block == tree->inserting.block) {
615         assert(index >= max_key_count);
616 
617         if (index <= max_key_count) {
618             datap = &tree->inserting.data;
619         }
620     }
621     if (datap) {
622         block_mac_copy(tr, &block_mac_ret, datap);
623     }
624     return block_mac_ret;
625 }
626 
627 #define block_tree_node_for_each_child(tr, tree, block, node_ro, child, i) \
628     for (i = 0;                                                            \
629          (child = block_tree_node_get_child(tr, tree, block, node_ro, i)); \
630          i++)
631 
632 /**
633  * block_tree_node_print_internal - Print internal node
634  * @tr:             Transaction object.
635  * @tree:           Tree object.
636  * @node_block:     Block number where @node_ro data is stored.
637  * @node_ro:        Node.
638  */
block_tree_node_print_internal(const struct transaction * tr,const struct block_tree * tree,data_block_t node_block,const struct block_tree_node * node_ro)639 static void block_tree_node_print_internal(
640         const struct transaction* tr,
641         const struct block_tree* tree,
642         data_block_t node_block,
643         const struct block_tree_node* node_ro) {
644     unsigned int i;
645     const struct block_mac* child;
646     const size_t key_count = block_tree_node_max_key_count(tree, node_ro);
647 
648     assert(!block_tree_node_is_leaf(node_ro));
649 
650     for (i = 0; i <= key_count + 1; i++) {
651         child = block_tree_node_get_child(tr, tree, node_block, node_ro, i);
652         if (child) {
653             printf(" %" PRIu64 "", block_mac_to_block(tr, child));
654         } else if (i < key_count + 1) {
655             printf(" .");
656         }
657         if (block_tree_node_get_key(tree, node_block, node_ro, i)) {
658             if (i == key_count) {
659                 printf(" inserting");
660             }
661             printf(" [%" PRIu64 "-]",
662                    block_tree_node_get_key(tree, node_block, node_ro, i));
663         }
664     }
665     assert(!block_tree_node_get_child(tr, tree, node_block, node_ro, i));
666     printf("\n");
667 }
668 
669 /**
670  * block_tree_node_print_leaf - Print leaf node
671  * @tr:             Transaction object.
672  * @tree:           Tree object.
673  * @node_block:     Block number where @node_ro data is stored.
674  * @node_ro:        Node.
675  */
block_tree_node_print_leaf(const struct transaction * tr,const struct block_tree * tree,data_block_t node_block,const struct block_tree_node * node_ro)676 static void block_tree_node_print_leaf(const struct transaction* tr,
677                                        const struct block_tree* tree,
678                                        data_block_t node_block,
679                                        const struct block_tree_node* node_ro) {
680     unsigned int i;
681     data_block_t key;
682     struct block_mac data;
683     const size_t key_count = block_tree_node_max_key_count(tree, node_ro);
684 
685     assert(block_tree_node_is_leaf(node_ro));
686 
687     for (i = 0; i <= key_count; i++) {
688         key = block_tree_node_get_key(tree, node_block, node_ro, i);
689         data = block_tree_node_get_data(tr, tree, node_block, node_ro, i);
690         if (key || block_mac_valid(tr, &data)) {
691             if (i == key_count) {
692                 printf(" inserting");
693             }
694             printf(" [%" PRIu64 ": %" PRIu64 "]", key,
695                    block_mac_to_block(tr, &data));
696         } else if (i < key_count) {
697             printf(" .");
698         }
699     }
700     printf("\n");
701 }
702 
703 /**
704  * block_tree_node_print - Print node
705  * @tr:             Transaction object.
706  * @tree:           Tree object.
707  * @node_block:     Block number where @node_ro data is stored.
708  * @node_ro:        Node.
709  */
block_tree_node_print(const struct transaction * tr,const struct block_tree * tree,data_block_t node_block,const struct block_tree_node * node_ro)710 static void block_tree_node_print(const struct transaction* tr,
711                                   const struct block_tree* tree,
712                                   data_block_t node_block,
713                                   const struct block_tree_node* node_ro) {
714     printf("  %3" PRIu64 ":", node_block);
715     if (node_ro->is_leaf == true) {
716         block_tree_node_print_leaf(tr, tree, node_block, node_ro);
717     } else if (!node_ro->is_leaf) {
718         block_tree_node_print_internal(tr, tree, node_block, node_ro);
719     } else {
720         printf(" bad node header %" PRIx64 "\n", node_ro->is_leaf);
721     }
722 }
723 
724 /**
725  * block_tree_print_sub_tree - Print tree or a branch of a tree
726  * @tr:             Transaction object.
727  * @tree:           Tree object.
728  * @block_mac:      Root of tree or branch to print.
729  */
block_tree_print_sub_tree(struct transaction * tr,const struct block_tree * tree,const struct block_mac * block_mac)730 static void block_tree_print_sub_tree(struct transaction* tr,
731                                       const struct block_tree* tree,
732                                       const struct block_mac* block_mac) {
733     int i;
734     const struct block_tree_node* node_ro;
735     struct obj_ref node_ref = OBJ_REF_INITIAL_VALUE(node_ref);
736     const struct block_mac* child;
737 
738     if (!block_mac || !block_mac_valid(tr, block_mac)) {
739         printf("empty\n");
740         return;
741     }
742 
743     node_ro = block_get(tr, block_mac, NULL, &node_ref);
744     if (!node_ro) {
745         printf("  %3" PRIu64 ": unreadable\n",
746                block_mac_to_block(tr, block_mac));
747         return;
748     }
749     block_tree_node_print(tr, tree, block_mac_to_block(tr, block_mac), node_ro);
750     if (!node_ro->is_leaf) {
751         block_tree_node_for_each_child(tr, tree,
752                                        block_mac_to_block(tr, block_mac),
753                                        node_ro, child, i) {
754             block_tree_print_sub_tree(tr, tree, child);
755         }
756     }
757     block_put(node_ro, &node_ref);
758 }
759 
760 /**
761  * block_tree_print - Print tree
762  * @tr:             Transaction object.
763  * @tree:           Tree object.
764  */
block_tree_print(struct transaction * tr,const struct block_tree * tree)765 void block_tree_print(struct transaction* tr, const struct block_tree* tree) {
766     block_tree_print_sub_tree(tr, tree, &tree->root);
767 }
768 
769 /**
770  * block_tree_node_check - Check node for errors
771  * @tr:             Transaction object.
772  * @tree:           Tree object.
773  * @node_ro:        Node.
774  * @node_block:     Block number where @node_ro data is stored.
775  * @min_key:        Minimum allowed key value.
776  * @max_key:        Maximum allowed key value.
777  *
778  * Return: %false is an error is detected, %true otherwise.
779  */
block_tree_node_check(const struct transaction * tr,const struct block_tree * tree,const struct block_tree_node * node_ro,data_block_t node_block,data_block_t min_key,data_block_t max_key)780 static bool block_tree_node_check(const struct transaction* tr,
781                                   const struct block_tree* tree,
782                                   const struct block_tree_node* node_ro,
783                                   data_block_t node_block,
784                                   data_block_t min_key,
785                                   data_block_t max_key) {
786     unsigned int i;
787     data_block_t key;
788     data_block_t prev_key = 0;
789     int empty_count;
790     const void* child_data;
791     size_t key_count = block_tree_node_max_key_count(tree, node_ro);
792     bool is_leaf;
793 
794     if (node_ro->is_leaf && node_ro->is_leaf != true) {
795         printf("%s: bad node header %" PRIx64 "\n", __func__, node_ro->is_leaf);
796         goto err;
797     }
798     is_leaf = block_tree_node_is_leaf(node_ro);
799 
800     empty_count = 0;
801     for (i = 0; i < key_count; i++) {
802         key = block_tree_node_get_key(tree, node_block, node_ro, i);
803         if (!key) {
804             empty_count++;
805         }
806         if (empty_count) {
807             if (key) {
808                 printf("%s: %" PRIu64
809                        ": expected zero key at %d, found %" PRIu64 "\n",
810                        __func__, node_block, i, key);
811                 goto err;
812             }
813             child_data =
814                     block_tree_node_get_child_data(tree, node_ro, i + !is_leaf);
815             if (!is_zero(child_data, tree->child_data_size[is_leaf])) {
816                 printf("%s: %" PRIu64
817                        ": expected zero data/right child value at %d\n",
818                        __func__, node_block, i);
819                 goto err;
820             }
821             continue;
822         }
823         if (key < prev_key || key < min_key || key > max_key) {
824             printf("%s: %" PRIu64 ": bad key at %d, %" PRIu64
825                    " not in [%" PRIu64 "/%" PRIu64 "-%" PRIu64 "]\n",
826                    __func__, node_block, i, key, min_key, prev_key, max_key);
827             if (tr->failed && key >= prev_key) {
828                 printf("%s: transaction failed, ignore\n", __func__);
829             } else {
830                 goto err;
831             }
832         }
833         prev_key = key;
834     }
835     return true;
836 
837 err:
838     if (print_check_errors) {
839         block_tree_node_print(tr, tree, node_block, node_ro);
840     }
841     return false;
842 }
843 
844 /**
845  * block_tree_check_sub_tree - Check tree or a branch of a tree for errros
846  * @tr:             Transaction object.
847  * @tree:           Tree object.
848  * @block_mac:      Root of tree or branch to check.
849  * @is_root:        %true if @block_mac refers to the root of the tree, %false
850  *                  otherwise.
851  * @min_key:        Minimum allowed key value.
852  * @max_key:        Maximum allowed key value.
853  * @updating:       %true if @tree is currently updating and nodes below
854  *                  min-full should be allowed, %false otherwise.
855  *
856  * Return: Depth of tree/branch, -1 if an error was detected or -2 if @block_mac
857  * could not be read.
858  *
859  * TODO: Reduce overlap with and move more checks to block_tree_node_check.
860  */
block_tree_check_sub_tree(struct transaction * tr,const struct block_tree * tree,const struct block_mac * block_mac,bool is_root,data_block_t min_key,data_block_t max_key,bool updating)861 static int block_tree_check_sub_tree(struct transaction* tr,
862                                      const struct block_tree* tree,
863                                      const struct block_mac* block_mac,
864                                      bool is_root,
865                                      data_block_t min_key,
866                                      data_block_t max_key,
867                                      bool updating) {
868     const struct block_tree_node* node_ro;
869     unsigned int i;
870     int last_child = 0;
871     int empty_count;
872     int depth;
873     int sub_tree_depth;
874     data_block_t child_min_key;
875     data_block_t child_max_key;
876     data_block_t key;
877     int max_empty_count;
878     size_t key_count;
879     const void* child_data;
880     struct block_mac child_block_mac;
881     struct obj_ref ref = OBJ_REF_INITIAL_VALUE(ref);
882     bool is_leaf;
883 
884     if (!block_mac || !block_mac_to_block(tr, block_mac))
885         return 0;
886 
887     depth = 1;
888 
889     if (block_mac_to_block(tr, block_mac) >= tr->fs->dev->block_count) {
890         printf("%s: %3" PRIu64 ": invalid\n", __func__,
891                block_mac_to_block(tr, block_mac));
892         return -1;
893     }
894 
895     node_ro = block_get_no_tr_fail(tr, block_mac, NULL, &ref);
896     if (!node_ro) {
897         if (tr->failed) {
898             /*
899              * Failed transactions discards dirty cache entries so parts of the
900              * tree may now be unreadable.
901              */
902             pr_warn("ignore unreadable block %" PRIu64 ", transaction failed\n",
903                     block_mac_to_block(tr, block_mac));
904             return -2;
905         }
906         pr_warn("%3" PRIu64 ": unreadable\n",
907                 block_mac_to_block(tr, block_mac));
908         return -2;
909     }
910 
911     if (!block_tree_node_check(tr, tree, node_ro,
912                                block_mac_to_block(tr, block_mac), min_key,
913                                max_key)) {
914         goto err;
915     }
916 
917     if (node_ro->is_leaf && node_ro->is_leaf != true) {
918         printf("%s: bad node header %" PRIx64 "\n", __func__, node_ro->is_leaf);
919         goto err;
920     }
921     is_leaf = block_tree_node_is_leaf(node_ro);
922 
923     key_count = block_tree_node_max_key_count(tree, node_ro);
924     /* TODO: don't allow empty root */
925     max_empty_count =
926             is_root ? (key_count /*- 1*/) : ((key_count + 1) / 2 + updating);
927 
928     child_min_key = min_key;
929     empty_count = 0;
930     for (i = 0; i < key_count; i++) {
931         key = block_tree_node_get_key(tree, block_mac_to_block(tr, block_mac),
932                                       node_ro, i);
933         if (!key) {
934             empty_count++;
935         }
936         if (empty_count) {
937             if (key) {
938                 printf("%s: %" PRIu64
939                        ": expected zero key at %d, found %" PRIu64 "\n",
940                        __func__, block_mac_to_block(tr, block_mac), i, key);
941                 goto err;
942             }
943             child_data =
944                     block_tree_node_get_child_data(tree, node_ro, i + !is_leaf);
945             if (!is_zero(child_data, tree->child_data_size[is_leaf])) {
946                 printf("%s: %" PRIu64
947                        ": expected zero data/right child value at %d\n",
948                        __func__, block_mac_to_block(tr, block_mac), i);
949                 goto err;
950             }
951             continue;
952         }
953         if (i == 0 && min_key && is_leaf && key != min_key) {
954             printf("%s: %" PRIu64 ": bad key at %d, %" PRIu64
955                    " not start of [%" PRIu64 "-%" PRIu64 "]\n",
956                    __func__, block_mac_to_block(tr, block_mac), i, key, min_key,
957                    max_key);
958             if (tr->failed) {
959                 printf("%s: transaction failed, ignore\n", __func__);
960             } else if (!key) {
961                 // warn only for now
962                 printf("%s: ignore empty node error\n", __func__);
963             } else {
964                 goto err;
965             }
966         }
967         min_key = key;
968         child_max_key = key;
969         if (!is_leaf) {
970             child_data = block_tree_node_get_child_data(tree, node_ro, i);
971             block_mac_copy(tr, &child_block_mac, child_data);
972             block_put(node_ro, &ref);
973             sub_tree_depth = block_tree_check_sub_tree(
974                     tr, tree, &child_block_mac, false, child_min_key,
975                     child_max_key, updating);
976             node_ro = block_get_no_tr_fail(tr, block_mac, NULL, &ref);
977             if (!node_ro) {
978                 pr_warn("%3" PRIu64 ": unreadable\n",
979                         block_mac_to_block(tr, block_mac));
980                 return -2;
981             }
982             if (sub_tree_depth == -1) {
983                 goto err;
984             }
985             if (sub_tree_depth == -2) {
986                 pr_warn("%" PRIu64 ": unreadable subtree at %d\n",
987                         block_mac_to_block(tr, block_mac), i);
988                 if (depth == 1) {
989                     depth = -2;
990                 }
991             } else if (depth == 1 || depth == -2) {
992                 depth = sub_tree_depth + 1;
993             } else if (sub_tree_depth != depth - 1) {
994                 printf("%s: %" PRIu64 ": bad subtree depth at %d, %d != %d\n",
995                        __func__, block_mac_to_block(tr, block_mac), i,
996                        sub_tree_depth, depth - 1);
997                 goto err;
998             }
999         }
1000         child_min_key = key;
1001         min_key = key;
1002         last_child = i + 1;
1003     }
1004     child_max_key = max_key;
1005     if (!is_leaf) {
1006         child_data = block_tree_node_get_child_data(tree, node_ro, last_child);
1007         block_mac_copy(tr, &child_block_mac, child_data);
1008         block_put(node_ro, &ref);
1009         sub_tree_depth = block_tree_check_sub_tree(tr, tree, &child_block_mac,
1010                                                    false, child_min_key,
1011                                                    child_max_key, updating);
1012         node_ro = block_get_no_tr_fail(tr, block_mac, NULL, &ref);
1013         if (!node_ro) {
1014             pr_warn("%3" PRIu64 ": unreadable\n",
1015                     block_mac_to_block(tr, block_mac));
1016             return -2;
1017         }
1018         if (sub_tree_depth == -1) {
1019             goto err;
1020         }
1021         if (sub_tree_depth == -2) {
1022             pr_warn("%" PRIu64 ": unreadable subtree at %d\n",
1023                     block_mac_to_block(tr, block_mac), last_child);
1024             if (depth == 1) {
1025                 depth = -2;
1026             }
1027         } else if (depth == 1 || depth == -2) {
1028             depth = sub_tree_depth + 1;
1029         } else if (sub_tree_depth != depth - 1) {
1030             printf("%s: %" PRIu64 ": bad subtree depth at %d, %d != %d\n",
1031                    __func__, block_mac_to_block(tr, block_mac), last_child,
1032                    sub_tree_depth, depth - 1);
1033             goto err;
1034         }
1035     }
1036     if (empty_count > max_empty_count) {
1037         printf("%s: %" PRIu64 ": too many empty nodes, %d > %d\n", __func__,
1038                block_mac_to_block(tr, block_mac), empty_count, max_empty_count);
1039         if (tr->failed) {
1040             printf("%s: ignore error, transaction failed\n", __func__);
1041         } else {
1042             goto err;
1043         }
1044     }
1045     block_put(node_ro, &ref);
1046 
1047     return depth;
1048 
1049 err:
1050     block_put(node_ro, &ref);
1051     return -1;
1052 }
1053 
1054 /**
1055  * block_tree_check - Check tree for errros
1056  * @tr:             Transaction object.
1057  * @tree:           Tree object.
1058  *
1059  * Return: %false if an error was detected, %true otherwise.
1060  */
block_tree_check(struct transaction * tr,const struct block_tree * tree)1061 bool block_tree_check(struct transaction* tr, const struct block_tree* tree) {
1062     int ret;
1063 
1064     assert(tr->fs->dev->block_size >= tree->block_size);
1065 
1066     ret = block_tree_check_sub_tree(tr, tree, &tree->root, true, 0,
1067                                     DATA_BLOCK_MAX, tree->updating);
1068     if (ret < 0) {
1069         if (ret == -2) {
1070             pr_warn("block_tree not fully readable\n");
1071             if (!tr->failed) {
1072                 transaction_fail(tr);
1073             }
1074             return true;
1075         }
1076         pr_err("%s: invalid block_tree:\n", __func__);
1077         if (print_check_errors) {
1078             block_tree_print(tr, tree);
1079         }
1080         return false;
1081     }
1082     return true;
1083 }
1084 
1085 /**
1086  * block_tree_node_full - Check if node is full
1087  * @tree:           Tree object.
1088  * @node_ro:        Node.
1089  *
1090  * Return: %true is @node_ro is full, %false otherwise.
1091  */
block_tree_node_full(const struct block_tree * tree,const struct block_tree_node * node_ro)1092 static bool block_tree_node_full(const struct block_tree* tree,
1093                                  const struct block_tree_node* node_ro) {
1094     const size_t key_count = block_tree_node_max_key_count(tree, node_ro);
1095     return !!block_tree_node_get_key(tree, DATA_BLOCK_INVALID, node_ro,
1096                                      key_count - 1);
1097 }
1098 
1099 /**
1100  * block_tree_min_key_count - Return the minimum number of keys for a node
1101  * @tree:           Tree object.
1102  * @leaf:           %true for leaf nodes, %false for internal nodes.
1103  *
1104  * B+ tree minimum full rules for non-root nodes.
1105  *
1106  * min-children = (max-children + 1) / 2 - 1
1107  * max-keys = max-children - 1
1108  * max-children = max-keys + 1
1109  * min-keys = min-children - 1
1110  *          = (max-children + 1) / 2 - 1
1111  *          = (max-keys + 1 + 1) / 2 - 1
1112  *          = max-keys / 2
1113  *
1114  * Return: minimum number of keys required for non-root leaf or internal nodes.
1115  */
block_tree_min_key_count(const struct block_tree * tree,bool leaf)1116 static unsigned int block_tree_min_key_count(const struct block_tree* tree,
1117                                              bool leaf) {
1118     return block_tree_max_key_count(tree, leaf) / 2;
1119 }
1120 
1121 /**
1122  * block_tree_node_min_full_index - Return index of last key in "min-full" node
1123  * @tree:           Tree object.
1124  * @node_ro:        Node.
1125  *
1126  * Return: index.
1127  */
block_tree_node_min_full_index(const struct block_tree * tree,const struct block_tree_node * node_ro)1128 static int block_tree_node_min_full_index(
1129         const struct block_tree* tree,
1130         const struct block_tree_node* node_ro) {
1131     return block_tree_min_key_count(tree, block_tree_node_is_leaf(node_ro)) - 1;
1132 }
1133 
1134 /**
1135  * block_tree_above_min_full - Check if node has more than the minimum number of
1136  * entries
1137  * @tree:           Tree object.
1138  * @node_ro:        Node.
1139  *
1140  * Return: %true if @node_ro has more than the minimim required entry count,
1141  * %false otherwise.
1142  */
block_tree_above_min_full(const struct block_tree * tree,const struct block_tree_node * node_ro)1143 bool block_tree_above_min_full(const struct block_tree* tree,
1144                                const struct block_tree_node* node_ro) {
1145     int min_full_index = block_tree_node_min_full_index(tree, node_ro);
1146     return !!block_tree_node_get_key(tree, DATA_BLOCK_INVALID, node_ro,
1147                                      min_full_index + 1);
1148 }
1149 
1150 /**
1151  * block_tree_above_min_full - Check if node has less than the minimum number of
1152  * entries
1153  * @tree:           Tree object.
1154  * @node_ro:        Node.
1155  *
1156  * Return: %true if @node_ro has less than the minimim required entry count,
1157  * %false otherwise.
1158  */
block_tree_below_min_full(const struct block_tree * tree,const struct block_tree_node * node_ro)1159 bool block_tree_below_min_full(const struct block_tree* tree,
1160                                const struct block_tree_node* node_ro) {
1161     int min_full_index = block_tree_node_min_full_index(tree, node_ro);
1162     return !block_tree_node_get_key(tree, DATA_BLOCK_INVALID, node_ro,
1163                                     min_full_index);
1164 }
1165 
1166 /**
1167  * block_tree_node_need_copy - Check if node needs to be copied before write
1168  * @tr:         Transaction object.
1169  * @tree:       Tree object.
1170  * @block_mac:  Block number and mac of node to check.
1171  *
1172  * Return: %true if @tree is a copy_on_write tree and @block_mac needs to be
1173  * copied before the node can be written, %false otherwise.
1174  */
block_tree_node_need_copy(struct transaction * tr,const struct block_tree * tree,const struct block_mac * block_mac)1175 static bool block_tree_node_need_copy(struct transaction* tr,
1176                                       const struct block_tree* tree,
1177                                       const struct block_mac* block_mac) {
1178     return tree->copy_on_write &&
1179            transaction_block_need_copy(tr, block_mac_to_block(tr, block_mac)) &&
1180            !tr->failed;
1181 }
1182 
1183 /**
1184  * block_tree_node_find_block - Helper function for block_tree_walk
1185  * @tr:             Transaction object.
1186  * @tree:           Tree object.
1187  * @node_block:     Block number where @node_ro data is stored.
1188  * @node_ro:        Node to search.
1189  * @key:            Key to search for.
1190  * @key_is_max:     If %true, and @key is not found in a leaf node, use largest
1191  *                  entry with a key less than @key. If %false, and @key is not
1192  *                  found in a leaf node, use smallest entry with a key greater
1193  *                  than @key. Ignored for internal nodes or when @key is found.
1194  * @child:          Output arg for child at returned index.
1195  * @prev_key:       Output arg. Unmodified if returned index is 0, set to key at
1196  *                  index - 1 otherwise.
1197  * @next_key:       Output arg. Unmodified if returned index does not contain
1198  *                  an entry, set to key at index otherwise.
1199  *
1200  * Return: index of closest matching entry.
1201  */
block_tree_node_find_block(const struct transaction * tr,const struct block_tree * tree,data_block_t node_block,const struct block_tree_node * node_ro,data_block_t key,bool key_is_max,const struct block_mac ** child,data_block_t * prev_key,data_block_t * next_key)1202 static int block_tree_node_find_block(const struct transaction* tr,
1203                                       const struct block_tree* tree,
1204                                       data_block_t node_block,
1205                                       const struct block_tree_node* node_ro,
1206                                       data_block_t key,
1207                                       bool key_is_max,
1208                                       const struct block_mac** child,
1209                                       data_block_t* prev_key,
1210                                       data_block_t* next_key) {
1211     int i;
1212     data_block_t curr_key;
1213     int keys_count;
1214     bool is_leaf = block_tree_node_is_leaf(node_ro);
1215 
1216     keys_count = block_tree_node_max_key_count(tree, node_ro);
1217 
1218     /* TODO: better search? */
1219     for (i = 0; i < keys_count + 1; i++) {
1220         curr_key = block_tree_node_get_key(tree, node_block, node_ro, i);
1221         if (!curr_key || (key <= (curr_key - !is_leaf))) {
1222             break;
1223         }
1224         curr_key = 0;
1225     }
1226     if (i == keys_count && curr_key) {
1227         assert(tree->inserting.block == node_block);
1228         if (print_ops) {
1229             printf("%s: using inserting block %" PRIu64 ", key %" PRIu64
1230                    ", child %" PRIu64 ", data %" PRIu64 "\n",
1231                    __func__, tree->inserting.block, tree->inserting.key,
1232                    block_mac_to_block(tr, &tree->inserting.child),
1233                    block_mac_to_block(tr, &tree->inserting.data));
1234         }
1235     }
1236     if (key_is_max && is_leaf && i > 0 && (!curr_key || curr_key > key)) {
1237         i--;
1238         curr_key = block_tree_node_get_key(tree, node_block, node_ro, i);
1239     }
1240 
1241     if (curr_key) {
1242         *next_key = curr_key;
1243     }
1244     if (i > 0) {
1245         /* TODO: move to loop */
1246         *prev_key = block_tree_node_get_key(tree, node_block, node_ro, i - 1);
1247     }
1248 
1249     if (is_leaf) {
1250         *child = NULL;
1251     } else {
1252         *child = block_tree_node_get_child(tr, tree, node_block, node_ro, i);
1253         assert(*child);
1254         assert(block_mac_valid(tr, *child));
1255     }
1256     if (print_lookup) {
1257         printf("%s: block %" PRIu64 ", key %" PRIu64
1258                " return %d, child_block %" PRIu64 ", prev_key %" PRIu64
1259                ", next_key %" PRIu64 "\n",
1260                __func__, node_block, key, i,
1261                *child ? block_mac_to_block(tr, *child) : 0, *prev_key,
1262                *next_key);
1263         block_tree_node_print(tr, tree, node_block, node_ro);
1264     }
1265     return i;
1266 }
1267 
1268 /**
1269  * block_tree_walk - Walk tree
1270  * @tr:             Transaction object.
1271  * @tree:           Tree object.
1272  * @key:            Key to search for.
1273  * @key_is_max:     If %true, and @key is not found in a leaf node, use largest
1274  *                  entry with a key less than @key. If %false, and @key is not
1275  *                  found in a leaf node, use smallest entry with a key greater
1276  *                  than @key. Ignored for internal nodes or when @key is found.
1277  * @path:           Tree path object to initalize.
1278  *
1279  * Walk tree to find path to @key or the insert point for @key.
1280  *
1281  * Note that if @key is not found, @path may point to an empty insert slot. A
1282  * caller that is looking for the closest match to @key, must therefore call
1283  * block_tree_path_next to advance @path to a valid entry if this case is hit.
1284  * TODO: Add a helper function or argument to avoid this.
1285  */
block_tree_walk(struct transaction * tr,struct block_tree * tree,data_block_t key,bool key_is_max,struct block_tree_path * path)1286 void block_tree_walk(struct transaction* tr,
1287                      struct block_tree* tree,
1288                      data_block_t key,
1289                      bool key_is_max,
1290                      struct block_tree_path* path) {
1291     const struct block_tree_node* node_ro;
1292     const struct block_tree_node* parent_node_ro;
1293     struct obj_ref ref[2] = {
1294             OBJ_REF_INITIAL_VALUE(ref[0]),
1295             OBJ_REF_INITIAL_VALUE(ref[1]),
1296     };
1297     int ref_index = 0;
1298     const struct block_mac* child;
1299     data_block_t prev_key = 0;
1300     data_block_t next_key = 0;
1301     int child_index;
1302     const struct block_mac* block_mac;
1303 
1304     if (print_lookup) {
1305         printf("%s: tree root block %" PRIu64 ", key %" PRIu64
1306                ", key_is_max %d\n",
1307                __func__, block_mac_to_block(tr, &tree->root), key, key_is_max);
1308     }
1309 
1310     assert(tr);
1311     assert(tree);
1312     assert(tree->block_size <= tr->fs->dev->block_size);
1313 
1314     path->count = 0;
1315     memset(&path->data, 0, sizeof(path->data));
1316     path->tr = tr;
1317     path->tree = tree;
1318     path->tree_update_count = tree->update_count;
1319 
1320     block_mac = &tree->root;
1321     parent_node_ro = NULL;
1322 
1323     while (block_mac && block_mac_valid(tr, block_mac)) {
1324         assert(path->count < countof(path->entry));
1325 
1326         node_ro = block_get(tr, block_mac, NULL, &ref[ref_index]);
1327         if (!node_ro) {
1328             assert(tr->failed);
1329             pr_warn("transaction failed, abort\n");
1330             path->count = 0;
1331             goto err;
1332         }
1333         assert(node_ro);
1334 
1335         path->entry[path->count].block_mac = *block_mac;
1336 
1337         child_index = block_tree_node_find_block(
1338                 tr, path->tree, block_mac_to_block(tr, block_mac), node_ro, key,
1339                 key_is_max, &child, &prev_key, &next_key);
1340         if (path->count) {
1341             assert(!path->entry[path->count - 1].next_key || next_key);
1342             assert(!path->entry[path->count - 1].next_key ||
1343                    next_key <= path->entry[path->count - 1].next_key);
1344             assert(!path->entry[path->count - 1].prev_key || prev_key);
1345             assert(!path->entry[path->count - 1].prev_key ||
1346                    prev_key >= path->entry[path->count - 1].prev_key);
1347         }
1348         path->entry[path->count].index = child_index;
1349         path->entry[path->count].prev_key = prev_key;
1350         path->entry[path->count].next_key = next_key;
1351         if (!child) {
1352             assert(block_tree_node_is_leaf(node_ro));
1353             path->data = block_tree_node_get_data(
1354                     tr, tree, block_mac_to_block(tr, block_mac), node_ro,
1355                     child_index);
1356             assert(!key_is_max || block_mac_valid(path->tr, &path->data) ||
1357                    path->count == 0);
1358         }
1359 
1360         block_mac = child;
1361         if (parent_node_ro) {
1362             block_put(parent_node_ro, &ref[!ref_index]);
1363         }
1364         parent_node_ro = node_ro;
1365         ref_index = !ref_index;
1366 
1367         if (print_lookup) {
1368             printf("%s: path[%d] = %" PRIu64 ", index %d, prev_key %" PRIu64
1369                    ", next_key %" PRIu64 "\n",
1370                    __func__, path->count,
1371                    block_mac_to_block(path->tr,
1372                                       &path->entry[path->count].block_mac),
1373                    path->entry[path->count].index,
1374                    path->entry[path->count].prev_key,
1375                    path->entry[path->count].next_key);
1376             if (!block_mac) {
1377                 printf("%s: path.data.block = %" PRIu64 "\n", __func__,
1378                        block_mac_to_block(path->tr, &path->data));
1379             }
1380         }
1381         path->count++;
1382     }
1383 err:
1384     if (parent_node_ro) {
1385         block_put(parent_node_ro, &ref[!ref_index]);
1386     }
1387 }
1388 
1389 /**
1390  * block_tree_path_next - Walk tree to get to next entry
1391  * @path:           Tree path object.
1392  */
block_tree_path_next(struct block_tree_path * path)1393 void block_tree_path_next(struct block_tree_path* path) {
1394     const struct block_tree_node* node_ro;
1395     const struct block_tree_node* parent_node_ro;
1396     struct obj_ref ref[2] = {
1397             OBJ_REF_INITIAL_VALUE(ref[0]),
1398             OBJ_REF_INITIAL_VALUE(ref[1]),
1399     };
1400     bool ref_index = 0;
1401     unsigned int index;
1402     unsigned int depth;
1403     const struct block_mac* block_mac;
1404     data_block_t prev_key;
1405     data_block_t next_key;
1406     struct block_mac next_data;
1407     const struct block_mac* next_child;
1408     data_block_t parent_next_key;
1409 
1410     assert(path->tree_update_count == path->tree->update_count);
1411     assert(path->count);
1412 
1413     depth = path->count - 1;
1414     assert(path->entry[depth].next_key);
1415 
1416     if (print_lookup) {
1417         printf("%s: tree root block %" PRIu64 "\n", __func__,
1418                block_mac_to_block(path->tr, &path->tree->root));
1419     }
1420 
1421     block_mac = &path->entry[depth].block_mac;
1422     index = path->entry[depth].index;
1423     parent_next_key = depth > 0 ? path->entry[depth - 1].next_key : 0;
1424 
1425     node_ro = block_get(path->tr, block_mac, NULL, &ref[ref_index]);
1426     if (!node_ro) {
1427         assert(path->tr->failed);
1428         pr_warn("transaction failed, abort\n");
1429         goto err_no_refs;
1430     }
1431     assert(node_ro);
1432     assert(block_tree_node_is_leaf(node_ro));
1433     prev_key = block_tree_node_get_key(path->tree,
1434                                        block_mac_to_block(path->tr, block_mac),
1435                                        node_ro, index);
1436     index++;
1437     next_key = block_tree_node_get_key(path->tree,
1438                                        block_mac_to_block(path->tr, block_mac),
1439                                        node_ro, index);
1440     next_data = block_tree_node_get_data(
1441             path->tr, path->tree, block_mac_to_block(path->tr, block_mac),
1442             node_ro, index);
1443     block_put(node_ro, &ref[ref_index]);
1444 
1445     assert(path->entry[depth].next_key == prev_key || !prev_key);
1446 
1447     if (next_key || !parent_next_key) {
1448         assert(!next_key || next_key >= prev_key);
1449         path->entry[depth].index = index;
1450         path->entry[depth].prev_key = prev_key;
1451         path->entry[depth].next_key = next_key;
1452         path->data = next_data;
1453         if (print_lookup) {
1454             printf("%s: path[%d] = %" PRIu64 ", index %d, prev_key %" PRIu64
1455                    ", next_key %" PRIu64 "\n",
1456                    __func__, depth,
1457                    block_mac_to_block(path->tr, &path->entry[depth].block_mac),
1458                    path->entry[depth].index, path->entry[depth].prev_key,
1459                    path->entry[depth].next_key);
1460             printf("%s: path.data.block = %" PRIu64 "\n", __func__,
1461                    block_mac_to_block(path->tr, &path->data));
1462         }
1463         return;
1464     }
1465 
1466     assert(depth > 0);
1467 
1468     while (depth > 0) {
1469         depth--;
1470         if (!path->entry[depth].next_key) {
1471             continue;
1472         }
1473         block_mac = &path->entry[depth].block_mac;
1474         index = path->entry[depth].index;
1475 
1476         node_ro = block_get(path->tr, block_mac, NULL, &ref[ref_index]);
1477         if (!node_ro) {
1478             assert(path->tr->failed);
1479             pr_warn("transaction failed, abort\n");
1480             goto err_no_refs;
1481         }
1482         assert(node_ro);
1483         assert(!block_tree_node_is_leaf(node_ro));
1484 
1485         parent_next_key = depth > 0 ? path->entry[depth - 1].next_key : 0;
1486 
1487         prev_key = block_tree_node_get_key(
1488                 path->tree, block_mac_to_block(path->tr, block_mac), node_ro,
1489                 index);
1490         index++;
1491         next_key = block_tree_node_get_key(
1492                 path->tree, block_mac_to_block(path->tr, block_mac), node_ro,
1493                 index);
1494         next_child = block_tree_node_get_child(
1495                 path->tr, path->tree, block_mac_to_block(path->tr, block_mac),
1496                 node_ro, index);
1497         if (next_child) {
1498             parent_node_ro = node_ro;
1499             ref_index = !ref_index;
1500             assert(prev_key && prev_key == path->entry[depth].next_key);
1501             path->entry[depth].index = index;
1502             path->entry[depth].prev_key = prev_key;
1503             path->entry[depth].next_key = next_key ?: parent_next_key;
1504             if (print_lookup) {
1505                 printf("%s: path[%d] = %" PRIu64 ", index %d, prev_key %" PRIu64
1506                        ", next_key %" PRIu64 "\n",
1507                        __func__, depth,
1508                        block_mac_to_block(path->tr,
1509                                           &path->entry[depth].block_mac),
1510                        path->entry[depth].index, path->entry[depth].prev_key,
1511                        path->entry[depth].next_key);
1512             }
1513             break;
1514         }
1515         block_put(node_ro, &ref[ref_index]);
1516     }
1517     assert(next_child);
1518     while (++depth < path->count - 1) {
1519         node_ro = block_get(path->tr, next_child, NULL, &ref[ref_index]);
1520         if (!node_ro) {
1521             assert(path->tr->failed);
1522             pr_warn("transaction failed, abort\n");
1523             goto err_has_parent_ref;
1524         }
1525         assert(node_ro);
1526         assert(!block_tree_node_is_leaf(node_ro));
1527         path->entry[depth].block_mac = *next_child;
1528         block_put(parent_node_ro, &ref[!ref_index]);
1529         path->entry[depth].index = 0;
1530         path->entry[depth].prev_key = prev_key;
1531         path->entry[depth].next_key = block_tree_node_get_key(
1532                 path->tree, DATA_BLOCK_INVALID, node_ro, 0);
1533         if (print_lookup) {
1534             printf("%s: path[%d] = %" PRIu64 ", index %d, prev_key %" PRIu64
1535                    ", next_key %" PRIu64 "\n",
1536                    __func__, depth,
1537                    block_mac_to_block(path->tr, &path->entry[depth].block_mac),
1538                    path->entry[depth].index, path->entry[depth].prev_key,
1539                    path->entry[depth].next_key);
1540         }
1541         next_child = block_tree_node_get_child(path->tr, path->tree,
1542                                                DATA_BLOCK_INVALID, node_ro, 0);
1543         parent_node_ro = node_ro;
1544         ref_index = !ref_index;
1545         assert(path->entry[depth].next_key);
1546         assert(next_child);
1547     }
1548 
1549     assert(next_child);
1550     node_ro = block_get(path->tr, next_child, NULL, &ref[ref_index]);
1551     if (!node_ro) {
1552         assert(path->tr->failed);
1553         pr_warn("transaction failed, abort\n");
1554         goto err_has_parent_ref;
1555     }
1556     assert(node_ro);
1557     assert(block_tree_node_is_leaf(node_ro));
1558     path->entry[depth].block_mac = *next_child;
1559     block_put(parent_node_ro, &ref[!ref_index]);
1560     path->entry[depth].index = 0;
1561     path->entry[depth].prev_key = prev_key;
1562     path->entry[depth].next_key =
1563             block_tree_node_get_key(path->tree, DATA_BLOCK_INVALID, node_ro, 0);
1564     path->data = block_tree_node_get_data(path->tr, path->tree,
1565                                           DATA_BLOCK_INVALID, node_ro, 0);
1566     block_put(node_ro, &ref[ref_index]);
1567     assert(path->entry[depth].next_key);
1568     if (print_lookup) {
1569         printf("%s: path[%d] = %" PRIu64 ", index %d, prev_key %" PRIu64
1570                ", next_key %" PRIu64 "\n",
1571                __func__, depth,
1572                block_mac_to_block(path->tr, &path->entry[depth].block_mac),
1573                path->entry[depth].index, path->entry[depth].prev_key,
1574                path->entry[depth].next_key);
1575         printf("%s: path.data.block = %" PRIu64 "\n", __func__,
1576                block_mac_to_block(path->tr, &path->data));
1577     }
1578     return;
1579 
1580 err_has_parent_ref:
1581     block_put(parent_node_ro, &ref[!ref_index]);
1582 err_no_refs:
1583     assert(path->tr->failed);
1584     path->count = 0;
1585 }
1586 
1587 /**
1588  * block_tree_block_dirty - Get writable node
1589  * @tr:         Transaction object.
1590  * @path:       Tree path object.
1591  * @path_index: Path depth that @node_ro was found at.
1592  * @node_ro:    Source node
1593  *
1594  * Copy @node_ro and update @path if needed.
1595  *
1596  * Return: writable pointer to @node_ro.
1597  */
block_tree_block_dirty(struct transaction * tr,struct block_tree_path * path,int path_index,const struct block_tree_node * node_ro)1598 static struct block_tree_node* block_tree_block_dirty(
1599         struct transaction* tr,
1600         struct block_tree_path* path,
1601         int path_index,
1602         const struct block_tree_node* node_ro) {
1603     data_block_t new_block;
1604     struct block_mac* block_mac;
1605 
1606     block_mac = &path->entry[path_index].block_mac;
1607 
1608     assert(path_index ||
1609            block_mac_same_block(tr, block_mac, &path->tree->root));
1610 
1611     if (!block_tree_node_need_copy(tr, path->tree, block_mac)) {
1612         if (tr->failed) {
1613             return NULL;
1614         }
1615         return block_dirty(tr, node_ro, !path->tree->allow_copy_on_write);
1616     }
1617     assert(path->tree->allow_copy_on_write);
1618     new_block = block_allocate_etc(tr, !path->tree->allow_copy_on_write);
1619     if (print_internal_changes) {
1620         printf("%s: copy block %" PRIu64 " to %" PRIu64 "\n", __func__,
1621                block_mac_to_block(tr, block_mac), new_block);
1622     }
1623     if (!new_block) {
1624         return NULL;
1625     }
1626     assert(new_block != block_mac_to_block(tr, block_mac));
1627     assert(!tr->failed);
1628     block_free(tr, block_mac_to_block(tr, block_mac));
1629     if (tr->failed) {
1630         pr_warn("transaction failed, abort\n");
1631         return NULL;
1632     }
1633     block_mac_set_block(tr, block_mac, new_block);
1634     if (!path_index) {
1635         block_mac_set_block(tr, &path->tree->root, new_block);
1636         path->tree->root_block_changed = true;
1637     }
1638     return block_move(tr, node_ro, new_block, !path->tree->allow_copy_on_write);
1639 }
1640 
1641 /**
1642  * block_tree_block_get_write - Get writable node fro path
1643  * @tr:         Transaction object.
1644  * @path:       Tree path object.
1645  * @path_index: Path depth to get node from.
1646  * @ref:        Pointer to store reference in.
1647  *
1648  * Get node from @path and @path_index, copying it and updating @path if
1649  * needed.
1650  *
1651  * Return: writable pointer to node.
1652  */
block_tree_block_get_write(struct transaction * tr,struct block_tree_path * path,int path_index,struct obj_ref * ref)1653 static struct block_tree_node* block_tree_block_get_write(
1654         struct transaction* tr,
1655         struct block_tree_path* path,
1656         int path_index,
1657         struct obj_ref* ref) {
1658     const struct block_tree_node* node_ro;
1659     struct block_tree_node* node_rw;
1660 
1661     node_ro = block_get(tr, &path->entry[path_index].block_mac, NULL, ref);
1662     if (!node_ro) {
1663         return NULL;
1664     }
1665 
1666     node_rw = block_tree_block_dirty(tr, path, path_index, node_ro);
1667     if (!node_rw) {
1668         block_put(node_ro, ref);
1669     }
1670     return node_rw;
1671 }
1672 
1673 /**
1674  * block_tree_path_put_dirty - Release dirty node and update tree
1675  * @tr:         Transaction object.
1676  * @path:       Tree path object.
1677  * @path_index: Path depth to get node from.
1678  * @data:       Block data pointed to by leaf node when @path_index is
1679  *              @path->count. Tree node otherwise.
1680  * @data_ref:   Reference to @data that will be released.
1681  *
1682  * Release dirty external block or tree node and update tree. Update mac values
1683  * and copy blocks as needed until root of tree is reached.
1684  */
block_tree_path_put_dirty(struct transaction * tr,struct block_tree_path * path,int path_index,void * data,struct obj_ref * data_ref)1685 void block_tree_path_put_dirty(struct transaction* tr,
1686                                struct block_tree_path* path,
1687                                int path_index,
1688                                void* data,
1689                                struct obj_ref* data_ref) {
1690     unsigned int index;
1691     struct block_mac* block_mac;
1692     struct block_tree_node* parent_node_rw;
1693     bool parent_is_leaf;
1694     struct obj_ref parent_node_ref = OBJ_REF_INITIAL_VALUE(parent_node_ref);
1695     struct obj_ref tmp_ref = OBJ_REF_INITIAL_VALUE(tmp_ref);
1696 
1697     if (path_index == (int)path->count) {
1698         assert(path_index < (int)countof(path->entry));
1699         path->entry[path_index].block_mac = path->data;  // copy data
1700     }
1701 
1702     while (--path_index >= 0) {
1703         parent_node_rw = block_tree_block_get_write(tr, path, path_index,
1704                                                     &parent_node_ref);
1705         if (tr->failed) {
1706             assert(!parent_node_rw);
1707             block_put_dirty_discard(data, data_ref);
1708             pr_warn("transaction failed, abort\n");
1709             return;
1710         }
1711         assert(parent_node_rw);
1712         parent_is_leaf = block_tree_node_is_leaf(parent_node_rw);
1713         index = path->entry[path_index].index;
1714         assert(path_index == (int)path->count - 1 || !parent_is_leaf);
1715         assert(path->tree->child_data_size[parent_is_leaf] <=
1716                sizeof(*block_mac));
1717         assert(path->tree->child_data_size[parent_is_leaf] >=
1718                tr->fs->block_num_size + tr->fs->mac_size);
1719         block_mac = block_tree_node_get_child_data_rw(path->tree,
1720                                                       parent_node_rw, index);
1721 
1722         /* check that block was copied if needed */
1723         assert(!block_tree_node_need_copy(tr, path->tree, block_mac) ||
1724                !block_mac_same_block(tr, block_mac,
1725                                      &path->entry[path_index + 1].block_mac));
1726 
1727         /* check that block was not copied when not needed */
1728         assert(block_tree_node_need_copy(tr, path->tree, block_mac) ||
1729                block_mac_same_block(tr, block_mac,
1730                                     &path->entry[path_index + 1].block_mac) ||
1731                tr->failed);
1732 
1733         if (!block_mac_same_block(tr, block_mac,
1734                                   &path->entry[path_index + 1].block_mac)) {
1735             if (print_internal_changes) {
1736                 printf("%s: update copied block %" PRIu64 " to %" PRIu64 "\n",
1737                        __func__, block_mac_to_block(tr, block_mac),
1738                        block_mac_to_block(
1739                                tr, &path->entry[path_index + 1].block_mac));
1740             }
1741             block_mac_set_block(
1742                     tr, block_mac,
1743                     block_mac_to_block(tr,
1744                                        &path->entry[path_index + 1].block_mac));
1745         }
1746 
1747         if (tr->failed) {
1748             block_put_dirty_discard(data, data_ref);
1749             block_put_dirty_discard(parent_node_rw, &parent_node_ref);
1750             pr_warn("transaction failed, abort\n");
1751             return;
1752         }
1753 
1754         assert(block_mac_eq(tr, block_mac,
1755                             &path->entry[path_index + 1].block_mac));
1756         block_put_dirty(tr, data, data_ref, block_mac, parent_node_rw);
1757         assert(!block_mac_eq(tr, block_mac,
1758                              &path->entry[path_index + 1].block_mac) ||
1759                tr->fs->mac_size < 16);
1760         block_mac_copy_mac(tr, &path->entry[path_index + 1].block_mac,
1761                            block_mac);
1762         assert(block_mac_eq(tr, block_mac,
1763                             &path->entry[path_index + 1].block_mac));
1764         data = parent_node_rw;
1765         data_ref = &tmp_ref;
1766         obj_ref_transfer(data_ref, &parent_node_ref);
1767     }
1768     block_mac = &path->tree->root;
1769     assert(block_mac_same_block(tr, block_mac, &path->entry[0].block_mac));
1770     assert(block_mac_eq(tr, block_mac, &path->entry[0].block_mac));
1771     block_put_dirty(tr, data, data_ref, block_mac, NULL);
1772     assert(!block_mac_eq(tr, block_mac, &path->entry[0].block_mac) ||
1773            tr->fs->mac_size < 16);
1774     block_mac_copy_mac(tr, &path->entry[0].block_mac, block_mac);
1775     assert(block_mac_eq(tr, block_mac, &path->entry[0].block_mac));
1776 }
1777 
1778 /**
1779  * block_tree_update_key - Update key in internal parent node
1780  * @tr:         Transaction object.
1781  * @path:       Tree path object.
1782  * @path_index: Path depth start search at.
1783  * @new_key:    New key value.
1784  *
1785  * Search path until a left parent key is found (non-zero index) then set it
1786  * to @new_key.
1787  *
1788  * TODO: merge with mac update?
1789  */
block_tree_update_key(struct transaction * tr,struct block_tree_path * path,int path_index,data_block_t new_key)1790 void block_tree_update_key(struct transaction* tr,
1791                            struct block_tree_path* path,
1792                            int path_index,
1793                            data_block_t new_key) {
1794     const struct block_mac* block_mac;
1795     unsigned int index;
1796     struct block_tree_node* node_rw;
1797     struct obj_ref node_ref = OBJ_REF_INITIAL_VALUE(node_ref);
1798 
1799     assert(new_key);
1800 
1801     while (path_index >= 0) {
1802         assert(path_index < (int)countof(path->entry));
1803         block_mac = &path->entry[path_index].block_mac;
1804         index = path->entry[path_index].index;
1805         if (index == 0) {
1806             path_index--;
1807             continue;
1808         }
1809         node_rw = block_tree_block_get_write(tr, path, path_index, &node_ref);
1810         if (tr->failed) {
1811             pr_warn("transaction failed, abort\n");
1812             return;
1813         }
1814         assert(node_rw);
1815         if (print_internal_changes) {
1816             printf("%s: block %" PRIu64 ", index %d, [%" PRIu64
1817                    "-] -> [%" PRIu64 "-]\n",
1818                    __func__, block_mac_to_block(tr, block_mac), index,
1819                    block_tree_node_get_key(path->tree, DATA_BLOCK_INVALID,
1820                                            node_rw, index - 1),
1821                    new_key);
1822             block_tree_node_print(tr, path->tree,
1823                                   block_mac_to_block(tr, block_mac), node_rw);
1824         }
1825         assert(!block_tree_node_is_leaf(node_rw));
1826         assert(index == 1 || new_key >= block_tree_node_get_key(
1827                                                 path->tree, DATA_BLOCK_INVALID,
1828                                                 node_rw, index - 2));
1829         assert(index == block_tree_node_max_key_count(path->tree, node_rw) ||
1830                new_key <= block_tree_node_get_key(path->tree,
1831                                                   DATA_BLOCK_INVALID, node_rw,
1832                                                   index) ||
1833                !block_tree_node_get_key(path->tree, DATA_BLOCK_INVALID, node_rw,
1834                                         index));
1835         assert(path->entry[path_index].prev_key ==
1836                block_tree_node_get_key(path->tree, DATA_BLOCK_INVALID, node_rw,
1837                                        index - 1));
1838         block_tree_node_set_key(path->tree, node_rw, index - 1, new_key);
1839         assert(block_tree_node_check(tr, path->tree, node_rw,
1840                                      block_mac_to_block(tr, block_mac), 0,
1841                                      DATA_BLOCK_MAX));
1842         path->entry[path_index].prev_key = new_key;
1843         if (print_internal_changes) {
1844             block_tree_node_print(tr, path->tree,
1845                                   block_mac_to_block(tr, block_mac), node_rw);
1846         }
1847         block_tree_path_put_dirty(tr, path, path_index, node_rw, &node_ref);
1848         return;
1849     }
1850     if (print_internal_changes) {
1851         printf("%s: root reached, no update needed (new key %" PRIu64 ")\n",
1852                __func__, new_key);
1853     }
1854 }
1855 
1856 /**
1857  * block_tree_node_get_key_count - Get number of keys currently stored in node
1858  * @tree:           Tree object.
1859  * @node_ro:        Node.
1860  */
block_tree_node_get_key_count(const struct block_tree * tree,const struct block_tree_node * node_ro)1861 static int block_tree_node_get_key_count(
1862         const struct block_tree* tree,
1863         const struct block_tree_node* node_ro) {
1864     unsigned int i;
1865     unsigned int max_key_count = block_tree_node_max_key_count(tree, node_ro);
1866 
1867     for (i = 0; i < max_key_count; i++) {
1868         if (!block_tree_node_get_key(tree, DATA_BLOCK_INVALID, node_ro, i)) {
1869             break;
1870         }
1871     }
1872 
1873     return i;
1874 }
1875 
1876 /**
1877  * block_tree_node_leaf_remove_entry - Remove entry from a leaf node
1878  * @tr:             Transaction object.
1879  * @tree:           Tree object.
1880  * @node_block_mac: Block number and mac for @node_rw.
1881  * @node_rw:        Node.
1882  * @index:          Index of entry to remove.
1883  */
block_tree_node_leaf_remove_entry(struct transaction * tr,const struct block_tree * tree,const struct block_mac * node_block_mac,struct block_tree_node * node_rw,unsigned int index)1884 static void block_tree_node_leaf_remove_entry(
1885         struct transaction* tr,
1886         const struct block_tree* tree,
1887         const struct block_mac* node_block_mac,
1888         struct block_tree_node* node_rw,
1889         unsigned int index) {
1890     unsigned int max_key_count = block_tree_node_max_key_count(tree, node_rw);
1891 
1892     if (print_internal_changes) {
1893         printf("%s: index %d:", __func__, index);
1894         block_tree_node_print(tr, tree, block_mac_to_block(tr, node_block_mac),
1895                               node_rw);
1896     }
1897     assert(index < max_key_count);
1898     assert(block_tree_node_is_leaf(node_rw));
1899     block_tree_node_shift_down(tree, node_rw, index, index + 1,
1900                                SHIFT_LEAF_OR_LEFT_CHILD);
1901     if (print_internal_changes) {
1902         printf("%s: done:   ", __func__);
1903         block_tree_node_print(tr, tree, block_mac_to_block(tr, node_block_mac),
1904                               node_rw);
1905     }
1906 }
1907 
1908 /**
1909  * block_tree_node_split - Split a full node and add entry.
1910  * @tr:             Transaction object.
1911  * @path:           Tree path object.
1912  * @append_key:     Key to add.
1913  * @append_child:   Child to add if @path points to an internal node.
1914  * @append_data:    Data to add if @path points to a leaf node.
1915  *
1916  * Allocate a new node and move half of the entries to it. If the old node was
1917  * the root node, allocate a new root node. Add new node to parent.
1918  */
block_tree_node_split(struct transaction * tr,struct block_tree_path * path,data_block_t append_key,const struct block_mac * append_child,const struct block_mac * append_data)1919 static void block_tree_node_split(struct transaction* tr,
1920                                   struct block_tree_path* path,
1921                                   data_block_t append_key,
1922                                   const struct block_mac* append_child,
1923                                   const struct block_mac* append_data) {
1924     struct block_tree_node* parent_node_rw;
1925     struct obj_ref parent_node_ref = OBJ_REF_INITIAL_VALUE(parent_node_ref);
1926     struct block_tree_node* node_left_rw;
1927     struct obj_ref node_left_ref = OBJ_REF_INITIAL_VALUE(node_left_ref);
1928     struct block_tree_node* node_right_rw;
1929     struct obj_ref node_right_ref = OBJ_REF_INITIAL_VALUE(node_right_ref);
1930     bool is_leaf;
1931     struct block_mac* left_block_mac;
1932     struct block_mac* right_block_mac;
1933     data_block_t left_block_num;
1934     struct block_mac right;
1935     const struct block_mac* block_mac;
1936     const struct block_mac* parent_block_mac;
1937     unsigned int parent_index;
1938     int split_index;
1939     data_block_t parent_key;
1940     data_block_t overflow_key = 0;
1941     struct block_mac overflow_child;
1942     size_t max_key_count;
1943 
1944     assert(path->count > 0);
1945     assert(path->tree);
1946 
1947     block_mac = &path->entry[path->count - 1].block_mac;
1948     path->count--;
1949 
1950     if (print_internal_changes) {
1951         printf("%s: tree root %" PRIu64 ", block %" PRIu64 "\n", __func__,
1952                block_mac_to_block(tr, &path->tree->root),
1953                block_mac_to_block(tr, block_mac));
1954     }
1955 
1956     assert(append_key);
1957     /* we should only insert one node at a time */
1958     assert(!path->tree->inserting.block);
1959     path->tree->inserting.block = block_mac_to_block(tr, block_mac);
1960     path->tree->inserting.key = append_key;
1961     if (append_data) {
1962         assert(!append_child);
1963         path->tree->inserting.data = *append_data;
1964     }
1965     if (append_child) {
1966         assert(!append_data);
1967         path->tree->inserting.child = *append_child;
1968     }
1969 
1970     assert(!path->tree->copy_on_write || path->tree->allow_copy_on_write);
1971 
1972     block_mac_set_block(
1973             tr, &right,
1974             block_allocate_etc(tr, !path->tree->allow_copy_on_write));
1975     if (!path->count) {
1976         /* use old block at the new root */
1977         left_block_num =
1978                 block_allocate_etc(tr, !path->tree->allow_copy_on_write);
1979     } else {
1980         left_block_num = block_mac_to_block(tr, block_mac);
1981     }
1982     if (tr->failed) {
1983         pr_warn("transaction failed, abort\n");
1984         return;
1985     }
1986     assert(block_mac);
1987     assert(block_mac_valid(tr, block_mac));
1988     assert(path->tree->inserting.block == block_mac_to_block(tr, block_mac));
1989     assert(path->tree->inserting.key == append_key);
1990     assert(block_mac_to_block(tr, &path->tree->inserting.data) ==
1991            (append_data ? block_mac_to_block(tr, append_data) : 0));
1992     assert(block_mac_to_block(tr, &path->tree->inserting.child) ==
1993            (append_child ? block_mac_to_block(tr, append_child) : 0));
1994     memset(&path->tree->inserting, 0, sizeof(path->tree->inserting));
1995 
1996     if (print_internal_changes) {
1997         printf("%s: %" PRIu64 " -> %" PRIu64 " %" PRIu64 "\n", __func__,
1998                block_mac_to_block(tr, block_mac), left_block_num,
1999                block_mac_to_block(tr, &right));
2000     }
2001 
2002     node_left_rw =
2003             block_tree_block_get_write(tr, path, path->count, &node_left_ref);
2004     if (!node_left_rw) {
2005         assert(tr->failed);
2006         pr_warn("transaction failed, abort\n");
2007         return;
2008     }
2009     assert(node_left_rw);
2010     is_leaf = block_tree_node_is_leaf(node_left_rw);
2011     if (!path->count) {
2012         assert(left_block_num != block_mac_to_block(tr, block_mac));
2013         parent_node_rw = node_left_rw;
2014         obj_ref_transfer(&parent_node_ref, &node_left_ref);
2015         /* TODO: use allocated node for root instead since we need to update the
2016          * mac anyway */
2017         node_left_rw = block_get_copy(tr, node_left_rw, left_block_num,
2018                                       !path->tree->allow_copy_on_write,
2019                                       &node_left_ref);
2020         assert(node_left_rw);
2021         if (tr->failed) {
2022             pr_warn("transaction failed, abort\n");
2023             goto err_copy_parent;
2024         }
2025         // parent_node_rw = block_get_cleared(path->tree->root.block);
2026         /* TODO: use block_get_cleared */
2027         memset(parent_node_rw, 0, path->tree->block_size);
2028         parent_index = 0;
2029         left_block_mac = block_tree_node_get_child_data_rw(
2030                 path->tree, parent_node_rw, parent_index);
2031         block_mac_set_block(tr, left_block_mac, left_block_num);
2032         parent_block_mac = block_mac;
2033     } else {
2034         assert(left_block_num == block_mac_to_block(tr, block_mac));
2035         parent_block_mac = &path->entry[path->count - 1].block_mac;
2036         parent_index = path->entry[path->count - 1].index;
2037         parent_node_rw = block_tree_block_get_write(tr, path, path->count - 1,
2038                                                     &parent_node_ref);
2039         if (tr->failed) {
2040             pr_warn("transaction failed, abort\n");
2041             goto err_get_parent;
2042         }
2043         assert(parent_node_rw);
2044         assert(!block_tree_node_is_leaf(parent_node_rw));
2045         left_block_mac = block_tree_node_get_child_data_rw(
2046                 path->tree, parent_node_rw, parent_index);
2047     }
2048     assert(block_mac_to_block(tr, left_block_mac) == left_block_num);
2049     assert(!tr->failed);
2050     node_right_rw =
2051             block_get_copy(tr, node_left_rw, block_mac_to_block(tr, &right),
2052                            !path->tree->allow_copy_on_write, &node_right_ref);
2053     if (tr->failed) {
2054         pr_warn("transaction failed, abort\n");
2055         goto err_copy_right;
2056     }
2057     assert(block_tree_node_is_leaf(node_left_rw) == is_leaf);
2058     assert(block_tree_node_is_leaf(node_right_rw) == is_leaf);
2059     assert(is_leaf || (append_key && append_child));
2060     assert(!is_leaf || (append_key && append_data));
2061     assert(block_tree_node_full(path->tree, node_left_rw));
2062     assert(!tr->failed);
2063 
2064     max_key_count = block_tree_node_max_key_count(path->tree, node_left_rw);
2065     split_index = (max_key_count + 1) / 2;
2066 
2067     if (print_internal_changes) {
2068         printf("%s: insert split_index %d", __func__, split_index);
2069         block_tree_node_print(tr, path->tree, left_block_num, node_left_rw);
2070         if (is_leaf) {
2071             printf("%s: append key, data: [%" PRIu64 ": %" PRIu64 "]\n",
2072                    __func__, append_key, block_mac_to_block(tr, append_data));
2073         } else {
2074             printf("%s: append key, child: [%" PRIu64 "-] %" PRIu64 "\n",
2075                    __func__, append_key, block_mac_to_block(tr, append_child));
2076         }
2077     }
2078 
2079     /* Update left node */
2080     block_tree_node_clear_end(path->tree, node_left_rw, split_index);
2081     if (print_internal_changes) {
2082         printf("%s: left  %3" PRIu64 "", __func__, left_block_num);
2083         block_tree_node_print(tr, path->tree, left_block_num, node_left_rw);
2084     }
2085 
2086     /*
2087      * Update right node. For internal nodes the key at split_index moves to
2088      * the parent, for leaf nodes it gets copied
2089      */
2090     parent_key = block_tree_node_get_key(path->tree, DATA_BLOCK_INVALID,
2091                                          node_right_rw, split_index);
2092     block_tree_node_shift_down(path->tree, node_right_rw, 0,
2093                                split_index + !is_leaf,
2094                                SHIFT_LEAF_OR_LEFT_CHILD);
2095     assert(max_key_count ==
2096            block_tree_node_max_key_count(path->tree, node_right_rw));
2097     block_tree_node_insert(
2098             path->tree, node_right_rw, max_key_count - split_index - !is_leaf,
2099             is_leaf ? SHIFT_LEAF_OR_LEFT_CHILD : SHIFT_RIGHT_CHILD, &append_key,
2100             is_leaf ? append_data : append_child, NULL, NULL);
2101     if (print_internal_changes) {
2102         printf("%s: right %3" PRIu64 "", __func__,
2103                block_mac_to_block(tr, &right));
2104         block_tree_node_print(tr, path->tree, block_mac_to_block(tr, &right),
2105                               node_right_rw);
2106     }
2107 
2108     /* Insert right node in parent node */
2109     assert(!block_tree_node_is_leaf(parent_node_rw));
2110 
2111     if (print_internal_changes) {
2112         printf("%s: insert [%" PRIu64 "-] %" PRIu64 " @%d:", __func__,
2113                parent_key, block_mac_to_block(tr, &right), parent_index);
2114         block_tree_node_print(tr, path->tree,
2115                               block_mac_to_block(tr, parent_block_mac),
2116                               parent_node_rw);
2117     }
2118 
2119     assert(parent_index ==
2120                    block_tree_node_max_key_count(path->tree, parent_node_rw) ||
2121            !block_tree_node_get_key(path->tree, DATA_BLOCK_INVALID,
2122                                     parent_node_rw, parent_index) ||
2123            parent_key < block_tree_node_get_key(path->tree, DATA_BLOCK_INVALID,
2124                                                 parent_node_rw, parent_index));
2125     assert(parent_index == 0 ||
2126            block_tree_node_get_key(path->tree, DATA_BLOCK_INVALID,
2127                                    parent_node_rw,
2128                                    parent_index - 1) <= parent_key);
2129     block_tree_node_insert(path->tree, parent_node_rw, parent_index,
2130                            SHIFT_RIGHT_CHILD, &parent_key, &right,
2131                            &overflow_key, &overflow_child);
2132     assert(!overflow_key == !block_mac_valid(tr, &overflow_child));
2133     /*
2134      * If overflow_key is set it is not safe to re-enter the tree at this point
2135      * as overflow_child is not reachable. It becomes reachable again when
2136      * saved in tree->inserting.child at the top of this function.
2137      */
2138 
2139     if (parent_index <
2140         block_tree_node_max_key_count(path->tree, parent_node_rw)) {
2141         right_block_mac = block_tree_node_get_child_data_rw(
2142                 path->tree, parent_node_rw, parent_index + 1);
2143     } else {
2144         assert(block_mac_valid(tr, &overflow_child));
2145         right_block_mac = &overflow_child;
2146     }
2147 
2148     if (print_internal_changes) {
2149         printf("%s: insert [%" PRIu64 "-] %" PRIu64 " @%d done:", __func__,
2150                parent_key, block_mac_to_block(tr, &right), parent_index);
2151         block_tree_node_print(tr, path->tree,
2152                               block_mac_to_block(tr, parent_block_mac),
2153                               parent_node_rw);
2154     }
2155 
2156     if (!path->count && print_internal_changes) {
2157         printf("%s root", __func__);
2158         block_tree_node_print(tr, path->tree, block_mac_to_block(tr, block_mac),
2159                               parent_node_rw);
2160     }
2161 
2162     block_put_dirty(tr, node_left_rw, &node_left_ref, left_block_mac,
2163                     parent_node_rw);
2164     block_put_dirty(tr, node_right_rw, &node_right_ref, right_block_mac,
2165                     parent_node_rw);
2166     /* block_tree_path_put_dirty will handle negative depths */
2167     block_tree_path_put_dirty(tr, path, (int)path->count - 1, parent_node_rw,
2168                               &parent_node_ref);
2169 
2170     if (overflow_key) {
2171         assert(path->count); /* new root node should not overflow */
2172         if (print_internal_changes) {
2173             printf("%s: overflow [%" PRIu64 "-] %" PRIu64 "\n", __func__,
2174                    overflow_key, block_mac_to_block(tr, &overflow_child));
2175         }
2176         /* TODO: use a loop instead */
2177         block_tree_node_split(tr, path, overflow_key, &overflow_child, 0);
2178     }
2179     return;
2180 
2181 err_copy_right:
2182     block_put_dirty_discard(node_right_rw, &node_right_ref);
2183 err_copy_parent:
2184     block_put_dirty_discard(parent_node_rw, &parent_node_ref);
2185 err_get_parent:
2186     block_put_dirty_discard(node_left_rw, &node_left_ref);
2187 }
2188 
2189 /**
2190  * block_tree_sibling_index - Get the index of a sibling node
2191  * @index:      Index of current node.
2192  *
2193  * Use left sibling when possible, otherwise use right sibling. Other siblings
2194  * could be used, but this simple rule avoids selecting an empty sibling if
2195  * a non-empty sibling exists.
2196  */
block_tree_sibling_index(int index)2197 static int block_tree_sibling_index(int index) {
2198     return !index ? 1 : index - 1;
2199 }
2200 
2201 /**
2202  * block_tree_get_sibling_block - Get block and mac for sibling block
2203  * @tr:             Transaction object.
2204  * @path:           Tree path object.
2205  *
2206  * Return: block_mac of sibling block selected by block_tree_sibling_index
2207  */
block_tree_get_sibling_block(struct transaction * tr,struct block_tree_path * path)2208 static struct block_mac block_tree_get_sibling_block(
2209         struct transaction* tr,
2210         struct block_tree_path* path) {
2211     struct block_mac block_mac = BLOCK_MAC_INITIAL_VALUE(block_mac);
2212     const struct block_mac* parent;
2213     const struct block_mac* block_mac_ptr;
2214     int parent_index;
2215     const struct block_tree_node* node_ro;
2216     struct obj_ref node_ref = OBJ_REF_INITIAL_VALUE(node_ref);
2217 
2218     assert(path->tree);
2219     assert(path->count > 1);
2220 
2221     parent = &path->entry[path->count - 2].block_mac;
2222     parent_index = path->entry[path->count - 2].index;
2223 
2224     node_ro = block_get(tr, parent, NULL, &node_ref);
2225     if (!node_ro) {
2226         assert(tr->failed);
2227         pr_warn("transaction failed, abort\n");
2228         goto err;
2229     }
2230     parent_index = block_tree_sibling_index(parent_index);
2231     block_mac_ptr =
2232             block_tree_node_get_child_data(path->tree, node_ro, parent_index);
2233     block_mac_copy(tr, &block_mac, block_mac_ptr);
2234     assert(block_mac_valid(tr, &block_mac));
2235     block_put(node_ro, &node_ref);
2236 err:
2237     return block_mac;
2238 }
2239 
2240 /**
2241  * block_tree_path_set_sibling_block - Set path to point to sibling block_mac
2242  * @path:           Tree path object.
2243  * @block_mac:      Block-mac of sibling block to swap with path entry
2244  * @left:           %true if @block_mac is the left sibling on entry (right on
2245  *                  return) %false otherwise.
2246  *
2247  * Update @path to point to the left (if @left is %true) or right sibling node
2248  * @block_mac, and return the block_mac of the old node that @path pointed to in
2249  * @block_mac.
2250  *
2251  * This is used to toggle the path between two sibling nodes without re-reading
2252  * the parent node. It does not preserve the prev_key or next_key value that is
2253  * not shared beween the siblings.
2254  */
block_tree_path_set_sibling_block(struct block_tree_path * path,struct block_mac * block_mac,bool left)2255 static void block_tree_path_set_sibling_block(struct block_tree_path* path,
2256                                               struct block_mac* block_mac,
2257                                               bool left) {
2258     int parent_index;
2259     struct block_mac tmp;
2260 
2261     assert(path->count > 1);
2262     parent_index = path->entry[path->count - 2].index;
2263     // assert(left || parent_index <= 1); // assert only valid on initial call
2264     assert(!left || parent_index > 0);
2265 
2266     if (left) {
2267         parent_index--;
2268     } else {
2269         parent_index++;
2270     }
2271 
2272     if (print_internal_changes) {
2273         printf("%s: path[%d].index: %d -> %d\n", __func__, path->count - 2,
2274                path->entry[path->count - 2].index, parent_index);
2275         printf("%s: path[%d].block_mac.block: %" PRIu64 " -> %" PRIu64 "\n",
2276                __func__, path->count - 1,
2277                block_mac_to_block(path->tr,
2278                                   &path->entry[path->count - 1].block_mac),
2279                block_mac_to_block(path->tr, block_mac));
2280     }
2281 
2282     path->entry[path->count - 2].index = parent_index;
2283     if (left) {
2284         path->entry[path->count - 2].next_key =
2285                 path->entry[path->count - 2].prev_key;
2286         path->entry[path->count - 2].prev_key = 0;  // unknown
2287     } else {
2288         path->entry[path->count - 2].prev_key =
2289                 path->entry[path->count - 2].next_key;
2290         path->entry[path->count - 2].next_key = 0;  // unknown
2291     }
2292 
2293     tmp = *block_mac;
2294     *block_mac = path->entry[path->count - 1].block_mac;
2295     path->entry[path->count - 1].block_mac = tmp;
2296 }
2297 
2298 static void block_tree_node_merge(struct transaction* tr,
2299                                   struct block_tree_path* path);
2300 
2301 /**
2302  * block_tree_remove_internal - Remove key and right child from internal node
2303  * @tr:             Transaction object.
2304  * @path:           Tree path object.
2305  *
2306  * Removes an entry from an internal node, and check the resulting node follows
2307  * B+ tree rules. If a root node would have no remaining entries, delete it. If
2308  * a non-root node has too few remaining entries, call block_tree_node_merge
2309  * which will either borrow an entry from, or merge with a sibling node.
2310  *
2311  * TODO: merge with block_tree_node_merge and avoid recursion?
2312  */
block_tree_remove_internal(struct transaction * tr,struct block_tree_path * path)2313 static void block_tree_remove_internal(struct transaction* tr,
2314                                        struct block_tree_path* path) {
2315     const struct block_tree_node* node_ro;
2316     struct block_tree_node* node_rw;
2317     struct obj_ref node_ref = OBJ_REF_INITIAL_VALUE(node_ref);
2318     int index;
2319     bool need_merge;
2320     const struct block_mac* block_mac;
2321 
2322     assert(path->count);
2323 
2324     block_mac = &path->entry[path->count - 1].block_mac;
2325     index = path->entry[path->count - 1].index;
2326     node_ro = block_get(tr, block_mac, NULL, &node_ref);
2327     if (!node_ro) {
2328         assert(tr->failed);
2329         pr_warn("transaction failed, abort\n");
2330         return;
2331     }
2332     assert(!block_tree_node_is_leaf(node_ro));
2333     assert(index > 0);
2334 
2335     if (print_internal_changes) {
2336         printf("%s: @%d:", __func__, index);
2337         block_tree_node_print(tr, path->tree, block_mac_to_block(tr, block_mac),
2338                               node_ro);
2339     }
2340 
2341     if (path->count == 1 &&
2342         !block_tree_node_get_key(path->tree, DATA_BLOCK_INVALID, node_ro, 1)) {
2343         /* block_tree_node_merge always removes right node */
2344         assert(index == 1);
2345         const struct block_mac* new_root =
2346                 block_tree_node_get_child_data(path->tree, node_ro, 0);
2347         if (print_internal_changes) {
2348             printf("%s: root emptied, new root %" PRIu64 "\n", __func__,
2349                    block_mac_to_block(tr, new_root));
2350         }
2351         assert(block_mac_valid(tr, new_root));
2352         path->tree->root = *new_root;
2353         block_discard_dirty(node_ro);
2354         block_put(node_ro, &node_ref);
2355         assert(!path->tree->copy_on_write || path->tree->allow_copy_on_write);
2356         block_free_etc(tr, block_mac_to_block(tr, block_mac),
2357                        !path->tree->allow_copy_on_write);
2358         return;
2359     }
2360 
2361     node_rw = block_tree_block_dirty(tr, path, path->count - 1, node_ro);
2362     if (tr->failed) {
2363         block_put(node_ro, &node_ref);
2364         pr_warn("transaction failed, abort\n");
2365         return;
2366     }
2367 
2368     block_tree_node_shift_down(path->tree, node_rw, index - 1, index,
2369                                SHIFT_RIGHT_CHILD);
2370 
2371     if (print_internal_changes) {
2372         printf("%s: @%d done:", __func__, index);
2373         block_tree_node_print(tr, path->tree, block_mac_to_block(tr, block_mac),
2374                               node_rw);
2375     }
2376     need_merge =
2377             path->count > 1 && block_tree_below_min_full(path->tree, node_rw);
2378     block_tree_path_put_dirty(tr, path, path->count - 1, node_rw, &node_ref);
2379 
2380     if (need_merge) {
2381         block_tree_node_merge(tr, path);
2382     }
2383 }
2384 
2385 /**
2386  * block_tree_node_merge - Merge node or borrow entry from sibling block
2387  * @tr:             Transaction object.
2388  * @path:           Tree path object.
2389  *
2390  * If sibling block has more entries than it needs, migrate an entry from it,
2391  * otherwise merge the two nodes and free the block that stored the right node.
2392  */
block_tree_node_merge(struct transaction * tr,struct block_tree_path * path)2393 static void block_tree_node_merge(struct transaction* tr,
2394                                   struct block_tree_path* path) {
2395     const struct block_tree_node* node_ro;
2396     const struct block_tree_node* merge_node_ro;
2397     struct block_tree_node* node_rw;
2398     struct block_tree_node* merge_node_rw;
2399     struct obj_ref node_ref1 = OBJ_REF_INITIAL_VALUE(node_ref1);
2400     struct obj_ref node_ref2 = OBJ_REF_INITIAL_VALUE(node_ref2);
2401     struct obj_ref* node_ref = &node_ref1;
2402     struct obj_ref* merge_node_ref = &node_ref2;
2403     const struct block_mac* block_mac;
2404     struct block_mac merge_block;
2405     unsigned int src_index;
2406     unsigned int dest_index;
2407     data_block_t key;
2408     data_block_t parent_key;
2409     int index;
2410     bool node_is_left;
2411     bool is_leaf;
2412 
2413     assert(path->count > 1);
2414     assert(path->tree);
2415 
2416     block_mac = &path->entry[path->count - 1].block_mac;
2417     node_is_left = !path->entry[path->count - 2].index;
2418     merge_block = block_tree_get_sibling_block(tr, path);
2419 
2420     if (tr->failed) {
2421         pr_warn("transaction failed, abort\n");
2422         return;
2423     }
2424 
2425     node_ro = block_get(tr, block_mac, NULL, node_ref);
2426     if (!node_ro) {
2427         assert(tr->failed);
2428         pr_warn("transaction failed, abort\n");
2429         return;
2430     }
2431     assert(node_ro);
2432     is_leaf = block_tree_node_is_leaf(node_ro);
2433 
2434     merge_node_ro = block_get(tr, &merge_block, NULL, merge_node_ref);
2435     if (!merge_node_ro) {
2436         assert(tr->failed);
2437         block_put(node_ro, node_ref);
2438         pr_warn("transaction failed, abort\n");
2439         return;
2440     }
2441     assert(merge_node_ro);
2442     assert(is_leaf == block_tree_node_is_leaf(merge_node_ro));
2443 
2444     if (print_internal_changes) {
2445         printf("%s: node_is_left %d\n", __func__, node_is_left);
2446         block_tree_node_print(tr, path->tree, block_mac_to_block(tr, block_mac),
2447                               node_ro);
2448         block_tree_node_print(tr, path->tree,
2449                               block_mac_to_block(tr, &merge_block),
2450                               merge_node_ro);
2451     }
2452 
2453     assert(block_tree_below_min_full(path->tree, node_ro));
2454     assert(!block_tree_below_min_full(path->tree, merge_node_ro));
2455     if (block_tree_above_min_full(path->tree, merge_node_ro)) {
2456         /* migrate entries instead */
2457         block_tree_path_set_sibling_block(path, &merge_block, !node_is_left);
2458         assert(!tr->failed);
2459         merge_node_rw = block_tree_block_dirty(tr, path, path->count - 1,
2460                                                merge_node_ro);
2461         block_tree_path_set_sibling_block(path, &merge_block, node_is_left);
2462         if (!merge_node_rw) {
2463             assert(tr->failed);
2464             block_put(node_ro, node_ref);
2465             block_put(merge_node_ro, merge_node_ref);
2466             pr_warn("transaction failed, abort\n");
2467             return;
2468         }
2469         assert(!block_tree_node_need_copy(tr, path->tree, &merge_block));
2470 
2471         node_rw = block_dirty(tr, node_ro, !path->tree->copy_on_write);
2472         assert(!block_tree_node_need_copy(tr, path->tree, block_mac));
2473 
2474         if (node_is_left) {
2475             src_index = 0;
2476             dest_index = block_tree_node_min_full_index(path->tree, node_rw);
2477         } else {
2478             src_index =
2479                     block_tree_node_get_key_count(path->tree, merge_node_rw) -
2480                     1;
2481             dest_index = 0;
2482         }
2483 
2484         key = block_tree_node_get_key(path->tree, DATA_BLOCK_INVALID,
2485                                       merge_node_rw, src_index);
2486         if (node_is_left && is_leaf) {
2487             parent_key = block_tree_node_get_key(path->tree, DATA_BLOCK_INVALID,
2488                                                  merge_node_rw, 1);
2489         } else {
2490             parent_key = key;
2491         }
2492         if (!is_leaf) {
2493             if (node_is_left) {
2494                 key = path->entry[path->count - 2].next_key;
2495             } else {
2496                 key = path->entry[path->count - 2].prev_key;
2497             }
2498         }
2499 
2500         block_tree_node_insert(path->tree, node_rw, dest_index,
2501                                (node_is_left && !is_leaf)
2502                                        ? SHIFT_RIGHT_CHILD
2503                                        : SHIFT_LEAF_OR_LEFT_CHILD,
2504                                &key,
2505                                block_tree_node_get_child_data(
2506                                        path->tree, merge_node_rw,
2507                                        src_index + (!node_is_left && !is_leaf)),
2508                                NULL, NULL);
2509 
2510         block_tree_node_shift_down(
2511                 path->tree, merge_node_rw, src_index, src_index + 1,
2512                 (node_is_left || is_leaf) ? SHIFT_LEAF_OR_LEFT_CHILD
2513                                           : SHIFT_RIGHT_CHILD);
2514 
2515         if (node_is_left) {
2516             if (dest_index == 0 && is_leaf) {
2517                 data_block_t key0;
2518                 key0 = block_tree_node_get_key(path->tree, DATA_BLOCK_INVALID,
2519                                                node_rw, 0);
2520                 assert(key0);
2521                 block_tree_update_key(tr, path, path->count - 2, key0);
2522             }
2523             block_tree_path_set_sibling_block(path, &merge_block,
2524                                               !node_is_left);
2525         }
2526         block_tree_update_key(tr, path, path->count - 2, parent_key);
2527         if (node_is_left) {
2528             block_tree_path_set_sibling_block(path, &merge_block, node_is_left);
2529         }
2530         if (print_internal_changes) {
2531             printf("%s: %" PRIu64 " %" PRIu64 " done\n", __func__,
2532                    block_mac_to_block(tr, block_mac),
2533                    block_mac_to_block(tr, &merge_block));
2534             block_tree_node_print(tr, path->tree,
2535                                   block_mac_to_block(tr, block_mac), node_rw);
2536             block_tree_node_print(tr, path->tree,
2537                                   block_mac_to_block(tr, &merge_block),
2538                                   merge_node_rw);
2539         }
2540         block_tree_path_put_dirty(tr, path, path->count - 1, node_rw, node_ref);
2541         block_tree_path_set_sibling_block(path, &merge_block, !node_is_left);
2542         block_tree_path_put_dirty(tr, path, path->count - 1, merge_node_rw,
2543                                   merge_node_ref);
2544         block_tree_path_set_sibling_block(path, &merge_block, node_is_left);
2545     } else {
2546         const struct block_mac* left;
2547         const struct block_mac* right;
2548         data_block_t* merge_key;
2549         if (!node_is_left) {
2550             const struct block_tree_node* tmp_node;
2551             struct obj_ref* tmp_ref;
2552             tmp_node = node_ro;
2553             node_ro = merge_node_ro;
2554             merge_node_ro = tmp_node;
2555             tmp_ref = node_ref;
2556             node_ref = merge_node_ref;
2557             merge_node_ref = tmp_ref;
2558             block_tree_path_set_sibling_block(path, &merge_block, true);
2559         }
2560         left = block_mac;
2561         right = &merge_block;
2562         node_rw = block_tree_block_dirty(tr, path, path->count - 1, node_ro);
2563         if (!node_rw) {
2564             assert(tr->failed);
2565             block_put(node_ro, node_ref);
2566             block_put(merge_node_ro, merge_node_ref);
2567             pr_warn("transaction failed, abort\n");
2568             return;
2569         }
2570         assert(!block_tree_node_need_copy(tr, path->tree, left));
2571 
2572         index = block_tree_node_get_key_count(path->tree, node_rw);
2573         if (is_leaf) {
2574             merge_key = NULL;
2575         } else {
2576             merge_key = &path->entry[path->count - 2].next_key;
2577             assert(*merge_key);
2578         }
2579         block_tree_node_merge_entries(
2580                 path->tree, node_rw, merge_node_ro, index,
2581                 block_tree_node_get_key_count(path->tree, merge_node_ro),
2582                 merge_key);
2583         assert(block_tree_node_check(tr, path->tree, node_rw,
2584                                      block_mac_to_block(tr, left), 0,
2585                                      DATA_BLOCK_MAX));
2586 
2587         if (is_leaf && !block_tree_node_min_full_index(path->tree, node_rw) &&
2588             !index) {
2589             data_block_t key0;
2590             key0 = block_tree_node_get_key(path->tree, DATA_BLOCK_INVALID,
2591                                            node_rw, 0);
2592             assert(key0);
2593             if (print_internal_changes) {
2594                 printf("hack, special case for order <= 4 tree\n");
2595             }
2596             block_tree_update_key(tr, path, path->count - 2, key0);
2597         }
2598 
2599         if (print_internal_changes) {
2600             printf("%s: merge done, free %" PRIu64 "\n", __func__,
2601                    block_mac_to_block(tr, right));
2602             block_tree_node_print(tr, path->tree, block_mac_to_block(tr, left),
2603                                   node_rw);
2604         }
2605 
2606         block_tree_path_put_dirty(tr, path, path->count - 1, node_rw, node_ref);
2607         block_discard_dirty(merge_node_ro);
2608         block_put(merge_node_ro, merge_node_ref);
2609         block_tree_path_set_sibling_block(path, &merge_block, false);
2610         path->count--;
2611         block_tree_remove_internal(tr, path);
2612         block_free_etc(tr, block_mac_to_block(tr, block_mac),
2613                        !path->tree->allow_copy_on_write);
2614     }
2615 }
2616 
2617 /**
2618  * block_tree_insert_block_mac - Insert an entry into a B+ tree
2619  * @tr:             Transaction object.
2620  * @tree:           Tree object.
2621  * @key:            Key of new entry.
2622  * @data:           Data of new entry.
2623  *
2624  * Find a valid insert point for @key and insert @key and @data there. If this
2625  * node becomes overfull, split it.
2626  *
2627  * TODO: use a void * for data and merge with block_tree_insert.
2628  * TODO: Allow insert by path
2629  */
block_tree_insert_block_mac(struct transaction * tr,struct block_tree * tree,data_block_t key,struct block_mac data)2630 void block_tree_insert_block_mac(struct transaction* tr,
2631                                  struct block_tree* tree,
2632                                  data_block_t key,
2633                                  struct block_mac data) {
2634     const struct block_tree_node* node_ro;
2635     struct block_tree_node* node_rw;
2636     struct obj_ref node_ref = OBJ_REF_INITIAL_VALUE(node_ref);
2637     const struct block_mac* block_mac;
2638     data_block_t overflow_key = 0;
2639     struct block_mac overflow_data;
2640     int index;
2641     struct block_tree_path path;
2642 
2643     assert(!tr->failed);
2644     assert(!tree->updating);
2645     assert(key);
2646     assert(block_mac_valid(tr, &data));
2647 
2648     tree->updating = true;
2649 
2650     if (!block_mac_valid(tr, &tree->root)) {
2651         assert(!tree->copy_on_write || tree->allow_copy_on_write);
2652         block_mac_set_block(tr, &tree->root,
2653                             block_allocate_etc(tr, !tree->allow_copy_on_write));
2654         if (tr->failed) {
2655             pr_warn("transaction failed, abort\n");
2656             goto err;
2657         }
2658         if (print_internal_changes) {
2659             printf("%s: new root block %" PRIu64 "\n", __func__,
2660                    block_mac_to_block(tr, &tree->root));
2661         }
2662         assert(block_mac_valid(tr, &tree->root));
2663         node_rw = block_get_cleared(tr, block_mac_to_block(tr, &tree->root),
2664                                     !tree->allow_copy_on_write, &node_ref);
2665         node_rw->is_leaf = true;
2666         block_put_dirty(tr, node_rw, &node_ref, &tree->root, NULL);
2667         tree->root_block_changed = true;
2668     }
2669 
2670     block_tree_walk(tr, tree, key, false, &path);
2671     if (tr->failed) {
2672         pr_warn("transaction failed, abort\n");
2673         goto err;
2674     }
2675 
2676     assert(path.count > 0);
2677 
2678     block_mac = &path.entry[path.count - 1].block_mac;
2679     index = path.entry[path.count - 1].index;
2680 
2681     node_ro = block_get(tr, block_mac, NULL, &node_ref);
2682     if (!node_ro) {
2683         assert(tr->failed);
2684         pr_warn("transaction failed, abort\n");
2685         goto err;
2686     }
2687 
2688     if (print_changes) {
2689         printf("%s: key %" PRIu64 ", data.block %" PRIu64 ", index %d\n",
2690                __func__, key, block_mac_to_block(tr, &data), index);
2691         assert(node_ro);
2692         block_tree_node_print(tr, tree, block_mac_to_block(tr, block_mac),
2693                               node_ro);
2694     }
2695 
2696     assert(node_ro);
2697     assert(block_tree_node_is_leaf(node_ro));
2698     assert(index || !path.entry[path.count - 1].prev_key ||
2699            block_tree_node_get_key(tree, DATA_BLOCK_INVALID, node_ro, index) ==
2700                    key);
2701 
2702     node_rw = block_tree_block_dirty(tr, &path, path.count - 1, node_ro);
2703     if (!node_rw) {
2704         assert(tr->failed);
2705         pr_warn("transaction failed, abort\n");
2706         block_put(node_ro, &node_ref);
2707         goto err;
2708     }
2709 
2710     block_tree_node_insert(tree, node_rw, index, SHIFT_LEAF_OR_LEFT_CHILD, &key,
2711                            &data, &overflow_key, &overflow_data);
2712 
2713     block_tree_path_put_dirty(tr, &path, path.count - 1, node_rw, &node_ref);
2714 
2715     if (overflow_key) {
2716         assert(block_mac_valid(tr, &overflow_data));
2717         block_tree_node_split(tr, &path, overflow_key, NULL, &overflow_data);
2718     }
2719 
2720     if (print_changes_full_tree) {
2721         printf("%s: key %" PRIu64 ", data.block %" PRIu64 ", done:\n", __func__,
2722                key, block_mac_to_block(tr, &data));
2723         block_tree_print(tr, tree);
2724     }
2725 
2726 err:
2727     tree->update_count++;
2728     tree->updating = false;
2729     full_assert(block_tree_check(tr, tree));
2730 }
2731 
2732 /**
2733  * block_tree_insert - Insert a data_block_t type entry into a B+ tree
2734  * @tr:             Transaction object.
2735  * @tree:           Tree object.
2736  * @key:            Key of new entry.
2737  * @data_block:     Data of new entry.
2738  */
block_tree_insert(struct transaction * tr,struct block_tree * tree,data_block_t key,data_block_t data_block)2739 void block_tree_insert(struct transaction* tr,
2740                        struct block_tree* tree,
2741                        data_block_t key,
2742                        data_block_t data_block) {
2743     struct block_mac data = BLOCK_MAC_INITIAL_VALUE(data);
2744 
2745     block_mac_set_block(tr, &data, data_block);
2746 
2747     block_tree_insert_block_mac(tr, tree, key, data);
2748 }
2749 
2750 /**
2751  * block_tree_remove - Remove an entry from a B+ tree
2752  * @tr:             Transaction object.
2753  * @tree:           Tree object.
2754  * @key:            Key of entry to remove.
2755  * @data:           Data of entry to remove.
2756  *
2757  * Find an entry in the tree where the key matches @key, and the start of data
2758  * matches @data, and remove it from the tree. Call block_tree_node_merge if
2759  * the resulting node does follow B+ tree rules.
2760  *
2761  * TODO: Remove by by path instead
2762  */
block_tree_remove(struct transaction * tr,struct block_tree * tree,data_block_t key,data_block_t data)2763 void block_tree_remove(struct transaction* tr,
2764                        struct block_tree* tree,
2765                        data_block_t key,
2766                        data_block_t data) {
2767     struct block_tree_path path;
2768     const struct block_tree_node* node_ro;
2769     struct block_tree_node* node_rw;
2770     struct obj_ref node_ref = OBJ_REF_INITIAL_VALUE(node_ref);
2771     const struct block_mac* block_mac;
2772     int index;
2773     bool need_merge = false;
2774     data_block_t new_parent_key;
2775 
2776     assert(!tr->failed);
2777     assert(!tree->updating);
2778     assert(block_mac_valid(tr, &tree->root));
2779     full_assert(block_tree_check(tr, tree));
2780     assert(key);
2781     assert(data);
2782 
2783     tree->updating = true;
2784 
2785     block_tree_walk(tr, tree, key, false, &path); /* TODO: make writeable */
2786     if (tr->failed) {
2787         pr_warn("transaction failed, abort\n");
2788         goto err;
2789     }
2790     assert(path.count > 0);
2791 
2792     if (block_mac_to_block(tr, &path.data) != data) {
2793         /* TODO: make path writeable after finding maching entry */
2794         block_tree_walk(tr, tree, key - 1, true, &path);
2795         while (block_mac_to_block(tr, &path.data) != data ||
2796                block_tree_path_get_key(&path) != key) {
2797             assert(block_tree_path_get_key(&path));
2798             block_tree_path_next(&path);
2799         }
2800     }
2801 
2802     block_mac = &path.entry[path.count - 1].block_mac;
2803     index = path.entry[path.count - 1].index;
2804 
2805     node_ro = block_get(tr, block_mac, NULL, &node_ref);
2806     if (!node_ro) {
2807         assert(tr->failed);
2808         pr_warn("transaction failed, abort\n");
2809         goto err;
2810     }
2811     assert(block_tree_node_is_leaf(node_ro));
2812     assert(block_tree_node_get_key(tree, DATA_BLOCK_INVALID, node_ro, index) ==
2813            key);
2814     assert(!memcmp(block_tree_node_get_child_data(tree, node_ro, index), &data,
2815                    MIN(sizeof(data), tr->fs->block_num_size)));
2816 
2817     if (print_changes) {
2818         printf("%s: key %" PRIu64 ", data %" PRIu64 ", index %d\n", __func__,
2819                key, data, index);
2820         block_tree_node_print(tr, tree, block_mac_to_block(tr, block_mac),
2821                               node_ro);
2822     }
2823 
2824     node_rw = block_tree_block_dirty(tr, &path, path.count - 1, node_ro);
2825     if (tr->failed) {
2826         block_put(node_ro, &node_ref);
2827         pr_warn("transaction failed, abort\n");
2828         goto err;
2829     }
2830     assert(node_rw);
2831     block_tree_node_leaf_remove_entry(tr, tree, block_mac, node_rw, index);
2832     if (path.count > 1) {
2833         if (!index) {
2834             new_parent_key = block_tree_node_get_key(tree, DATA_BLOCK_INVALID,
2835                                                      node_rw, 0);
2836             assert(new_parent_key ||
2837                    !block_tree_node_min_full_index(tree, node_rw));
2838             if (new_parent_key) {
2839                 block_tree_update_key(tr, &path, path.count - 2,
2840                                       new_parent_key);
2841             }
2842         }
2843         need_merge = block_tree_below_min_full(tree, node_rw);
2844     }
2845     block_tree_path_put_dirty(tr, &path, path.count - 1, node_rw, &node_ref);
2846     if (need_merge) {
2847         block_tree_node_merge(tr, &path);
2848     }
2849 
2850     if (print_changes_full_tree) {
2851         printf("%s: key %" PRIu64 ", data %" PRIu64 ", done:\n", __func__, key,
2852                data);
2853         block_tree_print(tr, tree);
2854     }
2855 
2856 err:
2857     tree->update_count++;
2858     tree->updating = false;
2859     full_assert(block_tree_check(tr, tree));
2860 }
2861 
2862 /**
2863  * block_tree_update_block_mac - Update key or data of an existing B+ tree entry
2864  * @tr:             Transaction object.
2865  * @tree:           Tree object.
2866  * @old_key:        Key of entry to update.
2867  * @old_data:       Data of entry to update.
2868  * @new_key:        New key of entry.
2869  * @new_data:       New data of entry.
2870  *
2871  * Find an entry in the tree where the key matches @key, and the start of data
2872  * matches @data, and update it's key or data. When updating key, the new key
2873  * must not cause the entry to move in the tree.
2874  *
2875  * TODO: Update by path instead
2876  */
block_tree_update_block_mac(struct transaction * tr,struct block_tree * tree,data_block_t old_key,struct block_mac old_data,data_block_t new_key,struct block_mac new_data)2877 void block_tree_update_block_mac(struct transaction* tr,
2878                                  struct block_tree* tree,
2879                                  data_block_t old_key,
2880                                  struct block_mac old_data,
2881                                  data_block_t new_key,
2882                                  struct block_mac new_data) {
2883     struct block_tree_path path;
2884     const struct block_tree_node* node_ro;
2885     struct block_tree_node* node_rw;
2886     struct obj_ref node_ref = OBJ_REF_INITIAL_VALUE(node_ref);
2887     const struct block_mac* block_mac;
2888     data_block_t prev_key;
2889     data_block_t next_key;
2890     unsigned int index;
2891     size_t max_key_count;
2892 
2893     assert(!tr->failed);
2894     assert(!tree->updating);
2895     assert(block_mac_valid(tr, &tree->root));
2896     full_assert(block_tree_check(tr, tree));
2897     assert(old_key);
2898     assert(block_mac_valid(tr, &old_data));
2899     assert(new_key);
2900     assert(block_mac_valid(tr, &new_data));
2901     assert(old_key == new_key ||
2902            block_mac_same_block(tr, &old_data, &new_data));
2903     assert(old_key != new_key ||
2904            !block_mac_same_block(tr, &old_data, &new_data));
2905 
2906     tree->updating = true;
2907 
2908     block_tree_walk(tr, tree, old_key - 1, true, &path);
2909     prev_key = block_tree_path_get_key(&path);
2910     /* modify leftmost entry in tree */
2911     if (prev_key == old_key &&
2912         block_mac_same_block(tr, &path.data, &old_data)) {
2913         prev_key = 0;
2914     }
2915 
2916     block_tree_walk(tr, tree, old_key, false, &path); /* TODO: make writeable */
2917     if (tr->failed) {
2918         pr_warn("transaction failed, abort\n");
2919         goto err;
2920     }
2921     assert(path.count > 0);
2922 
2923     if (!block_mac_same_block(tr, &path.data, &old_data)) {
2924         /* TODO: make path writeable after finding maching entry */
2925         block_tree_walk(tr, tree, old_key - 1, true, &path);
2926         while (!block_mac_same_block(tr, &path.data, &old_data) ||
2927                block_tree_path_get_key(&path) != old_key) {
2928             assert(block_tree_path_get_key(&path));
2929             block_tree_path_next(&path);
2930         }
2931     }
2932 
2933     block_mac = &path.entry[path.count - 1].block_mac;
2934     index = path.entry[path.count - 1].index;
2935 
2936     node_ro = block_get(tr, block_mac, NULL, &node_ref);
2937     if (!node_ro) {
2938         assert(tr->failed);
2939         pr_warn("transaction failed, abort\n");
2940         goto err;
2941     }
2942     max_key_count = block_tree_node_max_key_count(tree, node_ro);
2943     assert(block_tree_node_is_leaf(node_ro));
2944     assert(block_tree_node_get_key(tree, DATA_BLOCK_INVALID, node_ro, index) ==
2945            old_key);
2946     assert(block_mac_same_block(
2947             tr, block_tree_node_get_child_data(tree, node_ro, index),
2948             &old_data));
2949 
2950     if (print_changes) {
2951         printf("%s: key %" PRIu64 "->%" PRIu64 ", data %" PRIu64 "->%" PRIu64
2952                ", index %d\n",
2953                __func__, old_key, new_key, block_mac_to_block(tr, &old_data),
2954                block_mac_to_block(tr, &new_data), index);
2955         block_tree_node_print(tr, tree, block_mac_to_block(tr, block_mac),
2956                               node_ro);
2957     }
2958 
2959     next_key = (index + 1 < max_key_count)
2960                        ? block_tree_node_get_key(tree, DATA_BLOCK_INVALID,
2961                                                  node_ro, index + 1)
2962                        : 0;
2963     if (path.count > 1 && !next_key) {
2964         next_key = path.entry[path.count - 2].next_key;
2965     }
2966 
2967     node_rw = block_tree_block_dirty(tr, &path, path.count - 1, node_ro);
2968     if (tr->failed) {
2969         block_put(node_ro, &node_ref);
2970         pr_warn("transaction failed, abort\n");
2971         goto err;
2972     }
2973     assert(node_rw);
2974 
2975     if (old_key == new_key) {
2976         assert(tree->child_data_size[1] <= sizeof(new_data));
2977         memcpy(block_tree_node_get_child_data_rw(tree, node_rw, index),
2978                &new_data, tree->child_data_size[1]);
2979     } else if (new_key >= prev_key && (new_key <= next_key || !next_key)) {
2980         block_tree_node_set_key(tree, node_rw, index, new_key);
2981         if (!index) {
2982             path.count--;
2983             assert(new_key);
2984             assert(new_key == block_tree_node_get_key(tree, DATA_BLOCK_INVALID,
2985                                                       node_rw, index));
2986             /* A negative depth to block_tree_update_key indicates a no-op */
2987             block_tree_update_key(tr, &path, (int)path.count - 1, new_key);
2988             path.count++;
2989         }
2990     } else {
2991         assert(0); /* moving entries with block_tree_update is not supported */
2992     }
2993     block_tree_path_put_dirty(tr, &path, path.count - 1, node_rw, &node_ref);
2994 
2995     if (print_changes_full_tree) {
2996         printf("%s: key %" PRIu64 "->%" PRIu64 ", data %" PRIu64 "->%" PRIu64
2997                ", done:\n",
2998                __func__, old_key, new_key, block_mac_to_block(tr, &old_data),
2999                block_mac_to_block(tr, &new_data));
3000         block_tree_print(tr, tree);
3001     }
3002 
3003 err:
3004     tree->update_count++;
3005     tree->updating = false;
3006     full_assert(block_tree_check(tr, tree));
3007 }
3008 
3009 /**
3010  * block_tree_update_block_mac - Update key or data of an existing B+ tree entry
3011  * @tr:             Transaction object.
3012  * @tree:           Tree object.
3013  * @old_key:        Key of entry to update.
3014  * @old_data_block: Data of entry to update.
3015  * @new_key:        New key of entry.
3016  * @new_data_block: New data of entry.
3017  *
3018  * Same as block_tree_update_block_mac but data of type data_block_t.
3019  *
3020  * TODO: Merge with block_tree_update_block_mac
3021  */
block_tree_update(struct transaction * tr,struct block_tree * tree,data_block_t old_key,data_block_t old_data_block,data_block_t new_key,data_block_t new_data_block)3022 void block_tree_update(struct transaction* tr,
3023                        struct block_tree* tree,
3024                        data_block_t old_key,
3025                        data_block_t old_data_block,
3026                        data_block_t new_key,
3027                        data_block_t new_data_block) {
3028     struct block_mac old_data = BLOCK_MAC_INITIAL_VALUE(old_data);
3029     struct block_mac new_data = BLOCK_MAC_INITIAL_VALUE(new_data);
3030 
3031     block_mac_set_block(tr, &old_data, old_data_block);
3032     block_mac_set_block(tr, &new_data, new_data_block);
3033 
3034     block_tree_update_block_mac(tr, tree, old_key, old_data, new_key, new_data);
3035 }
3036 
3037 /**
3038  * block_tree_init - Initialize in-memory state of block backed B+ tree
3039  * @tree:           Tree object.
3040  * @block_size:     Block size.
3041  * @key_size:       Key size. [1-8].
3042  * @child_size:     Child size. [@key_size-24].
3043  * @data_size:      Data size. [1-24].
3044  *
3045  * TODO: set copy-on-write flags.
3046  */
block_tree_init(struct block_tree * tree,size_t block_size,size_t key_size,size_t child_size,size_t data_size)3047 void block_tree_init(struct block_tree* tree,
3048                      size_t block_size,
3049                      size_t key_size,
3050                      size_t child_size,
3051                      size_t data_size) {
3052     memset(tree, 0, sizeof(*tree));
3053     block_tree_set_sizes(tree, block_size, key_size, child_size, data_size);
3054 }
3055 
3056 /**
3057  * block_tree_init - Initialize in-memory state of copy-on-write B+ tree
3058  * @dst:        New tree object.
3059  * @src:        Source tree object.
3060  *
3061  * Initialize @dst to point to the same tree as @src, but set flag to enable
3062  * copy-on-write so all nodes that need changes get copied to new blocks before
3063  * those changes are made.
3064  */
block_tree_copy(struct block_tree * dst,const struct block_tree * src)3065 void block_tree_copy(struct block_tree* dst, const struct block_tree* src) {
3066     assert(src->copy_on_write);
3067     *dst = *src;
3068     dst->allow_copy_on_write = true;
3069 }
3070 
3071 #if BUILD_STORAGE_TEST
log_n(double n,double x)3072 static double log_n(double n, double x) {
3073     return log(x) / log(n);
3074 }
3075 
3076 static bool block_tree_max_depth_needed;
3077 
block_tree_check_config(struct block_device * dev)3078 void block_tree_check_config(struct block_device* dev) {
3079     struct block_tree tree[1];
3080     int node_internal_min_key_count;
3081     int node_internal_max_key_count;
3082     int node_min_child_count;
3083     int node_max_child_count;
3084     int node_leaf_min_key_count;
3085     int node_leaf_max_key_count;
3086     double tree_min_entry_count;
3087     double tree_max_entry_count;
3088     int depth;
3089     int max_entries_needed = (dev->block_count + 1) / 2;
3090     int depth_needed;
3091 
3092     block_tree_set_sizes(tree, dev->block_size, sizeof(data_block_t),
3093                          sizeof(struct block_mac), sizeof(struct block_mac));
3094 
3095     node_internal_min_key_count = block_tree_min_key_count(tree, false);
3096     node_internal_max_key_count = block_tree_max_key_count(tree, false);
3097     node_min_child_count = node_internal_min_key_count + 1;
3098     node_max_child_count = node_internal_max_key_count + 1;
3099     node_leaf_min_key_count = block_tree_min_key_count(tree, true);
3100     node_leaf_max_key_count = block_tree_max_key_count(tree, true);
3101 
3102     printf("block_tree config\n");
3103     printf("internal min key count: %6d\n", node_internal_min_key_count);
3104     printf("internal max key count: %6d\n", node_internal_max_key_count);
3105     printf("min child count:        %6d\n", node_min_child_count);
3106     printf("max child count:        %6d\n", node_max_child_count);
3107     printf("leaf min key count:     %6d\n", node_leaf_min_key_count);
3108     printf("leaf max key count:     %6d\n", node_leaf_max_key_count);
3109     printf("block size:             %6zd\n", dev->block_size);
3110     printf("block count:            %6" PRIu64 "\n", dev->block_count);
3111     printf("max entries needed:     %6d\n", max_entries_needed);
3112     printf("max depth:              %6d\n", BLOCK_TREE_MAX_DEPTH);
3113     for (depth = 1; depth <= BLOCK_TREE_MAX_DEPTH; depth++) {
3114         if (depth == 1) {
3115             tree_min_entry_count = 0;
3116             tree_max_entry_count = node_leaf_max_key_count;
3117         } else if (depth == 2) {
3118             tree_min_entry_count = node_leaf_min_key_count * 2;
3119         } else {
3120             tree_min_entry_count *= node_min_child_count;
3121         }
3122         tree_max_entry_count *= node_max_child_count;
3123         printf("depth %2d: entry count range: %20.15g - %20.15g (alt. %15.10g - %15.10g)\n",
3124                depth, tree_min_entry_count, tree_max_entry_count,
3125                2.0 * pow(ceil(node_max_child_count / 2.0), depth - 1),
3126                pow(node_max_child_count, depth) -
3127                        pow(node_max_child_count, depth - 1));
3128     }
3129     depth_needed = ceil(log_n(ceil(node_max_child_count / 2.0),
3130                               ceil(max_entries_needed / 2.0))) +
3131                    1;
3132     printf("est. tree depth needed for %u entries: %d\n", max_entries_needed,
3133            depth_needed);
3134     printf("est. tree depth needed for %" PRIu64 " entries: %g\n",
3135            ((data_block_t)~0 / dev->block_size),
3136            ceil(log_n(ceil(node_max_child_count / 2.0),
3137                       ceil(((data_block_t)~0 / dev->block_size) / 2.0)) +
3138                 1.0));
3139     printf("est. tree depth needed for %" PRIu64 " entries: %g\n",
3140            ((data_block_t)~0),
3141            ceil(log_n(ceil(node_max_child_count / 2.0),
3142                       (((data_block_t)~0) / 2 + 1)) +
3143                 1.0));
3144     assert(tree_min_entry_count >= max_entries_needed);
3145     if (tree_min_entry_count / node_min_child_count < max_entries_needed) {
3146         block_tree_max_depth_needed = true;
3147     }
3148 }
3149 
block_tree_check_config_done(void)3150 void block_tree_check_config_done(void) {
3151     assert(block_tree_max_depth_needed);
3152 }
3153 
3154 #endif
3155