1 /*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "block_tree.h"
18
19 #include <assert.h>
20 #include <inttypes.h>
21 #include <limits.h>
22 #include <lk/compiler.h>
23 #include <lk/macros.h>
24 #include <stdbool.h>
25 #include <stdint.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29
30 #include "array.h"
31 #include "block_allocator.h"
32 #include "block_cache.h"
33 #include "crypt.h"
34 #include "debug.h"
35 #include "transaction.h"
36
37 #if BUILD_STORAGE_TEST
38 #include <math.h>
39 #endif
40
41 bool print_lookup;
42 static bool print_ops;
43 static bool print_node_changes;
44 static bool print_internal_changes;
45 static bool print_changes;
46 static bool print_changes_full_tree;
47 static bool print_check_errors;
48
49 /**
50 * struct block_tree_node - On-disk B+ tree node header and payload start
51 * @iv: initial value used for encrypt/decrypt
52 * @is_leaf: 0 for internal nodes, 1 for leaf nodes.
53 * @data: Key, child and data payload, size of entries is tree specific
54 * so accessor functions are needed to get to specific entries.
55 *
56 * On-disk B+ tree node format.
57 *
58 * If %is_leaf is 0, the node is an internal B+ tree node and @data contains n
59 * keys and n + 1 child block-macs. If %is_leaf is 1, the node is a leaf node
60 * and @data contains n keys and n data entries. For either node type
61 * n <= block_tree_max_key_count. For root leaf nodes n >= 0, for root internal
62 * nodes n >= 1, otherwise n >= block_tree_min_key_count. While updating the
63 * tree, n can temporarily exceed these limits by 1 for a single node. There is
64 * no room in this struct to store extra keys, so the extra key and child/data
65 * is stored in the in-memory tree struct.
66 *
67 * The value of block_tree_max_key_count and block_tree_min_key_count depends
68 * on the block, key, child and data sizes for the tree, and may not be the
69 * same for leaf and internal nodes.
70 *
71 * Keys are stored starting at @data, child and data values are stored starting
72 * at data + block_tree_max_key_count * key-size.
73 *
74 * TODO: store n in this struct? Currently a 0 key value us used to determine
75 * how full a node is, which prevents storing 0 keys in the tree.
76 */
77 struct block_tree_node {
78 struct iv iv;
79 uint64_t is_leaf;
80 uint8_t data[0];
81 };
82
83 /**
84 * block_tree_node_is_leaf - Check if node is a leaf or an internal node
85 * @node_ro: Node pointer.
86 *
87 * Return: %true if @node_ro is a leaf node, %false if @node_ro is an internal
88 * node.
89 */
block_tree_node_is_leaf(const struct block_tree_node * node_ro)90 static bool block_tree_node_is_leaf(const struct block_tree_node* node_ro) {
91 assert(node_ro);
92 assert(node_ro->is_leaf <= 1);
93
94 return node_ro->is_leaf;
95 }
96
97 /**
98 * block_tree_set_sizes - Configure tree block and entry sizes
99 * @tree: Tree object.
100 * @block_size: Block size.
101 * @key_size: Key size. [1-8].
102 * @child_size: Child size. [@key_size-24].
103 * @data_size: Data size. [1-24].
104 *
105 * Store tree block and entry sizes and calculate key counts.
106 */
block_tree_set_sizes(struct block_tree * tree,size_t block_size,size_t key_size,size_t child_size,size_t data_size)107 static void block_tree_set_sizes(struct block_tree* tree,
108 size_t block_size,
109 size_t key_size,
110 size_t child_size,
111 size_t data_size) {
112 size_t payload_size = block_size - sizeof(struct block_tree_node);
113
114 assert(payload_size < block_size);
115 assert(key_size);
116 assert(key_size <= sizeof(data_block_t));
117 assert(child_size >= key_size);
118 assert(child_size <= sizeof(struct block_mac));
119 assert(data_size);
120 assert(data_size <= sizeof(struct block_mac));
121
122 tree->key_size = key_size;
123 tree->child_data_size[0] = child_size;
124 tree->child_data_size[1] = data_size;
125 tree->key_count[0] = (payload_size - child_size) / (key_size + child_size);
126 /* TODO: allow next pointer when mac is not needed? */
127 tree->key_count[1] = (payload_size) / (key_size + data_size);
128 tree->block_size = block_size;
129
130 assert(tree->key_count[0] >= 2);
131 assert(tree->key_count[1] >= 2);
132 }
133
134 /**
135 * is_zero - Helper function to check that buffer only contain 0 bytes
136 * @data: Data pointer.
137 * @size: Number of bytes to check.
138 *
139 * Return: %true if @size is 0 or @data[0..@size - 1] is all 0, %false
140 * otherwise.
141 */
is_zero(const void * data,size_t size)142 static bool is_zero(const void* data, size_t size) {
143 if (!size) {
144 return true;
145 }
146 assert(size >= 1);
147 return !*(char*)data && !memcmp(data, data + 1, size - 1);
148 }
149
150 /**
151 * block_tree_max_key_count - Return max key count for leaf or internal nodes
152 * @tree: Tree object.
153 * @leaf: %true for leaf node, %false for internal node.
154 *
155 * Return: Maximum number of keys that fit in a leaf node or in and internal
156 * node.
157 */
block_tree_max_key_count(const struct block_tree * tree,bool leaf)158 static unsigned int block_tree_max_key_count(const struct block_tree* tree,
159 bool leaf) {
160 unsigned int key_count = tree->key_count[leaf];
161 assert(key_count);
162 assert(key_count * tree->key_size < tree->block_size);
163
164 return key_count;
165 }
166
167 /**
168 * block_tree_node_max_key_count - Return max key count for specific node
169 * @tree: Tree object.
170 * @node_ro: Node pointer.
171 *
172 * Return: Maximum number of keys that fit @node_ro.
173 */
block_tree_node_max_key_count(const struct block_tree * tree,const struct block_tree_node * node_ro)174 static unsigned int block_tree_node_max_key_count(
175 const struct block_tree* tree,
176 const struct block_tree_node* node_ro) {
177 return block_tree_max_key_count(tree, block_tree_node_is_leaf(node_ro));
178 }
179
180 /**
181 * Child selection for block_tree_node_shift
182 * @SHIFT_LEAF_OR_LEFT_CHILD: Required value for leaf nodes, selects left
183 * child for internal nodes.
184 * @SHIFT_RIGHT_CHILD: Select right child.
185 * @SHIFT_LEFT_CHILD: Select left child (for src_index).
186 * @SHIFT_BOTH: Select left child at start and right child at
187 * end. Only valid when inserting data at the end
188 * of a node.
189 */
190 #define SHIFT_LEAF_OR_LEFT_CHILD 0U
191 #define SHIFT_RIGHT_CHILD (1U << 0)
192 #define SHIFT_LEFT_CHILD (1U << 1)
193 #define SHIFT_BOTH (SHIFT_RIGHT_CHILD | SHIFT_LEFT_CHILD)
194
195 /**
196 * block_tree_node_shift - Helper function to insert or remove entries in a node
197 * @tree: Tree object.
198 * @node_rw: Node pointer.
199 * @dest_index: Destination index for existing entries to shift.
200 * @src_index: Source index for existing entries to shift.
201 * @shift_mode: Specifies which child to shift for internal nodes.
202 * @new_key: When shifting up (inserting), keys to insert.
203 * @new_data: When shifting up (inserting), data (or child) to insert.
204 * @overflow_key: When shifting up (inserting), location to store keys that
205 * was shifted out of range. Can be %NULL if all those keys are
206 * 0.
207 * @overflow_data: When shifting up (inserting), location to store data (or
208 * child) that was shifted out of range. Can be %NULL if that
209 * data is all 0.
210 */
block_tree_node_shift(const struct block_tree * tree,struct block_tree_node * node_rw,unsigned int dest_index,unsigned int src_index,uint32_t shift_mode,const void * new_key,const void * new_data,void * overflow_key,void * overflow_data)211 static void block_tree_node_shift(const struct block_tree* tree,
212 struct block_tree_node* node_rw,
213 unsigned int dest_index,
214 unsigned int src_index,
215 uint32_t shift_mode,
216 const void* new_key,
217 const void* new_data,
218 void* overflow_key,
219 void* overflow_data) {
220 bool is_leaf = block_tree_node_is_leaf(node_rw);
221 unsigned int max_count = tree->key_count[is_leaf];
222
223 int i;
224 void* base;
225 size_t entry_size;
226
227 const void* src;
228 void* dest;
229 size_t size;
230 unsigned int clear_index;
231
232 assert(max_count);
233 assert(dest_index <= max_count + !is_leaf + 1);
234
235 for (i = 0; i < 2; i++) {
236 if (i == 0) {
237 /* key */
238 base = node_rw->data;
239 entry_size = tree->key_size;
240 } else {
241 /* data/child */
242 base = node_rw->data + tree->key_size * max_count;
243 entry_size = tree->child_data_size[is_leaf];
244 if (!is_leaf) {
245 /* internal nodes have one more child than keys */
246 max_count++;
247 }
248 if (shift_mode & SHIFT_RIGHT_CHILD) {
249 assert(!is_leaf);
250 if (!(shift_mode & SHIFT_LEFT_CHILD) && src_index != UINT_MAX) {
251 src_index++;
252 }
253 dest_index++;
254 }
255 }
256
257 if (src_index < dest_index) {
258 /* Inserting, copy entries that will be overwritten to overflow_* */
259 size = (dest_index - src_index) * entry_size;
260 if (src_index == max_count) {
261 src = i == 0 ? new_key : new_data;
262 } else {
263 src = base + max_count * entry_size - size;
264 }
265 dest = i == 0 ? overflow_key : overflow_data;
266 if (dest) {
267 if (print_node_changes) {
268 #if TLOG_LVL >= TLOG_LVL_DEBUG
269 printf("%s: copy %p, index %d/%d, to overflow, %p, size %zd, is_zero %d\n",
270 __func__, src, max_count - (dest_index - src_index),
271 max_count, dest, size, is_zero(src, size));
272 #else
273 printf("%s: copy src index %d/%d, to overflow dest, size %zd, is_zero %d\n",
274 __func__, max_count - (dest_index - src_index),
275 max_count, size, is_zero(src, size));
276 #endif
277 }
278 memcpy(dest, src, size);
279 } else {
280 assert(is_zero(src, size));
281 }
282 }
283
284 if (src_index < max_count) {
285 /* Inserting or deleting, shift up or down */
286 src = base + src_index * entry_size;
287 dest = base + dest_index * entry_size;
288 size = (max_count - MAX(src_index, dest_index)) * entry_size;
289 if (print_node_changes) {
290 #if TLOG_LVL >= TLOG_LVL_DEBUG
291 printf("%s: move %p, index %d, to %p, index %d, size %zd, is_zero %d\n",
292 __func__, src, src_index, dest, dest_index, size,
293 is_zero(src, size));
294 #else
295 printf("%s: move src index %d, to dest index %d, size %zd, is_zero %d\n",
296 __func__, src_index, dest_index, size,
297 is_zero(src, size));
298 #endif
299 }
300 memmove(dest, src, size);
301 if (src_index >= dest_index) {
302 clear_index = max_count + dest_index - src_index;
303 }
304 } else {
305 clear_index = dest_index;
306 }
307
308 if (src_index < dest_index) {
309 /* Inserting, copy new entries passed in */
310 assert(new_key);
311 assert(new_data);
312 src = i == 0 ? new_key : new_data;
313 dest = base + src_index * entry_size;
314 size = (dest_index - src_index) * entry_size;
315 if (src_index == max_count) {
316 /* NOP, data was copied to overflow_* above */
317 } else {
318 assert(src);
319 if (print_node_changes) {
320 #if TLOG_LVL >= TLOG_LVL_DEBUG
321 printf("%s: copy new data %p, to %p, index %d, size %zd, is_zero %d\n",
322 __func__, src, dest, src_index, size,
323 is_zero(src, size));
324 #else
325 printf("%s: copy new data, index %d, size %zd, is_zero %d\n",
326 __func__, src_index, size,
327 is_zero(src, size));
328 #endif
329 }
330 memcpy(dest, src, size);
331 }
332 } else {
333 /* Deleting, clear unused space at end */
334 assert(dest_index <= max_count);
335 dest = base + clear_index * entry_size;
336 size = (max_count - clear_index) * entry_size;
337 if (print_node_changes) {
338 #if TLOG_LVL >= TLOG_LVL_DEBUG
339 printf("%s: clear %p, index %d/%d, size %zd\n", __func__, dest,
340 clear_index, max_count, size);
341 #else
342 printf("%s: clear dest, index %d/%d, size %zd\n", __func__,
343 clear_index, max_count, size);
344 #endif
345 }
346 memset(dest, 0, size);
347 }
348 }
349 }
350
351 /**
352 * block_tree_node_merge_entries - Helper function to merge nodes
353 * @tree: Tree object.
354 * @node_rw: Destination node.
355 * @src_node_ro: Source node.
356 * @dest_index: Index to insert at in @node_rw.
357 * @count: Number of entries to copy from start of @src_node_ro.
358 * @merge_key: For internal nodes, key to insert between nodes.
359 */
block_tree_node_merge_entries(const struct block_tree * tree,struct block_tree_node * node_rw,const struct block_tree_node * src_node_ro,unsigned int dest_index,unsigned int count,const void * merge_key)360 static void block_tree_node_merge_entries(
361 const struct block_tree* tree,
362 struct block_tree_node* node_rw,
363 const struct block_tree_node* src_node_ro,
364 unsigned int dest_index,
365 unsigned int count,
366 const void* merge_key) {
367 bool is_leaf = block_tree_node_is_leaf(node_rw);
368 unsigned int max_count = tree->key_count[is_leaf];
369 void* dest_key;
370 uint32_t shift_mode = SHIFT_LEAF_OR_LEFT_CHILD;
371 if (!is_leaf) {
372 dest_key = node_rw->data + tree->key_size * dest_index;
373 assert(is_zero(dest_key, tree->key_size));
374 memcpy(dest_key, merge_key, tree->key_size);
375 dest_index++;
376 shift_mode = SHIFT_BOTH;
377 }
378 block_tree_node_shift(tree, node_rw, dest_index + count, dest_index,
379 shift_mode, src_node_ro->data,
380 src_node_ro->data + tree->key_size * max_count, NULL,
381 NULL);
382 }
383
384 /**
385 * block_tree_node_shift_down - Helper function to delete entries from node
386 * @tree: Tree object.
387 * @node_rw: Node.
388 * @dest_index: Destination index for existing entries to shift (or
389 * first entry to remove).
390 * @src_index: Source index for existing entries to shift (or first entry
391 * after @dest_index not to remove).
392 * @shift_mode: Specifies which child to shift for internal nodes.
393 */
block_tree_node_shift_down(const struct block_tree * tree,struct block_tree_node * node_rw,unsigned int dest_index,unsigned int src_index,uint32_t shift_mode)394 static void block_tree_node_shift_down(const struct block_tree* tree,
395 struct block_tree_node* node_rw,
396 unsigned int dest_index,
397 unsigned int src_index,
398 uint32_t shift_mode) {
399 assert(dest_index < src_index);
400 block_tree_node_shift(tree, node_rw, dest_index, src_index, shift_mode,
401 NULL, NULL, NULL, NULL);
402 }
403
404 /**
405 * block_tree_node_shift_down - Helper function to delete entries from end of
406 * node
407 * @tree: Tree object.
408 * @node_rw: Node.
409 * @start_index: Index of first entry to remove. For internal nodes the
410 * the right child is removed, so @start_index refers to the
411 * key index, not the child index.
412 */
block_tree_node_clear_end(const struct block_tree * tree,struct block_tree_node * node_rw,unsigned int start_index)413 static void block_tree_node_clear_end(const struct block_tree* tree,
414 struct block_tree_node* node_rw,
415 unsigned int start_index) {
416 block_tree_node_shift_down(tree, node_rw, start_index, UINT_MAX,
417 block_tree_node_is_leaf(node_rw)
418 ? SHIFT_LEAF_OR_LEFT_CHILD
419 : SHIFT_RIGHT_CHILD);
420 }
421
422 /**
423 * block_tree_node_insert - Helper function to insert one entry in a node
424 * @tree: Tree object.
425 * @node_rw: Node.
426 * @index: Index to insert at.
427 * @shift_mode: Specifies which child to insert for internal nodes.
428 * @new_key: Key to insert.
429 * @new_data: Data or child to insert.
430 * @overflow_key: Location to store key that was shifted out of range. Can be
431 * %NULL if that key is always 0.
432 * @overflow_data: Location to store data (or child) that was shifted out of
433 * range. Can be %NULL if that data is all 0.
434 */
block_tree_node_insert(const struct block_tree * tree,struct block_tree_node * node_rw,unsigned int index,uint32_t shift_mode,const void * new_key,const void * new_data,void * overflow_key,void * overflow_data)435 static void block_tree_node_insert(const struct block_tree* tree,
436 struct block_tree_node* node_rw,
437 unsigned int index,
438 uint32_t shift_mode,
439 const void* new_key,
440 const void* new_data,
441 void* overflow_key,
442 void* overflow_data) {
443 block_tree_node_shift(tree, node_rw, index + 1, index, shift_mode, new_key,
444 new_data, overflow_key, overflow_data);
445 }
446
447 /**
448 * block_tree_node_get_key - Get key from node or in-memory pending update
449 * @tree: Tree object.
450 * @node_block: Block number where @node_ro data is stored.
451 * @node_ro: Node.
452 * @index: Index of key to get.
453 *
454 * Return: key at @index, or 0 if there is no key at @index. If @index is
455 * equal to max key count, check for a matching entry in @tree->inserting.
456 */
block_tree_node_get_key(const struct block_tree * tree,data_block_t node_block,const struct block_tree_node * node_ro,unsigned int index)457 static data_block_t block_tree_node_get_key(
458 const struct block_tree* tree,
459 data_block_t node_block,
460 const struct block_tree_node* node_ro,
461 unsigned int index) {
462 data_block_t key = 0;
463 const void* keyp;
464 const size_t key_count = block_tree_node_max_key_count(tree, node_ro);
465 const size_t key_size = tree->key_size;
466
467 assert(node_ro);
468
469 if (index < key_count) {
470 keyp = node_ro->data + index * tree->key_size;
471 assert(sizeof(key) >= key_size);
472 memcpy(&key, keyp, key_size);
473 }
474 if (!key && node_block == tree->inserting.block) {
475 assert(index >= key_count);
476
477 if (index <= key_count) {
478 key = tree->inserting.key;
479 }
480 }
481 return key;
482 }
483
484 /**
485 * block_tree_node_set_key - Set key in node
486 * @tree: Tree object.
487 * @node_rw: Node.
488 * @index: Index of key to set.
489 * @new_key: Key value to write.
490 */
block_tree_node_set_key(const struct block_tree * tree,struct block_tree_node * node_rw,unsigned int index,data_block_t new_key)491 static void block_tree_node_set_key(const struct block_tree* tree,
492 struct block_tree_node* node_rw,
493 unsigned int index,
494 data_block_t new_key) {
495 const size_t key_size = tree->key_size;
496 const size_t key_count = block_tree_node_max_key_count(tree, node_rw);
497
498 assert(node_rw);
499 assert(index < key_count);
500 assert(key_size <= sizeof(new_key));
501 /* TODO: support big-endian host */
502 memcpy(node_rw->data + index * tree->key_size, &new_key, key_size);
503 }
504
505 /**
506 * block_tree_node_get_child_data - Get pointer to child or data
507 * @tree: Tree object.
508 * @node_ro: Node.
509 * @index: Index of child or data entry to get.
510 *
511 * Return: pointer to child or data entry at index @index in @node_ro.
512 */
block_tree_node_get_child_data(const struct block_tree * tree,const struct block_tree_node * node_ro,unsigned int index)513 static const void* block_tree_node_get_child_data(
514 const struct block_tree* tree,
515 const struct block_tree_node* node_ro,
516 unsigned int index) {
517 bool is_leaf = block_tree_node_is_leaf(node_ro);
518 const size_t max_key_count = tree->key_count[is_leaf];
519 const size_t child_data_size = tree->child_data_size[is_leaf];
520 const void* child_data;
521
522 assert(index < max_key_count + !is_leaf);
523
524 child_data = node_ro->data + tree->key_size * max_key_count +
525 child_data_size * index;
526
527 assert(child_data > (void*)node_ro->data);
528 assert(child_data < (void*)node_ro + tree->block_size);
529
530 return child_data;
531 }
532
533 /**
534 * block_tree_node_get_child_data_rw - Get pointer to child or data
535 * @tree: Tree object.
536 * @node_rw: Node.
537 * @index: Index of child or data entry to get.
538 *
539 * Return: pointer to child or data entry at index @index in @node_rw.
540 */
block_tree_node_get_child_data_rw(const struct block_tree * tree,struct block_tree_node * node_rw,int index)541 static void* block_tree_node_get_child_data_rw(const struct block_tree* tree,
542 struct block_tree_node* node_rw,
543 int index) {
544 return (void*)block_tree_node_get_child_data(tree, node_rw, index);
545 }
546
547 /**
548 * block_tree_node_get_child - Get child from node or in-memory pending update
549 * @tree: Tree object.
550 * @node_block: Block number where @node_ro data is stored.
551 * @node_ro: Internal node.
552 * @index: Index of child to get.
553 *
554 * Return: child at @index, or 0 if there is no child at @index. If @index is
555 * equal to max child count, check for a matching entry in @tree->inserting.
556 */
block_tree_node_get_child(const struct transaction * tr,const struct block_tree * tree,data_block_t node_block,const struct block_tree_node * node_ro,unsigned int index)557 static const struct block_mac* block_tree_node_get_child(
558 const struct transaction* tr,
559 const struct block_tree* tree,
560 data_block_t node_block,
561 const struct block_tree_node* node_ro,
562 unsigned int index) {
563 const struct block_mac* child = NULL;
564 const size_t key_count = block_tree_node_max_key_count(tree, node_ro);
565
566 assert(!block_tree_node_is_leaf(node_ro));
567
568 if (index <= key_count) {
569 child = block_tree_node_get_child_data(tree, node_ro, index);
570 if (!block_mac_to_block(tr, child)) {
571 child = NULL;
572 }
573 }
574
575 if (!child && node_block == tree->inserting.block) {
576 assert(index > key_count);
577
578 if (index <= key_count + 1) {
579 child = &tree->inserting.child;
580 }
581 }
582
583 return child;
584 }
585
586 /**
587 * block_tree_node_get_data - Get data from node or in-memory pending update
588 * @tree: Tree object.
589 * @node_block: Block number where @node_ro data is stored.
590 * @node_ro: Leaf node.
591 * @index: Index of data to get.
592 *
593 * Return: data at @index, or 0 if there is no data at @index. If @index is
594 * equal to max key count, check for a matching entry in @tree->inserting.
595 */
block_tree_node_get_data(const struct transaction * tr,const struct block_tree * tree,data_block_t node_block,const struct block_tree_node * node_ro,unsigned int index)596 static struct block_mac block_tree_node_get_data(
597 const struct transaction* tr,
598 const struct block_tree* tree,
599 data_block_t node_block,
600 const struct block_tree_node* node_ro,
601 unsigned int index) {
602 struct block_mac block_mac_ret = BLOCK_MAC_INITIAL_VALUE(block_mac_ret);
603 const struct block_mac* datap = NULL;
604 const size_t max_key_count = block_tree_node_max_key_count(tree, node_ro);
605
606 assert(block_tree_node_is_leaf(node_ro));
607
608 if (index < max_key_count) {
609 datap = block_tree_node_get_child_data(tree, node_ro, index);
610 if (!block_mac_to_block(tr, datap)) {
611 datap = NULL;
612 }
613 }
614 if (!datap && node_block == tree->inserting.block) {
615 assert(index >= max_key_count);
616
617 if (index <= max_key_count) {
618 datap = &tree->inserting.data;
619 }
620 }
621 if (datap) {
622 block_mac_copy(tr, &block_mac_ret, datap);
623 }
624 return block_mac_ret;
625 }
626
627 #define block_tree_node_for_each_child(tr, tree, block, node_ro, child, i) \
628 for (i = 0; \
629 (child = block_tree_node_get_child(tr, tree, block, node_ro, i)); \
630 i++)
631
632 /**
633 * block_tree_node_print_internal - Print internal node
634 * @tr: Transaction object.
635 * @tree: Tree object.
636 * @node_block: Block number where @node_ro data is stored.
637 * @node_ro: Node.
638 */
block_tree_node_print_internal(const struct transaction * tr,const struct block_tree * tree,data_block_t node_block,const struct block_tree_node * node_ro)639 static void block_tree_node_print_internal(
640 const struct transaction* tr,
641 const struct block_tree* tree,
642 data_block_t node_block,
643 const struct block_tree_node* node_ro) {
644 unsigned int i;
645 const struct block_mac* child;
646 const size_t key_count = block_tree_node_max_key_count(tree, node_ro);
647
648 assert(!block_tree_node_is_leaf(node_ro));
649
650 for (i = 0; i <= key_count + 1; i++) {
651 child = block_tree_node_get_child(tr, tree, node_block, node_ro, i);
652 if (child) {
653 printf(" %" PRIu64 "", block_mac_to_block(tr, child));
654 } else if (i < key_count + 1) {
655 printf(" .");
656 }
657 if (block_tree_node_get_key(tree, node_block, node_ro, i)) {
658 if (i == key_count) {
659 printf(" inserting");
660 }
661 printf(" [%" PRIu64 "-]",
662 block_tree_node_get_key(tree, node_block, node_ro, i));
663 }
664 }
665 assert(!block_tree_node_get_child(tr, tree, node_block, node_ro, i));
666 printf("\n");
667 }
668
669 /**
670 * block_tree_node_print_leaf - Print leaf node
671 * @tr: Transaction object.
672 * @tree: Tree object.
673 * @node_block: Block number where @node_ro data is stored.
674 * @node_ro: Node.
675 */
block_tree_node_print_leaf(const struct transaction * tr,const struct block_tree * tree,data_block_t node_block,const struct block_tree_node * node_ro)676 static void block_tree_node_print_leaf(const struct transaction* tr,
677 const struct block_tree* tree,
678 data_block_t node_block,
679 const struct block_tree_node* node_ro) {
680 unsigned int i;
681 data_block_t key;
682 struct block_mac data;
683 const size_t key_count = block_tree_node_max_key_count(tree, node_ro);
684
685 assert(block_tree_node_is_leaf(node_ro));
686
687 for (i = 0; i <= key_count; i++) {
688 key = block_tree_node_get_key(tree, node_block, node_ro, i);
689 data = block_tree_node_get_data(tr, tree, node_block, node_ro, i);
690 if (key || block_mac_valid(tr, &data)) {
691 if (i == key_count) {
692 printf(" inserting");
693 }
694 printf(" [%" PRIu64 ": %" PRIu64 "]", key,
695 block_mac_to_block(tr, &data));
696 } else if (i < key_count) {
697 printf(" .");
698 }
699 }
700 printf("\n");
701 }
702
703 /**
704 * block_tree_node_print - Print node
705 * @tr: Transaction object.
706 * @tree: Tree object.
707 * @node_block: Block number where @node_ro data is stored.
708 * @node_ro: Node.
709 */
block_tree_node_print(const struct transaction * tr,const struct block_tree * tree,data_block_t node_block,const struct block_tree_node * node_ro)710 static void block_tree_node_print(const struct transaction* tr,
711 const struct block_tree* tree,
712 data_block_t node_block,
713 const struct block_tree_node* node_ro) {
714 printf(" %3" PRIu64 ":", node_block);
715 if (node_ro->is_leaf == true) {
716 block_tree_node_print_leaf(tr, tree, node_block, node_ro);
717 } else if (!node_ro->is_leaf) {
718 block_tree_node_print_internal(tr, tree, node_block, node_ro);
719 } else {
720 printf(" bad node header %" PRIx64 "\n", node_ro->is_leaf);
721 }
722 }
723
724 /**
725 * block_tree_print_sub_tree - Print tree or a branch of a tree
726 * @tr: Transaction object.
727 * @tree: Tree object.
728 * @block_mac: Root of tree or branch to print.
729 */
block_tree_print_sub_tree(struct transaction * tr,const struct block_tree * tree,const struct block_mac * block_mac)730 static void block_tree_print_sub_tree(struct transaction* tr,
731 const struct block_tree* tree,
732 const struct block_mac* block_mac) {
733 int i;
734 const struct block_tree_node* node_ro;
735 struct obj_ref node_ref = OBJ_REF_INITIAL_VALUE(node_ref);
736 const struct block_mac* child;
737
738 if (!block_mac || !block_mac_valid(tr, block_mac)) {
739 printf("empty\n");
740 return;
741 }
742
743 node_ro = block_get(tr, block_mac, NULL, &node_ref);
744 if (!node_ro) {
745 printf(" %3" PRIu64 ": unreadable\n",
746 block_mac_to_block(tr, block_mac));
747 return;
748 }
749 block_tree_node_print(tr, tree, block_mac_to_block(tr, block_mac), node_ro);
750 if (!node_ro->is_leaf) {
751 block_tree_node_for_each_child(tr, tree,
752 block_mac_to_block(tr, block_mac),
753 node_ro, child, i) {
754 block_tree_print_sub_tree(tr, tree, child);
755 }
756 }
757 block_put(node_ro, &node_ref);
758 }
759
760 /**
761 * block_tree_print - Print tree
762 * @tr: Transaction object.
763 * @tree: Tree object.
764 */
block_tree_print(struct transaction * tr,const struct block_tree * tree)765 void block_tree_print(struct transaction* tr, const struct block_tree* tree) {
766 block_tree_print_sub_tree(tr, tree, &tree->root);
767 }
768
769 /**
770 * block_tree_node_check - Check node for errors
771 * @tr: Transaction object.
772 * @tree: Tree object.
773 * @node_ro: Node.
774 * @node_block: Block number where @node_ro data is stored.
775 * @min_key: Minimum allowed key value.
776 * @max_key: Maximum allowed key value.
777 *
778 * Return: %false is an error is detected, %true otherwise.
779 */
block_tree_node_check(const struct transaction * tr,const struct block_tree * tree,const struct block_tree_node * node_ro,data_block_t node_block,data_block_t min_key,data_block_t max_key)780 static bool block_tree_node_check(const struct transaction* tr,
781 const struct block_tree* tree,
782 const struct block_tree_node* node_ro,
783 data_block_t node_block,
784 data_block_t min_key,
785 data_block_t max_key) {
786 unsigned int i;
787 data_block_t key;
788 data_block_t prev_key = 0;
789 int empty_count;
790 const void* child_data;
791 size_t key_count = block_tree_node_max_key_count(tree, node_ro);
792 bool is_leaf;
793
794 if (node_ro->is_leaf && node_ro->is_leaf != true) {
795 printf("%s: bad node header %" PRIx64 "\n", __func__, node_ro->is_leaf);
796 goto err;
797 }
798 is_leaf = block_tree_node_is_leaf(node_ro);
799
800 empty_count = 0;
801 for (i = 0; i < key_count; i++) {
802 key = block_tree_node_get_key(tree, node_block, node_ro, i);
803 if (!key) {
804 empty_count++;
805 }
806 if (empty_count) {
807 if (key) {
808 printf("%s: %" PRIu64
809 ": expected zero key at %d, found %" PRIu64 "\n",
810 __func__, node_block, i, key);
811 goto err;
812 }
813 child_data =
814 block_tree_node_get_child_data(tree, node_ro, i + !is_leaf);
815 if (!is_zero(child_data, tree->child_data_size[is_leaf])) {
816 printf("%s: %" PRIu64
817 ": expected zero data/right child value at %d\n",
818 __func__, node_block, i);
819 goto err;
820 }
821 continue;
822 }
823 if (key < prev_key || key < min_key || key > max_key) {
824 printf("%s: %" PRIu64 ": bad key at %d, %" PRIu64
825 " not in [%" PRIu64 "/%" PRIu64 "-%" PRIu64 "]\n",
826 __func__, node_block, i, key, min_key, prev_key, max_key);
827 if (tr->failed && key >= prev_key) {
828 printf("%s: transaction failed, ignore\n", __func__);
829 } else {
830 goto err;
831 }
832 }
833 prev_key = key;
834 }
835 return true;
836
837 err:
838 if (print_check_errors) {
839 block_tree_node_print(tr, tree, node_block, node_ro);
840 }
841 return false;
842 }
843
844 /**
845 * block_tree_check_sub_tree - Check tree or a branch of a tree for errros
846 * @tr: Transaction object.
847 * @tree: Tree object.
848 * @block_mac: Root of tree or branch to check.
849 * @is_root: %true if @block_mac refers to the root of the tree, %false
850 * otherwise.
851 * @min_key: Minimum allowed key value.
852 * @max_key: Maximum allowed key value.
853 * @updating: %true if @tree is currently updating and nodes below
854 * min-full should be allowed, %false otherwise.
855 *
856 * Return: Depth of tree/branch, -1 if an error was detected or -2 if @block_mac
857 * could not be read.
858 *
859 * TODO: Reduce overlap with and move more checks to block_tree_node_check.
860 */
block_tree_check_sub_tree(struct transaction * tr,const struct block_tree * tree,const struct block_mac * block_mac,bool is_root,data_block_t min_key,data_block_t max_key,bool updating)861 static int block_tree_check_sub_tree(struct transaction* tr,
862 const struct block_tree* tree,
863 const struct block_mac* block_mac,
864 bool is_root,
865 data_block_t min_key,
866 data_block_t max_key,
867 bool updating) {
868 const struct block_tree_node* node_ro;
869 unsigned int i;
870 int last_child = 0;
871 int empty_count;
872 int depth;
873 int sub_tree_depth;
874 data_block_t child_min_key;
875 data_block_t child_max_key;
876 data_block_t key;
877 int max_empty_count;
878 size_t key_count;
879 const void* child_data;
880 struct block_mac child_block_mac;
881 struct obj_ref ref = OBJ_REF_INITIAL_VALUE(ref);
882 bool is_leaf;
883
884 if (!block_mac || !block_mac_to_block(tr, block_mac))
885 return 0;
886
887 depth = 1;
888
889 if (block_mac_to_block(tr, block_mac) >= tr->fs->dev->block_count) {
890 printf("%s: %3" PRIu64 ": invalid\n", __func__,
891 block_mac_to_block(tr, block_mac));
892 return -1;
893 }
894
895 node_ro = block_get_no_tr_fail(tr, block_mac, NULL, &ref);
896 if (!node_ro) {
897 if (tr->failed) {
898 /*
899 * Failed transactions discards dirty cache entries so parts of the
900 * tree may now be unreadable.
901 */
902 pr_warn("ignore unreadable block %" PRIu64 ", transaction failed\n",
903 block_mac_to_block(tr, block_mac));
904 return -2;
905 }
906 pr_warn("%3" PRIu64 ": unreadable\n",
907 block_mac_to_block(tr, block_mac));
908 return -2;
909 }
910
911 if (!block_tree_node_check(tr, tree, node_ro,
912 block_mac_to_block(tr, block_mac), min_key,
913 max_key)) {
914 goto err;
915 }
916
917 if (node_ro->is_leaf && node_ro->is_leaf != true) {
918 printf("%s: bad node header %" PRIx64 "\n", __func__, node_ro->is_leaf);
919 goto err;
920 }
921 is_leaf = block_tree_node_is_leaf(node_ro);
922
923 key_count = block_tree_node_max_key_count(tree, node_ro);
924 /* TODO: don't allow empty root */
925 max_empty_count =
926 is_root ? (key_count /*- 1*/) : ((key_count + 1) / 2 + updating);
927
928 child_min_key = min_key;
929 empty_count = 0;
930 for (i = 0; i < key_count; i++) {
931 key = block_tree_node_get_key(tree, block_mac_to_block(tr, block_mac),
932 node_ro, i);
933 if (!key) {
934 empty_count++;
935 }
936 if (empty_count) {
937 if (key) {
938 printf("%s: %" PRIu64
939 ": expected zero key at %d, found %" PRIu64 "\n",
940 __func__, block_mac_to_block(tr, block_mac), i, key);
941 goto err;
942 }
943 child_data =
944 block_tree_node_get_child_data(tree, node_ro, i + !is_leaf);
945 if (!is_zero(child_data, tree->child_data_size[is_leaf])) {
946 printf("%s: %" PRIu64
947 ": expected zero data/right child value at %d\n",
948 __func__, block_mac_to_block(tr, block_mac), i);
949 goto err;
950 }
951 continue;
952 }
953 if (i == 0 && min_key && is_leaf && key != min_key) {
954 printf("%s: %" PRIu64 ": bad key at %d, %" PRIu64
955 " not start of [%" PRIu64 "-%" PRIu64 "]\n",
956 __func__, block_mac_to_block(tr, block_mac), i, key, min_key,
957 max_key);
958 if (tr->failed) {
959 printf("%s: transaction failed, ignore\n", __func__);
960 } else if (!key) {
961 // warn only for now
962 printf("%s: ignore empty node error\n", __func__);
963 } else {
964 goto err;
965 }
966 }
967 min_key = key;
968 child_max_key = key;
969 if (!is_leaf) {
970 child_data = block_tree_node_get_child_data(tree, node_ro, i);
971 block_mac_copy(tr, &child_block_mac, child_data);
972 block_put(node_ro, &ref);
973 sub_tree_depth = block_tree_check_sub_tree(
974 tr, tree, &child_block_mac, false, child_min_key,
975 child_max_key, updating);
976 node_ro = block_get_no_tr_fail(tr, block_mac, NULL, &ref);
977 if (!node_ro) {
978 pr_warn("%3" PRIu64 ": unreadable\n",
979 block_mac_to_block(tr, block_mac));
980 return -2;
981 }
982 if (sub_tree_depth == -1) {
983 goto err;
984 }
985 if (sub_tree_depth == -2) {
986 pr_warn("%" PRIu64 ": unreadable subtree at %d\n",
987 block_mac_to_block(tr, block_mac), i);
988 if (depth == 1) {
989 depth = -2;
990 }
991 } else if (depth == 1 || depth == -2) {
992 depth = sub_tree_depth + 1;
993 } else if (sub_tree_depth != depth - 1) {
994 printf("%s: %" PRIu64 ": bad subtree depth at %d, %d != %d\n",
995 __func__, block_mac_to_block(tr, block_mac), i,
996 sub_tree_depth, depth - 1);
997 goto err;
998 }
999 }
1000 child_min_key = key;
1001 min_key = key;
1002 last_child = i + 1;
1003 }
1004 child_max_key = max_key;
1005 if (!is_leaf) {
1006 child_data = block_tree_node_get_child_data(tree, node_ro, last_child);
1007 block_mac_copy(tr, &child_block_mac, child_data);
1008 block_put(node_ro, &ref);
1009 sub_tree_depth = block_tree_check_sub_tree(tr, tree, &child_block_mac,
1010 false, child_min_key,
1011 child_max_key, updating);
1012 node_ro = block_get_no_tr_fail(tr, block_mac, NULL, &ref);
1013 if (!node_ro) {
1014 pr_warn("%3" PRIu64 ": unreadable\n",
1015 block_mac_to_block(tr, block_mac));
1016 return -2;
1017 }
1018 if (sub_tree_depth == -1) {
1019 goto err;
1020 }
1021 if (sub_tree_depth == -2) {
1022 pr_warn("%" PRIu64 ": unreadable subtree at %d\n",
1023 block_mac_to_block(tr, block_mac), last_child);
1024 if (depth == 1) {
1025 depth = -2;
1026 }
1027 } else if (depth == 1 || depth == -2) {
1028 depth = sub_tree_depth + 1;
1029 } else if (sub_tree_depth != depth - 1) {
1030 printf("%s: %" PRIu64 ": bad subtree depth at %d, %d != %d\n",
1031 __func__, block_mac_to_block(tr, block_mac), last_child,
1032 sub_tree_depth, depth - 1);
1033 goto err;
1034 }
1035 }
1036 if (empty_count > max_empty_count) {
1037 printf("%s: %" PRIu64 ": too many empty nodes, %d > %d\n", __func__,
1038 block_mac_to_block(tr, block_mac), empty_count, max_empty_count);
1039 if (tr->failed) {
1040 printf("%s: ignore error, transaction failed\n", __func__);
1041 } else {
1042 goto err;
1043 }
1044 }
1045 block_put(node_ro, &ref);
1046
1047 return depth;
1048
1049 err:
1050 block_put(node_ro, &ref);
1051 return -1;
1052 }
1053
1054 /**
1055 * block_tree_check - Check tree for errros
1056 * @tr: Transaction object.
1057 * @tree: Tree object.
1058 *
1059 * Return: %false if an error was detected, %true otherwise.
1060 */
block_tree_check(struct transaction * tr,const struct block_tree * tree)1061 bool block_tree_check(struct transaction* tr, const struct block_tree* tree) {
1062 int ret;
1063
1064 assert(tr->fs->dev->block_size >= tree->block_size);
1065
1066 ret = block_tree_check_sub_tree(tr, tree, &tree->root, true, 0,
1067 DATA_BLOCK_MAX, tree->updating);
1068 if (ret < 0) {
1069 if (ret == -2) {
1070 pr_warn("block_tree not fully readable\n");
1071 if (!tr->failed) {
1072 transaction_fail(tr);
1073 }
1074 return true;
1075 }
1076 pr_err("%s: invalid block_tree:\n", __func__);
1077 if (print_check_errors) {
1078 block_tree_print(tr, tree);
1079 }
1080 return false;
1081 }
1082 return true;
1083 }
1084
1085 /**
1086 * block_tree_node_full - Check if node is full
1087 * @tree: Tree object.
1088 * @node_ro: Node.
1089 *
1090 * Return: %true is @node_ro is full, %false otherwise.
1091 */
block_tree_node_full(const struct block_tree * tree,const struct block_tree_node * node_ro)1092 static bool block_tree_node_full(const struct block_tree* tree,
1093 const struct block_tree_node* node_ro) {
1094 const size_t key_count = block_tree_node_max_key_count(tree, node_ro);
1095 return !!block_tree_node_get_key(tree, DATA_BLOCK_INVALID, node_ro,
1096 key_count - 1);
1097 }
1098
1099 /**
1100 * block_tree_min_key_count - Return the minimum number of keys for a node
1101 * @tree: Tree object.
1102 * @leaf: %true for leaf nodes, %false for internal nodes.
1103 *
1104 * B+ tree minimum full rules for non-root nodes.
1105 *
1106 * min-children = (max-children + 1) / 2 - 1
1107 * max-keys = max-children - 1
1108 * max-children = max-keys + 1
1109 * min-keys = min-children - 1
1110 * = (max-children + 1) / 2 - 1
1111 * = (max-keys + 1 + 1) / 2 - 1
1112 * = max-keys / 2
1113 *
1114 * Return: minimum number of keys required for non-root leaf or internal nodes.
1115 */
block_tree_min_key_count(const struct block_tree * tree,bool leaf)1116 static unsigned int block_tree_min_key_count(const struct block_tree* tree,
1117 bool leaf) {
1118 return block_tree_max_key_count(tree, leaf) / 2;
1119 }
1120
1121 /**
1122 * block_tree_node_min_full_index - Return index of last key in "min-full" node
1123 * @tree: Tree object.
1124 * @node_ro: Node.
1125 *
1126 * Return: index.
1127 */
block_tree_node_min_full_index(const struct block_tree * tree,const struct block_tree_node * node_ro)1128 static int block_tree_node_min_full_index(
1129 const struct block_tree* tree,
1130 const struct block_tree_node* node_ro) {
1131 return block_tree_min_key_count(tree, block_tree_node_is_leaf(node_ro)) - 1;
1132 }
1133
1134 /**
1135 * block_tree_above_min_full - Check if node has more than the minimum number of
1136 * entries
1137 * @tree: Tree object.
1138 * @node_ro: Node.
1139 *
1140 * Return: %true if @node_ro has more than the minimim required entry count,
1141 * %false otherwise.
1142 */
block_tree_above_min_full(const struct block_tree * tree,const struct block_tree_node * node_ro)1143 bool block_tree_above_min_full(const struct block_tree* tree,
1144 const struct block_tree_node* node_ro) {
1145 int min_full_index = block_tree_node_min_full_index(tree, node_ro);
1146 return !!block_tree_node_get_key(tree, DATA_BLOCK_INVALID, node_ro,
1147 min_full_index + 1);
1148 }
1149
1150 /**
1151 * block_tree_above_min_full - Check if node has less than the minimum number of
1152 * entries
1153 * @tree: Tree object.
1154 * @node_ro: Node.
1155 *
1156 * Return: %true if @node_ro has less than the minimim required entry count,
1157 * %false otherwise.
1158 */
block_tree_below_min_full(const struct block_tree * tree,const struct block_tree_node * node_ro)1159 bool block_tree_below_min_full(const struct block_tree* tree,
1160 const struct block_tree_node* node_ro) {
1161 int min_full_index = block_tree_node_min_full_index(tree, node_ro);
1162 return !block_tree_node_get_key(tree, DATA_BLOCK_INVALID, node_ro,
1163 min_full_index);
1164 }
1165
1166 /**
1167 * block_tree_node_need_copy - Check if node needs to be copied before write
1168 * @tr: Transaction object.
1169 * @tree: Tree object.
1170 * @block_mac: Block number and mac of node to check.
1171 *
1172 * Return: %true if @tree is a copy_on_write tree and @block_mac needs to be
1173 * copied before the node can be written, %false otherwise.
1174 */
block_tree_node_need_copy(struct transaction * tr,const struct block_tree * tree,const struct block_mac * block_mac)1175 static bool block_tree_node_need_copy(struct transaction* tr,
1176 const struct block_tree* tree,
1177 const struct block_mac* block_mac) {
1178 return tree->copy_on_write &&
1179 transaction_block_need_copy(tr, block_mac_to_block(tr, block_mac)) &&
1180 !tr->failed;
1181 }
1182
1183 /**
1184 * block_tree_node_find_block - Helper function for block_tree_walk
1185 * @tr: Transaction object.
1186 * @tree: Tree object.
1187 * @node_block: Block number where @node_ro data is stored.
1188 * @node_ro: Node to search.
1189 * @key: Key to search for.
1190 * @key_is_max: If %true, and @key is not found in a leaf node, use largest
1191 * entry with a key less than @key. If %false, and @key is not
1192 * found in a leaf node, use smallest entry with a key greater
1193 * than @key. Ignored for internal nodes or when @key is found.
1194 * @child: Output arg for child at returned index.
1195 * @prev_key: Output arg. Unmodified if returned index is 0, set to key at
1196 * index - 1 otherwise.
1197 * @next_key: Output arg. Unmodified if returned index does not contain
1198 * an entry, set to key at index otherwise.
1199 *
1200 * Return: index of closest matching entry.
1201 */
block_tree_node_find_block(const struct transaction * tr,const struct block_tree * tree,data_block_t node_block,const struct block_tree_node * node_ro,data_block_t key,bool key_is_max,const struct block_mac ** child,data_block_t * prev_key,data_block_t * next_key)1202 static int block_tree_node_find_block(const struct transaction* tr,
1203 const struct block_tree* tree,
1204 data_block_t node_block,
1205 const struct block_tree_node* node_ro,
1206 data_block_t key,
1207 bool key_is_max,
1208 const struct block_mac** child,
1209 data_block_t* prev_key,
1210 data_block_t* next_key) {
1211 int i;
1212 data_block_t curr_key;
1213 int keys_count;
1214 bool is_leaf = block_tree_node_is_leaf(node_ro);
1215
1216 keys_count = block_tree_node_max_key_count(tree, node_ro);
1217
1218 /* TODO: better search? */
1219 for (i = 0; i < keys_count + 1; i++) {
1220 curr_key = block_tree_node_get_key(tree, node_block, node_ro, i);
1221 if (!curr_key || (key <= (curr_key - !is_leaf))) {
1222 break;
1223 }
1224 curr_key = 0;
1225 }
1226 if (i == keys_count && curr_key) {
1227 assert(tree->inserting.block == node_block);
1228 if (print_ops) {
1229 printf("%s: using inserting block %" PRIu64 ", key %" PRIu64
1230 ", child %" PRIu64 ", data %" PRIu64 "\n",
1231 __func__, tree->inserting.block, tree->inserting.key,
1232 block_mac_to_block(tr, &tree->inserting.child),
1233 block_mac_to_block(tr, &tree->inserting.data));
1234 }
1235 }
1236 if (key_is_max && is_leaf && i > 0 && (!curr_key || curr_key > key)) {
1237 i--;
1238 curr_key = block_tree_node_get_key(tree, node_block, node_ro, i);
1239 }
1240
1241 if (curr_key) {
1242 *next_key = curr_key;
1243 }
1244 if (i > 0) {
1245 /* TODO: move to loop */
1246 *prev_key = block_tree_node_get_key(tree, node_block, node_ro, i - 1);
1247 }
1248
1249 if (is_leaf) {
1250 *child = NULL;
1251 } else {
1252 *child = block_tree_node_get_child(tr, tree, node_block, node_ro, i);
1253 assert(*child);
1254 assert(block_mac_valid(tr, *child));
1255 }
1256 if (print_lookup) {
1257 printf("%s: block %" PRIu64 ", key %" PRIu64
1258 " return %d, child_block %" PRIu64 ", prev_key %" PRIu64
1259 ", next_key %" PRIu64 "\n",
1260 __func__, node_block, key, i,
1261 *child ? block_mac_to_block(tr, *child) : 0, *prev_key,
1262 *next_key);
1263 block_tree_node_print(tr, tree, node_block, node_ro);
1264 }
1265 return i;
1266 }
1267
1268 /**
1269 * block_tree_walk - Walk tree
1270 * @tr: Transaction object.
1271 * @tree: Tree object.
1272 * @key: Key to search for.
1273 * @key_is_max: If %true, and @key is not found in a leaf node, use largest
1274 * entry with a key less than @key. If %false, and @key is not
1275 * found in a leaf node, use smallest entry with a key greater
1276 * than @key. Ignored for internal nodes or when @key is found.
1277 * @path: Tree path object to initalize.
1278 *
1279 * Walk tree to find path to @key or the insert point for @key.
1280 *
1281 * Note that if @key is not found, @path may point to an empty insert slot. A
1282 * caller that is looking for the closest match to @key, must therefore call
1283 * block_tree_path_next to advance @path to a valid entry if this case is hit.
1284 * TODO: Add a helper function or argument to avoid this.
1285 */
block_tree_walk(struct transaction * tr,struct block_tree * tree,data_block_t key,bool key_is_max,struct block_tree_path * path)1286 void block_tree_walk(struct transaction* tr,
1287 struct block_tree* tree,
1288 data_block_t key,
1289 bool key_is_max,
1290 struct block_tree_path* path) {
1291 const struct block_tree_node* node_ro;
1292 const struct block_tree_node* parent_node_ro;
1293 struct obj_ref ref[2] = {
1294 OBJ_REF_INITIAL_VALUE(ref[0]),
1295 OBJ_REF_INITIAL_VALUE(ref[1]),
1296 };
1297 int ref_index = 0;
1298 const struct block_mac* child;
1299 data_block_t prev_key = 0;
1300 data_block_t next_key = 0;
1301 int child_index;
1302 const struct block_mac* block_mac;
1303
1304 if (print_lookup) {
1305 printf("%s: tree root block %" PRIu64 ", key %" PRIu64
1306 ", key_is_max %d\n",
1307 __func__, block_mac_to_block(tr, &tree->root), key, key_is_max);
1308 }
1309
1310 assert(tr);
1311 assert(tree);
1312 assert(tree->block_size <= tr->fs->dev->block_size);
1313
1314 path->count = 0;
1315 memset(&path->data, 0, sizeof(path->data));
1316 path->tr = tr;
1317 path->tree = tree;
1318 path->tree_update_count = tree->update_count;
1319
1320 block_mac = &tree->root;
1321 parent_node_ro = NULL;
1322
1323 while (block_mac && block_mac_valid(tr, block_mac)) {
1324 assert(path->count < countof(path->entry));
1325
1326 node_ro = block_get(tr, block_mac, NULL, &ref[ref_index]);
1327 if (!node_ro) {
1328 assert(tr->failed);
1329 pr_warn("transaction failed, abort\n");
1330 path->count = 0;
1331 goto err;
1332 }
1333 assert(node_ro);
1334
1335 path->entry[path->count].block_mac = *block_mac;
1336
1337 child_index = block_tree_node_find_block(
1338 tr, path->tree, block_mac_to_block(tr, block_mac), node_ro, key,
1339 key_is_max, &child, &prev_key, &next_key);
1340 if (path->count) {
1341 assert(!path->entry[path->count - 1].next_key || next_key);
1342 assert(!path->entry[path->count - 1].next_key ||
1343 next_key <= path->entry[path->count - 1].next_key);
1344 assert(!path->entry[path->count - 1].prev_key || prev_key);
1345 assert(!path->entry[path->count - 1].prev_key ||
1346 prev_key >= path->entry[path->count - 1].prev_key);
1347 }
1348 path->entry[path->count].index = child_index;
1349 path->entry[path->count].prev_key = prev_key;
1350 path->entry[path->count].next_key = next_key;
1351 if (!child) {
1352 assert(block_tree_node_is_leaf(node_ro));
1353 path->data = block_tree_node_get_data(
1354 tr, tree, block_mac_to_block(tr, block_mac), node_ro,
1355 child_index);
1356 assert(!key_is_max || block_mac_valid(path->tr, &path->data) ||
1357 path->count == 0);
1358 }
1359
1360 block_mac = child;
1361 if (parent_node_ro) {
1362 block_put(parent_node_ro, &ref[!ref_index]);
1363 }
1364 parent_node_ro = node_ro;
1365 ref_index = !ref_index;
1366
1367 if (print_lookup) {
1368 printf("%s: path[%d] = %" PRIu64 ", index %d, prev_key %" PRIu64
1369 ", next_key %" PRIu64 "\n",
1370 __func__, path->count,
1371 block_mac_to_block(path->tr,
1372 &path->entry[path->count].block_mac),
1373 path->entry[path->count].index,
1374 path->entry[path->count].prev_key,
1375 path->entry[path->count].next_key);
1376 if (!block_mac) {
1377 printf("%s: path.data.block = %" PRIu64 "\n", __func__,
1378 block_mac_to_block(path->tr, &path->data));
1379 }
1380 }
1381 path->count++;
1382 }
1383 err:
1384 if (parent_node_ro) {
1385 block_put(parent_node_ro, &ref[!ref_index]);
1386 }
1387 }
1388
1389 /**
1390 * block_tree_path_next - Walk tree to get to next entry
1391 * @path: Tree path object.
1392 */
block_tree_path_next(struct block_tree_path * path)1393 void block_tree_path_next(struct block_tree_path* path) {
1394 const struct block_tree_node* node_ro;
1395 const struct block_tree_node* parent_node_ro;
1396 struct obj_ref ref[2] = {
1397 OBJ_REF_INITIAL_VALUE(ref[0]),
1398 OBJ_REF_INITIAL_VALUE(ref[1]),
1399 };
1400 bool ref_index = 0;
1401 unsigned int index;
1402 unsigned int depth;
1403 const struct block_mac* block_mac;
1404 data_block_t prev_key;
1405 data_block_t next_key;
1406 struct block_mac next_data;
1407 const struct block_mac* next_child;
1408 data_block_t parent_next_key;
1409
1410 assert(path->tree_update_count == path->tree->update_count);
1411 assert(path->count);
1412
1413 depth = path->count - 1;
1414 assert(path->entry[depth].next_key);
1415
1416 if (print_lookup) {
1417 printf("%s: tree root block %" PRIu64 "\n", __func__,
1418 block_mac_to_block(path->tr, &path->tree->root));
1419 }
1420
1421 block_mac = &path->entry[depth].block_mac;
1422 index = path->entry[depth].index;
1423 parent_next_key = depth > 0 ? path->entry[depth - 1].next_key : 0;
1424
1425 node_ro = block_get(path->tr, block_mac, NULL, &ref[ref_index]);
1426 if (!node_ro) {
1427 assert(path->tr->failed);
1428 pr_warn("transaction failed, abort\n");
1429 goto err_no_refs;
1430 }
1431 assert(node_ro);
1432 assert(block_tree_node_is_leaf(node_ro));
1433 prev_key = block_tree_node_get_key(path->tree,
1434 block_mac_to_block(path->tr, block_mac),
1435 node_ro, index);
1436 index++;
1437 next_key = block_tree_node_get_key(path->tree,
1438 block_mac_to_block(path->tr, block_mac),
1439 node_ro, index);
1440 next_data = block_tree_node_get_data(
1441 path->tr, path->tree, block_mac_to_block(path->tr, block_mac),
1442 node_ro, index);
1443 block_put(node_ro, &ref[ref_index]);
1444
1445 assert(path->entry[depth].next_key == prev_key || !prev_key);
1446
1447 if (next_key || !parent_next_key) {
1448 assert(!next_key || next_key >= prev_key);
1449 path->entry[depth].index = index;
1450 path->entry[depth].prev_key = prev_key;
1451 path->entry[depth].next_key = next_key;
1452 path->data = next_data;
1453 if (print_lookup) {
1454 printf("%s: path[%d] = %" PRIu64 ", index %d, prev_key %" PRIu64
1455 ", next_key %" PRIu64 "\n",
1456 __func__, depth,
1457 block_mac_to_block(path->tr, &path->entry[depth].block_mac),
1458 path->entry[depth].index, path->entry[depth].prev_key,
1459 path->entry[depth].next_key);
1460 printf("%s: path.data.block = %" PRIu64 "\n", __func__,
1461 block_mac_to_block(path->tr, &path->data));
1462 }
1463 return;
1464 }
1465
1466 assert(depth > 0);
1467
1468 while (depth > 0) {
1469 depth--;
1470 if (!path->entry[depth].next_key) {
1471 continue;
1472 }
1473 block_mac = &path->entry[depth].block_mac;
1474 index = path->entry[depth].index;
1475
1476 node_ro = block_get(path->tr, block_mac, NULL, &ref[ref_index]);
1477 if (!node_ro) {
1478 assert(path->tr->failed);
1479 pr_warn("transaction failed, abort\n");
1480 goto err_no_refs;
1481 }
1482 assert(node_ro);
1483 assert(!block_tree_node_is_leaf(node_ro));
1484
1485 parent_next_key = depth > 0 ? path->entry[depth - 1].next_key : 0;
1486
1487 prev_key = block_tree_node_get_key(
1488 path->tree, block_mac_to_block(path->tr, block_mac), node_ro,
1489 index);
1490 index++;
1491 next_key = block_tree_node_get_key(
1492 path->tree, block_mac_to_block(path->tr, block_mac), node_ro,
1493 index);
1494 next_child = block_tree_node_get_child(
1495 path->tr, path->tree, block_mac_to_block(path->tr, block_mac),
1496 node_ro, index);
1497 if (next_child) {
1498 parent_node_ro = node_ro;
1499 ref_index = !ref_index;
1500 assert(prev_key && prev_key == path->entry[depth].next_key);
1501 path->entry[depth].index = index;
1502 path->entry[depth].prev_key = prev_key;
1503 path->entry[depth].next_key = next_key ?: parent_next_key;
1504 if (print_lookup) {
1505 printf("%s: path[%d] = %" PRIu64 ", index %d, prev_key %" PRIu64
1506 ", next_key %" PRIu64 "\n",
1507 __func__, depth,
1508 block_mac_to_block(path->tr,
1509 &path->entry[depth].block_mac),
1510 path->entry[depth].index, path->entry[depth].prev_key,
1511 path->entry[depth].next_key);
1512 }
1513 break;
1514 }
1515 block_put(node_ro, &ref[ref_index]);
1516 }
1517 assert(next_child);
1518 while (++depth < path->count - 1) {
1519 node_ro = block_get(path->tr, next_child, NULL, &ref[ref_index]);
1520 if (!node_ro) {
1521 assert(path->tr->failed);
1522 pr_warn("transaction failed, abort\n");
1523 goto err_has_parent_ref;
1524 }
1525 assert(node_ro);
1526 assert(!block_tree_node_is_leaf(node_ro));
1527 path->entry[depth].block_mac = *next_child;
1528 block_put(parent_node_ro, &ref[!ref_index]);
1529 path->entry[depth].index = 0;
1530 path->entry[depth].prev_key = prev_key;
1531 path->entry[depth].next_key = block_tree_node_get_key(
1532 path->tree, DATA_BLOCK_INVALID, node_ro, 0);
1533 if (print_lookup) {
1534 printf("%s: path[%d] = %" PRIu64 ", index %d, prev_key %" PRIu64
1535 ", next_key %" PRIu64 "\n",
1536 __func__, depth,
1537 block_mac_to_block(path->tr, &path->entry[depth].block_mac),
1538 path->entry[depth].index, path->entry[depth].prev_key,
1539 path->entry[depth].next_key);
1540 }
1541 next_child = block_tree_node_get_child(path->tr, path->tree,
1542 DATA_BLOCK_INVALID, node_ro, 0);
1543 parent_node_ro = node_ro;
1544 ref_index = !ref_index;
1545 assert(path->entry[depth].next_key);
1546 assert(next_child);
1547 }
1548
1549 assert(next_child);
1550 node_ro = block_get(path->tr, next_child, NULL, &ref[ref_index]);
1551 if (!node_ro) {
1552 assert(path->tr->failed);
1553 pr_warn("transaction failed, abort\n");
1554 goto err_has_parent_ref;
1555 }
1556 assert(node_ro);
1557 assert(block_tree_node_is_leaf(node_ro));
1558 path->entry[depth].block_mac = *next_child;
1559 block_put(parent_node_ro, &ref[!ref_index]);
1560 path->entry[depth].index = 0;
1561 path->entry[depth].prev_key = prev_key;
1562 path->entry[depth].next_key =
1563 block_tree_node_get_key(path->tree, DATA_BLOCK_INVALID, node_ro, 0);
1564 path->data = block_tree_node_get_data(path->tr, path->tree,
1565 DATA_BLOCK_INVALID, node_ro, 0);
1566 block_put(node_ro, &ref[ref_index]);
1567 assert(path->entry[depth].next_key);
1568 if (print_lookup) {
1569 printf("%s: path[%d] = %" PRIu64 ", index %d, prev_key %" PRIu64
1570 ", next_key %" PRIu64 "\n",
1571 __func__, depth,
1572 block_mac_to_block(path->tr, &path->entry[depth].block_mac),
1573 path->entry[depth].index, path->entry[depth].prev_key,
1574 path->entry[depth].next_key);
1575 printf("%s: path.data.block = %" PRIu64 "\n", __func__,
1576 block_mac_to_block(path->tr, &path->data));
1577 }
1578 return;
1579
1580 err_has_parent_ref:
1581 block_put(parent_node_ro, &ref[!ref_index]);
1582 err_no_refs:
1583 assert(path->tr->failed);
1584 path->count = 0;
1585 }
1586
1587 /**
1588 * block_tree_block_dirty - Get writable node
1589 * @tr: Transaction object.
1590 * @path: Tree path object.
1591 * @path_index: Path depth that @node_ro was found at.
1592 * @node_ro: Source node
1593 *
1594 * Copy @node_ro and update @path if needed.
1595 *
1596 * Return: writable pointer to @node_ro.
1597 */
block_tree_block_dirty(struct transaction * tr,struct block_tree_path * path,int path_index,const struct block_tree_node * node_ro)1598 static struct block_tree_node* block_tree_block_dirty(
1599 struct transaction* tr,
1600 struct block_tree_path* path,
1601 int path_index,
1602 const struct block_tree_node* node_ro) {
1603 data_block_t new_block;
1604 struct block_mac* block_mac;
1605
1606 block_mac = &path->entry[path_index].block_mac;
1607
1608 assert(path_index ||
1609 block_mac_same_block(tr, block_mac, &path->tree->root));
1610
1611 if (!block_tree_node_need_copy(tr, path->tree, block_mac)) {
1612 if (tr->failed) {
1613 return NULL;
1614 }
1615 return block_dirty(tr, node_ro, !path->tree->allow_copy_on_write);
1616 }
1617 assert(path->tree->allow_copy_on_write);
1618 new_block = block_allocate_etc(tr, !path->tree->allow_copy_on_write);
1619 if (print_internal_changes) {
1620 printf("%s: copy block %" PRIu64 " to %" PRIu64 "\n", __func__,
1621 block_mac_to_block(tr, block_mac), new_block);
1622 }
1623 if (!new_block) {
1624 return NULL;
1625 }
1626 assert(new_block != block_mac_to_block(tr, block_mac));
1627 assert(!tr->failed);
1628 block_free(tr, block_mac_to_block(tr, block_mac));
1629 if (tr->failed) {
1630 pr_warn("transaction failed, abort\n");
1631 return NULL;
1632 }
1633 block_mac_set_block(tr, block_mac, new_block);
1634 if (!path_index) {
1635 block_mac_set_block(tr, &path->tree->root, new_block);
1636 path->tree->root_block_changed = true;
1637 }
1638 return block_move(tr, node_ro, new_block, !path->tree->allow_copy_on_write);
1639 }
1640
1641 /**
1642 * block_tree_block_get_write - Get writable node fro path
1643 * @tr: Transaction object.
1644 * @path: Tree path object.
1645 * @path_index: Path depth to get node from.
1646 * @ref: Pointer to store reference in.
1647 *
1648 * Get node from @path and @path_index, copying it and updating @path if
1649 * needed.
1650 *
1651 * Return: writable pointer to node.
1652 */
block_tree_block_get_write(struct transaction * tr,struct block_tree_path * path,int path_index,struct obj_ref * ref)1653 static struct block_tree_node* block_tree_block_get_write(
1654 struct transaction* tr,
1655 struct block_tree_path* path,
1656 int path_index,
1657 struct obj_ref* ref) {
1658 const struct block_tree_node* node_ro;
1659 struct block_tree_node* node_rw;
1660
1661 node_ro = block_get(tr, &path->entry[path_index].block_mac, NULL, ref);
1662 if (!node_ro) {
1663 return NULL;
1664 }
1665
1666 node_rw = block_tree_block_dirty(tr, path, path_index, node_ro);
1667 if (!node_rw) {
1668 block_put(node_ro, ref);
1669 }
1670 return node_rw;
1671 }
1672
1673 /**
1674 * block_tree_path_put_dirty - Release dirty node and update tree
1675 * @tr: Transaction object.
1676 * @path: Tree path object.
1677 * @path_index: Path depth to get node from.
1678 * @data: Block data pointed to by leaf node when @path_index is
1679 * @path->count. Tree node otherwise.
1680 * @data_ref: Reference to @data that will be released.
1681 *
1682 * Release dirty external block or tree node and update tree. Update mac values
1683 * and copy blocks as needed until root of tree is reached.
1684 */
block_tree_path_put_dirty(struct transaction * tr,struct block_tree_path * path,int path_index,void * data,struct obj_ref * data_ref)1685 void block_tree_path_put_dirty(struct transaction* tr,
1686 struct block_tree_path* path,
1687 int path_index,
1688 void* data,
1689 struct obj_ref* data_ref) {
1690 unsigned int index;
1691 struct block_mac* block_mac;
1692 struct block_tree_node* parent_node_rw;
1693 bool parent_is_leaf;
1694 struct obj_ref parent_node_ref = OBJ_REF_INITIAL_VALUE(parent_node_ref);
1695 struct obj_ref tmp_ref = OBJ_REF_INITIAL_VALUE(tmp_ref);
1696
1697 if (path_index == (int)path->count) {
1698 assert(path_index < (int)countof(path->entry));
1699 path->entry[path_index].block_mac = path->data; // copy data
1700 }
1701
1702 while (--path_index >= 0) {
1703 parent_node_rw = block_tree_block_get_write(tr, path, path_index,
1704 &parent_node_ref);
1705 if (tr->failed) {
1706 assert(!parent_node_rw);
1707 block_put_dirty_discard(data, data_ref);
1708 pr_warn("transaction failed, abort\n");
1709 return;
1710 }
1711 assert(parent_node_rw);
1712 parent_is_leaf = block_tree_node_is_leaf(parent_node_rw);
1713 index = path->entry[path_index].index;
1714 assert(path_index == (int)path->count - 1 || !parent_is_leaf);
1715 assert(path->tree->child_data_size[parent_is_leaf] <=
1716 sizeof(*block_mac));
1717 assert(path->tree->child_data_size[parent_is_leaf] >=
1718 tr->fs->block_num_size + tr->fs->mac_size);
1719 block_mac = block_tree_node_get_child_data_rw(path->tree,
1720 parent_node_rw, index);
1721
1722 /* check that block was copied if needed */
1723 assert(!block_tree_node_need_copy(tr, path->tree, block_mac) ||
1724 !block_mac_same_block(tr, block_mac,
1725 &path->entry[path_index + 1].block_mac));
1726
1727 /* check that block was not copied when not needed */
1728 assert(block_tree_node_need_copy(tr, path->tree, block_mac) ||
1729 block_mac_same_block(tr, block_mac,
1730 &path->entry[path_index + 1].block_mac) ||
1731 tr->failed);
1732
1733 if (!block_mac_same_block(tr, block_mac,
1734 &path->entry[path_index + 1].block_mac)) {
1735 if (print_internal_changes) {
1736 printf("%s: update copied block %" PRIu64 " to %" PRIu64 "\n",
1737 __func__, block_mac_to_block(tr, block_mac),
1738 block_mac_to_block(
1739 tr, &path->entry[path_index + 1].block_mac));
1740 }
1741 block_mac_set_block(
1742 tr, block_mac,
1743 block_mac_to_block(tr,
1744 &path->entry[path_index + 1].block_mac));
1745 }
1746
1747 if (tr->failed) {
1748 block_put_dirty_discard(data, data_ref);
1749 block_put_dirty_discard(parent_node_rw, &parent_node_ref);
1750 pr_warn("transaction failed, abort\n");
1751 return;
1752 }
1753
1754 assert(block_mac_eq(tr, block_mac,
1755 &path->entry[path_index + 1].block_mac));
1756 block_put_dirty(tr, data, data_ref, block_mac, parent_node_rw);
1757 assert(!block_mac_eq(tr, block_mac,
1758 &path->entry[path_index + 1].block_mac) ||
1759 tr->fs->mac_size < 16);
1760 block_mac_copy_mac(tr, &path->entry[path_index + 1].block_mac,
1761 block_mac);
1762 assert(block_mac_eq(tr, block_mac,
1763 &path->entry[path_index + 1].block_mac));
1764 data = parent_node_rw;
1765 data_ref = &tmp_ref;
1766 obj_ref_transfer(data_ref, &parent_node_ref);
1767 }
1768 block_mac = &path->tree->root;
1769 assert(block_mac_same_block(tr, block_mac, &path->entry[0].block_mac));
1770 assert(block_mac_eq(tr, block_mac, &path->entry[0].block_mac));
1771 block_put_dirty(tr, data, data_ref, block_mac, NULL);
1772 assert(!block_mac_eq(tr, block_mac, &path->entry[0].block_mac) ||
1773 tr->fs->mac_size < 16);
1774 block_mac_copy_mac(tr, &path->entry[0].block_mac, block_mac);
1775 assert(block_mac_eq(tr, block_mac, &path->entry[0].block_mac));
1776 }
1777
1778 /**
1779 * block_tree_update_key - Update key in internal parent node
1780 * @tr: Transaction object.
1781 * @path: Tree path object.
1782 * @path_index: Path depth start search at.
1783 * @new_key: New key value.
1784 *
1785 * Search path until a left parent key is found (non-zero index) then set it
1786 * to @new_key.
1787 *
1788 * TODO: merge with mac update?
1789 */
block_tree_update_key(struct transaction * tr,struct block_tree_path * path,int path_index,data_block_t new_key)1790 void block_tree_update_key(struct transaction* tr,
1791 struct block_tree_path* path,
1792 int path_index,
1793 data_block_t new_key) {
1794 const struct block_mac* block_mac;
1795 unsigned int index;
1796 struct block_tree_node* node_rw;
1797 struct obj_ref node_ref = OBJ_REF_INITIAL_VALUE(node_ref);
1798
1799 assert(new_key);
1800
1801 while (path_index >= 0) {
1802 assert(path_index < (int)countof(path->entry));
1803 block_mac = &path->entry[path_index].block_mac;
1804 index = path->entry[path_index].index;
1805 if (index == 0) {
1806 path_index--;
1807 continue;
1808 }
1809 node_rw = block_tree_block_get_write(tr, path, path_index, &node_ref);
1810 if (tr->failed) {
1811 pr_warn("transaction failed, abort\n");
1812 return;
1813 }
1814 assert(node_rw);
1815 if (print_internal_changes) {
1816 printf("%s: block %" PRIu64 ", index %d, [%" PRIu64
1817 "-] -> [%" PRIu64 "-]\n",
1818 __func__, block_mac_to_block(tr, block_mac), index,
1819 block_tree_node_get_key(path->tree, DATA_BLOCK_INVALID,
1820 node_rw, index - 1),
1821 new_key);
1822 block_tree_node_print(tr, path->tree,
1823 block_mac_to_block(tr, block_mac), node_rw);
1824 }
1825 assert(!block_tree_node_is_leaf(node_rw));
1826 assert(index == 1 || new_key >= block_tree_node_get_key(
1827 path->tree, DATA_BLOCK_INVALID,
1828 node_rw, index - 2));
1829 assert(index == block_tree_node_max_key_count(path->tree, node_rw) ||
1830 new_key <= block_tree_node_get_key(path->tree,
1831 DATA_BLOCK_INVALID, node_rw,
1832 index) ||
1833 !block_tree_node_get_key(path->tree, DATA_BLOCK_INVALID, node_rw,
1834 index));
1835 assert(path->entry[path_index].prev_key ==
1836 block_tree_node_get_key(path->tree, DATA_BLOCK_INVALID, node_rw,
1837 index - 1));
1838 block_tree_node_set_key(path->tree, node_rw, index - 1, new_key);
1839 assert(block_tree_node_check(tr, path->tree, node_rw,
1840 block_mac_to_block(tr, block_mac), 0,
1841 DATA_BLOCK_MAX));
1842 path->entry[path_index].prev_key = new_key;
1843 if (print_internal_changes) {
1844 block_tree_node_print(tr, path->tree,
1845 block_mac_to_block(tr, block_mac), node_rw);
1846 }
1847 block_tree_path_put_dirty(tr, path, path_index, node_rw, &node_ref);
1848 return;
1849 }
1850 if (print_internal_changes) {
1851 printf("%s: root reached, no update needed (new key %" PRIu64 ")\n",
1852 __func__, new_key);
1853 }
1854 }
1855
1856 /**
1857 * block_tree_node_get_key_count - Get number of keys currently stored in node
1858 * @tree: Tree object.
1859 * @node_ro: Node.
1860 */
block_tree_node_get_key_count(const struct block_tree * tree,const struct block_tree_node * node_ro)1861 static int block_tree_node_get_key_count(
1862 const struct block_tree* tree,
1863 const struct block_tree_node* node_ro) {
1864 unsigned int i;
1865 unsigned int max_key_count = block_tree_node_max_key_count(tree, node_ro);
1866
1867 for (i = 0; i < max_key_count; i++) {
1868 if (!block_tree_node_get_key(tree, DATA_BLOCK_INVALID, node_ro, i)) {
1869 break;
1870 }
1871 }
1872
1873 return i;
1874 }
1875
1876 /**
1877 * block_tree_node_leaf_remove_entry - Remove entry from a leaf node
1878 * @tr: Transaction object.
1879 * @tree: Tree object.
1880 * @node_block_mac: Block number and mac for @node_rw.
1881 * @node_rw: Node.
1882 * @index: Index of entry to remove.
1883 */
block_tree_node_leaf_remove_entry(struct transaction * tr,const struct block_tree * tree,const struct block_mac * node_block_mac,struct block_tree_node * node_rw,unsigned int index)1884 static void block_tree_node_leaf_remove_entry(
1885 struct transaction* tr,
1886 const struct block_tree* tree,
1887 const struct block_mac* node_block_mac,
1888 struct block_tree_node* node_rw,
1889 unsigned int index) {
1890 unsigned int max_key_count = block_tree_node_max_key_count(tree, node_rw);
1891
1892 if (print_internal_changes) {
1893 printf("%s: index %d:", __func__, index);
1894 block_tree_node_print(tr, tree, block_mac_to_block(tr, node_block_mac),
1895 node_rw);
1896 }
1897 assert(index < max_key_count);
1898 assert(block_tree_node_is_leaf(node_rw));
1899 block_tree_node_shift_down(tree, node_rw, index, index + 1,
1900 SHIFT_LEAF_OR_LEFT_CHILD);
1901 if (print_internal_changes) {
1902 printf("%s: done: ", __func__);
1903 block_tree_node_print(tr, tree, block_mac_to_block(tr, node_block_mac),
1904 node_rw);
1905 }
1906 }
1907
1908 /**
1909 * block_tree_node_split - Split a full node and add entry.
1910 * @tr: Transaction object.
1911 * @path: Tree path object.
1912 * @append_key: Key to add.
1913 * @append_child: Child to add if @path points to an internal node.
1914 * @append_data: Data to add if @path points to a leaf node.
1915 *
1916 * Allocate a new node and move half of the entries to it. If the old node was
1917 * the root node, allocate a new root node. Add new node to parent.
1918 */
block_tree_node_split(struct transaction * tr,struct block_tree_path * path,data_block_t append_key,const struct block_mac * append_child,const struct block_mac * append_data)1919 static void block_tree_node_split(struct transaction* tr,
1920 struct block_tree_path* path,
1921 data_block_t append_key,
1922 const struct block_mac* append_child,
1923 const struct block_mac* append_data) {
1924 struct block_tree_node* parent_node_rw;
1925 struct obj_ref parent_node_ref = OBJ_REF_INITIAL_VALUE(parent_node_ref);
1926 struct block_tree_node* node_left_rw;
1927 struct obj_ref node_left_ref = OBJ_REF_INITIAL_VALUE(node_left_ref);
1928 struct block_tree_node* node_right_rw;
1929 struct obj_ref node_right_ref = OBJ_REF_INITIAL_VALUE(node_right_ref);
1930 bool is_leaf;
1931 struct block_mac* left_block_mac;
1932 struct block_mac* right_block_mac;
1933 data_block_t left_block_num;
1934 struct block_mac right;
1935 const struct block_mac* block_mac;
1936 const struct block_mac* parent_block_mac;
1937 unsigned int parent_index;
1938 int split_index;
1939 data_block_t parent_key;
1940 data_block_t overflow_key = 0;
1941 struct block_mac overflow_child;
1942 size_t max_key_count;
1943
1944 assert(path->count > 0);
1945 assert(path->tree);
1946
1947 block_mac = &path->entry[path->count - 1].block_mac;
1948 path->count--;
1949
1950 if (print_internal_changes) {
1951 printf("%s: tree root %" PRIu64 ", block %" PRIu64 "\n", __func__,
1952 block_mac_to_block(tr, &path->tree->root),
1953 block_mac_to_block(tr, block_mac));
1954 }
1955
1956 assert(append_key);
1957 /* we should only insert one node at a time */
1958 assert(!path->tree->inserting.block);
1959 path->tree->inserting.block = block_mac_to_block(tr, block_mac);
1960 path->tree->inserting.key = append_key;
1961 if (append_data) {
1962 assert(!append_child);
1963 path->tree->inserting.data = *append_data;
1964 }
1965 if (append_child) {
1966 assert(!append_data);
1967 path->tree->inserting.child = *append_child;
1968 }
1969
1970 assert(!path->tree->copy_on_write || path->tree->allow_copy_on_write);
1971
1972 block_mac_set_block(
1973 tr, &right,
1974 block_allocate_etc(tr, !path->tree->allow_copy_on_write));
1975 if (!path->count) {
1976 /* use old block at the new root */
1977 left_block_num =
1978 block_allocate_etc(tr, !path->tree->allow_copy_on_write);
1979 } else {
1980 left_block_num = block_mac_to_block(tr, block_mac);
1981 }
1982 if (tr->failed) {
1983 pr_warn("transaction failed, abort\n");
1984 return;
1985 }
1986 assert(block_mac);
1987 assert(block_mac_valid(tr, block_mac));
1988 assert(path->tree->inserting.block == block_mac_to_block(tr, block_mac));
1989 assert(path->tree->inserting.key == append_key);
1990 assert(block_mac_to_block(tr, &path->tree->inserting.data) ==
1991 (append_data ? block_mac_to_block(tr, append_data) : 0));
1992 assert(block_mac_to_block(tr, &path->tree->inserting.child) ==
1993 (append_child ? block_mac_to_block(tr, append_child) : 0));
1994 memset(&path->tree->inserting, 0, sizeof(path->tree->inserting));
1995
1996 if (print_internal_changes) {
1997 printf("%s: %" PRIu64 " -> %" PRIu64 " %" PRIu64 "\n", __func__,
1998 block_mac_to_block(tr, block_mac), left_block_num,
1999 block_mac_to_block(tr, &right));
2000 }
2001
2002 node_left_rw =
2003 block_tree_block_get_write(tr, path, path->count, &node_left_ref);
2004 if (!node_left_rw) {
2005 assert(tr->failed);
2006 pr_warn("transaction failed, abort\n");
2007 return;
2008 }
2009 assert(node_left_rw);
2010 is_leaf = block_tree_node_is_leaf(node_left_rw);
2011 if (!path->count) {
2012 assert(left_block_num != block_mac_to_block(tr, block_mac));
2013 parent_node_rw = node_left_rw;
2014 obj_ref_transfer(&parent_node_ref, &node_left_ref);
2015 /* TODO: use allocated node for root instead since we need to update the
2016 * mac anyway */
2017 node_left_rw = block_get_copy(tr, node_left_rw, left_block_num,
2018 !path->tree->allow_copy_on_write,
2019 &node_left_ref);
2020 assert(node_left_rw);
2021 if (tr->failed) {
2022 pr_warn("transaction failed, abort\n");
2023 goto err_copy_parent;
2024 }
2025 // parent_node_rw = block_get_cleared(path->tree->root.block);
2026 /* TODO: use block_get_cleared */
2027 memset(parent_node_rw, 0, path->tree->block_size);
2028 parent_index = 0;
2029 left_block_mac = block_tree_node_get_child_data_rw(
2030 path->tree, parent_node_rw, parent_index);
2031 block_mac_set_block(tr, left_block_mac, left_block_num);
2032 parent_block_mac = block_mac;
2033 } else {
2034 assert(left_block_num == block_mac_to_block(tr, block_mac));
2035 parent_block_mac = &path->entry[path->count - 1].block_mac;
2036 parent_index = path->entry[path->count - 1].index;
2037 parent_node_rw = block_tree_block_get_write(tr, path, path->count - 1,
2038 &parent_node_ref);
2039 if (tr->failed) {
2040 pr_warn("transaction failed, abort\n");
2041 goto err_get_parent;
2042 }
2043 assert(parent_node_rw);
2044 assert(!block_tree_node_is_leaf(parent_node_rw));
2045 left_block_mac = block_tree_node_get_child_data_rw(
2046 path->tree, parent_node_rw, parent_index);
2047 }
2048 assert(block_mac_to_block(tr, left_block_mac) == left_block_num);
2049 assert(!tr->failed);
2050 node_right_rw =
2051 block_get_copy(tr, node_left_rw, block_mac_to_block(tr, &right),
2052 !path->tree->allow_copy_on_write, &node_right_ref);
2053 if (tr->failed) {
2054 pr_warn("transaction failed, abort\n");
2055 goto err_copy_right;
2056 }
2057 assert(block_tree_node_is_leaf(node_left_rw) == is_leaf);
2058 assert(block_tree_node_is_leaf(node_right_rw) == is_leaf);
2059 assert(is_leaf || (append_key && append_child));
2060 assert(!is_leaf || (append_key && append_data));
2061 assert(block_tree_node_full(path->tree, node_left_rw));
2062 assert(!tr->failed);
2063
2064 max_key_count = block_tree_node_max_key_count(path->tree, node_left_rw);
2065 split_index = (max_key_count + 1) / 2;
2066
2067 if (print_internal_changes) {
2068 printf("%s: insert split_index %d", __func__, split_index);
2069 block_tree_node_print(tr, path->tree, left_block_num, node_left_rw);
2070 if (is_leaf) {
2071 printf("%s: append key, data: [%" PRIu64 ": %" PRIu64 "]\n",
2072 __func__, append_key, block_mac_to_block(tr, append_data));
2073 } else {
2074 printf("%s: append key, child: [%" PRIu64 "-] %" PRIu64 "\n",
2075 __func__, append_key, block_mac_to_block(tr, append_child));
2076 }
2077 }
2078
2079 /* Update left node */
2080 block_tree_node_clear_end(path->tree, node_left_rw, split_index);
2081 if (print_internal_changes) {
2082 printf("%s: left %3" PRIu64 "", __func__, left_block_num);
2083 block_tree_node_print(tr, path->tree, left_block_num, node_left_rw);
2084 }
2085
2086 /*
2087 * Update right node. For internal nodes the key at split_index moves to
2088 * the parent, for leaf nodes it gets copied
2089 */
2090 parent_key = block_tree_node_get_key(path->tree, DATA_BLOCK_INVALID,
2091 node_right_rw, split_index);
2092 block_tree_node_shift_down(path->tree, node_right_rw, 0,
2093 split_index + !is_leaf,
2094 SHIFT_LEAF_OR_LEFT_CHILD);
2095 assert(max_key_count ==
2096 block_tree_node_max_key_count(path->tree, node_right_rw));
2097 block_tree_node_insert(
2098 path->tree, node_right_rw, max_key_count - split_index - !is_leaf,
2099 is_leaf ? SHIFT_LEAF_OR_LEFT_CHILD : SHIFT_RIGHT_CHILD, &append_key,
2100 is_leaf ? append_data : append_child, NULL, NULL);
2101 if (print_internal_changes) {
2102 printf("%s: right %3" PRIu64 "", __func__,
2103 block_mac_to_block(tr, &right));
2104 block_tree_node_print(tr, path->tree, block_mac_to_block(tr, &right),
2105 node_right_rw);
2106 }
2107
2108 /* Insert right node in parent node */
2109 assert(!block_tree_node_is_leaf(parent_node_rw));
2110
2111 if (print_internal_changes) {
2112 printf("%s: insert [%" PRIu64 "-] %" PRIu64 " @%d:", __func__,
2113 parent_key, block_mac_to_block(tr, &right), parent_index);
2114 block_tree_node_print(tr, path->tree,
2115 block_mac_to_block(tr, parent_block_mac),
2116 parent_node_rw);
2117 }
2118
2119 assert(parent_index ==
2120 block_tree_node_max_key_count(path->tree, parent_node_rw) ||
2121 !block_tree_node_get_key(path->tree, DATA_BLOCK_INVALID,
2122 parent_node_rw, parent_index) ||
2123 parent_key < block_tree_node_get_key(path->tree, DATA_BLOCK_INVALID,
2124 parent_node_rw, parent_index));
2125 assert(parent_index == 0 ||
2126 block_tree_node_get_key(path->tree, DATA_BLOCK_INVALID,
2127 parent_node_rw,
2128 parent_index - 1) <= parent_key);
2129 block_tree_node_insert(path->tree, parent_node_rw, parent_index,
2130 SHIFT_RIGHT_CHILD, &parent_key, &right,
2131 &overflow_key, &overflow_child);
2132 assert(!overflow_key == !block_mac_valid(tr, &overflow_child));
2133 /*
2134 * If overflow_key is set it is not safe to re-enter the tree at this point
2135 * as overflow_child is not reachable. It becomes reachable again when
2136 * saved in tree->inserting.child at the top of this function.
2137 */
2138
2139 if (parent_index <
2140 block_tree_node_max_key_count(path->tree, parent_node_rw)) {
2141 right_block_mac = block_tree_node_get_child_data_rw(
2142 path->tree, parent_node_rw, parent_index + 1);
2143 } else {
2144 assert(block_mac_valid(tr, &overflow_child));
2145 right_block_mac = &overflow_child;
2146 }
2147
2148 if (print_internal_changes) {
2149 printf("%s: insert [%" PRIu64 "-] %" PRIu64 " @%d done:", __func__,
2150 parent_key, block_mac_to_block(tr, &right), parent_index);
2151 block_tree_node_print(tr, path->tree,
2152 block_mac_to_block(tr, parent_block_mac),
2153 parent_node_rw);
2154 }
2155
2156 if (!path->count && print_internal_changes) {
2157 printf("%s root", __func__);
2158 block_tree_node_print(tr, path->tree, block_mac_to_block(tr, block_mac),
2159 parent_node_rw);
2160 }
2161
2162 block_put_dirty(tr, node_left_rw, &node_left_ref, left_block_mac,
2163 parent_node_rw);
2164 block_put_dirty(tr, node_right_rw, &node_right_ref, right_block_mac,
2165 parent_node_rw);
2166 /* block_tree_path_put_dirty will handle negative depths */
2167 block_tree_path_put_dirty(tr, path, (int)path->count - 1, parent_node_rw,
2168 &parent_node_ref);
2169
2170 if (overflow_key) {
2171 assert(path->count); /* new root node should not overflow */
2172 if (print_internal_changes) {
2173 printf("%s: overflow [%" PRIu64 "-] %" PRIu64 "\n", __func__,
2174 overflow_key, block_mac_to_block(tr, &overflow_child));
2175 }
2176 /* TODO: use a loop instead */
2177 block_tree_node_split(tr, path, overflow_key, &overflow_child, 0);
2178 }
2179 return;
2180
2181 err_copy_right:
2182 block_put_dirty_discard(node_right_rw, &node_right_ref);
2183 err_copy_parent:
2184 block_put_dirty_discard(parent_node_rw, &parent_node_ref);
2185 err_get_parent:
2186 block_put_dirty_discard(node_left_rw, &node_left_ref);
2187 }
2188
2189 /**
2190 * block_tree_sibling_index - Get the index of a sibling node
2191 * @index: Index of current node.
2192 *
2193 * Use left sibling when possible, otherwise use right sibling. Other siblings
2194 * could be used, but this simple rule avoids selecting an empty sibling if
2195 * a non-empty sibling exists.
2196 */
block_tree_sibling_index(int index)2197 static int block_tree_sibling_index(int index) {
2198 return !index ? 1 : index - 1;
2199 }
2200
2201 /**
2202 * block_tree_get_sibling_block - Get block and mac for sibling block
2203 * @tr: Transaction object.
2204 * @path: Tree path object.
2205 *
2206 * Return: block_mac of sibling block selected by block_tree_sibling_index
2207 */
block_tree_get_sibling_block(struct transaction * tr,struct block_tree_path * path)2208 static struct block_mac block_tree_get_sibling_block(
2209 struct transaction* tr,
2210 struct block_tree_path* path) {
2211 struct block_mac block_mac = BLOCK_MAC_INITIAL_VALUE(block_mac);
2212 const struct block_mac* parent;
2213 const struct block_mac* block_mac_ptr;
2214 int parent_index;
2215 const struct block_tree_node* node_ro;
2216 struct obj_ref node_ref = OBJ_REF_INITIAL_VALUE(node_ref);
2217
2218 assert(path->tree);
2219 assert(path->count > 1);
2220
2221 parent = &path->entry[path->count - 2].block_mac;
2222 parent_index = path->entry[path->count - 2].index;
2223
2224 node_ro = block_get(tr, parent, NULL, &node_ref);
2225 if (!node_ro) {
2226 assert(tr->failed);
2227 pr_warn("transaction failed, abort\n");
2228 goto err;
2229 }
2230 parent_index = block_tree_sibling_index(parent_index);
2231 block_mac_ptr =
2232 block_tree_node_get_child_data(path->tree, node_ro, parent_index);
2233 block_mac_copy(tr, &block_mac, block_mac_ptr);
2234 assert(block_mac_valid(tr, &block_mac));
2235 block_put(node_ro, &node_ref);
2236 err:
2237 return block_mac;
2238 }
2239
2240 /**
2241 * block_tree_path_set_sibling_block - Set path to point to sibling block_mac
2242 * @path: Tree path object.
2243 * @block_mac: Block-mac of sibling block to swap with path entry
2244 * @left: %true if @block_mac is the left sibling on entry (right on
2245 * return) %false otherwise.
2246 *
2247 * Update @path to point to the left (if @left is %true) or right sibling node
2248 * @block_mac, and return the block_mac of the old node that @path pointed to in
2249 * @block_mac.
2250 *
2251 * This is used to toggle the path between two sibling nodes without re-reading
2252 * the parent node. It does not preserve the prev_key or next_key value that is
2253 * not shared beween the siblings.
2254 */
block_tree_path_set_sibling_block(struct block_tree_path * path,struct block_mac * block_mac,bool left)2255 static void block_tree_path_set_sibling_block(struct block_tree_path* path,
2256 struct block_mac* block_mac,
2257 bool left) {
2258 int parent_index;
2259 struct block_mac tmp;
2260
2261 assert(path->count > 1);
2262 parent_index = path->entry[path->count - 2].index;
2263 // assert(left || parent_index <= 1); // assert only valid on initial call
2264 assert(!left || parent_index > 0);
2265
2266 if (left) {
2267 parent_index--;
2268 } else {
2269 parent_index++;
2270 }
2271
2272 if (print_internal_changes) {
2273 printf("%s: path[%d].index: %d -> %d\n", __func__, path->count - 2,
2274 path->entry[path->count - 2].index, parent_index);
2275 printf("%s: path[%d].block_mac.block: %" PRIu64 " -> %" PRIu64 "\n",
2276 __func__, path->count - 1,
2277 block_mac_to_block(path->tr,
2278 &path->entry[path->count - 1].block_mac),
2279 block_mac_to_block(path->tr, block_mac));
2280 }
2281
2282 path->entry[path->count - 2].index = parent_index;
2283 if (left) {
2284 path->entry[path->count - 2].next_key =
2285 path->entry[path->count - 2].prev_key;
2286 path->entry[path->count - 2].prev_key = 0; // unknown
2287 } else {
2288 path->entry[path->count - 2].prev_key =
2289 path->entry[path->count - 2].next_key;
2290 path->entry[path->count - 2].next_key = 0; // unknown
2291 }
2292
2293 tmp = *block_mac;
2294 *block_mac = path->entry[path->count - 1].block_mac;
2295 path->entry[path->count - 1].block_mac = tmp;
2296 }
2297
2298 static void block_tree_node_merge(struct transaction* tr,
2299 struct block_tree_path* path);
2300
2301 /**
2302 * block_tree_remove_internal - Remove key and right child from internal node
2303 * @tr: Transaction object.
2304 * @path: Tree path object.
2305 *
2306 * Removes an entry from an internal node, and check the resulting node follows
2307 * B+ tree rules. If a root node would have no remaining entries, delete it. If
2308 * a non-root node has too few remaining entries, call block_tree_node_merge
2309 * which will either borrow an entry from, or merge with a sibling node.
2310 *
2311 * TODO: merge with block_tree_node_merge and avoid recursion?
2312 */
block_tree_remove_internal(struct transaction * tr,struct block_tree_path * path)2313 static void block_tree_remove_internal(struct transaction* tr,
2314 struct block_tree_path* path) {
2315 const struct block_tree_node* node_ro;
2316 struct block_tree_node* node_rw;
2317 struct obj_ref node_ref = OBJ_REF_INITIAL_VALUE(node_ref);
2318 int index;
2319 bool need_merge;
2320 const struct block_mac* block_mac;
2321
2322 assert(path->count);
2323
2324 block_mac = &path->entry[path->count - 1].block_mac;
2325 index = path->entry[path->count - 1].index;
2326 node_ro = block_get(tr, block_mac, NULL, &node_ref);
2327 if (!node_ro) {
2328 assert(tr->failed);
2329 pr_warn("transaction failed, abort\n");
2330 return;
2331 }
2332 assert(!block_tree_node_is_leaf(node_ro));
2333 assert(index > 0);
2334
2335 if (print_internal_changes) {
2336 printf("%s: @%d:", __func__, index);
2337 block_tree_node_print(tr, path->tree, block_mac_to_block(tr, block_mac),
2338 node_ro);
2339 }
2340
2341 if (path->count == 1 &&
2342 !block_tree_node_get_key(path->tree, DATA_BLOCK_INVALID, node_ro, 1)) {
2343 /* block_tree_node_merge always removes right node */
2344 assert(index == 1);
2345 const struct block_mac* new_root =
2346 block_tree_node_get_child_data(path->tree, node_ro, 0);
2347 if (print_internal_changes) {
2348 printf("%s: root emptied, new root %" PRIu64 "\n", __func__,
2349 block_mac_to_block(tr, new_root));
2350 }
2351 assert(block_mac_valid(tr, new_root));
2352 path->tree->root = *new_root;
2353 block_discard_dirty(node_ro);
2354 block_put(node_ro, &node_ref);
2355 assert(!path->tree->copy_on_write || path->tree->allow_copy_on_write);
2356 block_free_etc(tr, block_mac_to_block(tr, block_mac),
2357 !path->tree->allow_copy_on_write);
2358 return;
2359 }
2360
2361 node_rw = block_tree_block_dirty(tr, path, path->count - 1, node_ro);
2362 if (tr->failed) {
2363 block_put(node_ro, &node_ref);
2364 pr_warn("transaction failed, abort\n");
2365 return;
2366 }
2367
2368 block_tree_node_shift_down(path->tree, node_rw, index - 1, index,
2369 SHIFT_RIGHT_CHILD);
2370
2371 if (print_internal_changes) {
2372 printf("%s: @%d done:", __func__, index);
2373 block_tree_node_print(tr, path->tree, block_mac_to_block(tr, block_mac),
2374 node_rw);
2375 }
2376 need_merge =
2377 path->count > 1 && block_tree_below_min_full(path->tree, node_rw);
2378 block_tree_path_put_dirty(tr, path, path->count - 1, node_rw, &node_ref);
2379
2380 if (need_merge) {
2381 block_tree_node_merge(tr, path);
2382 }
2383 }
2384
2385 /**
2386 * block_tree_node_merge - Merge node or borrow entry from sibling block
2387 * @tr: Transaction object.
2388 * @path: Tree path object.
2389 *
2390 * If sibling block has more entries than it needs, migrate an entry from it,
2391 * otherwise merge the two nodes and free the block that stored the right node.
2392 */
block_tree_node_merge(struct transaction * tr,struct block_tree_path * path)2393 static void block_tree_node_merge(struct transaction* tr,
2394 struct block_tree_path* path) {
2395 const struct block_tree_node* node_ro;
2396 const struct block_tree_node* merge_node_ro;
2397 struct block_tree_node* node_rw;
2398 struct block_tree_node* merge_node_rw;
2399 struct obj_ref node_ref1 = OBJ_REF_INITIAL_VALUE(node_ref1);
2400 struct obj_ref node_ref2 = OBJ_REF_INITIAL_VALUE(node_ref2);
2401 struct obj_ref* node_ref = &node_ref1;
2402 struct obj_ref* merge_node_ref = &node_ref2;
2403 const struct block_mac* block_mac;
2404 struct block_mac merge_block;
2405 unsigned int src_index;
2406 unsigned int dest_index;
2407 data_block_t key;
2408 data_block_t parent_key;
2409 int index;
2410 bool node_is_left;
2411 bool is_leaf;
2412
2413 assert(path->count > 1);
2414 assert(path->tree);
2415
2416 block_mac = &path->entry[path->count - 1].block_mac;
2417 node_is_left = !path->entry[path->count - 2].index;
2418 merge_block = block_tree_get_sibling_block(tr, path);
2419
2420 if (tr->failed) {
2421 pr_warn("transaction failed, abort\n");
2422 return;
2423 }
2424
2425 node_ro = block_get(tr, block_mac, NULL, node_ref);
2426 if (!node_ro) {
2427 assert(tr->failed);
2428 pr_warn("transaction failed, abort\n");
2429 return;
2430 }
2431 assert(node_ro);
2432 is_leaf = block_tree_node_is_leaf(node_ro);
2433
2434 merge_node_ro = block_get(tr, &merge_block, NULL, merge_node_ref);
2435 if (!merge_node_ro) {
2436 assert(tr->failed);
2437 block_put(node_ro, node_ref);
2438 pr_warn("transaction failed, abort\n");
2439 return;
2440 }
2441 assert(merge_node_ro);
2442 assert(is_leaf == block_tree_node_is_leaf(merge_node_ro));
2443
2444 if (print_internal_changes) {
2445 printf("%s: node_is_left %d\n", __func__, node_is_left);
2446 block_tree_node_print(tr, path->tree, block_mac_to_block(tr, block_mac),
2447 node_ro);
2448 block_tree_node_print(tr, path->tree,
2449 block_mac_to_block(tr, &merge_block),
2450 merge_node_ro);
2451 }
2452
2453 assert(block_tree_below_min_full(path->tree, node_ro));
2454 assert(!block_tree_below_min_full(path->tree, merge_node_ro));
2455 if (block_tree_above_min_full(path->tree, merge_node_ro)) {
2456 /* migrate entries instead */
2457 block_tree_path_set_sibling_block(path, &merge_block, !node_is_left);
2458 assert(!tr->failed);
2459 merge_node_rw = block_tree_block_dirty(tr, path, path->count - 1,
2460 merge_node_ro);
2461 block_tree_path_set_sibling_block(path, &merge_block, node_is_left);
2462 if (!merge_node_rw) {
2463 assert(tr->failed);
2464 block_put(node_ro, node_ref);
2465 block_put(merge_node_ro, merge_node_ref);
2466 pr_warn("transaction failed, abort\n");
2467 return;
2468 }
2469 assert(!block_tree_node_need_copy(tr, path->tree, &merge_block));
2470
2471 node_rw = block_dirty(tr, node_ro, !path->tree->copy_on_write);
2472 assert(!block_tree_node_need_copy(tr, path->tree, block_mac));
2473
2474 if (node_is_left) {
2475 src_index = 0;
2476 dest_index = block_tree_node_min_full_index(path->tree, node_rw);
2477 } else {
2478 src_index =
2479 block_tree_node_get_key_count(path->tree, merge_node_rw) -
2480 1;
2481 dest_index = 0;
2482 }
2483
2484 key = block_tree_node_get_key(path->tree, DATA_BLOCK_INVALID,
2485 merge_node_rw, src_index);
2486 if (node_is_left && is_leaf) {
2487 parent_key = block_tree_node_get_key(path->tree, DATA_BLOCK_INVALID,
2488 merge_node_rw, 1);
2489 } else {
2490 parent_key = key;
2491 }
2492 if (!is_leaf) {
2493 if (node_is_left) {
2494 key = path->entry[path->count - 2].next_key;
2495 } else {
2496 key = path->entry[path->count - 2].prev_key;
2497 }
2498 }
2499
2500 block_tree_node_insert(path->tree, node_rw, dest_index,
2501 (node_is_left && !is_leaf)
2502 ? SHIFT_RIGHT_CHILD
2503 : SHIFT_LEAF_OR_LEFT_CHILD,
2504 &key,
2505 block_tree_node_get_child_data(
2506 path->tree, merge_node_rw,
2507 src_index + (!node_is_left && !is_leaf)),
2508 NULL, NULL);
2509
2510 block_tree_node_shift_down(
2511 path->tree, merge_node_rw, src_index, src_index + 1,
2512 (node_is_left || is_leaf) ? SHIFT_LEAF_OR_LEFT_CHILD
2513 : SHIFT_RIGHT_CHILD);
2514
2515 if (node_is_left) {
2516 if (dest_index == 0 && is_leaf) {
2517 data_block_t key0;
2518 key0 = block_tree_node_get_key(path->tree, DATA_BLOCK_INVALID,
2519 node_rw, 0);
2520 assert(key0);
2521 block_tree_update_key(tr, path, path->count - 2, key0);
2522 }
2523 block_tree_path_set_sibling_block(path, &merge_block,
2524 !node_is_left);
2525 }
2526 block_tree_update_key(tr, path, path->count - 2, parent_key);
2527 if (node_is_left) {
2528 block_tree_path_set_sibling_block(path, &merge_block, node_is_left);
2529 }
2530 if (print_internal_changes) {
2531 printf("%s: %" PRIu64 " %" PRIu64 " done\n", __func__,
2532 block_mac_to_block(tr, block_mac),
2533 block_mac_to_block(tr, &merge_block));
2534 block_tree_node_print(tr, path->tree,
2535 block_mac_to_block(tr, block_mac), node_rw);
2536 block_tree_node_print(tr, path->tree,
2537 block_mac_to_block(tr, &merge_block),
2538 merge_node_rw);
2539 }
2540 block_tree_path_put_dirty(tr, path, path->count - 1, node_rw, node_ref);
2541 block_tree_path_set_sibling_block(path, &merge_block, !node_is_left);
2542 block_tree_path_put_dirty(tr, path, path->count - 1, merge_node_rw,
2543 merge_node_ref);
2544 block_tree_path_set_sibling_block(path, &merge_block, node_is_left);
2545 } else {
2546 const struct block_mac* left;
2547 const struct block_mac* right;
2548 data_block_t* merge_key;
2549 if (!node_is_left) {
2550 const struct block_tree_node* tmp_node;
2551 struct obj_ref* tmp_ref;
2552 tmp_node = node_ro;
2553 node_ro = merge_node_ro;
2554 merge_node_ro = tmp_node;
2555 tmp_ref = node_ref;
2556 node_ref = merge_node_ref;
2557 merge_node_ref = tmp_ref;
2558 block_tree_path_set_sibling_block(path, &merge_block, true);
2559 }
2560 left = block_mac;
2561 right = &merge_block;
2562 node_rw = block_tree_block_dirty(tr, path, path->count - 1, node_ro);
2563 if (!node_rw) {
2564 assert(tr->failed);
2565 block_put(node_ro, node_ref);
2566 block_put(merge_node_ro, merge_node_ref);
2567 pr_warn("transaction failed, abort\n");
2568 return;
2569 }
2570 assert(!block_tree_node_need_copy(tr, path->tree, left));
2571
2572 index = block_tree_node_get_key_count(path->tree, node_rw);
2573 if (is_leaf) {
2574 merge_key = NULL;
2575 } else {
2576 merge_key = &path->entry[path->count - 2].next_key;
2577 assert(*merge_key);
2578 }
2579 block_tree_node_merge_entries(
2580 path->tree, node_rw, merge_node_ro, index,
2581 block_tree_node_get_key_count(path->tree, merge_node_ro),
2582 merge_key);
2583 assert(block_tree_node_check(tr, path->tree, node_rw,
2584 block_mac_to_block(tr, left), 0,
2585 DATA_BLOCK_MAX));
2586
2587 if (is_leaf && !block_tree_node_min_full_index(path->tree, node_rw) &&
2588 !index) {
2589 data_block_t key0;
2590 key0 = block_tree_node_get_key(path->tree, DATA_BLOCK_INVALID,
2591 node_rw, 0);
2592 assert(key0);
2593 if (print_internal_changes) {
2594 printf("hack, special case for order <= 4 tree\n");
2595 }
2596 block_tree_update_key(tr, path, path->count - 2, key0);
2597 }
2598
2599 if (print_internal_changes) {
2600 printf("%s: merge done, free %" PRIu64 "\n", __func__,
2601 block_mac_to_block(tr, right));
2602 block_tree_node_print(tr, path->tree, block_mac_to_block(tr, left),
2603 node_rw);
2604 }
2605
2606 block_tree_path_put_dirty(tr, path, path->count - 1, node_rw, node_ref);
2607 block_discard_dirty(merge_node_ro);
2608 block_put(merge_node_ro, merge_node_ref);
2609 block_tree_path_set_sibling_block(path, &merge_block, false);
2610 path->count--;
2611 block_tree_remove_internal(tr, path);
2612 block_free_etc(tr, block_mac_to_block(tr, block_mac),
2613 !path->tree->allow_copy_on_write);
2614 }
2615 }
2616
2617 /**
2618 * block_tree_insert_block_mac - Insert an entry into a B+ tree
2619 * @tr: Transaction object.
2620 * @tree: Tree object.
2621 * @key: Key of new entry.
2622 * @data: Data of new entry.
2623 *
2624 * Find a valid insert point for @key and insert @key and @data there. If this
2625 * node becomes overfull, split it.
2626 *
2627 * TODO: use a void * for data and merge with block_tree_insert.
2628 * TODO: Allow insert by path
2629 */
block_tree_insert_block_mac(struct transaction * tr,struct block_tree * tree,data_block_t key,struct block_mac data)2630 void block_tree_insert_block_mac(struct transaction* tr,
2631 struct block_tree* tree,
2632 data_block_t key,
2633 struct block_mac data) {
2634 const struct block_tree_node* node_ro;
2635 struct block_tree_node* node_rw;
2636 struct obj_ref node_ref = OBJ_REF_INITIAL_VALUE(node_ref);
2637 const struct block_mac* block_mac;
2638 data_block_t overflow_key = 0;
2639 struct block_mac overflow_data;
2640 int index;
2641 struct block_tree_path path;
2642
2643 assert(!tr->failed);
2644 assert(!tree->updating);
2645 assert(key);
2646 assert(block_mac_valid(tr, &data));
2647
2648 tree->updating = true;
2649
2650 if (!block_mac_valid(tr, &tree->root)) {
2651 assert(!tree->copy_on_write || tree->allow_copy_on_write);
2652 block_mac_set_block(tr, &tree->root,
2653 block_allocate_etc(tr, !tree->allow_copy_on_write));
2654 if (tr->failed) {
2655 pr_warn("transaction failed, abort\n");
2656 goto err;
2657 }
2658 if (print_internal_changes) {
2659 printf("%s: new root block %" PRIu64 "\n", __func__,
2660 block_mac_to_block(tr, &tree->root));
2661 }
2662 assert(block_mac_valid(tr, &tree->root));
2663 node_rw = block_get_cleared(tr, block_mac_to_block(tr, &tree->root),
2664 !tree->allow_copy_on_write, &node_ref);
2665 node_rw->is_leaf = true;
2666 block_put_dirty(tr, node_rw, &node_ref, &tree->root, NULL);
2667 tree->root_block_changed = true;
2668 }
2669
2670 block_tree_walk(tr, tree, key, false, &path);
2671 if (tr->failed) {
2672 pr_warn("transaction failed, abort\n");
2673 goto err;
2674 }
2675
2676 assert(path.count > 0);
2677
2678 block_mac = &path.entry[path.count - 1].block_mac;
2679 index = path.entry[path.count - 1].index;
2680
2681 node_ro = block_get(tr, block_mac, NULL, &node_ref);
2682 if (!node_ro) {
2683 assert(tr->failed);
2684 pr_warn("transaction failed, abort\n");
2685 goto err;
2686 }
2687
2688 if (print_changes) {
2689 printf("%s: key %" PRIu64 ", data.block %" PRIu64 ", index %d\n",
2690 __func__, key, block_mac_to_block(tr, &data), index);
2691 assert(node_ro);
2692 block_tree_node_print(tr, tree, block_mac_to_block(tr, block_mac),
2693 node_ro);
2694 }
2695
2696 assert(node_ro);
2697 assert(block_tree_node_is_leaf(node_ro));
2698 assert(index || !path.entry[path.count - 1].prev_key ||
2699 block_tree_node_get_key(tree, DATA_BLOCK_INVALID, node_ro, index) ==
2700 key);
2701
2702 node_rw = block_tree_block_dirty(tr, &path, path.count - 1, node_ro);
2703 if (!node_rw) {
2704 assert(tr->failed);
2705 pr_warn("transaction failed, abort\n");
2706 block_put(node_ro, &node_ref);
2707 goto err;
2708 }
2709
2710 block_tree_node_insert(tree, node_rw, index, SHIFT_LEAF_OR_LEFT_CHILD, &key,
2711 &data, &overflow_key, &overflow_data);
2712
2713 block_tree_path_put_dirty(tr, &path, path.count - 1, node_rw, &node_ref);
2714
2715 if (overflow_key) {
2716 assert(block_mac_valid(tr, &overflow_data));
2717 block_tree_node_split(tr, &path, overflow_key, NULL, &overflow_data);
2718 }
2719
2720 if (print_changes_full_tree) {
2721 printf("%s: key %" PRIu64 ", data.block %" PRIu64 ", done:\n", __func__,
2722 key, block_mac_to_block(tr, &data));
2723 block_tree_print(tr, tree);
2724 }
2725
2726 err:
2727 tree->update_count++;
2728 tree->updating = false;
2729 full_assert(block_tree_check(tr, tree));
2730 }
2731
2732 /**
2733 * block_tree_insert - Insert a data_block_t type entry into a B+ tree
2734 * @tr: Transaction object.
2735 * @tree: Tree object.
2736 * @key: Key of new entry.
2737 * @data_block: Data of new entry.
2738 */
block_tree_insert(struct transaction * tr,struct block_tree * tree,data_block_t key,data_block_t data_block)2739 void block_tree_insert(struct transaction* tr,
2740 struct block_tree* tree,
2741 data_block_t key,
2742 data_block_t data_block) {
2743 struct block_mac data = BLOCK_MAC_INITIAL_VALUE(data);
2744
2745 block_mac_set_block(tr, &data, data_block);
2746
2747 block_tree_insert_block_mac(tr, tree, key, data);
2748 }
2749
2750 /**
2751 * block_tree_remove - Remove an entry from a B+ tree
2752 * @tr: Transaction object.
2753 * @tree: Tree object.
2754 * @key: Key of entry to remove.
2755 * @data: Data of entry to remove.
2756 *
2757 * Find an entry in the tree where the key matches @key, and the start of data
2758 * matches @data, and remove it from the tree. Call block_tree_node_merge if
2759 * the resulting node does follow B+ tree rules.
2760 *
2761 * TODO: Remove by by path instead
2762 */
block_tree_remove(struct transaction * tr,struct block_tree * tree,data_block_t key,data_block_t data)2763 void block_tree_remove(struct transaction* tr,
2764 struct block_tree* tree,
2765 data_block_t key,
2766 data_block_t data) {
2767 struct block_tree_path path;
2768 const struct block_tree_node* node_ro;
2769 struct block_tree_node* node_rw;
2770 struct obj_ref node_ref = OBJ_REF_INITIAL_VALUE(node_ref);
2771 const struct block_mac* block_mac;
2772 int index;
2773 bool need_merge = false;
2774 data_block_t new_parent_key;
2775
2776 assert(!tr->failed);
2777 assert(!tree->updating);
2778 assert(block_mac_valid(tr, &tree->root));
2779 full_assert(block_tree_check(tr, tree));
2780 assert(key);
2781 assert(data);
2782
2783 tree->updating = true;
2784
2785 block_tree_walk(tr, tree, key, false, &path); /* TODO: make writeable */
2786 if (tr->failed) {
2787 pr_warn("transaction failed, abort\n");
2788 goto err;
2789 }
2790 assert(path.count > 0);
2791
2792 if (block_mac_to_block(tr, &path.data) != data) {
2793 /* TODO: make path writeable after finding maching entry */
2794 block_tree_walk(tr, tree, key - 1, true, &path);
2795 while (block_mac_to_block(tr, &path.data) != data ||
2796 block_tree_path_get_key(&path) != key) {
2797 assert(block_tree_path_get_key(&path));
2798 block_tree_path_next(&path);
2799 }
2800 }
2801
2802 block_mac = &path.entry[path.count - 1].block_mac;
2803 index = path.entry[path.count - 1].index;
2804
2805 node_ro = block_get(tr, block_mac, NULL, &node_ref);
2806 if (!node_ro) {
2807 assert(tr->failed);
2808 pr_warn("transaction failed, abort\n");
2809 goto err;
2810 }
2811 assert(block_tree_node_is_leaf(node_ro));
2812 assert(block_tree_node_get_key(tree, DATA_BLOCK_INVALID, node_ro, index) ==
2813 key);
2814 assert(!memcmp(block_tree_node_get_child_data(tree, node_ro, index), &data,
2815 MIN(sizeof(data), tr->fs->block_num_size)));
2816
2817 if (print_changes) {
2818 printf("%s: key %" PRIu64 ", data %" PRIu64 ", index %d\n", __func__,
2819 key, data, index);
2820 block_tree_node_print(tr, tree, block_mac_to_block(tr, block_mac),
2821 node_ro);
2822 }
2823
2824 node_rw = block_tree_block_dirty(tr, &path, path.count - 1, node_ro);
2825 if (tr->failed) {
2826 block_put(node_ro, &node_ref);
2827 pr_warn("transaction failed, abort\n");
2828 goto err;
2829 }
2830 assert(node_rw);
2831 block_tree_node_leaf_remove_entry(tr, tree, block_mac, node_rw, index);
2832 if (path.count > 1) {
2833 if (!index) {
2834 new_parent_key = block_tree_node_get_key(tree, DATA_BLOCK_INVALID,
2835 node_rw, 0);
2836 assert(new_parent_key ||
2837 !block_tree_node_min_full_index(tree, node_rw));
2838 if (new_parent_key) {
2839 block_tree_update_key(tr, &path, path.count - 2,
2840 new_parent_key);
2841 }
2842 }
2843 need_merge = block_tree_below_min_full(tree, node_rw);
2844 }
2845 block_tree_path_put_dirty(tr, &path, path.count - 1, node_rw, &node_ref);
2846 if (need_merge) {
2847 block_tree_node_merge(tr, &path);
2848 }
2849
2850 if (print_changes_full_tree) {
2851 printf("%s: key %" PRIu64 ", data %" PRIu64 ", done:\n", __func__, key,
2852 data);
2853 block_tree_print(tr, tree);
2854 }
2855
2856 err:
2857 tree->update_count++;
2858 tree->updating = false;
2859 full_assert(block_tree_check(tr, tree));
2860 }
2861
2862 /**
2863 * block_tree_update_block_mac - Update key or data of an existing B+ tree entry
2864 * @tr: Transaction object.
2865 * @tree: Tree object.
2866 * @old_key: Key of entry to update.
2867 * @old_data: Data of entry to update.
2868 * @new_key: New key of entry.
2869 * @new_data: New data of entry.
2870 *
2871 * Find an entry in the tree where the key matches @key, and the start of data
2872 * matches @data, and update it's key or data. When updating key, the new key
2873 * must not cause the entry to move in the tree.
2874 *
2875 * TODO: Update by path instead
2876 */
block_tree_update_block_mac(struct transaction * tr,struct block_tree * tree,data_block_t old_key,struct block_mac old_data,data_block_t new_key,struct block_mac new_data)2877 void block_tree_update_block_mac(struct transaction* tr,
2878 struct block_tree* tree,
2879 data_block_t old_key,
2880 struct block_mac old_data,
2881 data_block_t new_key,
2882 struct block_mac new_data) {
2883 struct block_tree_path path;
2884 const struct block_tree_node* node_ro;
2885 struct block_tree_node* node_rw;
2886 struct obj_ref node_ref = OBJ_REF_INITIAL_VALUE(node_ref);
2887 const struct block_mac* block_mac;
2888 data_block_t prev_key;
2889 data_block_t next_key;
2890 unsigned int index;
2891 size_t max_key_count;
2892
2893 assert(!tr->failed);
2894 assert(!tree->updating);
2895 assert(block_mac_valid(tr, &tree->root));
2896 full_assert(block_tree_check(tr, tree));
2897 assert(old_key);
2898 assert(block_mac_valid(tr, &old_data));
2899 assert(new_key);
2900 assert(block_mac_valid(tr, &new_data));
2901 assert(old_key == new_key ||
2902 block_mac_same_block(tr, &old_data, &new_data));
2903 assert(old_key != new_key ||
2904 !block_mac_same_block(tr, &old_data, &new_data));
2905
2906 tree->updating = true;
2907
2908 block_tree_walk(tr, tree, old_key - 1, true, &path);
2909 prev_key = block_tree_path_get_key(&path);
2910 /* modify leftmost entry in tree */
2911 if (prev_key == old_key &&
2912 block_mac_same_block(tr, &path.data, &old_data)) {
2913 prev_key = 0;
2914 }
2915
2916 block_tree_walk(tr, tree, old_key, false, &path); /* TODO: make writeable */
2917 if (tr->failed) {
2918 pr_warn("transaction failed, abort\n");
2919 goto err;
2920 }
2921 assert(path.count > 0);
2922
2923 if (!block_mac_same_block(tr, &path.data, &old_data)) {
2924 /* TODO: make path writeable after finding maching entry */
2925 block_tree_walk(tr, tree, old_key - 1, true, &path);
2926 while (!block_mac_same_block(tr, &path.data, &old_data) ||
2927 block_tree_path_get_key(&path) != old_key) {
2928 assert(block_tree_path_get_key(&path));
2929 block_tree_path_next(&path);
2930 }
2931 }
2932
2933 block_mac = &path.entry[path.count - 1].block_mac;
2934 index = path.entry[path.count - 1].index;
2935
2936 node_ro = block_get(tr, block_mac, NULL, &node_ref);
2937 if (!node_ro) {
2938 assert(tr->failed);
2939 pr_warn("transaction failed, abort\n");
2940 goto err;
2941 }
2942 max_key_count = block_tree_node_max_key_count(tree, node_ro);
2943 assert(block_tree_node_is_leaf(node_ro));
2944 assert(block_tree_node_get_key(tree, DATA_BLOCK_INVALID, node_ro, index) ==
2945 old_key);
2946 assert(block_mac_same_block(
2947 tr, block_tree_node_get_child_data(tree, node_ro, index),
2948 &old_data));
2949
2950 if (print_changes) {
2951 printf("%s: key %" PRIu64 "->%" PRIu64 ", data %" PRIu64 "->%" PRIu64
2952 ", index %d\n",
2953 __func__, old_key, new_key, block_mac_to_block(tr, &old_data),
2954 block_mac_to_block(tr, &new_data), index);
2955 block_tree_node_print(tr, tree, block_mac_to_block(tr, block_mac),
2956 node_ro);
2957 }
2958
2959 next_key = (index + 1 < max_key_count)
2960 ? block_tree_node_get_key(tree, DATA_BLOCK_INVALID,
2961 node_ro, index + 1)
2962 : 0;
2963 if (path.count > 1 && !next_key) {
2964 next_key = path.entry[path.count - 2].next_key;
2965 }
2966
2967 node_rw = block_tree_block_dirty(tr, &path, path.count - 1, node_ro);
2968 if (tr->failed) {
2969 block_put(node_ro, &node_ref);
2970 pr_warn("transaction failed, abort\n");
2971 goto err;
2972 }
2973 assert(node_rw);
2974
2975 if (old_key == new_key) {
2976 assert(tree->child_data_size[1] <= sizeof(new_data));
2977 memcpy(block_tree_node_get_child_data_rw(tree, node_rw, index),
2978 &new_data, tree->child_data_size[1]);
2979 } else if (new_key >= prev_key && (new_key <= next_key || !next_key)) {
2980 block_tree_node_set_key(tree, node_rw, index, new_key);
2981 if (!index) {
2982 path.count--;
2983 assert(new_key);
2984 assert(new_key == block_tree_node_get_key(tree, DATA_BLOCK_INVALID,
2985 node_rw, index));
2986 /* A negative depth to block_tree_update_key indicates a no-op */
2987 block_tree_update_key(tr, &path, (int)path.count - 1, new_key);
2988 path.count++;
2989 }
2990 } else {
2991 assert(0); /* moving entries with block_tree_update is not supported */
2992 }
2993 block_tree_path_put_dirty(tr, &path, path.count - 1, node_rw, &node_ref);
2994
2995 if (print_changes_full_tree) {
2996 printf("%s: key %" PRIu64 "->%" PRIu64 ", data %" PRIu64 "->%" PRIu64
2997 ", done:\n",
2998 __func__, old_key, new_key, block_mac_to_block(tr, &old_data),
2999 block_mac_to_block(tr, &new_data));
3000 block_tree_print(tr, tree);
3001 }
3002
3003 err:
3004 tree->update_count++;
3005 tree->updating = false;
3006 full_assert(block_tree_check(tr, tree));
3007 }
3008
3009 /**
3010 * block_tree_update_block_mac - Update key or data of an existing B+ tree entry
3011 * @tr: Transaction object.
3012 * @tree: Tree object.
3013 * @old_key: Key of entry to update.
3014 * @old_data_block: Data of entry to update.
3015 * @new_key: New key of entry.
3016 * @new_data_block: New data of entry.
3017 *
3018 * Same as block_tree_update_block_mac but data of type data_block_t.
3019 *
3020 * TODO: Merge with block_tree_update_block_mac
3021 */
block_tree_update(struct transaction * tr,struct block_tree * tree,data_block_t old_key,data_block_t old_data_block,data_block_t new_key,data_block_t new_data_block)3022 void block_tree_update(struct transaction* tr,
3023 struct block_tree* tree,
3024 data_block_t old_key,
3025 data_block_t old_data_block,
3026 data_block_t new_key,
3027 data_block_t new_data_block) {
3028 struct block_mac old_data = BLOCK_MAC_INITIAL_VALUE(old_data);
3029 struct block_mac new_data = BLOCK_MAC_INITIAL_VALUE(new_data);
3030
3031 block_mac_set_block(tr, &old_data, old_data_block);
3032 block_mac_set_block(tr, &new_data, new_data_block);
3033
3034 block_tree_update_block_mac(tr, tree, old_key, old_data, new_key, new_data);
3035 }
3036
3037 /**
3038 * block_tree_init - Initialize in-memory state of block backed B+ tree
3039 * @tree: Tree object.
3040 * @block_size: Block size.
3041 * @key_size: Key size. [1-8].
3042 * @child_size: Child size. [@key_size-24].
3043 * @data_size: Data size. [1-24].
3044 *
3045 * TODO: set copy-on-write flags.
3046 */
block_tree_init(struct block_tree * tree,size_t block_size,size_t key_size,size_t child_size,size_t data_size)3047 void block_tree_init(struct block_tree* tree,
3048 size_t block_size,
3049 size_t key_size,
3050 size_t child_size,
3051 size_t data_size) {
3052 memset(tree, 0, sizeof(*tree));
3053 block_tree_set_sizes(tree, block_size, key_size, child_size, data_size);
3054 }
3055
3056 /**
3057 * block_tree_init - Initialize in-memory state of copy-on-write B+ tree
3058 * @dst: New tree object.
3059 * @src: Source tree object.
3060 *
3061 * Initialize @dst to point to the same tree as @src, but set flag to enable
3062 * copy-on-write so all nodes that need changes get copied to new blocks before
3063 * those changes are made.
3064 */
block_tree_copy(struct block_tree * dst,const struct block_tree * src)3065 void block_tree_copy(struct block_tree* dst, const struct block_tree* src) {
3066 assert(src->copy_on_write);
3067 *dst = *src;
3068 dst->allow_copy_on_write = true;
3069 }
3070
3071 #if BUILD_STORAGE_TEST
log_n(double n,double x)3072 static double log_n(double n, double x) {
3073 return log(x) / log(n);
3074 }
3075
3076 static bool block_tree_max_depth_needed;
3077
block_tree_check_config(struct block_device * dev)3078 void block_tree_check_config(struct block_device* dev) {
3079 struct block_tree tree[1];
3080 int node_internal_min_key_count;
3081 int node_internal_max_key_count;
3082 int node_min_child_count;
3083 int node_max_child_count;
3084 int node_leaf_min_key_count;
3085 int node_leaf_max_key_count;
3086 double tree_min_entry_count;
3087 double tree_max_entry_count;
3088 int depth;
3089 int max_entries_needed = (dev->block_count + 1) / 2;
3090 int depth_needed;
3091
3092 block_tree_set_sizes(tree, dev->block_size, sizeof(data_block_t),
3093 sizeof(struct block_mac), sizeof(struct block_mac));
3094
3095 node_internal_min_key_count = block_tree_min_key_count(tree, false);
3096 node_internal_max_key_count = block_tree_max_key_count(tree, false);
3097 node_min_child_count = node_internal_min_key_count + 1;
3098 node_max_child_count = node_internal_max_key_count + 1;
3099 node_leaf_min_key_count = block_tree_min_key_count(tree, true);
3100 node_leaf_max_key_count = block_tree_max_key_count(tree, true);
3101
3102 printf("block_tree config\n");
3103 printf("internal min key count: %6d\n", node_internal_min_key_count);
3104 printf("internal max key count: %6d\n", node_internal_max_key_count);
3105 printf("min child count: %6d\n", node_min_child_count);
3106 printf("max child count: %6d\n", node_max_child_count);
3107 printf("leaf min key count: %6d\n", node_leaf_min_key_count);
3108 printf("leaf max key count: %6d\n", node_leaf_max_key_count);
3109 printf("block size: %6zd\n", dev->block_size);
3110 printf("block count: %6" PRIu64 "\n", dev->block_count);
3111 printf("max entries needed: %6d\n", max_entries_needed);
3112 printf("max depth: %6d\n", BLOCK_TREE_MAX_DEPTH);
3113 for (depth = 1; depth <= BLOCK_TREE_MAX_DEPTH; depth++) {
3114 if (depth == 1) {
3115 tree_min_entry_count = 0;
3116 tree_max_entry_count = node_leaf_max_key_count;
3117 } else if (depth == 2) {
3118 tree_min_entry_count = node_leaf_min_key_count * 2;
3119 } else {
3120 tree_min_entry_count *= node_min_child_count;
3121 }
3122 tree_max_entry_count *= node_max_child_count;
3123 printf("depth %2d: entry count range: %20.15g - %20.15g (alt. %15.10g - %15.10g)\n",
3124 depth, tree_min_entry_count, tree_max_entry_count,
3125 2.0 * pow(ceil(node_max_child_count / 2.0), depth - 1),
3126 pow(node_max_child_count, depth) -
3127 pow(node_max_child_count, depth - 1));
3128 }
3129 depth_needed = ceil(log_n(ceil(node_max_child_count / 2.0),
3130 ceil(max_entries_needed / 2.0))) +
3131 1;
3132 printf("est. tree depth needed for %u entries: %d\n", max_entries_needed,
3133 depth_needed);
3134 printf("est. tree depth needed for %" PRIu64 " entries: %g\n",
3135 ((data_block_t)~0 / dev->block_size),
3136 ceil(log_n(ceil(node_max_child_count / 2.0),
3137 ceil(((data_block_t)~0 / dev->block_size) / 2.0)) +
3138 1.0));
3139 printf("est. tree depth needed for %" PRIu64 " entries: %g\n",
3140 ((data_block_t)~0),
3141 ceil(log_n(ceil(node_max_child_count / 2.0),
3142 (((data_block_t)~0) / 2 + 1)) +
3143 1.0));
3144 assert(tree_min_entry_count >= max_entries_needed);
3145 if (tree_min_entry_count / node_min_child_count < max_entries_needed) {
3146 block_tree_max_depth_needed = true;
3147 }
3148 }
3149
block_tree_check_config_done(void)3150 void block_tree_check_config_done(void) {
3151 assert(block_tree_max_depth_needed);
3152 }
3153
3154 #endif
3155