app/storage/super.c

/*
 * Copyright (C) 2015-2016 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <assert.h>
#include <inttypes.h>
#include <lk/compiler.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>

#ifndef LOCAL_TRACE
#define LOCAL_TRACE TRACE_LEVEL_INIT
#endif
#ifndef LOCAL_TRACE_ERR
#define LOCAL_TRACE_ERR TRACE_LEVEL_INIT
#endif

#include "array.h"
#include "block_allocator.h"
#include "block_cache.h"
#include "block_set.h"
#include "checkpoint.h"
#include "debug.h"
#include "error_reporting.h"
#include "file.h"
#include "fs.h"
#include "transaction.h"

#define SUPER_BLOCK_MAGIC (0x0073797473757274ULL) /* trustys */
#define SUPER_BLOCK_FLAGS_VERSION_MASK (0x3U)
#define SUPER_BLOCK_FLAGS_BLOCK_INDEX_MASK (0x1U)
#define SUPER_BLOCK_FLAGS_EMPTY (0x4U)
#define SUPER_BLOCK_FLAGS_ALTERNATE (0x8U)
#define SUPER_BLOCK_FLAGS_SUPPORTED_MASK (0xfU)
#define SUPER_BLOCK_FS_VERSION (0U)

/**
 * typedef super_block_opt_flags8_t - Optional flags, can be ORed together
 *
 * %SUPER_BLOCK_OPT_FLAGS_HAS_FLAGS3
 *   Indicates that the superblock has additional data after flags2 and that
 *   flags3 should be set to the same value as flags
 * %SUPER_BLOCK_OPT_FLAGS_HAS_CHECKPOINT
 *   Indicates that the superblock contains the @checkpoint field
 * %SUPER_BLOCK_OPT_FLAGS_NEEDS_FULL_SCAN
 *   An error was detected in this file system, a full scan and possibly repair
 *   should be initiated on the next mount. Reset after scanning.
 */
typedef uint8_t super_block_opt_flags8_t;
#define SUPER_BLOCK_OPT_FLAGS_HAS_FLAGS3 (0x1U)
#define SUPER_BLOCK_OPT_FLAGS_HAS_CHECKPOINT (0x2U)
#define SUPER_BLOCK_OPT_FLAGS_NEEDS_FULL_SCAN (0x4U)

/**
 * typedef super_block_required_flags16_t - Required FS flags, can be ORed
 *                                          together
 *
 * These flags are required to be supported by the current implementation; if
 * any unrecognized flag bits are set the file system must not be mounted.
 * Versions of the storage service prior to the addition of the @required_flags
 * field will interpret non-zero flags as a high @fs_version and will refuse to
 * mount the file-system.
 *
 * %SUPER_BLOCK_REQUIRED_FLAGS_MAIN_REPAIRED
 *   Indicates that the main (i.e. flags does not contain
 *   %SUPER_BLOCK_FLAGS_ALTERNATE) file system has been repaired in a manner
 *   that effectively resulted in rollback to a previous state since it was last
 *   cleared. This flag is required to be supported, if set, so that we do not
 *   discard a repaired state by running an older version of the storage
 *   service. This flag is cleared when the main file system is cleared, and
 *   therefore only tracks repairs since the file system was last cleared.
 * %SUPER_BLOCK_REQUIRED_FLAGS_MASK
 *   Mask of bits that are understood by the current storage implementation. If
 *   any bits of this field are set outside of this mask, do not mount the file
 *   system.
 */
typedef uint16_t super_block_required_flags16_t;
#define SUPER_BLOCK_REQUIRED_FLAGS_MAIN_REPAIRED (0x1U)
#define SUPER_BLOCK_REQUIRED_FLAGS_MASK \
    (SUPER_BLOCK_REQUIRED_FLAGS_MAIN_REPAIRED)

/**
 * struct super_block - On-disk root block for file system state
 * @iv:             Initial value used for encrypt/decrypt.
 * @magic:          SUPER_BLOCK_MAGIC.
 * @flags:          Version in bottom two bits, other bits are reserved.
 * @fs_version:     Required file system version. If greater than
 *                  %SUPER_BLOCK_FS_VERSION, do not mount or overwrite
 *                  filesystem.
 * @required_flags: Required file system flags. To mount this file system, any
 *                  non-zero flag bits set must be supported by the storage
 *                  implementation.
 * @block_size:     Block size of file system.
 * @block_num_size: Number of bytes used to store block numbers.
 * @mac_size:       number of bytes used to store mac values.
 * @opt_flags:      Optional flags, any of &typedef super_block_opt_flags8_t
 *                  ORed together.
 * @res2:           Reserved for future use. Write 0, read ignore.
 * @block_count:    Size of file system.
 * @free:           Block and mac of free set root node.
 * @free_count:     Currently unused.
 * @files:          Block and mac of files tree root node.
 * @res3:           Reserved for future use. Write 0, read ignore.
 * @flags2:         Copy of @flags. Allows storing the super-block in a device
 *                  that does not support an atomic write of the entire
 *                  super-block.
 * @backup:         Backup of previous super-block, used to support an alternate
 *                  backing store. 0 if no backup has ever been written. Once a
 *                  backup exists, it will only ever be swapped, not cleared.
 * @checkpoint:     Block and mac of checkpoint metadata block. 0 if a
 *                  checkpoint does not exist.
 * @res4:           Reserved for future use. Write 0, read ignore.
 * @flags3:         Copy of @flags. Allows storing the super-block in a device
 *                  that does not support an atomic write of the entire
 *                  super-block. If SUPER_BLOCK_OPT_FLAGS_HAS_FLAGS3 is not set,
 *                  @flags3 is not checked and fields after @flags2 are ignored.
 *
 * Block numbers and macs in @free and @files are packed as indicated by
 * @block_num_size and @mac_size, but unlike other on-disk data, the size of the
 * whole field is always the full 24 bytes needed for a 8 byte block number and
 * 16 byte mac This allows the @flags2 and @flags3 to be validated before
 * knowing @block_num_size and @mac_size.
 */
struct super_block {
    struct iv iv;
    uint64_t magic;
    uint32_t flags;
    uint16_t fs_version;
    super_block_required_flags16_t required_flags;
    uint32_t block_size;
    uint8_t block_num_size;
    uint8_t mac_size;
    super_block_opt_flags8_t opt_flags;
    uint8_t res2;
    data_block_t block_count;
    struct block_mac free;
    data_block_t free_count;
    struct block_mac files;
    uint32_t res3[5];
    uint32_t flags2;
    struct super_block_backup backup;
    struct block_mac checkpoint;
    uint32_t res4[6];
    uint32_t flags3;
};
STATIC_ASSERT(offsetof(struct super_block, flags2) == 124);
STATIC_ASSERT(offsetof(struct super_block, flags3) == 252);
STATIC_ASSERT(sizeof(struct super_block) == 256);

/*
 * We rely on these offsets in future_fs_version_test and
 * unknown_required_flags_test in the storage_block_test to test that we will
 * not mount or modify a super block with unknown version or fs flags.
 */
STATIC_ASSERT(offsetof(struct super_block, fs_version) == 28);
STATIC_ASSERT(offsetof(struct super_block, required_flags) == 30);

/* block_device_tipc.c ensures that we have at least 256 bytes in RPMB blocks */
STATIC_ASSERT(sizeof(struct super_block) <= 256);

static struct list_node fs_list = LIST_INITIAL_VALUE(fs_list);

/**
 * update_super_block_internal - Generate and write superblock
 * @tr:         Transaction object.
 * @free:       New free root.
 * @files:      New files root.
 * @checkpoint: New checkpoint metadata block.
 * @pinned:     New block should not be reused in the block cache until
 *              it is successfully written.
 *
 * Return: %true if super block was updated (in cache), %false if transaction
 * failed before super block was updated.
 */
static bool update_super_block_internal(struct transaction* tr,
                                        const struct block_mac* free,
                                        const struct block_mac* files,
                                        const struct block_mac* checkpoint,
                                        bool pinned) {
    struct super_block* super_rw;
    struct obj_ref super_ref = OBJ_REF_INITIAL_VALUE(super_ref);
    unsigned int ver;
    unsigned int index;
    super_block_required_flags16_t required_flags = 0;
    uint32_t flags;
    uint32_t block_size = tr->fs->super_dev->block_size;
    super_block_opt_flags8_t opt_flags = SUPER_BLOCK_OPT_FLAGS_HAS_FLAGS3 |
                                         SUPER_BLOCK_OPT_FLAGS_HAS_CHECKPOINT;

    if (!tr->fs->writable) {
        pr_err("Attempting to write superblock for read-only filesystem\n");
        if (!tr->failed) {
            transaction_fail(tr);
        }
        return false;
    }

    assert(block_size >= sizeof(struct super_block));
    assert(tr->fs->initial_super_block_tr == NULL ||
           tr->fs->initial_super_block_tr == tr);

    ver = (tr->fs->super_block_version + 1) & SUPER_BLOCK_FLAGS_VERSION_MASK;
    index = ver & SUPER_BLOCK_FLAGS_BLOCK_INDEX_MASK;
    flags = ver;
    if (!free && !files) {
        /*
         * If the free and files trees are not provided, the filesystem is in
         * the initial empty state.
         */
        flags |= SUPER_BLOCK_FLAGS_EMPTY;
    } else {
        /* Non-empty filesystems must have both trees (with root node blocks) */
        assert(free);
        assert(files);
    }
    if (tr->fs->alternate_data) {
        flags |= SUPER_BLOCK_FLAGS_ALTERNATE;
    }
    if (tr->repaired || tr->fs->main_repaired) {
        /*
         * We don't track repairs in alternate data mode, so we shouldn't do
         * them - ensure the transaction does not include a repair if we are in
         * alternate state. The FS flag is used to persist the state for the
         * main FS.
         */
        assert(!tr->repaired || !tr->fs->alternate_data);
        required_flags |= SUPER_BLOCK_REQUIRED_FLAGS_MAIN_REPAIRED;
        /*
         * TODO: We would like to track the number of repairs in addition to the
         * current repair state. This may be up to three different counters: 1)
         * the number of times this fs has been repaired over the device
         * lifetime to report in metrics, 2) the number of repairs since last
         * clear, and 3) the overall fs generation count (number of device
         * lifetime repairs+clears). 2) and 3) would primarily be useful if we
         * expose them to clients via a new query API, while 1) would mostly be
         * for device metrics. We can implement some or all of these counters
         * when we add an API that consumes them.
         */
    }
    if (tr->fs->needs_full_scan) {
        opt_flags |= SUPER_BLOCK_OPT_FLAGS_NEEDS_FULL_SCAN;
    }

    pr_write("write super block %" PRIu64 ", ver %d\n",
             tr->fs->super_block[index], ver);

    super_rw = block_get_cleared_super(tr, tr->fs->super_block[index],
                                       &super_ref, pinned);
    if (tr->failed) {
        block_put_dirty_discard(super_rw, &super_ref);
        return false;
    }
    super_rw->magic = SUPER_BLOCK_MAGIC;
    super_rw->flags = flags;
    /* TODO: keep existing fs version when possible */
    super_rw->fs_version = SUPER_BLOCK_FS_VERSION;
    super_rw->required_flags = required_flags;
    super_rw->block_size = tr->fs->dev->block_size;
    super_rw->block_num_size = tr->fs->block_num_size;
    super_rw->mac_size = tr->fs->mac_size;
    super_rw->opt_flags = opt_flags;
    super_rw->block_count = tr->fs->dev->block_count;
    if (free) {
        super_rw->free = *free;
    }
    super_rw->free_count = 0; /* TODO: remove or update */
    if (files) {
        super_rw->files = *files;
    }
    if (checkpoint) {
        super_rw->checkpoint = *checkpoint;
    }
    super_rw->flags2 = flags;
    super_rw->backup = tr->fs->backup;
    super_rw->flags3 = flags;
    tr->fs->written_super_block_version = ver;

    block_put_dirty_no_mac(super_rw, &super_ref, tr->fs->allow_tampering);

    return true;
}

/**
 * update_super_block - Generate and write superblock
 * @tr:         Transaction object.
 * @free:       New free root.
 * @files:      New files root.
 * @checkpoint: New checkpoint metadata block.
 *
 * Return: %true if super block was updated (in cache), %false if transaction
 * failed before super block was updated.
 */
bool update_super_block(struct transaction* tr,
                        const struct block_mac* free,
                        const struct block_mac* files,
                        const struct block_mac* checkpoint) {
    return update_super_block_internal(tr, free, files, checkpoint, false);
}

/**
 * write_initial_super_block - Write initial superblock to internal transaction
 * @fs:         File system state object.
 *
 * When needed, this must be called before creating any other transactions on
 * this filesystem so we don't fill up the cache with entries that can't be
 * flushed to make room for this block.
 *
 * Return: %true if the initial empty superblock was successfully written to the
 * cache, or %false otherwise.
 */
static bool write_initial_super_block(struct fs* fs) {
    struct transaction* tr;
    tr = calloc(1, sizeof(*tr));
    if (!tr) {
        return false;
    }
    fs->initial_super_block_tr = tr;

    transaction_init(tr, fs, true);
    return update_super_block_internal(tr, NULL, NULL, NULL, true);
}

/**
 * write_current_super_block - Write current superblock to internal transaction
 * @fs:           File system state object.
 * @reinitialize: Allow the special transaction to be reinitialized if it has
 *                failed
 *
 * Write the current state of the super block to an internal transaction that
 * will be written before any other block. This can be used to re-sync the
 * in-memory fs-state with the on-disk state after detecting a write failure
 * where no longer know the on-disk super block state.
 *
 * @fs must be writable when calling this function.
 */
void write_current_super_block(struct fs* fs, bool reinitialize) {
    bool super_block_updated;
    struct transaction* tr;

    assert(fs->writable);

    if (fs->initial_super_block_tr) {
        /*
         * If initial_super_block_tr is already pending and not failed there is
         * no need to allocate a new one so return early.
         *
         * If the special transaction has failed, we need to re-initialize it so
         * that we can attempt to recover to a good state.
         *
         * We are only allowed to reinitialze if the @reinitialize parameter is
         * true. We don't want to allow reinitialization while cleaning blocks
         * (i.e. via fs_unknown_super_block_state_all()), as this would reset
         * the special transaction to non-failed state and create a situation
         * where transaction_initial_super_block_complete() cannot know if it
         * successfully flushed the special transaction to disk. Therefore we
         * only allow transaction_initial_super_block_complete() to reinitialize
         * a failed special transaction after it attempts and fails to write the
         * block to disk.
         *
         * Since we pin special superblock entries in the block cache and
         * therefore cannot evict them with normal transactions,
         * transaction_initial_super_block_complete() is the only place we can
         * attempt a special transaction write, and if it fails the transaction
         * is immediately reinitialized. Therefore we should only ever be in a
         * failed state if reinitialize is true (i.e. we are being called from
         * transaction_initial_super_block_complete()).
         */

        assert(reinitialize || !fs->initial_super_block_tr->failed);
        if (!fs->initial_super_block_tr->failed || !reinitialize) {
            return;
        }

        tr = fs->initial_super_block_tr;
        transaction_activate(tr);
    } else {
        tr = calloc(1, sizeof(*tr));
        if (!tr) {
            /* Not safe to proceed. TODO: add flag to defer this allocation? */
            abort();
        }
        transaction_init(tr, fs, true);
        fs->initial_super_block_tr = tr;
    }

    /*
     * Until the filesystem contains committed data, fs->free.block_tree.root
     * will be zero, i.e. an invalid block mac. fs->free.block_tree.root is only
     * updated in transaction_complete() after successfully writing a new
     * superblock. If the filesystem is empty, we need to emit a cleared
     * superblock with a special flag to prevent the superblock state from
     * getting out of sync with the filesystem data if a reboot occurrs before
     * committing a superblock with data.
     *
     * We can't use fs->files.root here because it may be invalid if there are
     * no files in the filesystem. If the free node is zero, then the files node
     * must be as well, so we assert this.
     */
    bool fs_is_cleared = !block_mac_valid(tr, &fs->free.block_tree.root);
    if (fs_is_cleared) {
        assert(!block_mac_valid(tr, &fs->files.root));
        super_block_updated =
                update_super_block_internal(tr, NULL, NULL, NULL, true);
    } else {
        super_block_updated = update_super_block_internal(
                tr, &fs->free.block_tree.root, &fs->files.root, &fs->checkpoint,
                true);
    }
    if (!super_block_updated) {
        /* Not safe to proceed. TODO: add flag to try again? */
        fprintf(stderr,
                "Could not create pending write for current superblock state. "
                "Not safe to proceed.\n");
        abort();
    }
}

/**
 * fs_mark_scan_required - Require a full scan for invalid blocks the next time
 *                         this FS is mounted
 * @fs:             File system object
 *
 * Marks the file system to require a full scan (and possibly repair) on the
 * next mount. If @fs is writable, this function immediately writes a new copy
 * of the current super block, so the flag will persist even with no further
 * writes to the file system.
 */
void fs_mark_scan_required(struct fs* fs) {
    fs->needs_full_scan = true;
    if (!fs->writable) {
        /* We can't write back the superblock until this FS is writable. */
        return;
    }
    write_current_super_block(fs, false);
    assert(fs->initial_super_block_tr);
    transaction_initial_super_block_complete(fs->initial_super_block_tr);
}

/**
 * super_block_valid - Check if superblock is valid
 * @dev:        Block device that supoer block was read from.
 * @super:      Super block data.
 *
 * Return: %true if @super is valid for @dev, %false otherwise.
 */
static bool super_block_valid(const struct block_device* dev,
                              const struct super_block* super) {
    if (super->magic != SUPER_BLOCK_MAGIC) {
        pr_init("bad magic, 0x%" PRIx64 "\n", super->magic);
        return false;
    }
    if (super->flags != super->flags2) {
        pr_warn("flags, 0x%x, does not match flags2, 0x%x\n", super->flags,
                super->flags2);
        return false;
    }
    if ((super->opt_flags & SUPER_BLOCK_OPT_FLAGS_HAS_FLAGS3) &&
        super->flags != super->flags3) {
        pr_warn("flags, 0x%x, does not match flags3, 0x%x\n", super->flags,
                super->flags3);
        return false;
    }
    if (super->fs_version > SUPER_BLOCK_FS_VERSION) {
        pr_warn("super block is from the future: 0x%x\n", super->fs_version);
        return true;
    }
    if (super->flags & ~SUPER_BLOCK_FLAGS_SUPPORTED_MASK) {
        pr_warn("unknown flags set, 0x%x\n", super->flags);
        return false;
    }
    if (super->block_size != dev->block_size) {
        pr_warn("bad block size 0x%x, expected 0x%zx\n", super->block_size,
                dev->block_size);
        return false;
    }
    if (super->block_num_size != dev->block_num_size) {
        pr_warn("invalid block_num_size %d, expected %zd\n",
                super->block_num_size, dev->block_num_size);
        return false;
    }
    if (super->mac_size != dev->mac_size) {
        pr_warn("invalid mac_size %d, expected %zd\n", super->mac_size,
                dev->mac_size);
        return false;
    }
    if (!dev->tamper_detecting && super->mac_size != sizeof(struct mac)) {
        pr_warn("invalid mac_size %d != %zd\n", super->mac_size,
                sizeof(data_block_t));
        return false;
    }

    return true;
}

/**
 * super_version_delta - Find the version delta between two superblocks
 * @new_super: Candidate new superblock
 * @old_super: Old superblock
 *
 * The overflow in this function is intentional as a way to use a wrapping
 * two-bit counter.
 *
 * Return: Wrapped difference between the two bit version numbers in the two
 * superblocks. This will be 1 when new is newer than old, 3 when old is
 * newer than new, and any other number indicates an invalid/corrupt version.
 */
__attribute__((no_sanitize("unsigned-integer-overflow"))) static inline uint8_t
super_version_delta(const struct super_block* new_super,
                    const struct super_block* old_super) {
    return (new_super->flags - old_super->flags) &
           SUPER_BLOCK_FLAGS_VERSION_MASK;
}

/**
 * use_new_super - Check if new superblock is valid and more recent than old
 * @dev:                Block device that super block was read from.
 * @new_super:          New super block data.
 * @new_super_index:    Index that @new_super was read from.
 * @old_super:          Old super block data, or %NULL.
 *
 * Return: %true if @new_super is valid for @dev, and more recent than
 * @old_super (or @old_super is %NULL), %false otherwise.
 */
static bool use_new_super(const struct block_device* dev,
                          const struct super_block* new_super,
                          unsigned int new_super_index,
                          const struct super_block* old_super) {
    uint8_t dv;
    if (!super_block_valid(dev, new_super)) {
        return false;
    }
    if ((new_super->flags & SUPER_BLOCK_FLAGS_BLOCK_INDEX_MASK) !=
        new_super_index) {
        pr_warn("block index, 0x%x, does not match flags, 0x%x\n",
                new_super_index, new_super->flags);
        return false;
    }
    if (!old_super) {
        return true;
    }
    dv = super_version_delta(new_super, old_super);
    pr_read("version delta, %d (new flags 0x%x, old flags 0x%x)\n", dv,
            new_super->flags, old_super->flags);
    if (dv == 1) {
        return true;
    }
    if (dv == 3) {
        return false;
    }
    pr_warn("bad version delta, %d (new flags 0x%x, old flags 0x%x)\n", dv,
            new_super->flags, old_super->flags);
    return false;
}

static void fs_init_free_set(struct fs* fs, struct block_set* set);

/**
 * fs_set_roots - Initialize fs state from super block roots
 * @fs:                File system state object
 * @free:              Free set root node
 * @files:             Files tree root node
 * @checkpoint:        Checkpoint metadata block. May be NULL.
 * @restore_checkpoint: If %true, restore files and free roots from @checkpoint
 *                      (which must not be NULL).
 *
 * Unconditionally sets the filesystem roots to @free and @files respectively,
 * then attempts to restore the checkpoint roots if @restore_checkpoint is
 * %true. When attempting to restore from a checkpoint that exists but is not
 * readable, return %false, leaving the filesystem roots initialized to @free
 * and @files. If attempting to restore from checkpoint but no checkpoint was
 * previously set, this function will clear the filesystem.
 *
 * Returns %true if fs roots were correctly initialized as requested, %false if
 * a requested checkpoint restore failed (but roots were still initialized to
 * the provided blocks).
 */
static bool fs_set_roots(struct fs* fs,
                         const struct block_mac* free,
                         const struct block_mac* files,
                         const struct block_mac* checkpoint,
                         bool restore_checkpoint) {
    bool success = true;
    struct transaction tr;
    struct block_tree checkpoint_files =
            BLOCK_TREE_INITIAL_VALUE(checkpoint_files);

    assert(!restore_checkpoint || checkpoint);

    fs->free.block_tree.root = *free;
    fs->files.root = *files;

    if (checkpoint) {
        fs->checkpoint = *checkpoint;
        transaction_init(&tr, fs, true);

        /*
         * fs->checkpoint_free is initialized to contain all blocks, so we
         * don't have to initialize it if there is no checkpoint on disk
         */
        assert(!block_range_empty(fs->checkpoint_free.initial_range));

        if (block_mac_valid(&tr, &fs->checkpoint)) {
            success = checkpoint_read(&tr, &fs->checkpoint, &checkpoint_files,
                                      &fs->checkpoint_free);
        } else if (restore_checkpoint) {
            /* We do not want to restore a non-existent checkpoint */
            success = false;
        }
        if (success && restore_checkpoint) {
            /*
             * Checkpoint restore counts as a repair which must set the repaired
             * flag. We disallow checkpoint restore in alternate mode in
             * fs_init().
             */
            fs->main_repaired = true;
            fs->files.root = checkpoint_files.root;
            block_set_copy_ro(&tr, &fs->free, &fs->checkpoint_free);
            /*
             * block_set_copy_ro() clears the copy_on_write flag for the free
             * set, so we have to reset it to allow modification.
             */
            fs->free.block_tree.copy_on_write = true;
        }
        if (!tr.failed) {
            /* temporary transaction is only for reading, drop it */
            transaction_fail(&tr);
        }
        transaction_free(&tr);
    }

    return success;
}

/**
 * fs_init_free_set - Initialize an initial free set for a file system
 * @fs:         File system state object.
 * @set:        Block set to initialize
 *
 * Initializes @set to the entire range of @fs, i.e. all blocks are free.
 */
static void fs_init_free_set(struct fs* fs, struct block_set* set) {
    struct block_range range = {
            .start = fs->min_block_num,
            .end = fs->dev->block_count,
    };
    block_set_add_initial_range(set, range);
}

/**
 * fs_init_from_super - Initialize file system from super block
 * @fs:         File system state object.
 * @super:      Superblock data, or %NULL.
 * @flags:      Any of &typedef fs_init_flags32_t, ORed together.
 *
 * Return: 0 if super block was usable, -1 if a fatal error was encountered and
 * initialization should not continue. The file system may not be readable, even
 * if this function returns 0. Check @fs->readable before attempting to read
 * from this file system.
 */
static int fs_init_from_super(struct fs* fs,
                              const struct super_block* super,
                              fs_init_flags32_t flags) {
    bool is_clear = false;
    bool do_clear = flags & FS_INIT_FLAGS_DO_CLEAR;
    bool do_swap = false; /* Does the active superblock alternate mode match the
                             current mode? */
    bool do_clear_backup = false;
    bool has_backup_field =
            super && (super->opt_flags & SUPER_BLOCK_OPT_FLAGS_HAS_FLAGS3);
    bool has_checkpoint_field =
            has_backup_field && super &&
            (super->opt_flags & SUPER_BLOCK_OPT_FLAGS_HAS_CHECKPOINT);
    bool recovery_allowed = flags & FS_INIT_FLAGS_RECOVERY_CLEAR_ALLOWED;
    bool read_only = false;
    const struct block_mac* new_files_root;
    const struct block_mac* new_free_root;
    const struct block_mac* new_checkpoint = NULL;

    /*
     * We check that the super-block matches these block device params in
     * super_block_valid(). If these params change, the filesystem (and
     * alternate backup) will be wiped and reset with the new params.
     */
    fs->block_num_size = fs->dev->block_num_size;
    fs->mac_size = fs->dev->mac_size;

    block_set_init(fs, &fs->free);
    fs->free.block_tree.copy_on_write = true;
    fs_file_tree_init(fs, &fs->files);
    fs->files.copy_on_write = true;
    fs->files.allow_copy_on_write = true;
    fs->main_repaired = false;

    memset(&fs->checkpoint, 0, sizeof(fs->checkpoint));
    block_set_init(fs, &fs->checkpoint_free);
    /*
     * checkpoint_init() will clear the checkpoint initial range if a valid
     * checkpoint exists.
     */
    fs_init_free_set(fs, &fs->checkpoint_free);

    /* Reserve 1/4 for tmp blocks plus half of the remaining space */
    fs->reserved_count = fs->dev->block_count / 8 * 5;

    fs->alternate_data = flags & FS_INIT_FLAGS_ALTERNATE_DATA;

    /*
     * Check version and flags after initializing an empty FS, so that we can
     * disallow writing and continue initializing other file systems. If we exit
     * early here this file system will be inaccessible, but its fields are
     * safely initialized.
     */
    if (super && super->fs_version > SUPER_BLOCK_FS_VERSION) {
        pr_err("ERROR: super block is from the future 0x%x\n",
               super->fs_version);
        error_report_superblock_invalid(fs->name);
        assert(!fs->readable);
        assert(!fs->writable);
        return 0;
    }

    if (super && (super->required_flags & ~SUPER_BLOCK_REQUIRED_FLAGS_MASK)) {
        pr_err("ERROR: super block requires unrecognized fs features: 0x%x\n",
               super->required_flags);
        error_report_superblock_invalid(fs->name);
        assert(!fs->readable);
        assert(!fs->writable);
        return 0;
    }

    if (super) {
        fs->super_block_version = super->flags & SUPER_BLOCK_FLAGS_VERSION_MASK;
        fs->needs_full_scan =
                super->opt_flags & SUPER_BLOCK_OPT_FLAGS_NEEDS_FULL_SCAN;
        fs->main_repaired = super->required_flags &
                            SUPER_BLOCK_REQUIRED_FLAGS_MAIN_REPAIRED;

        do_swap = !(super->flags & SUPER_BLOCK_FLAGS_ALTERNATE) !=
                  !(flags & FS_INIT_FLAGS_ALTERNATE_DATA);

        if (do_swap) {
            pr_init("Swapping super-block with alternate\n");

            fs->backup.flags = super->flags & (SUPER_BLOCK_FLAGS_EMPTY |
                                               SUPER_BLOCK_FLAGS_ALTERNATE);
            fs->backup.free = super->free;
            fs->backup.files = super->files;
            fs->backup.checkpoint = super->checkpoint;

            if (!has_backup_field ||
                super->backup.flags & SUPER_BLOCK_FLAGS_EMPTY) {
                is_clear = true;
            } else if (has_backup_field) {
                new_files_root = &super->backup.files;
                new_free_root = &super->backup.free;
                if (has_checkpoint_field) {
                    new_checkpoint = &super->backup.checkpoint;
                }
            }
        } else {
            if (has_backup_field) {
                fs->backup = super->backup;
            }

            if (super->flags & SUPER_BLOCK_FLAGS_EMPTY) {
                is_clear = true;
            } else {
                new_files_root = &super->files;
                new_free_root = &super->free;
                if (has_checkpoint_field) {
                    new_checkpoint = &super->checkpoint;
                }
            }
        }

        if (!is_clear && !do_clear &&
            (!block_probe(fs, new_files_root, true) ||
             !block_probe(fs, new_free_root, false))) {
            pr_init("Backing file probe failed, fs is corrupted.\n");
            if (recovery_allowed) {
                pr_init("Attempting to clear corrupted fs.\n");
                do_clear = true;
            }
        }

        /*
         * Check that the block device has not shrunk. Shrinking is only allowed
         * in limited circumstances if we are also clearing the filesystem.
         */
        if (super->block_count > fs->dev->block_count) {
            if ((!do_clear) && (!is_clear)) {
                /*
                 * If block device is smaller than super and we're not clearing
                 * the fs, we want to prevent write access to avoid losing data.
                 * Read-only access is still allowed, although blocks may be
                 * missing.
                 */
                pr_err("bad block count 0x%" PRIx64 ", expected <= 0x%" PRIx64
                       "\n",
                       super->block_count, fs->dev->block_count);
                read_only = true;
            } else if (flags & FS_INIT_FLAGS_ALTERNATE_DATA) {
                /*
                 * Either we are on main filesystem and switching to alternate
                 * or we are on alternate. Either case is an error. If we get
                 * here, then the alternate FS is not backed by a temp file,
                 * which should never happen. We want to error loudly in this
                 * case, but continue mounting other file systems.
                 */
                pr_err("Can't clear fs if FS_INIT_FLAGS_ALTERNATE_DATA is"
                       " set .\n");
                assert(!fs->readable);
                assert(!fs->writable);
                return 0;
            } else {
                /*
                 * If we are are on main filesystem and the backup is an
                 * alternate, clear the backup also.
                 */
                do_clear_backup = true;
            }
        }
    }

    if (!fs->alternate_data && (flags & FS_INIT_FLAGS_RESTORE_CHECKPOINT)) {
        fs->needs_full_scan = false;
    }

    /*
     * If any of the following are true:
     * - we are initializing a new fs
     * - we are not swapping but detect an old superblock without the backup
     * - filesystem device has shrunk and FS_INIT_FLAGS_DO_CLEAR is set
     * then ensure that the backup slot is a valid empty filesystem in case we
     * later switch filesystems without an explicit clear flag.
     */
    if (!super || (!do_swap && !has_backup_field) || do_clear_backup) {
        fs->backup = (struct super_block_backup){
                .flags = SUPER_BLOCK_FLAGS_EMPTY,
                .files = {0},
                .free = {0},
                .checkpoint = {0},
        };
    }

    if (super && !is_clear && !do_clear) {
        if (!fs_set_roots(fs, new_free_root, new_files_root, new_checkpoint,
                          flags & FS_INIT_FLAGS_RESTORE_CHECKPOINT)) {
            /*
             * fs_set_roots() returns false if the checkpoint restore failed,
             * but leaves the roots in a valid state to allow read-only access.
             */
            pr_err("fs %s: failed to initialize filesystem roots\n", fs->name);
            read_only = true;
        } else {
            pr_init("fs %s: loaded super block version %d, checkpoint exists: %d\n",
                    fs->name, fs->super_block_version,
                    block_range_empty(fs->checkpoint_free.initial_range));
        }
    } else {
        if (is_clear) {
            pr_init("fs %s: superblock, version %d, is empty fs\n", fs->name,
                    fs->super_block_version);
        } else if (do_clear) {
            pr_init("fs %s: clear requested, create empty, version %d\n",
                    fs->name, fs->super_block_version);
            if (!fs->alternate_data) {
                fs->main_repaired = false;
                fs->needs_full_scan = false;
            }
        } else {
            pr_init("fs %s: no valid super-block found, create empty\n",
                    fs->name);
        }
        fs_init_free_set(fs, &fs->free);
    }
    assert(fs->block_num_size >= fs->dev->block_num_size);
    assert(fs->block_num_size <= sizeof(data_block_t));
    assert(fs->mac_size >= fs->dev->mac_size);
    assert(fs->mac_size <= sizeof(struct mac));
    assert(fs->mac_size == sizeof(struct mac) || fs->dev->tamper_detecting);

    /*
     * fs_set_roots() unconditionally set the files and free roots. If it fails,
     * it failed to read the checkpoint block but that should only block
     * modification, not reading.
     */
    fs->readable = true;

    if (read_only) {
        assert(!fs->writable);
        return 0;
    }

    fs->writable = true;
    if (do_clear && !is_clear) {
        if (!write_initial_super_block(fs)) {
            return -1;
        }
    } else if (flags & FS_INIT_FLAGS_RESTORE_CHECKPOINT) {
        /*
         * Flush the new restored checkpoint to superblock before overwriting
         * any data blocks. We know that we can't already have a pending
         * initial_super_block_tr yet because we just made the filesystem
         * writable, and write_current_super_block() requires a writable
         * filesystem.
         */
        assert(!fs->initial_super_block_tr);
        write_current_super_block(fs, false);
    }

    return 0;
}

/**
 * load_super_block - Find and load superblock and initialize file system state
 * @fs:         File system state object.
 * @flags:      Any of &typedef fs_init_flags32_t, ORed together.
 *
 * Return: 0 if super block was readable and not from a future file system
 * version (regardless of its other content), -1 if not.
 */
static int load_super_block(struct fs* fs, fs_init_flags32_t flags) {
    unsigned int i;
    int ret;
    const struct super_block* new_super;
    struct obj_ref new_super_ref = OBJ_REF_INITIAL_VALUE(new_super_ref);
    const struct super_block* old_super = NULL;
    struct obj_ref old_super_ref = OBJ_REF_INITIAL_VALUE(old_super_ref);

    assert(fs->super_dev->block_size >= sizeof(struct super_block));

    for (i = 0; i < countof(fs->super_block); i++) {
        new_super = block_get_super(fs, fs->super_block[i], &new_super_ref);
        if (!new_super) {
            if (fs->allow_tampering) {
                /*
                 * Superblock may not exist yet in non-secure storage, proceed
                 * anyway
                 */
                continue;
            }
            pr_err("failed to read super-block\n");
            ret = -1;  // -EIO ? ERR_IO?;
            goto err;
        }
        if (use_new_super(fs->dev, new_super, i, old_super)) {
            if (old_super) {
                block_put(old_super, &old_super_ref);
            }
            old_super = new_super;
            obj_ref_transfer(&old_super_ref, &new_super_ref);
        } else {
            block_put(new_super, &new_super_ref);
        }
    }

    ret = fs_init_from_super(fs, old_super, flags);
err:
    if (old_super) {
        block_put(old_super, &old_super_ref);
    }
    return ret;
}

struct fs_check_state {
    struct file_iterate_state iter;
    bool delete_invalid_files;

    bool internal_state_valid;
    bool invalid_block_found;
};

static bool fs_check_file(struct file_iterate_state* iter,
                          struct transaction* tr,
                          const struct block_mac* block_mac,
                          bool added,
                          bool removed) {
    struct fs_check_state* fs_check_state =
            containerof(iter, struct fs_check_state, iter);
    struct obj_ref info_ref = OBJ_REF_INITIAL_VALUE(info_ref);
    struct storage_file_handle file;
    char path[FS_PATH_MAX];

    assert(!tr->failed);
    assert(!tr->invalid_block_found);

    const struct file_info* info = file_get_info(tr, block_mac, &info_ref);
    if (!info) {
        pr_err("could not get file info at block %" PRIu64 "\n",
               block_mac_to_block(tr, block_mac));
        fs_check_state->internal_state_valid = false;
        goto err_file_info;
    }
    strncpy(path, info->path, sizeof(path));
    path[sizeof(path) - 1] = '\0';
    file_info_put(info, &info_ref);

    enum file_op_result result =
            file_open(tr, path, &file, FILE_OPEN_NO_CREATE, true);
    if (result != FILE_OP_SUCCESS) {
        /* TODO: is it ok to leak the filename here? we do it elsewhere */
        pr_err("could not open file %s\n", path);
        fs_check_state->internal_state_valid = false;
        goto err_file_open;
    }

    if (!file_check(tr, &file)) {
        fs_check_state->internal_state_valid = false;
    }

    file_close(&file);

err_file_open:
err_file_info:
    if (tr->invalid_block_found) {
        fs_check_state->invalid_block_found = true;
        /* We have noted the invalid block, reset for the next file. */
        tr->invalid_block_found = false;
    }
    if (tr->failed) {
        transaction_activate(tr);
    }

    /* Continue iterating unconditionally */
    return false;
}

enum fs_check_result fs_check_full(struct fs* fs) {
    bool free_set_valid, file_tree_valid;
    enum fs_check_result res = FS_CHECK_NO_ERROR;
    struct transaction iterate_tr;
    struct fs_check_state state = {
            .iter.file = fs_check_file,
            .internal_state_valid = true,
            .invalid_block_found = false,
    };

    transaction_init(&iterate_tr, fs, true);

    /* Check the free list for consistency */
    free_set_valid = block_set_check(&iterate_tr, &fs->free);
    if (!free_set_valid || iterate_tr.invalid_block_found) {
        pr_err("free block set is invalid\n");
        res = FS_CHECK_INVALID_FREE_SET;
        /*
         * We can recover the free set non-destructively by rebuilding from the
         * file tree, so we don't need to report the invalid block.
         */
        iterate_tr.invalid_block_found = false;
    }
    if (iterate_tr.failed) {
        pr_err("free set tree not fully readable\n");
        state.internal_state_valid = false;
        transaction_activate(&iterate_tr);
    }

    /* Check the file tree for consistency */
    file_tree_valid = block_tree_check(&iterate_tr, &fs->files);
    if (!file_tree_valid) {
        pr_err("file tree is invalid\n");
        res = FS_CHECK_INVALID_FILE_TREE;
    }
    if (iterate_tr.invalid_block_found) {
        pr_err("invalid block encountered in file tree\n");
        state.invalid_block_found = true;
        iterate_tr.invalid_block_found = false;
    }
    if (iterate_tr.failed) {
        pr_err("file tree not fully readable\n");
        state.internal_state_valid = false;
        transaction_activate(&iterate_tr);
    }

    file_iterate(&iterate_tr, NULL, false, &state.iter, true);

    /* Invalid blocks take precedence over internal consistency errors. */
    if (state.invalid_block_found) {
        res = FS_CHECK_INVALID_BLOCK;
    } else if (res == FS_CHECK_NO_ERROR && !state.internal_state_valid) {
        res = FS_CHECK_UNKNOWN;
    }
    if (!iterate_tr.failed) {
        transaction_fail(&iterate_tr);
    }
    transaction_free(&iterate_tr);

    return res;
}

enum fs_check_result fs_check_quick(struct fs* fs) {
    bool fs_is_clear = !block_range_empty(fs->free.initial_range);
    if (fs_is_clear || (block_probe(fs, &fs->files.root, true) &&
                        block_probe(fs, &fs->free.block_tree.root, false))) {
        return FS_CHECK_NO_ERROR;
    } else {
        return FS_CHECK_INVALID_BLOCK;
    }
}

enum fs_check_result fs_check(struct fs* fs) {
    if (fs->needs_full_scan) {
        pr_warn("%s filesystem requires full scan on mount\n", fs->name);
        return fs_check_full(fs);
    } else {
        return fs_check_quick(fs);
    }
}

/**
 * fs_file_tree_init - Initialize an empty file tree for a file system
 * @fs:        File system state object.
 * @tree:      Block tree to initialize as a file tree.
 */
void fs_file_tree_init(const struct fs* fs, struct block_tree* tree) {
    size_t block_num_size;
    size_t block_mac_size;

    block_num_size = fs->block_num_size;
    block_mac_size = block_num_size + fs->mac_size;
    block_tree_init(tree, fs->dev->block_size, block_num_size, block_mac_size,
                    block_mac_size);
}

/**
 * fs_init - Initialize file system state
 * @fs:         File system state object.
 * @name:       File system name for error reporting. Must be a static string.
 * @key:        Key pointer. Must not be freed while @fs is in use.
 * @dev:        Main block device.
 * @super_dev:  Block device for super block.
 * @flags:      Any of &typedef fs_init_flags32_t, ORed together.
 */
int fs_init(struct fs* fs,
            const char* name,
            const struct key* key,
            struct block_device* dev,
            struct block_device* super_dev,
            fs_init_flags32_t flags) {
    int ret;

    if (super_dev->block_size < sizeof(struct super_block)) {
        pr_err("unsupported block size for super_dev, %zd < %zd\n",
               super_dev->block_size, sizeof(struct super_block));
        return -1;  // ERR_NOT_VALID?
    }

    if (super_dev->block_count < 2) {
        pr_err("unsupported block count for super_dev, %" PRIu64 "\n",
               super_dev->block_count);
        return -1;  // ERR_NOT_VALID?
    }

    if ((flags & FS_INIT_FLAGS_ALTERNATE_DATA) &&
        (flags & FS_INIT_FLAGS_RESTORE_CHECKPOINT)) {
        pr_err("Alternate file system cannot restore to a checkpoint\n");
        return -1;
    }

    fs->name = name;
    fs->key = key;
    fs->dev = dev;
    fs->super_dev = super_dev;
    fs->readable = false;
    fs->writable = false;
    fs->allow_tampering = flags & FS_INIT_FLAGS_ALLOW_TAMPERING;
    fs->checkpoint_required = false;
    list_initialize(&fs->transactions);
    list_initialize(&fs->allocated);
    fs->initial_super_block_tr = NULL;
    list_add_tail(&fs_list, &fs->node);

    if (dev == super_dev) {
        fs->min_block_num = 2;
    } else {
        /* TODO: use 0 when btree code allows it */
        fs->min_block_num = 1;
    }
    fs->super_block[0] = 0;
    fs->super_block[1] = 1;
    ret = load_super_block(fs, flags);
    if (ret) {
        fs_destroy(fs);
        fs->dev = NULL;
        fs->super_dev = NULL;
        return ret;
    }

    if ((flags & FS_INIT_FLAGS_AUTO_CHECKPOINT) &&
        !block_mac_valid_fs(fs, &fs->checkpoint)) {
        if (fs_check_full(fs) == FS_CHECK_NO_ERROR) {
            fs->checkpoint_required = true;
        } else {
            pr_err("Not automatically creating a checkpoint; "
                   "an error was found in filesystem %s\n",
                   fs->name);
        }
    }

    return 0;
}

/**
 * fs_destroy - Destroy file system state
 * @fs:         File system state object.
 *
 * Free any dynamically allocated state and check that @fs is not referenced by
 * any transactions.
 */
void fs_destroy(struct fs* fs) {
    if (fs->initial_super_block_tr) {
        if (!fs->initial_super_block_tr->failed) {
            transaction_fail(fs->initial_super_block_tr);
        }
        transaction_free(fs->initial_super_block_tr);
        free(fs->initial_super_block_tr);
        fs->initial_super_block_tr = NULL;
    }
    assert(list_is_empty(&fs->transactions));
    assert(list_is_empty(&fs->allocated));
    list_delete(&fs->node);
    fs->readable = false;
    fs->writable = false;
}

/**
 * fs_unknown_super_block_state_all - Notify filesystems of unknown disk state
 *
 * Call from other layers when detecting write failues that can cause the
 * in-memory state of super blocks (or other block that we don't care about) to
 * be different from the on-disk state. Write in-memory state to disk before
 * writing any other block.
 */
void fs_unknown_super_block_state_all(void) {
    struct fs* fs;
    list_for_every_entry(&fs_list, fs, struct fs, node) {
        /* TODO: filter out filesystems that are not affected? */
        /*
         * We can't reinitialize an existing, failed special transaction here.
         * If a initial superblock write failed and triggered
         * fs_unknown_super_block_state_all() we need to leave that superblock
         * transaction in a failed state so that the transaction that that
         * triggered the failing write can also be failed further up the call
         * chain. If a special transaction already exists we are guaranteed that
         * it will be reinitialized and flushed to disk before any new writes to
         * that FS, so we don't need to reinitialize it here.
         *
         * If this file system is not writable, we should not try to re-write
         * the current super block state. A read-only file system cannot have
         * any modifications that we are allowed to save, and it does not need
         * to be re-synced here as we cannot have previously failed to write its
         * superblock.
         */
        if (fs->writable) {
            write_current_super_block(fs, false /* reinitialize */);
        }
    }
}

void fs_fail_all_transactions(void) {
    struct transaction* tmp_tr;
    struct transaction* tr;
    struct fs* fs;
    list_for_every_entry(&fs_list, fs, struct fs, node) {
        list_for_every_entry_safe(&fs->transactions, tr, tmp_tr,
                                  struct transaction, node) {
            if (transaction_is_active(tr) && !tr->failed) {
                transaction_fail(tr);
            }
        }
    }
}