1 //===--- PostingList.h - Symbol identifiers storage interface --*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This defines posting list interface: a storage for identifiers of symbols 11 /// which can be characterized by a specific feature (such as fuzzy-find 12 /// trigram, scope, type or any other Search Token). Posting lists can be 13 /// traversed in order using an iterator and are values for inverted index, 14 /// which maps search tokens to corresponding posting lists. 15 /// 16 /// In order to decrease size of Index in-memory representation, Variable Byte 17 /// Encoding (VByte) is used for PostingLists compression. An overview of VByte 18 /// algorithm can be found in "Introduction to Information Retrieval" book: 19 /// https://nlp.stanford.edu/IR-book/html/htmledition/variable-byte-codes-1.html 20 /// 21 //===----------------------------------------------------------------------===// 22 23 #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H 24 #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H 25 26 #include "Iterator.h" 27 #include "llvm/ADT/ArrayRef.h" 28 #include "llvm/ADT/SmallVector.h" 29 #include <cstdint> 30 #include <vector> 31 32 namespace clang { 33 namespace clangd { 34 namespace dex { 35 class Token; 36 37 /// NOTE: This is an implementation detail. 38 /// 39 /// Chunk is a fixed-width piece of PostingList which contains the first DocID 40 /// in uncompressed format (Head) and delta-encoded Payload. It can be 41 /// decompressed upon request. 42 struct Chunk { 43 /// Keep sizeof(Chunk) == 32. 44 static constexpr size_t PayloadSize = 32 - sizeof(DocID); 45 46 llvm::SmallVector<DocID, PayloadSize + 1> decompress() const; 47 48 /// The first element of decompressed Chunk. 49 DocID Head; 50 /// VByte-encoded deltas. 51 std::array<uint8_t, PayloadSize> Payload; 52 }; 53 static_assert(sizeof(Chunk) == 32, "Chunk should take 32 bytes of memory."); 54 55 /// PostingList is the storage of DocIDs which can be inserted to the Query 56 /// Tree as a leaf by constructing Iterator over the PostingList object. DocIDs 57 /// are stored in underlying chunks. Compression saves memory at a small cost 58 /// in access time, which is still fast enough in practice. 59 class PostingList { 60 public: 61 explicit PostingList(llvm::ArrayRef<DocID> Documents); 62 63 /// Constructs DocumentIterator over given posting list. DocumentIterator will 64 /// go through the chunks and decompress them on-the-fly when necessary. 65 /// If given, Tok is only used for the string representation. 66 std::unique_ptr<Iterator> iterator(const Token *Tok = nullptr) const; 67 68 /// Returns in-memory size of external storage. bytes()69 size_t bytes() const { return Chunks.capacity() * sizeof(Chunk); } 70 71 private: 72 const std::vector<Chunk> Chunks; 73 }; 74 75 } // namespace dex 76 } // namespace clangd 77 } // namespace clang 78 79 #endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H 80