//===--- Iterator.h - Query Symbol Retrieval --------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// /// \file /// Symbol index queries consist of specific requirements for the requested /// symbol, such as high fuzzy matching score, scope, type etc. The lists of all /// symbols matching some criteria (e.g. belonging to "clang::clangd::" scope) /// are expressed in a form of Search Tokens which are stored in the inverted /// index. Inverted index maps these tokens to the posting lists - sorted (by /// symbol quality) sequences of symbol IDs matching the token, e.g. scope token /// "clangd::clangd::" is mapped to the list of IDs of all symbols which are /// declared in this namespace. Search queries are build from a set of /// requirements which can be combined with each other forming the query trees. /// The leafs of such trees are posting lists, and the nodes are operations on /// these posting lists, e.g. intersection or union. Efficient processing of /// these multi-level queries is handled by Iterators. Iterators advance through /// all leaf posting lists producing the result of search query, which preserves /// the sorted order of IDs. Having the resulting IDs sorted is important, /// because it allows receiving a certain number of the most valuable items /// (e.g. symbols with highest quality which was the sorting key in the first /// place) without processing all items with requested properties (this might /// not be computationally effective if search request is not very restrictive). // //===----------------------------------------------------------------------===// #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_ITERATOR_H #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_ITERATOR_H #include "llvm/ADT/ArrayRef.h" #include "llvm/Support/raw_ostream.h" #include #include #include namespace clang { namespace clangd { namespace dex { /// Symbol position in the list of all index symbols sorted by a pre-computed /// symbol quality. using DocID = uint32_t; /// Iterator is the interface for Query Tree node. The simplest type of Iterator /// is DocumentIterator which is simply a wrapper around PostingList iterator /// and serves as the Query Tree leaf. More sophisticated examples of iterators /// can manage intersection, union of the elements produced by other iterators /// (their children) to form a multi-level Query Tree. The interface is designed /// to be extensible in order to support multiple types of iterators. class Iterator { public: /// Returns true if all valid DocIDs were processed and hence the iterator is /// exhausted. virtual bool reachedEnd() const = 0; /// Moves to next valid DocID. If it doesn't exist, the iterator is exhausted /// and proceeds to the END. /// /// Note: reachedEnd() must be false. virtual void advance() = 0; /// Moves to the first valid DocID which is equal or higher than given ID. If /// it doesn't exist, the iterator is exhausted and proceeds to the END. /// /// Note: reachedEnd() must be false. virtual void advanceTo(DocID ID) = 0; /// Returns the current element this iterator points to. /// /// Note: reachedEnd() must be false. virtual DocID peek() const = 0; /// Informs the iterator that the current document was consumed, and returns /// its boost. /// /// Note: If this iterator has any child iterators that contain the document, /// consume() should be called on those and their boosts incorporated. /// consume() must *not* be called on children that don't contain the current /// doc. virtual float consume() = 0; /// Returns an estimate of advance() calls before the iterator is exhausted. virtual size_t estimateSize() const = 0; virtual ~Iterator() {} /// Prints a convenient human-readable iterator representation by recursively /// dumping iterators in the following format: /// /// (Type Child1 Child2 ...) /// /// Where Type is the iterator type representation: "&" for And, "|" for Or, /// ChildN is N-th iterator child. Raw iterators over PostingList are /// represented as "[... CurID ...]" where CurID is the current PostingList /// entry being inspected. friend llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Iterator &Iterator) { return Iterator.dump(OS); } /// Inspect iterator type, used internally for optimizing query trees. enum class Kind { And, Or, True, False, Other }; Kind kind() const { return MyKind; } protected: Iterator(Kind MyKind = Kind::Other) : MyKind(MyKind) {} private: virtual llvm::raw_ostream &dump(llvm::raw_ostream &OS) const = 0; Kind MyKind; }; /// Advances the iterator until it is exhausted. Returns pairs of document IDs /// with the corresponding boosting score. /// /// Boosting can be seen as a compromise between retrieving too many items and /// calculating finals score for each of them (which might be very expensive) /// and not retrieving enough items so that items with very high final score /// would not be processed. Boosting score is a computationally efficient way /// to acquire preliminary scores of requested items. std::vector> consume(Iterator &It); namespace detail { // Variadic template machinery. inline void populateChildren(std::vector> &) {} template void populateChildren(std::vector> &Children, std::unique_ptr Head, TailT... Tail) { Children.push_back(move(Head)); populateChildren(Children, move(Tail)...); } } // namespace detail // A corpus is a set of documents, and a factory for iterators over them. class Corpus { DocID Size; public: explicit Corpus(DocID Size) : Size(Size) {} /// Returns AND Iterator which performs the intersection of the PostingLists /// of its children. /// /// consume(): AND Iterator returns the product of Childrens' boosting /// scores. std::unique_ptr intersect(std::vector> Children) const; /// Returns OR Iterator which performs the union of the PostingLists of its /// children. /// /// consume(): OR Iterator returns the highest boost value among children /// containing the requested item. std::unique_ptr unionOf(std::vector> Children) const; /// Returns TRUE Iterator which iterates over "virtual" PostingList /// containing all items in range [0, Size) in an efficient manner. std::unique_ptr all() const; /// Returns FALSE Iterator which iterates over no documents. std::unique_ptr none() const; /// Returns BOOST iterator which multiplies the score of each item by given /// factor. Boosting can be used as a computationally inexpensive filtering. /// Users can return significantly more items using consumeAndBoost() and /// then trim Top K using retrieval score. std::unique_ptr boost(std::unique_ptr Child, float Factor) const; /// Returns LIMIT iterator, which yields up to N elements of its child /// iterator. Elements only count towards the limit if they are part of the /// final result set. Therefore the following iterator (AND (2) (LIMIT (1 2) /// 1)) yields (2), not (). std::unique_ptr limit(std::unique_ptr Child, size_t Limit) const; /// This allows intersect(create(...), create(...)) syntax. template std::unique_ptr intersect(Args... args) const { std::vector> Children; detail::populateChildren(Children, std::forward(args)...); return intersect(move(Children)); } /// This allows unionOf(create(...), create(...)) syntax. template std::unique_ptr unionOf(Args... args) const { std::vector> Children; detail::populateChildren(Children, std::forward(args)...); return unionOf(move(Children)); } }; } // namespace dex } // namespace clangd } // namespace clang #endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_ITERATOR_H