1 //===--- Iterator.h - Query Symbol Retrieval --------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// Symbol index queries consist of specific requirements for the requested
11 /// symbol, such as high fuzzy matching score, scope, type etc. The lists of all
12 /// symbols matching some criteria (e.g. belonging to "clang::clangd::" scope)
13 /// are expressed in a form of Search Tokens which are stored in the inverted
14 /// index. Inverted index maps these tokens to the posting lists - sorted (by
15 /// symbol quality) sequences of symbol IDs matching the token, e.g. scope token
16 /// "clangd::clangd::" is mapped to the list of IDs of all symbols which are
17 /// declared in this namespace. Search queries are build from a set of
18 /// requirements which can be combined with each other forming the query trees.
19 /// The leafs of such trees are posting lists, and the nodes are operations on
20 /// these posting lists, e.g. intersection or union. Efficient processing of
21 /// these multi-level queries is handled by Iterators. Iterators advance through
22 /// all leaf posting lists producing the result of search query, which preserves
23 /// the sorted order of IDs. Having the resulting IDs sorted is important,
24 /// because it allows receiving a certain number of the most valuable items
25 /// (e.g. symbols with highest quality which was the sorting key in the first
26 /// place) without processing all items with requested properties (this might
27 /// not be computationally effective if search request is not very restrictive).
28 //
29 //===----------------------------------------------------------------------===//
30 
31 #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_ITERATOR_H
32 #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_ITERATOR_H
33 
34 #include "llvm/ADT/ArrayRef.h"
35 #include "llvm/Support/raw_ostream.h"
36 #include <algorithm>
37 #include <memory>
38 #include <vector>
39 
40 namespace clang {
41 namespace clangd {
42 namespace dex {
43 
44 /// Symbol position in the list of all index symbols sorted by a pre-computed
45 /// symbol quality.
46 using DocID = uint32_t;
47 
48 /// Iterator is the interface for Query Tree node. The simplest type of Iterator
49 /// is DocumentIterator which is simply a wrapper around PostingList iterator
50 /// and serves as the Query Tree leaf. More sophisticated examples of iterators
51 /// can manage intersection, union of the elements produced by other iterators
52 /// (their children) to form a multi-level Query Tree. The interface is designed
53 /// to be extensible in order to support multiple types of iterators.
54 class Iterator {
55 public:
56   /// Returns true if all valid DocIDs were processed and hence the iterator is
57   /// exhausted.
58   virtual bool reachedEnd() const = 0;
59   /// Moves to next valid DocID. If it doesn't exist, the iterator is exhausted
60   /// and proceeds to the END.
61   ///
62   /// Note: reachedEnd() must be false.
63   virtual void advance() = 0;
64   /// Moves to the first valid DocID which is equal or higher than given ID. If
65   /// it doesn't exist, the iterator is exhausted and proceeds to the END.
66   ///
67   /// Note: reachedEnd() must be false.
68   virtual void advanceTo(DocID ID) = 0;
69   /// Returns the current element this iterator points to.
70   ///
71   /// Note: reachedEnd() must be false.
72   virtual DocID peek() const = 0;
73   /// Informs the iterator that the current document was consumed, and returns
74   /// its boost.
75   ///
76   /// Note: If this iterator has any child iterators that contain the document,
77   /// consume() should be called on those and their boosts incorporated.
78   /// consume() must *not* be called on children that don't contain the current
79   /// doc.
80   virtual float consume() = 0;
81   /// Returns an estimate of advance() calls before the iterator is exhausted.
82   virtual size_t estimateSize() const = 0;
83 
~Iterator()84   virtual ~Iterator() {}
85 
86   /// Prints a convenient human-readable iterator representation by recursively
87   /// dumping iterators in the following format:
88   ///
89   /// (Type Child1 Child2 ...)
90   ///
91   /// Where Type is the iterator type representation: "&" for And, "|" for Or,
92   /// ChildN is N-th iterator child. Raw iterators over PostingList are
93   /// represented as "[... CurID ...]" where CurID is the current PostingList
94   /// entry being inspected.
95   friend llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
96                                        const Iterator &Iterator) {
97     return Iterator.dump(OS);
98   }
99 
100   /// Inspect iterator type, used internally for optimizing query trees.
101   enum class Kind { And, Or, True, False, Other };
kind()102   Kind kind() const { return MyKind; }
103 
104 protected:
MyKind(MyKind)105   Iterator(Kind MyKind = Kind::Other) : MyKind(MyKind) {}
106 
107 private:
108   virtual llvm::raw_ostream &dump(llvm::raw_ostream &OS) const = 0;
109   Kind MyKind;
110 };
111 
112 /// Advances the iterator until it is exhausted. Returns pairs of document IDs
113 /// with the corresponding boosting score.
114 ///
115 /// Boosting can be seen as a compromise between retrieving too many items and
116 /// calculating finals score for each of them (which might be very expensive)
117 /// and not retrieving enough items so that items with very high final score
118 /// would not be processed. Boosting score is a computationally efficient way
119 /// to acquire preliminary scores of requested items.
120 std::vector<std::pair<DocID, float>> consume(Iterator &It);
121 
122 namespace detail {
123 // Variadic template machinery.
populateChildren(std::vector<std::unique_ptr<Iterator>> &)124 inline void populateChildren(std::vector<std::unique_ptr<Iterator>> &) {}
125 template <typename... TailT>
populateChildren(std::vector<std::unique_ptr<Iterator>> & Children,std::unique_ptr<Iterator> Head,TailT...Tail)126 void populateChildren(std::vector<std::unique_ptr<Iterator>> &Children,
127                       std::unique_ptr<Iterator> Head, TailT... Tail) {
128   Children.push_back(move(Head));
129   populateChildren(Children, move(Tail)...);
130 }
131 } // namespace detail
132 
133 // A corpus is a set of documents, and a factory for iterators over them.
134 class Corpus {
135   DocID Size;
136 
137 public:
Corpus(DocID Size)138   explicit Corpus(DocID Size) : Size(Size) {}
139 
140   /// Returns AND Iterator which performs the intersection of the PostingLists
141   /// of its children.
142   ///
143   /// consume(): AND Iterator returns the product of Childrens' boosting
144   /// scores.
145   std::unique_ptr<Iterator>
146   intersect(std::vector<std::unique_ptr<Iterator>> Children) const;
147 
148   /// Returns OR Iterator which performs the union of the PostingLists of its
149   /// children.
150   ///
151   /// consume(): OR Iterator returns the highest boost value among children
152   /// containing the requested item.
153   std::unique_ptr<Iterator>
154   unionOf(std::vector<std::unique_ptr<Iterator>> Children) const;
155 
156   /// Returns TRUE Iterator which iterates over "virtual" PostingList
157   /// containing all items in range [0, Size) in an efficient manner.
158   std::unique_ptr<Iterator> all() const;
159 
160   /// Returns FALSE Iterator which iterates over no documents.
161   std::unique_ptr<Iterator> none() const;
162 
163   /// Returns BOOST iterator which multiplies the score of each item by given
164   /// factor. Boosting can be used as a computationally inexpensive filtering.
165   /// Users can return significantly more items using consumeAndBoost() and
166   /// then trim Top K using retrieval score.
167   std::unique_ptr<Iterator> boost(std::unique_ptr<Iterator> Child,
168                                   float Factor) const;
169 
170   /// Returns LIMIT iterator, which yields up to N elements of its child
171   /// iterator. Elements only count towards the limit if they are part of the
172   /// final result set. Therefore the following iterator (AND (2) (LIMIT (1 2)
173   /// 1)) yields (2), not ().
174   std::unique_ptr<Iterator> limit(std::unique_ptr<Iterator> Child,
175                                   size_t Limit) const;
176 
177   /// This allows intersect(create(...), create(...)) syntax.
178   template <typename... Args>
intersect(Args...args)179   std::unique_ptr<Iterator> intersect(Args... args) const {
180     std::vector<std::unique_ptr<Iterator>> Children;
181     detail::populateChildren(Children, std::forward<Args>(args)...);
182     return intersect(move(Children));
183   }
184 
185   /// This allows unionOf(create(...), create(...)) syntax.
186   template <typename... Args>
unionOf(Args...args)187   std::unique_ptr<Iterator> unionOf(Args... args) const {
188     std::vector<std::unique_ptr<Iterator>> Children;
189     detail::populateChildren(Children, std::forward<Args>(args)...);
190     return unionOf(move(Children));
191   }
192 };
193 
194 } // namespace dex
195 } // namespace clangd
196 } // namespace clang
197 
198 #endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_ITERATOR_H
199