1 //===--- Dex.cpp - Dex Symbol Index Implementation --------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "Dex.h"
10 #include "FileDistance.h"
11 #include "FuzzyMatch.h"
12 #include "Quality.h"
13 #include "index/Index.h"
14 #include "index/dex/Iterator.h"
15 #include "support/Logger.h"
16 #include "support/Trace.h"
17 #include "llvm/ADT/StringSet.h"
18 #include "llvm/Support/ScopedPrinter.h"
19 #include <algorithm>
20 #include <queue>
21
22 namespace clang {
23 namespace clangd {
24 namespace dex {
25
build(SymbolSlab Symbols,RefSlab Refs,RelationSlab Rels)26 std::unique_ptr<SymbolIndex> Dex::build(SymbolSlab Symbols, RefSlab Refs,
27 RelationSlab Rels) {
28 auto Size = Symbols.bytes() + Refs.bytes();
29 // There is no need to include "Rels" in Data because the relations are self-
30 // contained, without references into a backing store.
31 auto Data = std::make_pair(std::move(Symbols), std::move(Refs));
32 return std::make_unique<Dex>(Data.first, Data.second, Rels, std::move(Data),
33 Size);
34 }
35
36 namespace {
37
38 // Mark symbols which are can be used for code completion.
39 const Token RestrictedForCodeCompletion =
40 Token(Token::Kind::Sentinel, "Restricted For Code Completion");
41
42 // Helper to efficiently assemble the inverse index (token -> matching docs).
43 // The output is a nice uniform structure keyed on Token, but constructing
44 // the Token object every time we want to insert into the map is wasteful.
45 // Instead we have various maps keyed on things that are cheap to compute,
46 // and produce the Token keys once at the end.
47 class IndexBuilder {
48 llvm::DenseMap<Trigram, std::vector<DocID>> TrigramDocs;
49 std::vector<DocID> RestrictedCCDocs;
50 llvm::StringMap<std::vector<DocID>> TypeDocs;
51 llvm::StringMap<std::vector<DocID>> ScopeDocs;
52 llvm::StringMap<std::vector<DocID>> ProximityDocs;
53 std::vector<Trigram> TrigramScratch;
54
55 public:
56 // Add the tokens which are given symbol's characteristics.
57 // This includes fuzzy matching trigrams, symbol's scope, etc.
58 // FIXME(kbobyrev): Support more token types:
59 // * Namespace proximity
add(const Symbol & Sym,DocID D)60 void add(const Symbol &Sym, DocID D) {
61 generateIdentifierTrigrams(Sym.Name, TrigramScratch);
62 for (Trigram T : TrigramScratch)
63 TrigramDocs[T].push_back(D);
64 ScopeDocs[Sym.Scope].push_back(D);
65 if (!llvm::StringRef(Sym.CanonicalDeclaration.FileURI).empty())
66 for (const auto &ProximityURI :
67 generateProximityURIs(Sym.CanonicalDeclaration.FileURI))
68 ProximityDocs[ProximityURI].push_back(D);
69 if (Sym.Flags & Symbol::IndexedForCodeCompletion)
70 RestrictedCCDocs.push_back(D);
71 if (!Sym.Type.empty())
72 TypeDocs[Sym.Type].push_back(D);
73 }
74
75 // Assemble the final compressed posting lists for the added symbols.
build()76 llvm::DenseMap<Token, PostingList> build() {
77 llvm::DenseMap<Token, PostingList> Result(/*InitialReserve=*/
78 TrigramDocs.size() +
79 RestrictedCCDocs.size() +
80 TypeDocs.size() +
81 ScopeDocs.size() +
82 ProximityDocs.size());
83 for (const auto &E : TrigramDocs)
84 Result.try_emplace(Token(Token::Kind::Trigram, E.first.str()), E.second);
85 for (const auto &E : TypeDocs)
86 Result.try_emplace(Token(Token::Kind::Type, E.first()), E.second);
87 for (const auto &E : ScopeDocs)
88 Result.try_emplace(Token(Token::Kind::Scope, E.first()), E.second);
89 for (const auto &E : ProximityDocs)
90 Result.try_emplace(Token(Token::Kind::ProximityURI, E.first()), E.second);
91 if (!RestrictedCCDocs.empty())
92 Result.try_emplace(RestrictedForCodeCompletion, RestrictedCCDocs);
93 return Result;
94 }
95 };
96
97 } // namespace
98
buildIndex()99 void Dex::buildIndex() {
100 this->Corpus = dex::Corpus(Symbols.size());
101 std::vector<std::pair<float, const Symbol *>> ScoredSymbols(Symbols.size());
102
103 for (size_t I = 0; I < Symbols.size(); ++I) {
104 const Symbol *Sym = Symbols[I];
105 LookupTable[Sym->ID] = Sym;
106 ScoredSymbols[I] = {quality(*Sym), Sym};
107 }
108
109 // Symbols are sorted by symbol qualities so that items in the posting lists
110 // are stored in the descending order of symbol quality.
111 llvm::sort(ScoredSymbols, std::greater<std::pair<float, const Symbol *>>());
112
113 // SymbolQuality was empty up until now.
114 SymbolQuality.resize(Symbols.size());
115 // Populate internal storage using Symbol + Score pairs.
116 for (size_t I = 0; I < ScoredSymbols.size(); ++I) {
117 SymbolQuality[I] = ScoredSymbols[I].first;
118 Symbols[I] = ScoredSymbols[I].second;
119 }
120
121 // Build posting lists for symbols.
122 IndexBuilder Builder;
123 for (DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank)
124 Builder.add(*Symbols[SymbolRank], SymbolRank);
125 InvertedIndex = Builder.build();
126 }
127
iterator(const Token & Tok) const128 std::unique_ptr<Iterator> Dex::iterator(const Token &Tok) const {
129 auto It = InvertedIndex.find(Tok);
130 return It == InvertedIndex.end() ? Corpus.none()
131 : It->second.iterator(&It->first);
132 }
133
134 // Constructs BOOST iterators for Path Proximities.
createFileProximityIterator(llvm::ArrayRef<std::string> ProximityPaths) const135 std::unique_ptr<Iterator> Dex::createFileProximityIterator(
136 llvm::ArrayRef<std::string> ProximityPaths) const {
137 std::vector<std::unique_ptr<Iterator>> BoostingIterators;
138 // Deduplicate parent URIs extracted from the ProximityPaths.
139 llvm::StringSet<> ParentURIs;
140 llvm::StringMap<SourceParams> Sources;
141 for (const auto &Path : ProximityPaths) {
142 Sources[Path] = SourceParams();
143 auto PathURI = URI::create(Path);
144 const auto PathProximityURIs = generateProximityURIs(PathURI.toString());
145 for (const auto &ProximityURI : PathProximityURIs)
146 ParentURIs.insert(ProximityURI);
147 }
148 // Use SymbolRelevanceSignals for symbol relevance evaluation: use defaults
149 // for all parameters except for Proximity Path distance signal.
150 SymbolRelevanceSignals PathProximitySignals;
151 // DistanceCalculator will find the shortest distance from ProximityPaths to
152 // any URI extracted from the ProximityPaths.
153 URIDistance DistanceCalculator(Sources);
154 PathProximitySignals.FileProximityMatch = &DistanceCalculator;
155 // Try to build BOOST iterator for each Proximity Path provided by
156 // ProximityPaths. Boosting factor should depend on the distance to the
157 // Proximity Path: the closer processed path is, the higher boosting factor.
158 for (const auto &ParentURI : ParentURIs.keys()) {
159 // FIXME(kbobyrev): Append LIMIT on top of every BOOST iterator.
160 auto It = iterator(Token(Token::Kind::ProximityURI, ParentURI));
161 if (It->kind() != Iterator::Kind::False) {
162 PathProximitySignals.SymbolURI = ParentURI;
163 BoostingIterators.push_back(Corpus.boost(
164 std::move(It), PathProximitySignals.evaluateHeuristics()));
165 }
166 }
167 BoostingIterators.push_back(Corpus.all());
168 return Corpus.unionOf(std::move(BoostingIterators));
169 }
170
171 // Constructs BOOST iterators for preferred types.
172 std::unique_ptr<Iterator>
createTypeBoostingIterator(llvm::ArrayRef<std::string> Types) const173 Dex::createTypeBoostingIterator(llvm::ArrayRef<std::string> Types) const {
174 std::vector<std::unique_ptr<Iterator>> BoostingIterators;
175 SymbolRelevanceSignals PreferredTypeSignals;
176 PreferredTypeSignals.TypeMatchesPreferred = true;
177 auto Boost = PreferredTypeSignals.evaluateHeuristics();
178 for (const auto &T : Types)
179 BoostingIterators.push_back(
180 Corpus.boost(iterator(Token(Token::Kind::Type, T)), Boost));
181 BoostingIterators.push_back(Corpus.all());
182 return Corpus.unionOf(std::move(BoostingIterators));
183 }
184
185 /// Constructs iterators over tokens extracted from the query and exhausts it
186 /// while applying Callback to each symbol in the order of decreasing quality
187 /// of the matched symbols.
fuzzyFind(const FuzzyFindRequest & Req,llvm::function_ref<void (const Symbol &)> Callback) const188 bool Dex::fuzzyFind(const FuzzyFindRequest &Req,
189 llvm::function_ref<void(const Symbol &)> Callback) const {
190 assert(!StringRef(Req.Query).contains("::") &&
191 "There must be no :: in query.");
192 trace::Span Tracer("Dex fuzzyFind");
193 FuzzyMatcher Filter(Req.Query);
194 // For short queries we use specialized trigrams that don't yield all results.
195 // Prevent clients from postfiltering them for longer queries.
196 bool More = !Req.Query.empty() && Req.Query.size() < 3;
197
198 std::vector<std::unique_ptr<Iterator>> Criteria;
199 const auto TrigramTokens = generateQueryTrigrams(Req.Query);
200
201 // Generate query trigrams and construct AND iterator over all query
202 // trigrams.
203 std::vector<std::unique_ptr<Iterator>> TrigramIterators;
204 for (const auto &Trigram : TrigramTokens)
205 TrigramIterators.push_back(iterator(Trigram));
206 Criteria.push_back(Corpus.intersect(move(TrigramIterators)));
207
208 // Generate scope tokens for search query.
209 std::vector<std::unique_ptr<Iterator>> ScopeIterators;
210 for (const auto &Scope : Req.Scopes)
211 ScopeIterators.push_back(iterator(Token(Token::Kind::Scope, Scope)));
212 if (Req.AnyScope)
213 ScopeIterators.push_back(
214 Corpus.boost(Corpus.all(), ScopeIterators.empty() ? 1.0 : 0.2));
215 Criteria.push_back(Corpus.unionOf(move(ScopeIterators)));
216
217 // Add proximity paths boosting (all symbols, some boosted).
218 Criteria.push_back(createFileProximityIterator(Req.ProximityPaths));
219 // Add boosting for preferred types.
220 Criteria.push_back(createTypeBoostingIterator(Req.PreferredTypes));
221
222 if (Req.RestrictForCodeCompletion)
223 Criteria.push_back(iterator(RestrictedForCodeCompletion));
224
225 // Use TRUE iterator if both trigrams and scopes from the query are not
226 // present in the symbol index.
227 auto Root = Corpus.intersect(move(Criteria));
228 // Retrieve more items than it was requested: some of the items with high
229 // final score might not be retrieved otherwise.
230 // FIXME(kbobyrev): Tune this ratio.
231 if (Req.Limit)
232 Root = Corpus.limit(move(Root), *Req.Limit * 100);
233 SPAN_ATTACH(Tracer, "query", llvm::to_string(*Root));
234 vlog("Dex query tree: {0}", *Root);
235
236 using IDAndScore = std::pair<DocID, float>;
237 std::vector<IDAndScore> IDAndScores = consume(*Root);
238
239 auto Compare = [](const IDAndScore &LHS, const IDAndScore &RHS) {
240 return LHS.second > RHS.second;
241 };
242 TopN<IDAndScore, decltype(Compare)> Top(
243 Req.Limit ? *Req.Limit : std::numeric_limits<size_t>::max(), Compare);
244 for (const auto &IDAndScore : IDAndScores) {
245 const DocID SymbolDocID = IDAndScore.first;
246 const auto *Sym = Symbols[SymbolDocID];
247 const llvm::Optional<float> Score = Filter.match(Sym->Name);
248 if (!Score)
249 continue;
250 // Combine Fuzzy Matching score, precomputed symbol quality and boosting
251 // score for a cumulative final symbol score.
252 const float FinalScore =
253 (*Score) * SymbolQuality[SymbolDocID] * IDAndScore.second;
254 // If Top.push(...) returns true, it means that it had to pop an item. In
255 // this case, it is possible to retrieve more symbols.
256 if (Top.push({SymbolDocID, FinalScore}))
257 More = true;
258 }
259
260 // Apply callback to the top Req.Limit items in the descending
261 // order of cumulative score.
262 for (const auto &Item : std::move(Top).items())
263 Callback(*Symbols[Item.first]);
264 return More;
265 }
266
lookup(const LookupRequest & Req,llvm::function_ref<void (const Symbol &)> Callback) const267 void Dex::lookup(const LookupRequest &Req,
268 llvm::function_ref<void(const Symbol &)> Callback) const {
269 trace::Span Tracer("Dex lookup");
270 for (const auto &ID : Req.IDs) {
271 auto I = LookupTable.find(ID);
272 if (I != LookupTable.end())
273 Callback(*I->second);
274 }
275 }
276
refs(const RefsRequest & Req,llvm::function_ref<void (const Ref &)> Callback) const277 bool Dex::refs(const RefsRequest &Req,
278 llvm::function_ref<void(const Ref &)> Callback) const {
279 trace::Span Tracer("Dex refs");
280 uint32_t Remaining =
281 Req.Limit.getValueOr(std::numeric_limits<uint32_t>::max());
282 for (const auto &ID : Req.IDs)
283 for (const auto &Ref : Refs.lookup(ID)) {
284 if (!static_cast<int>(Req.Filter & Ref.Kind))
285 continue;
286 if (Remaining == 0)
287 return true; // More refs were available.
288 --Remaining;
289 Callback(Ref);
290 }
291 return false; // We reported all refs.
292 }
293
relations(const RelationsRequest & Req,llvm::function_ref<void (const SymbolID &,const Symbol &)> Callback) const294 void Dex::relations(
295 const RelationsRequest &Req,
296 llvm::function_ref<void(const SymbolID &, const Symbol &)> Callback) const {
297 trace::Span Tracer("Dex relations");
298 uint32_t Remaining =
299 Req.Limit.getValueOr(std::numeric_limits<uint32_t>::max());
300 for (const SymbolID &Subject : Req.Subjects) {
301 LookupRequest LookupReq;
302 auto It = Relations.find(
303 std::make_pair(Subject, static_cast<uint8_t>(Req.Predicate)));
304 if (It != Relations.end()) {
305 for (const auto &Object : It->second) {
306 if (Remaining > 0) {
307 --Remaining;
308 LookupReq.IDs.insert(Object);
309 }
310 }
311 }
312 lookup(LookupReq, [&](const Symbol &Object) { Callback(Subject, Object); });
313 }
314 }
315
estimateMemoryUsage() const316 size_t Dex::estimateMemoryUsage() const {
317 size_t Bytes = Symbols.size() * sizeof(const Symbol *);
318 Bytes += SymbolQuality.size() * sizeof(float);
319 Bytes += LookupTable.getMemorySize();
320 Bytes += InvertedIndex.getMemorySize();
321 for (const auto &TokenToPostingList : InvertedIndex)
322 Bytes += TokenToPostingList.second.bytes();
323 Bytes += Refs.getMemorySize();
324 Bytes += Relations.getMemorySize();
325 return Bytes + BackingDataSize;
326 }
327
generateProximityURIs(llvm::StringRef URIPath)328 std::vector<std::string> generateProximityURIs(llvm::StringRef URIPath) {
329 std::vector<std::string> Result;
330 auto ParsedURI = URI::parse(URIPath);
331 assert(ParsedURI &&
332 "Non-empty argument of generateProximityURIs() should be a valid "
333 "URI.");
334 llvm::StringRef Body = ParsedURI->body();
335 // FIXME(kbobyrev): Currently, this is a heuristic which defines the maximum
336 // size of resulting vector. Some projects might want to have higher limit if
337 // the file hierarchy is deeper. For the generic case, it would be useful to
338 // calculate Limit in the index build stage by calculating the maximum depth
339 // of the project source tree at runtime.
340 size_t Limit = 5;
341 // Insert original URI before the loop: this would save a redundant iteration
342 // with a URI parse.
343 Result.emplace_back(ParsedURI->toString());
344 while (!Body.empty() && --Limit > 0) {
345 // FIXME(kbobyrev): Parsing and encoding path to URIs is not necessary and
346 // could be optimized.
347 Body = llvm::sys::path::parent_path(Body, llvm::sys::path::Style::posix);
348 if (!Body.empty())
349 Result.emplace_back(
350 URI(ParsedURI->scheme(), ParsedURI->authority(), Body).toString());
351 }
352 return Result;
353 }
354
355 } // namespace dex
356 } // namespace clangd
357 } // namespace clang
358