//===-- Serialization.cpp - Binary serialization of index data ------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "Serialization.h" #include "Headers.h" #include "RIFF.h" #include "SymbolLocation.h" #include "SymbolOrigin.h" #include "dex/Dex.h" #include "support/Logger.h" #include "support/Trace.h" #include "clang/Tooling/CompilationDatabase.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Compression.h" #include "llvm/Support/Endian.h" #include "llvm/Support/Error.h" #include "llvm/Support/raw_ostream.h" #include #include namespace clang { namespace clangd { namespace { // IO PRIMITIVES // We use little-endian 32 bit ints, sometimes with variable-length encoding. // // Variable-length int encoding (varint) uses the bottom 7 bits of each byte // to encode the number, and the top bit to indicate whether more bytes follow. // e.g. 9a 2f means [0x1a and keep reading, 0x2f and stop]. // This represents 0x1a | 0x2f<<7 = 6042. // A 32-bit integer takes 1-5 bytes to encode; small numbers are more compact. // Reads binary data from a StringRef, and keeps track of position. class Reader { const char *Begin, *End; bool Err = false; public: Reader(llvm::StringRef Data) : Begin(Data.begin()), End(Data.end()) {} // The "error" bit is set by reading past EOF or reading invalid data. // When in an error state, reads may return zero values: callers should check. bool err() const { return Err; } // Did we read all the data, or encounter an error? bool eof() const { return Begin == End || Err; } // All the data we didn't read yet. llvm::StringRef rest() const { return llvm::StringRef(Begin, End - Begin); } uint8_t consume8() { if (LLVM_UNLIKELY(Begin == End)) { Err = true; return 0; } return *Begin++; } uint32_t consume32() { if (LLVM_UNLIKELY(Begin + 4 > End)) { Err = true; return 0; } auto Ret = llvm::support::endian::read32le(Begin); Begin += 4; return Ret; } llvm::StringRef consume(int N) { if (LLVM_UNLIKELY(Begin + N > End)) { Err = true; return llvm::StringRef(); } llvm::StringRef Ret(Begin, N); Begin += N; return Ret; } uint32_t consumeVar() { constexpr static uint8_t More = 1 << 7; // Use a 32 bit unsigned here to prevent promotion to signed int (unless int // is wider than 32 bits). uint32_t B = consume8(); if (LLVM_LIKELY(!(B & More))) return B; uint32_t Val = B & ~More; for (int Shift = 7; B & More && Shift < 32; Shift += 7) { B = consume8(); // 5th byte of a varint can only have lowest 4 bits set. assert((Shift != 28 || B == (B & 0x0f)) && "Invalid varint encoding"); Val |= (B & ~More) << Shift; } return Val; } llvm::StringRef consumeString(llvm::ArrayRef Strings) { auto StringIndex = consumeVar(); if (LLVM_UNLIKELY(StringIndex >= Strings.size())) { Err = true; return llvm::StringRef(); } return Strings[StringIndex]; } SymbolID consumeID() { llvm::StringRef Raw = consume(SymbolID::RawSize); // short if truncated. return LLVM_UNLIKELY(err()) ? SymbolID() : SymbolID::fromRaw(Raw); } // Read a varint (as consumeVar) and resize the container accordingly. // If the size is invalid, return false and mark an error. // (The caller should abort in this case). template LLVM_NODISCARD bool consumeSize(T &Container) { auto Size = consumeVar(); // Conservatively assume each element is at least one byte. if (Size > (End - Begin)) { Err = true; return false; } Container.resize(Size); return true; } }; void write32(uint32_t I, llvm::raw_ostream &OS) { char Buf[4]; llvm::support::endian::write32le(Buf, I); OS.write(Buf, sizeof(Buf)); } void writeVar(uint32_t I, llvm::raw_ostream &OS) { constexpr static uint8_t More = 1 << 7; if (LLVM_LIKELY(I < 1 << 7)) { OS.write(I); return; } for (;;) { OS.write(I | More); I >>= 7; if (I < 1 << 7) { OS.write(I); return; } } } // STRING TABLE ENCODING // Index data has many string fields, and many strings are identical. // We store each string once, and refer to them by index. // // The string table's format is: // - UncompressedSize : uint32 (or 0 for no compression) // - CompressedData : byte[CompressedSize] // // CompressedData is a zlib-compressed byte[UncompressedSize]. // It contains a sequence of null-terminated strings, e.g. "foo\0bar\0". // These are sorted to improve compression. // Maps each string to a canonical representation. // Strings remain owned externally (e.g. by SymbolSlab). class StringTableOut { llvm::DenseSet Unique; std::vector Sorted; // Since strings are interned, look up can be by pointer. llvm::DenseMap, unsigned> Index; public: StringTableOut() { // Ensure there's at least one string in the table. // Table size zero is reserved to indicate no compression. Unique.insert(""); } // Add a string to the table. Overwrites S if an identical string exists. void intern(llvm::StringRef &S) { S = *Unique.insert(S).first; }; // Finalize the table and write it to OS. No more strings may be added. void finalize(llvm::raw_ostream &OS) { Sorted = {Unique.begin(), Unique.end()}; llvm::sort(Sorted); for (unsigned I = 0; I < Sorted.size(); ++I) Index.try_emplace({Sorted[I].data(), Sorted[I].size()}, I); std::string RawTable; for (llvm::StringRef S : Sorted) { RawTable.append(std::string(S)); RawTable.push_back(0); } if (llvm::zlib::isAvailable()) { llvm::SmallString<1> Compressed; llvm::cantFail(llvm::zlib::compress(RawTable, Compressed)); write32(RawTable.size(), OS); OS << Compressed; } else { write32(0, OS); // No compression. OS << RawTable; } } // Get the ID of an string, which must be interned. Table must be finalized. unsigned index(llvm::StringRef S) const { assert(!Sorted.empty() && "table not finalized"); assert(Index.count({S.data(), S.size()}) && "string not interned"); return Index.find({S.data(), S.size()})->second; } }; struct StringTableIn { llvm::BumpPtrAllocator Arena; std::vector Strings; }; llvm::Expected readStringTable(llvm::StringRef Data) { Reader R(Data); size_t UncompressedSize = R.consume32(); if (R.err()) return error("Truncated string table"); llvm::StringRef Uncompressed; llvm::SmallString<1> UncompressedStorage; if (UncompressedSize == 0) // No compression Uncompressed = R.rest(); else if (llvm::zlib::isAvailable()) { // Don't allocate a massive buffer if UncompressedSize was corrupted // This is effective for sharded index, but not big monolithic ones, as // once compressed size reaches 4MB nothing can be ruled out. // Theoretical max ratio from https://zlib.net/zlib_tech.html constexpr int MaxCompressionRatio = 1032; if (UncompressedSize / MaxCompressionRatio > R.rest().size()) return error("Bad stri table: uncompress {0} -> {1} bytes is implausible", R.rest().size(), UncompressedSize); if (llvm::Error E = llvm::zlib::uncompress(R.rest(), UncompressedStorage, UncompressedSize)) return std::move(E); Uncompressed = UncompressedStorage; } else return error("Compressed string table, but zlib is unavailable"); StringTableIn Table; llvm::StringSaver Saver(Table.Arena); R = Reader(Uncompressed); for (Reader R(Uncompressed); !R.eof();) { auto Len = R.rest().find(0); if (Len == llvm::StringRef::npos) return error("Bad string table: not null terminated"); Table.Strings.push_back(Saver.save(R.consume(Len))); R.consume8(); } if (R.err()) return error("Truncated string table"); return std::move(Table); } // SYMBOL ENCODING // Each field of clangd::Symbol is encoded in turn (see implementation). // - StringRef fields encode as varint (index into the string table) // - enums encode as the underlying type // - most numbers encode as varint void writeLocation(const SymbolLocation &Loc, const StringTableOut &Strings, llvm::raw_ostream &OS) { writeVar(Strings.index(Loc.FileURI), OS); for (const auto &Endpoint : {Loc.Start, Loc.End}) { writeVar(Endpoint.line(), OS); writeVar(Endpoint.column(), OS); } } SymbolLocation readLocation(Reader &Data, llvm::ArrayRef Strings) { SymbolLocation Loc; Loc.FileURI = Data.consumeString(Strings).data(); for (auto *Endpoint : {&Loc.Start, &Loc.End}) { Endpoint->setLine(Data.consumeVar()); Endpoint->setColumn(Data.consumeVar()); } return Loc; } IncludeGraphNode readIncludeGraphNode(Reader &Data, llvm::ArrayRef Strings) { IncludeGraphNode IGN; IGN.Flags = static_cast(Data.consume8()); IGN.URI = Data.consumeString(Strings); llvm::StringRef Digest = Data.consume(IGN.Digest.size()); std::copy(Digest.bytes_begin(), Digest.bytes_end(), IGN.Digest.begin()); if (!Data.consumeSize(IGN.DirectIncludes)) return IGN; for (llvm::StringRef &Include : IGN.DirectIncludes) Include = Data.consumeString(Strings); return IGN; } void writeIncludeGraphNode(const IncludeGraphNode &IGN, const StringTableOut &Strings, llvm::raw_ostream &OS) { OS.write(static_cast(IGN.Flags)); writeVar(Strings.index(IGN.URI), OS); llvm::StringRef Hash(reinterpret_cast(IGN.Digest.data()), IGN.Digest.size()); OS << Hash; writeVar(IGN.DirectIncludes.size(), OS); for (llvm::StringRef Include : IGN.DirectIncludes) writeVar(Strings.index(Include), OS); } void writeSymbol(const Symbol &Sym, const StringTableOut &Strings, llvm::raw_ostream &OS) { OS << Sym.ID.raw(); // TODO: once we start writing xrefs and posting lists, // symbol IDs should probably be in a string table. OS.write(static_cast(Sym.SymInfo.Kind)); OS.write(static_cast(Sym.SymInfo.Lang)); writeVar(Strings.index(Sym.Name), OS); writeVar(Strings.index(Sym.Scope), OS); writeVar(Strings.index(Sym.TemplateSpecializationArgs), OS); writeLocation(Sym.Definition, Strings, OS); writeLocation(Sym.CanonicalDeclaration, Strings, OS); writeVar(Sym.References, OS); OS.write(static_cast(Sym.Flags)); OS.write(static_cast(Sym.Origin)); writeVar(Strings.index(Sym.Signature), OS); writeVar(Strings.index(Sym.CompletionSnippetSuffix), OS); writeVar(Strings.index(Sym.Documentation), OS); writeVar(Strings.index(Sym.ReturnType), OS); writeVar(Strings.index(Sym.Type), OS); auto WriteInclude = [&](const Symbol::IncludeHeaderWithReferences &Include) { writeVar(Strings.index(Include.IncludeHeader), OS); writeVar(Include.References, OS); }; writeVar(Sym.IncludeHeaders.size(), OS); for (const auto &Include : Sym.IncludeHeaders) WriteInclude(Include); } Symbol readSymbol(Reader &Data, llvm::ArrayRef Strings) { Symbol Sym; Sym.ID = Data.consumeID(); Sym.SymInfo.Kind = static_cast(Data.consume8()); Sym.SymInfo.Lang = static_cast(Data.consume8()); Sym.Name = Data.consumeString(Strings); Sym.Scope = Data.consumeString(Strings); Sym.TemplateSpecializationArgs = Data.consumeString(Strings); Sym.Definition = readLocation(Data, Strings); Sym.CanonicalDeclaration = readLocation(Data, Strings); Sym.References = Data.consumeVar(); Sym.Flags = static_cast(Data.consume8()); Sym.Origin = static_cast(Data.consume8()); Sym.Signature = Data.consumeString(Strings); Sym.CompletionSnippetSuffix = Data.consumeString(Strings); Sym.Documentation = Data.consumeString(Strings); Sym.ReturnType = Data.consumeString(Strings); Sym.Type = Data.consumeString(Strings); if (!Data.consumeSize(Sym.IncludeHeaders)) return Sym; for (auto &I : Sym.IncludeHeaders) { I.IncludeHeader = Data.consumeString(Strings); I.References = Data.consumeVar(); } return Sym; } // REFS ENCODING // A refs section has data grouped by Symbol. Each symbol has: // - SymbolID: 8 bytes // - NumRefs: varint // - Ref[NumRefs] // Fields of Ref are encoded in turn, see implementation. void writeRefs(const SymbolID &ID, llvm::ArrayRef Refs, const StringTableOut &Strings, llvm::raw_ostream &OS) { OS << ID.raw(); writeVar(Refs.size(), OS); for (const auto &Ref : Refs) { OS.write(static_cast(Ref.Kind)); writeLocation(Ref.Location, Strings, OS); OS << Ref.Container.raw(); } } std::pair> readRefs(Reader &Data, llvm::ArrayRef Strings) { std::pair> Result; Result.first = Data.consumeID(); if (!Data.consumeSize(Result.second)) return Result; for (auto &Ref : Result.second) { Ref.Kind = static_cast(Data.consume8()); Ref.Location = readLocation(Data, Strings); Ref.Container = Data.consumeID(); } return Result; } // RELATIONS ENCODING // A relations section is a flat list of relations. Each relation has: // - SymbolID (subject): 8 bytes // - relation kind (predicate): 1 byte // - SymbolID (object): 8 bytes // In the future, we might prefer a packed representation if the need arises. void writeRelation(const Relation &R, llvm::raw_ostream &OS) { OS << R.Subject.raw(); OS.write(static_cast(R.Predicate)); OS << R.Object.raw(); } Relation readRelation(Reader &Data) { SymbolID Subject = Data.consumeID(); RelationKind Predicate = static_cast(Data.consume8()); SymbolID Object = Data.consumeID(); return {Subject, Predicate, Object}; } struct InternedCompileCommand { llvm::StringRef Directory; std::vector CommandLine; }; void writeCompileCommand(const InternedCompileCommand &Cmd, const StringTableOut &Strings, llvm::raw_ostream &CmdOS) { writeVar(Strings.index(Cmd.Directory), CmdOS); writeVar(Cmd.CommandLine.size(), CmdOS); for (llvm::StringRef C : Cmd.CommandLine) writeVar(Strings.index(C), CmdOS); } InternedCompileCommand readCompileCommand(Reader CmdReader, llvm::ArrayRef Strings) { InternedCompileCommand Cmd; Cmd.Directory = CmdReader.consumeString(Strings); if (!CmdReader.consumeSize(Cmd.CommandLine)) return Cmd; for (llvm::StringRef &C : Cmd.CommandLine) C = CmdReader.consumeString(Strings); return Cmd; } // FILE ENCODING // A file is a RIFF chunk with type 'CdIx'. // It contains the sections: // - meta: version number // - srcs: information related to include graph // - stri: string table // - symb: symbols // - refs: references to symbols // The current versioning scheme is simple - non-current versions are rejected. // If you make a breaking change, bump this version number to invalidate stored // data. Later we may want to support some backward compatibility. constexpr static uint32_t Version = 15; llvm::Expected readRIFF(llvm::StringRef Data) { auto RIFF = riff::readFile(Data); if (!RIFF) return RIFF.takeError(); if (RIFF->Type != riff::fourCC("CdIx")) return error("wrong RIFF filetype: {0}", riff::fourCCStr(RIFF->Type)); llvm::StringMap Chunks; for (const auto &Chunk : RIFF->Chunks) Chunks.try_emplace(llvm::StringRef(Chunk.ID.data(), Chunk.ID.size()), Chunk.Data); if (!Chunks.count("meta")) return error("missing meta chunk"); Reader Meta(Chunks.lookup("meta")); auto SeenVersion = Meta.consume32(); if (SeenVersion != Version) return error("wrong version: want {0}, got {1}", Version, SeenVersion); // meta chunk is checked above, as we prefer the "version mismatch" error. for (llvm::StringRef RequiredChunk : {"stri"}) if (!Chunks.count(RequiredChunk)) return error("missing required chunk {0}", RequiredChunk); auto Strings = readStringTable(Chunks.lookup("stri")); if (!Strings) return Strings.takeError(); IndexFileIn Result; if (Chunks.count("srcs")) { Reader SrcsReader(Chunks.lookup("srcs")); Result.Sources.emplace(); while (!SrcsReader.eof()) { auto IGN = readIncludeGraphNode(SrcsReader, Strings->Strings); auto Entry = Result.Sources->try_emplace(IGN.URI).first; Entry->getValue() = std::move(IGN); // We change all the strings inside the structure to point at the keys in // the map, since it is the only copy of the string that's going to live. Entry->getValue().URI = Entry->getKey(); for (auto &Include : Entry->getValue().DirectIncludes) Include = Result.Sources->try_emplace(Include).first->getKey(); } if (SrcsReader.err()) return error("malformed or truncated include uri"); } if (Chunks.count("symb")) { Reader SymbolReader(Chunks.lookup("symb")); SymbolSlab::Builder Symbols; while (!SymbolReader.eof()) Symbols.insert(readSymbol(SymbolReader, Strings->Strings)); if (SymbolReader.err()) return error("malformed or truncated symbol"); Result.Symbols = std::move(Symbols).build(); } if (Chunks.count("refs")) { Reader RefsReader(Chunks.lookup("refs")); RefSlab::Builder Refs; while (!RefsReader.eof()) { auto RefsBundle = readRefs(RefsReader, Strings->Strings); for (const auto &Ref : RefsBundle.second) // FIXME: bulk insert? Refs.insert(RefsBundle.first, Ref); } if (RefsReader.err()) return error("malformed or truncated refs"); Result.Refs = std::move(Refs).build(); } if (Chunks.count("rela")) { Reader RelationsReader(Chunks.lookup("rela")); RelationSlab::Builder Relations; while (!RelationsReader.eof()) Relations.insert(readRelation(RelationsReader)); if (RelationsReader.err()) return error("malformed or truncated relations"); Result.Relations = std::move(Relations).build(); } if (Chunks.count("cmdl")) { Reader CmdReader(Chunks.lookup("cmdl")); InternedCompileCommand Cmd = readCompileCommand(CmdReader, Strings->Strings); if (CmdReader.err()) return error("malformed or truncated commandline section"); Result.Cmd.emplace(); Result.Cmd->Directory = std::string(Cmd.Directory); Result.Cmd->CommandLine.reserve(Cmd.CommandLine.size()); for (llvm::StringRef C : Cmd.CommandLine) Result.Cmd->CommandLine.emplace_back(C); } return std::move(Result); } template void visitStrings(IncludeGraphNode &IGN, const Callback &CB) { CB(IGN.URI); for (llvm::StringRef &Include : IGN.DirectIncludes) CB(Include); } void writeRIFF(const IndexFileOut &Data, llvm::raw_ostream &OS) { assert(Data.Symbols && "An index file without symbols makes no sense!"); riff::File RIFF; RIFF.Type = riff::fourCC("CdIx"); llvm::SmallString<4> Meta; { llvm::raw_svector_ostream MetaOS(Meta); write32(Version, MetaOS); } RIFF.Chunks.push_back({riff::fourCC("meta"), Meta}); StringTableOut Strings; std::vector Symbols; for (const auto &Sym : *Data.Symbols) { Symbols.emplace_back(Sym); visitStrings(Symbols.back(), [&](llvm::StringRef &S) { Strings.intern(S); }); } std::vector Sources; if (Data.Sources) for (const auto &Source : *Data.Sources) { Sources.push_back(Source.getValue()); visitStrings(Sources.back(), [&](llvm::StringRef &S) { Strings.intern(S); }); } std::vector>> Refs; if (Data.Refs) { for (const auto &Sym : *Data.Refs) { Refs.emplace_back(Sym); for (auto &Ref : Refs.back().second) { llvm::StringRef File = Ref.Location.FileURI; Strings.intern(File); Ref.Location.FileURI = File.data(); } } } std::vector Relations; if (Data.Relations) { for (const auto &Relation : *Data.Relations) { Relations.emplace_back(Relation); // No strings to be interned in relations. } } InternedCompileCommand InternedCmd; if (Data.Cmd) { InternedCmd.CommandLine.reserve(Data.Cmd->CommandLine.size()); InternedCmd.Directory = Data.Cmd->Directory; Strings.intern(InternedCmd.Directory); for (llvm::StringRef C : Data.Cmd->CommandLine) { InternedCmd.CommandLine.emplace_back(C); Strings.intern(InternedCmd.CommandLine.back()); } } std::string StringSection; { llvm::raw_string_ostream StringOS(StringSection); Strings.finalize(StringOS); } RIFF.Chunks.push_back({riff::fourCC("stri"), StringSection}); std::string SymbolSection; { llvm::raw_string_ostream SymbolOS(SymbolSection); for (const auto &Sym : Symbols) writeSymbol(Sym, Strings, SymbolOS); } RIFF.Chunks.push_back({riff::fourCC("symb"), SymbolSection}); std::string RefsSection; if (Data.Refs) { { llvm::raw_string_ostream RefsOS(RefsSection); for (const auto &Sym : Refs) writeRefs(Sym.first, Sym.second, Strings, RefsOS); } RIFF.Chunks.push_back({riff::fourCC("refs"), RefsSection}); } std::string RelationSection; if (Data.Relations) { { llvm::raw_string_ostream RelationOS{RelationSection}; for (const auto &Relation : Relations) writeRelation(Relation, RelationOS); } RIFF.Chunks.push_back({riff::fourCC("rela"), RelationSection}); } std::string SrcsSection; { { llvm::raw_string_ostream SrcsOS(SrcsSection); for (const auto &SF : Sources) writeIncludeGraphNode(SF, Strings, SrcsOS); } RIFF.Chunks.push_back({riff::fourCC("srcs"), SrcsSection}); } std::string CmdlSection; if (Data.Cmd) { { llvm::raw_string_ostream CmdOS(CmdlSection); writeCompileCommand(InternedCmd, Strings, CmdOS); } RIFF.Chunks.push_back({riff::fourCC("cmdl"), CmdlSection}); } OS << RIFF; } } // namespace // Defined in YAMLSerialization.cpp. void writeYAML(const IndexFileOut &, llvm::raw_ostream &); llvm::Expected readYAML(llvm::StringRef); llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const IndexFileOut &O) { switch (O.Format) { case IndexFileFormat::RIFF: writeRIFF(O, OS); break; case IndexFileFormat::YAML: writeYAML(O, OS); break; } return OS; } llvm::Expected readIndexFile(llvm::StringRef Data) { if (Data.startswith("RIFF")) { return readRIFF(Data); } else if (auto YAMLContents = readYAML(Data)) { return std::move(*YAMLContents); } else { return error("Not a RIFF file and failed to parse as YAML: {0}", YAMLContents.takeError()); } } std::unique_ptr loadIndex(llvm::StringRef SymbolFilename, bool UseDex) { trace::Span OverallTracer("LoadIndex"); auto Buffer = llvm::MemoryBuffer::getFile(SymbolFilename); if (!Buffer) { elog("Can't open {0}: {1}", SymbolFilename, Buffer.getError().message()); return nullptr; } SymbolSlab Symbols; RefSlab Refs; RelationSlab Relations; { trace::Span Tracer("ParseIndex"); if (auto I = readIndexFile(Buffer->get()->getBuffer())) { if (I->Symbols) Symbols = std::move(*I->Symbols); if (I->Refs) Refs = std::move(*I->Refs); if (I->Relations) Relations = std::move(*I->Relations); } else { elog("Bad index file: {0}", I.takeError()); return nullptr; } } size_t NumSym = Symbols.size(); size_t NumRefs = Refs.numRefs(); size_t NumRelations = Relations.size(); trace::Span Tracer("BuildIndex"); auto Index = UseDex ? dex::Dex::build(std::move(Symbols), std::move(Refs), std::move(Relations)) : MemIndex::build(std::move(Symbols), std::move(Refs), std::move(Relations)); vlog("Loaded {0} from {1} with estimated memory usage {2} bytes\n" " - number of symbols: {3}\n" " - number of refs: {4}\n" " - number of relations: {5}", UseDex ? "Dex" : "MemIndex", SymbolFilename, Index->estimateMemoryUsage(), NumSym, NumRefs, NumRelations); return Index; } } // namespace clangd } // namespace clang