1 //===- DbiStreamBuilder.cpp - PDB Dbi Stream Creation -----------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // The data structures defined in this file are based on the reference
10 // implementation which is available at
11 // https://github.com/Microsoft/microsoft-pdb/blob/master/PDB/dbi/gsi.cpp
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h"
16 #include "llvm/DebugInfo/CodeView/RecordName.h"
17 #include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
18 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
19 #include "llvm/DebugInfo/CodeView/SymbolSerializer.h"
20 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
21 #include "llvm/DebugInfo/MSF/MSFCommon.h"
22 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
23 #include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
24 #include "llvm/DebugInfo/PDB/Native/Hash.h"
25 #include "llvm/Support/BinaryItemStream.h"
26 #include "llvm/Support/BinaryStreamWriter.h"
27 #include "llvm/Support/Parallel.h"
28 #include "llvm/Support/xxhash.h"
29 #include <algorithm>
30 #include <vector>
31
32 using namespace llvm;
33 using namespace llvm::msf;
34 using namespace llvm::pdb;
35 using namespace llvm::codeview;
36
37 // Helper class for building the public and global PDB hash table buckets.
38 struct llvm::pdb::GSIHashStreamBuilder {
39 // Sum of the size of all public or global records.
40 uint32_t RecordByteSize = 0;
41
42 std::vector<PSHashRecord> HashRecords;
43
44 // The hash bitmap has `ceil((IPHR_HASH + 1) / 32)` words in it. The
45 // reference implementation builds a hash table with IPHR_HASH buckets in it.
46 // The last bucket is used to link together free hash table cells in a linked
47 // list, but it is always empty in the compressed, on-disk format. However,
48 // the bitmap must have a bit for it.
49 std::array<support::ulittle32_t, (IPHR_HASH + 32) / 32> HashBitmap;
50
51 std::vector<support::ulittle32_t> HashBuckets;
52
53 uint32_t calculateSerializedLength() const;
54 Error commit(BinaryStreamWriter &Writer);
55
56 void finalizePublicBuckets();
57 void finalizeGlobalBuckets(uint32_t RecordZeroOffset);
58
59 // Assign public and global symbol records into hash table buckets.
60 // Modifies the list of records to store the bucket index, but does not
61 // change the order.
62 void finalizeBuckets(uint32_t RecordZeroOffset,
63 MutableArrayRef<BulkPublic> Globals);
64 };
65
66 // DenseMapInfo implementation for deduplicating symbol records.
67 struct llvm::pdb::SymbolDenseMapInfo {
getEmptyKeyllvm::pdb::SymbolDenseMapInfo68 static inline CVSymbol getEmptyKey() {
69 static CVSymbol Empty;
70 return Empty;
71 }
getTombstoneKeyllvm::pdb::SymbolDenseMapInfo72 static inline CVSymbol getTombstoneKey() {
73 static CVSymbol Tombstone(
74 DenseMapInfo<ArrayRef<uint8_t>>::getTombstoneKey());
75 return Tombstone;
76 }
getHashValuellvm::pdb::SymbolDenseMapInfo77 static unsigned getHashValue(const CVSymbol &Val) {
78 return xxHash64(Val.RecordData);
79 }
isEqualllvm::pdb::SymbolDenseMapInfo80 static bool isEqual(const CVSymbol &LHS, const CVSymbol &RHS) {
81 return LHS.RecordData == RHS.RecordData;
82 }
83 };
84
85 namespace {
86 LLVM_PACKED_START
87 struct PublicSym32Layout {
88 RecordPrefix Prefix;
89 PublicSym32Header Pub;
90 // char Name[];
91 };
92 LLVM_PACKED_END
93 } // namespace
94
95 // Calculate how much memory this public needs when serialized.
sizeOfPublic(const BulkPublic & Pub)96 static uint32_t sizeOfPublic(const BulkPublic &Pub) {
97 uint32_t NameLen = Pub.NameLen;
98 NameLen = std::min(NameLen,
99 uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1));
100 return alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4);
101 }
102
serializePublic(uint8_t * Mem,const BulkPublic & Pub)103 static CVSymbol serializePublic(uint8_t *Mem, const BulkPublic &Pub) {
104 // Assume the caller has allocated sizeOfPublic bytes.
105 uint32_t NameLen = std::min(
106 Pub.NameLen, uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1));
107 size_t Size = alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4);
108 assert(Size == sizeOfPublic(Pub));
109 auto *FixedMem = reinterpret_cast<PublicSym32Layout *>(Mem);
110 FixedMem->Prefix.RecordKind = static_cast<uint16_t>(codeview::S_PUB32);
111 FixedMem->Prefix.RecordLen = static_cast<uint16_t>(Size - 2);
112 FixedMem->Pub.Flags = Pub.Flags;
113 FixedMem->Pub.Offset = Pub.Offset;
114 FixedMem->Pub.Segment = Pub.Segment;
115 char *NameMem = reinterpret_cast<char *>(FixedMem + 1);
116 memcpy(NameMem, Pub.Name, NameLen);
117 // Zero the null terminator and remaining bytes.
118 memset(&NameMem[NameLen], 0, Size - sizeof(PublicSym32Layout) - NameLen);
119 return CVSymbol(makeArrayRef(reinterpret_cast<uint8_t *>(Mem), Size));
120 }
121
calculateSerializedLength() const122 uint32_t GSIHashStreamBuilder::calculateSerializedLength() const {
123 uint32_t Size = sizeof(GSIHashHeader);
124 Size += HashRecords.size() * sizeof(PSHashRecord);
125 Size += HashBitmap.size() * sizeof(uint32_t);
126 Size += HashBuckets.size() * sizeof(uint32_t);
127 return Size;
128 }
129
commit(BinaryStreamWriter & Writer)130 Error GSIHashStreamBuilder::commit(BinaryStreamWriter &Writer) {
131 GSIHashHeader Header;
132 Header.VerSignature = GSIHashHeader::HdrSignature;
133 Header.VerHdr = GSIHashHeader::HdrVersion;
134 Header.HrSize = HashRecords.size() * sizeof(PSHashRecord);
135 Header.NumBuckets = HashBitmap.size() * 4 + HashBuckets.size() * 4;
136
137 if (auto EC = Writer.writeObject(Header))
138 return EC;
139
140 if (auto EC = Writer.writeArray(makeArrayRef(HashRecords)))
141 return EC;
142 if (auto EC = Writer.writeArray(makeArrayRef(HashBitmap)))
143 return EC;
144 if (auto EC = Writer.writeArray(makeArrayRef(HashBuckets)))
145 return EC;
146 return Error::success();
147 }
148
isAsciiString(StringRef S)149 static bool isAsciiString(StringRef S) {
150 return llvm::all_of(S, [](char C) { return unsigned(C) < 0x80; });
151 }
152
153 // See `caseInsensitiveComparePchPchCchCch` in gsi.cpp
gsiRecordCmp(StringRef S1,StringRef S2)154 static int gsiRecordCmp(StringRef S1, StringRef S2) {
155 size_t LS = S1.size();
156 size_t RS = S2.size();
157 // Shorter strings always compare less than longer strings.
158 if (LS != RS)
159 return LS - RS;
160
161 // If either string contains non ascii characters, memcmp them.
162 if (LLVM_UNLIKELY(!isAsciiString(S1) || !isAsciiString(S2)))
163 return memcmp(S1.data(), S2.data(), LS);
164
165 // Both strings are ascii, perform a case-insenstive comparison.
166 return S1.compare_lower(S2.data());
167 }
168
finalizePublicBuckets()169 void GSIStreamBuilder::finalizePublicBuckets() {
170 PSH->finalizeBuckets(0, Publics);
171 }
172
finalizeGlobalBuckets(uint32_t RecordZeroOffset)173 void GSIStreamBuilder::finalizeGlobalBuckets(uint32_t RecordZeroOffset) {
174 // Build up a list of globals to be bucketed. Use the BulkPublic data
175 // structure for this purpose, even though these are global records, not
176 // public records. Most of the same fields are required:
177 // - Name
178 // - NameLen
179 // - SymOffset
180 // - BucketIdx
181 // The dead fields are Offset, Segment, and Flags.
182 std::vector<BulkPublic> Records;
183 Records.resize(Globals.size());
184 uint32_t SymOffset = RecordZeroOffset;
185 for (size_t I = 0, E = Globals.size(); I < E; ++I) {
186 StringRef Name = getSymbolName(Globals[I]);
187 Records[I].Name = Name.data();
188 Records[I].NameLen = Name.size();
189 Records[I].SymOffset = SymOffset;
190 SymOffset += Globals[I].length();
191 }
192
193 GSH->finalizeBuckets(RecordZeroOffset, Records);
194 }
195
finalizeBuckets(uint32_t RecordZeroOffset,MutableArrayRef<BulkPublic> Records)196 void GSIHashStreamBuilder::finalizeBuckets(
197 uint32_t RecordZeroOffset, MutableArrayRef<BulkPublic> Records) {
198 // Hash every name in parallel.
199 parallelForEachN(0, Records.size(), [&](size_t I) {
200 Records[I].setBucketIdx(hashStringV1(Records[I].Name) % IPHR_HASH);
201 });
202
203 // Count up the size of each bucket. Then, use an exclusive prefix sum to
204 // calculate the bucket start offsets. This is C++17 std::exclusive_scan, but
205 // we can't use it yet.
206 uint32_t BucketStarts[IPHR_HASH] = {0};
207 for (const BulkPublic &P : Records)
208 ++BucketStarts[P.BucketIdx];
209 uint32_t Sum = 0;
210 for (uint32_t &B : BucketStarts) {
211 uint32_t Size = B;
212 B = Sum;
213 Sum += Size;
214 }
215
216 // Place globals into the hash table in bucket order. When placing a global,
217 // update the bucket start. Every hash table slot should be filled. Always use
218 // a refcount of one for now.
219 HashRecords.resize(Records.size());
220 uint32_t BucketCursors[IPHR_HASH];
221 memcpy(BucketCursors, BucketStarts, sizeof(BucketCursors));
222 for (int I = 0, E = Records.size(); I < E; ++I) {
223 uint32_t HashIdx = BucketCursors[Records[I].BucketIdx]++;
224 HashRecords[HashIdx].Off = I;
225 HashRecords[HashIdx].CRef = 1;
226 }
227
228 // Within the buckets, sort each bucket by memcmp of the symbol's name. It's
229 // important that we use the same sorting algorithm as is used by the
230 // reference implementation to ensure that the search for a record within a
231 // bucket can properly early-out when it detects the record won't be found.
232 // The algorithm used here corresponds to the function
233 // caseInsensitiveComparePchPchCchCch in the reference implementation.
234 parallelForEachN(0, IPHR_HASH, [&](size_t I) {
235 auto B = HashRecords.begin() + BucketStarts[I];
236 auto E = HashRecords.begin() + BucketCursors[I];
237 if (B == E)
238 return;
239 auto BucketCmp = [Records](const PSHashRecord &LHash,
240 const PSHashRecord &RHash) {
241 const BulkPublic &L = Records[uint32_t(LHash.Off)];
242 const BulkPublic &R = Records[uint32_t(RHash.Off)];
243 assert(L.BucketIdx == R.BucketIdx);
244 int Cmp = gsiRecordCmp(L.getName(), R.getName());
245 if (Cmp != 0)
246 return Cmp < 0;
247 // This comparison is necessary to make the sorting stable in the presence
248 // of two static globals with the same name. The easiest way to observe
249 // this is with S_LDATA32 records.
250 return L.SymOffset < R.SymOffset;
251 };
252 llvm::sort(B, E, BucketCmp);
253
254 // After we are done sorting, replace the global indices with the stream
255 // offsets of each global. Add one when writing symbol offsets to disk.
256 // See GSI1::fixSymRecs.
257 for (PSHashRecord &HRec : make_range(B, E))
258 HRec.Off = Records[uint32_t(HRec.Off)].SymOffset + 1;
259 });
260
261 // For each non-empty bucket, push the bucket start offset into HashBuckets
262 // and set a bit in the hash bitmap.
263 for (uint32_t I = 0; I < HashBitmap.size(); ++I) {
264 uint32_t Word = 0;
265 for (uint32_t J = 0; J < 32; ++J) {
266 // Skip empty buckets.
267 uint32_t BucketIdx = I * 32 + J;
268 if (BucketIdx >= IPHR_HASH ||
269 BucketStarts[BucketIdx] == BucketCursors[BucketIdx])
270 continue;
271 Word |= (1U << J);
272
273 // Calculate what the offset of the first hash record in the chain would
274 // be if it were inflated to contain 32-bit pointers. On a 32-bit system,
275 // each record would be 12 bytes. See HROffsetCalc in gsi.h.
276 const int SizeOfHROffsetCalc = 12;
277 ulittle32_t ChainStartOff =
278 ulittle32_t(BucketStarts[BucketIdx] * SizeOfHROffsetCalc);
279 HashBuckets.push_back(ChainStartOff);
280 }
281 HashBitmap[I] = Word;
282 }
283 }
284
GSIStreamBuilder(msf::MSFBuilder & Msf)285 GSIStreamBuilder::GSIStreamBuilder(msf::MSFBuilder &Msf)
286 : Msf(Msf), PSH(std::make_unique<GSIHashStreamBuilder>()),
287 GSH(std::make_unique<GSIHashStreamBuilder>()) {}
288
~GSIStreamBuilder()289 GSIStreamBuilder::~GSIStreamBuilder() {}
290
calculatePublicsHashStreamSize() const291 uint32_t GSIStreamBuilder::calculatePublicsHashStreamSize() const {
292 uint32_t Size = 0;
293 Size += sizeof(PublicsStreamHeader);
294 Size += PSH->calculateSerializedLength();
295 Size += Publics.size() * sizeof(uint32_t); // AddrMap
296 // FIXME: Add thunk map and section offsets for incremental linking.
297
298 return Size;
299 }
300
calculateGlobalsHashStreamSize() const301 uint32_t GSIStreamBuilder::calculateGlobalsHashStreamSize() const {
302 return GSH->calculateSerializedLength();
303 }
304
finalizeMsfLayout()305 Error GSIStreamBuilder::finalizeMsfLayout() {
306 // First we write public symbol records, then we write global symbol records.
307 finalizePublicBuckets();
308 finalizeGlobalBuckets(PSH->RecordByteSize);
309
310 Expected<uint32_t> Idx = Msf.addStream(calculateGlobalsHashStreamSize());
311 if (!Idx)
312 return Idx.takeError();
313 GlobalsStreamIndex = *Idx;
314
315 Idx = Msf.addStream(calculatePublicsHashStreamSize());
316 if (!Idx)
317 return Idx.takeError();
318 PublicsStreamIndex = *Idx;
319
320 uint32_t RecordBytes = PSH->RecordByteSize + GSH->RecordByteSize;
321
322 Idx = Msf.addStream(RecordBytes);
323 if (!Idx)
324 return Idx.takeError();
325 RecordStreamIndex = *Idx;
326 return Error::success();
327 }
328
addPublicSymbols(std::vector<BulkPublic> && PublicsIn)329 void GSIStreamBuilder::addPublicSymbols(std::vector<BulkPublic> &&PublicsIn) {
330 assert(Publics.empty() && PSH->RecordByteSize == 0 &&
331 "publics can only be added once");
332 Publics = std::move(PublicsIn);
333
334 // Sort the symbols by name. PDBs contain lots of symbols, so use parallelism.
335 parallelSort(Publics, [](const BulkPublic &L, const BulkPublic &R) {
336 return L.getName() < R.getName();
337 });
338
339 // Assign offsets and calculate the length of the public symbol records.
340 uint32_t SymOffset = 0;
341 for (BulkPublic &Pub : Publics) {
342 Pub.SymOffset = SymOffset;
343 SymOffset += sizeOfPublic(Pub);
344 }
345
346 // Remember the length of the public stream records.
347 PSH->RecordByteSize = SymOffset;
348 }
349
addGlobalSymbol(const ProcRefSym & Sym)350 void GSIStreamBuilder::addGlobalSymbol(const ProcRefSym &Sym) {
351 serializeAndAddGlobal(Sym);
352 }
353
addGlobalSymbol(const DataSym & Sym)354 void GSIStreamBuilder::addGlobalSymbol(const DataSym &Sym) {
355 serializeAndAddGlobal(Sym);
356 }
357
addGlobalSymbol(const ConstantSym & Sym)358 void GSIStreamBuilder::addGlobalSymbol(const ConstantSym &Sym) {
359 serializeAndAddGlobal(Sym);
360 }
361
362 template <typename T>
serializeAndAddGlobal(const T & Symbol)363 void GSIStreamBuilder::serializeAndAddGlobal(const T &Symbol) {
364 T Copy(Symbol);
365 addGlobalSymbol(SymbolSerializer::writeOneSymbol(Copy, Msf.getAllocator(),
366 CodeViewContainer::Pdb));
367 }
368
addGlobalSymbol(const codeview::CVSymbol & Symbol)369 void GSIStreamBuilder::addGlobalSymbol(const codeview::CVSymbol &Symbol) {
370 // Ignore duplicate typedefs and constants.
371 if (Symbol.kind() == S_UDT || Symbol.kind() == S_CONSTANT) {
372 auto Iter = GlobalsSeen.insert(Symbol);
373 if (!Iter.second)
374 return;
375 }
376 GSH->RecordByteSize += Symbol.length();
377 Globals.push_back(Symbol);
378 }
379
380 // Serialize each public and write it.
writePublics(BinaryStreamWriter & Writer,ArrayRef<BulkPublic> Publics)381 static Error writePublics(BinaryStreamWriter &Writer,
382 ArrayRef<BulkPublic> Publics) {
383 std::vector<uint8_t> Storage;
384 for (const BulkPublic &Pub : Publics) {
385 Storage.resize(sizeOfPublic(Pub));
386 serializePublic(Storage.data(), Pub);
387 if (Error E = Writer.writeBytes(Storage))
388 return E;
389 }
390 return Error::success();
391 }
392
writeRecords(BinaryStreamWriter & Writer,ArrayRef<CVSymbol> Records)393 static Error writeRecords(BinaryStreamWriter &Writer,
394 ArrayRef<CVSymbol> Records) {
395 BinaryItemStream<CVSymbol> ItemStream(support::endianness::little);
396 ItemStream.setItems(Records);
397 BinaryStreamRef RecordsRef(ItemStream);
398 return Writer.writeStreamRef(RecordsRef);
399 }
400
commitSymbolRecordStream(WritableBinaryStreamRef Stream)401 Error GSIStreamBuilder::commitSymbolRecordStream(
402 WritableBinaryStreamRef Stream) {
403 BinaryStreamWriter Writer(Stream);
404
405 // Write public symbol records first, followed by global symbol records. This
406 // must match the order that we assume in finalizeMsfLayout when computing
407 // PSHZero and GSHZero.
408 if (auto EC = writePublics(Writer, Publics))
409 return EC;
410 if (auto EC = writeRecords(Writer, Globals))
411 return EC;
412
413 return Error::success();
414 }
415
416 static std::vector<support::ulittle32_t>
computeAddrMap(ArrayRef<BulkPublic> Publics)417 computeAddrMap(ArrayRef<BulkPublic> Publics) {
418 // Build a parallel vector of indices into the Publics vector, and sort it by
419 // address.
420 std::vector<ulittle32_t> PubAddrMap;
421 PubAddrMap.reserve(Publics.size());
422 for (int I = 0, E = Publics.size(); I < E; ++I)
423 PubAddrMap.push_back(ulittle32_t(I));
424
425 auto AddrCmp = [Publics](const ulittle32_t &LIdx, const ulittle32_t &RIdx) {
426 const BulkPublic &L = Publics[LIdx];
427 const BulkPublic &R = Publics[RIdx];
428 if (L.Segment != R.Segment)
429 return L.Segment < R.Segment;
430 if (L.Offset != R.Offset)
431 return L.Offset < R.Offset;
432 // parallelSort is unstable, so we have to do name comparison to ensure
433 // that two names for the same location come out in a deterministic order.
434 return L.getName() < R.getName();
435 };
436 parallelSort(PubAddrMap, AddrCmp);
437
438 // Rewrite the public symbol indices into symbol offsets.
439 for (ulittle32_t &Entry : PubAddrMap)
440 Entry = Publics[Entry].SymOffset;
441 return PubAddrMap;
442 }
443
commitPublicsHashStream(WritableBinaryStreamRef Stream)444 Error GSIStreamBuilder::commitPublicsHashStream(
445 WritableBinaryStreamRef Stream) {
446 BinaryStreamWriter Writer(Stream);
447 PublicsStreamHeader Header;
448
449 // FIXME: Fill these in. They are for incremental linking.
450 Header.SymHash = PSH->calculateSerializedLength();
451 Header.AddrMap = Publics.size() * 4;
452 Header.NumThunks = 0;
453 Header.SizeOfThunk = 0;
454 Header.ISectThunkTable = 0;
455 memset(Header.Padding, 0, sizeof(Header.Padding));
456 Header.OffThunkTable = 0;
457 Header.NumSections = 0;
458 if (auto EC = Writer.writeObject(Header))
459 return EC;
460
461 if (auto EC = PSH->commit(Writer))
462 return EC;
463
464 std::vector<support::ulittle32_t> PubAddrMap = computeAddrMap(Publics);
465 assert(PubAddrMap.size() == Publics.size());
466 if (auto EC = Writer.writeArray(makeArrayRef(PubAddrMap)))
467 return EC;
468
469 return Error::success();
470 }
471
commitGlobalsHashStream(WritableBinaryStreamRef Stream)472 Error GSIStreamBuilder::commitGlobalsHashStream(
473 WritableBinaryStreamRef Stream) {
474 BinaryStreamWriter Writer(Stream);
475 return GSH->commit(Writer);
476 }
477
commit(const msf::MSFLayout & Layout,WritableBinaryStreamRef Buffer)478 Error GSIStreamBuilder::commit(const msf::MSFLayout &Layout,
479 WritableBinaryStreamRef Buffer) {
480 auto GS = WritableMappedBlockStream::createIndexedStream(
481 Layout, Buffer, getGlobalsStreamIndex(), Msf.getAllocator());
482 auto PS = WritableMappedBlockStream::createIndexedStream(
483 Layout, Buffer, getPublicsStreamIndex(), Msf.getAllocator());
484 auto PRS = WritableMappedBlockStream::createIndexedStream(
485 Layout, Buffer, getRecordStreamIndex(), Msf.getAllocator());
486
487 if (auto EC = commitSymbolRecordStream(*PRS))
488 return EC;
489 if (auto EC = commitGlobalsHashStream(*GS))
490 return EC;
491 if (auto EC = commitPublicsHashStream(*PS))
492 return EC;
493 return Error::success();
494 }
495