1 #include "./durchschlag.h"
2 
3 #include <algorithm>
4 #include <exception>  /* terminate */
5 
6 #include "divsufsort.h"
7 
8 /* Pointer to position in text. */
9 typedef DurchschlagTextIdx TextIdx;
10 
11 /* (Sum of) value(s) of slice(s). */
12 typedef uint32_t Score;
13 
14 typedef struct HashSlot {
15   TextIdx next;
16   TextIdx offset;
17 } HashSlot;
18 
19 typedef struct MetaSlot {
20   TextIdx mark;
21   Score score;
22 } MetaSlot;
23 
24 typedef struct Range {
25   TextIdx start;
26   TextIdx end;
27 } Range;
28 
29 typedef struct Candidate {
30   Score score;
31   TextIdx position;
32 } Candidate;
33 
34 struct greaterScore {
operator ()greaterScore35   bool operator()(const Candidate& a, const Candidate& b) const {
36     return (a.score > b.score) ||
37         ((a.score == b.score) && (a.position < b.position));
38   }
39 };
40 
41 struct lessScore {
operator ()lessScore42   bool operator()(const Candidate& a, const Candidate& b) const {
43     return (a.score < b.score) ||
44         ((a.score == b.score) && (a.position > b.position));
45   }
46 };
47 
48 #define CANDIDATE_BUNDLE_SIZE (1 << 18)
49 
fatal(const char * error)50 static void fatal(const char* error) {
51   fprintf(stderr, "%s\n", error);
52   std::terminate();
53 }
54 
calculateDictionarySize(const std::vector<Range> & ranges)55 static TextIdx calculateDictionarySize(const std::vector<Range>& ranges) {
56   TextIdx result = 0;
57   for (size_t i = 0; i < ranges.size(); ++i) {
58     const Range& r = ranges[i];
59     result += r.end - r.start;
60   }
61   return result;
62 }
63 
createDictionary(const uint8_t * data,const std::vector<Range> & ranges,size_t limit)64 static std::string createDictionary(
65     const uint8_t* data, const std::vector<Range>& ranges, size_t limit) {
66   std::string output;
67   output.reserve(calculateDictionarySize(ranges));
68   for (size_t i = 0; i < ranges.size(); ++i) {
69     const Range& r = ranges[i];
70     output.insert(output.end(), &data[r.start], &data[r.end]);
71   }
72   if (output.size() > limit) {
73     output.resize(limit);
74   }
75   return output;
76 }
77 
78 /* precondition: span > 0
79    precondition: end + span == len(shortcut) */
buildCandidatesList(std::vector<Candidate> * candidates,std::vector<MetaSlot> * map,TextIdx span,const TextIdx * shortcut,TextIdx end)80 static Score buildCandidatesList(std::vector<Candidate>* candidates,
81     std::vector<MetaSlot>* map, TextIdx span, const TextIdx* shortcut,
82     TextIdx end) {
83   candidates->resize(0);
84 
85   size_t n = map->size();
86   MetaSlot* slots = map->data();
87   for (size_t j = 0; j < n; ++j) {
88     slots[j].mark = 0;
89   }
90 
91   Score score = 0;
92   /* Consider the whole span, except one last item. The following loop will
93      add the last item to the end of the "chain", evaluate it, and cut one
94      "link" form the beginning. */
95   for (size_t j = 0; j < span - 1; ++j) {
96     MetaSlot& item = slots[shortcut[j]];
97     if (item.mark == 0) {
98       score += item.score;
99     }
100     item.mark++;
101   }
102 
103   TextIdx i = 0;
104   TextIdx limit = std::min<TextIdx>(end, CANDIDATE_BUNDLE_SIZE);
105   Score maxScore = 0;
106   for (; i < limit; ++i) {
107     TextIdx slice = shortcut[i + span - 1];
108     MetaSlot& pick = slots[slice];
109     if (pick.mark == 0) {
110       score += pick.score;
111     }
112     pick.mark++;
113 
114     if (score > maxScore) {
115       maxScore = score;
116     }
117     candidates->push_back({score, i});
118 
119     MetaSlot& drop = slots[shortcut[i]];
120     drop.mark--;
121     if (drop.mark == 0) {
122       score -= drop.score;
123     }
124   }
125 
126   std::make_heap(candidates->begin(), candidates->end(), greaterScore());
127   Score minScore = candidates->at(0).score;
128   for (; i < end; ++i) {
129     TextIdx slice = shortcut[i + span - 1];
130     MetaSlot& pick = slots[slice];
131     if (pick.mark == 0) {
132       score += pick.score;
133     }
134     pick.mark++;
135 
136     if (score > maxScore) {
137       maxScore = score;
138     }
139     if (score >= minScore) {
140       candidates->push_back({score, i});
141       std::push_heap(candidates->begin(), candidates->end(), greaterScore());
142       if (candidates->size() > CANDIDATE_BUNDLE_SIZE && maxScore != minScore) {
143         while (candidates->at(0).score == minScore) {
144           std::pop_heap(candidates->begin(), candidates->end(), greaterScore());
145           candidates->pop_back();
146         }
147         minScore = candidates->at(0).score;
148       }
149     }
150 
151     MetaSlot& drop = slots[shortcut[i]];
152     drop.mark--;
153     if (drop.mark == 0) {
154       score -= drop.score;
155     }
156   }
157 
158   for (size_t j = 0; j < n; ++j) {
159     slots[j].mark = 0;
160   }
161 
162   std::make_heap(candidates->begin(), candidates->end(), lessScore());
163   return minScore;
164 }
165 
166 /* precondition: span > 0
167    precondition: end + span == len(shortcut) */
rebuildCandidatesList(std::vector<TextIdx> * candidates,std::vector<MetaSlot> * map,TextIdx span,const TextIdx * shortcut,TextIdx end,TextIdx * next)168 static Score rebuildCandidatesList(std::vector<TextIdx>* candidates,
169     std::vector<MetaSlot>* map, TextIdx span, const TextIdx* shortcut,
170     TextIdx end, TextIdx* next) {
171   size_t n = candidates->size();
172   TextIdx* data = candidates->data();
173   for (size_t i = 0; i < n; ++i) {
174     data[i] = 0;
175   }
176 
177   n = map->size();
178   MetaSlot* slots = map->data();
179   for (size_t i = 0; i < n; ++i) {
180     slots[i].mark = 0;
181   }
182 
183   Score score = 0;
184   /* Consider the whole span, except one last item. The following loop will
185      add the last item to the end of the "chain", evaluate it, and cut one
186      "link" form the beginning. */
187   for (TextIdx i = 0; i < span - 1; ++i) {
188     MetaSlot& item = slots[shortcut[i]];
189     if (item.mark == 0) {
190       score += item.score;
191     }
192     item.mark++;
193   }
194 
195   Score maxScore = 0;
196   for (TextIdx i = 0; i < end; ++i) {
197     MetaSlot& pick = slots[shortcut[i + span - 1]];
198     if (pick.mark == 0) {
199       score += pick.score;
200     }
201     pick.mark++;
202 
203     if (candidates->size() <= score) {
204       candidates->resize(score + 1);
205     }
206     if (score > maxScore) {
207       maxScore = score;
208     }
209     next[i] = candidates->at(score);
210     candidates->at(score) = i;
211 
212     MetaSlot& drop = slots[shortcut[i]];
213     drop.mark--;
214     if (drop.mark == 0) {
215       score -= drop.score;
216     }
217   }
218 
219   for (size_t i = 0; i < n; ++i) {
220     slots[i].mark = 0;
221   }
222 
223   candidates->resize(maxScore + 1);
224   return maxScore;
225 }
226 
addRange(std::vector<Range> * ranges,TextIdx start,TextIdx end)227 static void addRange(std::vector<Range>* ranges, TextIdx start, TextIdx end) {
228   for (auto it = ranges->begin(); it != ranges->end();) {
229     if (end < it->start) {
230       ranges->insert(it, {start, end});
231       return;
232     }
233     if (it->end < start) {
234       it++;
235       continue;
236     }
237     // Combine with existing.
238     start = std::min(start, it->start);
239     end = std::max(end, it->end);
240     // Remove consumed vector and continue.
241     it = ranges->erase(it);
242   }
243   ranges->push_back({start, end});
244 }
245 
durchschlag_generate(size_t dictionary_size_limit,size_t slice_len,size_t block_len,const std::vector<size_t> & sample_sizes,const uint8_t * sample_data)246 std::string durchschlag_generate(
247     size_t dictionary_size_limit, size_t slice_len, size_t block_len,
248     const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
249   DurchschlagContext ctx = durchschlag_prepare(
250       slice_len, sample_sizes, sample_data);
251   return durchschlag_generate(DURCHSCHLAG_COLLABORATIVE,
252       dictionary_size_limit, block_len, ctx, sample_data);
253 }
254 
durchschlag_prepare(size_t slice_len,const std::vector<size_t> & sample_sizes,const uint8_t * sample_data)255 DurchschlagContext durchschlag_prepare(size_t slice_len,
256     const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
257   /* Parameters aliasing */
258   TextIdx sliceLen = static_cast<TextIdx>(slice_len);
259   if (sliceLen != slice_len) fatal("slice_len is too large");
260   if (sliceLen < 1) fatal("slice_len is too small");
261   const uint8_t* data = sample_data;
262 
263   TextIdx total = 0;
264   std::vector<TextIdx> offsets;
265   offsets.reserve(sample_sizes.size());
266   for (size_t i = 0; i < sample_sizes.size(); ++i) {
267     TextIdx delta = static_cast<TextIdx>(sample_sizes[i]);
268     if (delta != sample_sizes[i]) fatal("sample is too large");
269     if (delta == 0) fatal("0-length samples are prohibited");
270     TextIdx next_total = total + delta;
271     if (next_total <= total) fatal("corpus is too large");
272     total = next_total;
273     offsets.push_back(total);
274   }
275 
276   if (total < sliceLen) fatal("slice_len is larger than corpus size");
277   TextIdx end = total - static_cast<TextIdx>(sliceLen) + 1;
278   TextIdx hashLen = 11;
279   while (hashLen < 29 && ((1u << hashLen) < end)) {
280     hashLen += 3;
281   }
282   hashLen -= 3;
283   TextIdx hashMask = (1u << hashLen) - 1u;
284   std::vector<TextIdx> hashHead(1 << hashLen);
285   TextIdx hash = 0;
286   TextIdx lShift = 3;
287   TextIdx rShift = hashLen - lShift;
288   for (TextIdx i = 0; i < sliceLen - 1; ++i) {
289     TextIdx v = data[i];
290     hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
291   }
292   TextIdx lShiftX = (lShift * (sliceLen - 1)) % hashLen;
293   TextIdx rShiftX = hashLen - lShiftX;
294 
295   std::vector<HashSlot> map;
296   map.push_back({0, 0});
297   TextIdx hashSlot = 1;
298   std::vector<TextIdx> sliceMap;
299   sliceMap.reserve(end);
300   for (TextIdx i = 0; i < end; ++i) {
301     TextIdx v = data[i + sliceLen - 1];
302     TextIdx bucket = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
303     v = data[i];
304     hash = bucket ^ (((v << lShiftX) | (v >> rShiftX)) & hashMask);
305     TextIdx slot = hashHead[bucket];
306     while (slot != 0) {
307       HashSlot& item = map[slot];
308       TextIdx start = item.offset;
309       bool miss = false;
310       for (TextIdx j = 0; j < sliceLen; ++j) {
311         if (data[i + j] != data[start + j]) {
312           miss = true;
313           break;
314         }
315       }
316       if (!miss) {
317         sliceMap.push_back(slot);
318         break;
319       }
320       slot = item.next;
321     }
322     if (slot == 0) {
323       map.push_back({hashHead[bucket], i});
324       hashHead[bucket] = hashSlot;
325       sliceMap.push_back(hashSlot);
326       hashSlot++;
327     }
328   }
329 
330   return {total, sliceLen, static_cast<TextIdx>(map.size()),
331       std::move(offsets), std::move(sliceMap)};
332 }
333 
durchschlag_prepare(size_t slice_len,const std::vector<size_t> & sample_sizes,const DurchschlagIndex & index)334 DurchschlagContext durchschlag_prepare(size_t slice_len,
335     const std::vector<size_t>& sample_sizes, const DurchschlagIndex& index) {
336   /* Parameters aliasing */
337   TextIdx sliceLen = static_cast<TextIdx>(slice_len);
338   if (sliceLen != slice_len) fatal("slice_len is too large");
339   if (sliceLen < 1) fatal("slice_len is too small");
340   const TextIdx* lcp = index.lcp.data();
341   const TextIdx* sa = index.sa.data();
342 
343   TextIdx total = 0;
344   std::vector<TextIdx> offsets;
345   offsets.reserve(sample_sizes.size());
346   for (size_t i = 0; i < sample_sizes.size(); ++i) {
347     TextIdx delta = static_cast<TextIdx>(sample_sizes[i]);
348     if (delta != sample_sizes[i]) fatal("sample is too large");
349     if (delta == 0) fatal("0-length samples are prohibited");
350     TextIdx next_total = total + delta;
351     if (next_total <= total) fatal("corpus is too large");
352     total = next_total;
353     offsets.push_back(total);
354   }
355 
356   if (total < sliceLen) fatal("slice_len is larger than corpus size");
357   TextIdx counter = 1;
358   TextIdx end = total - sliceLen + 1;
359   std::vector<TextIdx> sliceMap(total);
360   TextIdx last = 0;
361   TextIdx current = 1;
362   while (current <= total) {
363     if (lcp[current - 1] < sliceLen) {
364       for (TextIdx i = last; i < current; ++i) {
365         sliceMap[sa[i]] = counter;
366       }
367       counter++;
368       last = current;
369     }
370     current++;
371   }
372   sliceMap.resize(end);
373 
374   // Reorder items for the better locality.
375   std::vector<TextIdx> reorder(counter);
376   counter = 1;
377   for (TextIdx i = 0; i < end; ++i) {
378     if (reorder[sliceMap[i]] == 0) {
379       reorder[sliceMap[i]] = counter++;
380     }
381   }
382   for (TextIdx i = 0; i < end; ++i) {
383     sliceMap[i] = reorder[sliceMap[i]];
384   }
385 
386   return {total, sliceLen, counter, std::move(offsets), std::move(sliceMap)};
387 }
388 
durchschlag_index(const std::vector<uint8_t> & data)389 DurchschlagIndex durchschlag_index(const std::vector<uint8_t>& data) {
390   TextIdx total = static_cast<TextIdx>(data.size());
391   if (total != data.size()) fatal("corpus is too large");
392   saidx_t saTotal = static_cast<saidx_t>(total);
393   if (saTotal < 0) fatal("corpus is too large");
394   if (static_cast<TextIdx>(saTotal) != total) fatal("corpus is too large");
395   std::vector<TextIdx> sa(total);
396   /* Hopefully, non-negative int32_t values match TextIdx ones. */
397   if (sizeof(TextIdx) != sizeof(int32_t)) fatal("type length mismatch");
398   int32_t* saData = reinterpret_cast<int32_t*>(sa.data());
399   divsufsort(data.data(), saData, saTotal);
400 
401   std::vector<TextIdx> isa(total);
402   for (TextIdx i = 0; i < total; ++i) isa[sa[i]] = i;
403 
404   // TODO: borrowed -> unknown efficiency.
405   std::vector<TextIdx> lcp(total);
406   TextIdx k = 0;
407   lcp[total - 1] = 0;
408   for (TextIdx i = 0; i < total; ++i) {
409     TextIdx current = isa[i];
410     if (current == total - 1) {
411       k = 0;
412       continue;
413     }
414     TextIdx j = sa[current + 1];  // Suffix which follow i-th suffix.
415     while ((i + k < total) && (j + k < total) && (data[i + k] == data[j + k])) {
416       ++k;
417     }
418     lcp[current] = k;
419     if (k > 0) --k;
420   }
421 
422   return {std::move(lcp), std::move(sa)};
423 }
424 
ScoreSlices(const std::vector<TextIdx> & offsets,std::vector<MetaSlot> & map,const TextIdx * shortcut,TextIdx end)425 static void ScoreSlices(const std::vector<TextIdx>& offsets,
426     std::vector<MetaSlot>& map, const TextIdx* shortcut, TextIdx end) {
427   TextIdx piece = 0;
428   /* Fresh map contains all zeroes -> initial mark should be different. */
429   TextIdx mark = 1;
430   for (TextIdx i = 0; i < end; ++i) {
431     if (offsets[piece] == i) {
432       piece++;
433       mark++;
434     }
435     MetaSlot& item = map[shortcut[i]];
436     if (item.mark != mark) {
437       item.mark = mark;
438       item.score++;
439     }
440   }
441 }
442 
durchschlagGenerateExclusive(size_t dictionary_size_limit,size_t block_len,const DurchschlagContext & context,const uint8_t * sample_data)443 static std::string durchschlagGenerateExclusive(
444     size_t dictionary_size_limit, size_t block_len,
445     const DurchschlagContext& context, const uint8_t* sample_data) {
446   /* Parameters aliasing */
447   TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit);
448   if (targetSize != dictionary_size_limit) {
449     fprintf(stderr, "dictionary_size_limit is too large\n");
450     return "";
451   }
452   TextIdx sliceLen = context.sliceLen;
453   TextIdx total = context.dataSize;
454   TextIdx blockLen = static_cast<TextIdx>(block_len);
455   if (blockLen != block_len) {
456     fprintf(stderr, "block_len is too large\n");
457     return "";
458   }
459   const uint8_t* data = sample_data;
460   const std::vector<TextIdx>& offsets = context.offsets;
461   std::vector<MetaSlot> map(context.numUniqueSlices);
462   const TextIdx* shortcut = context.sliceMap.data();
463 
464   /* Initialization */
465   if (blockLen < sliceLen) {
466     fprintf(stderr, "sliceLen is larger than block_len\n");
467     return "";
468   }
469   if (targetSize < blockLen || total < blockLen) {
470     fprintf(stderr, "block_len is too large\n");
471     return "";
472   }
473   TextIdx end = total - sliceLen + 1;
474   ScoreSlices(offsets, map, shortcut, end);
475   TextIdx span = blockLen - sliceLen + 1;
476   end = static_cast<TextIdx>(context.sliceMap.size()) - span;
477   std::vector<TextIdx> candidates;
478   std::vector<TextIdx> next(end);
479   Score maxScore = rebuildCandidatesList(
480       &candidates, &map, span, shortcut, end, next.data());
481 
482   /* Block selection */
483   const size_t triesLimit = (600 * 1000000) / span;
484   const size_t candidatesLimit = (150 * 1000000) / span;
485   std::vector<Range> ranges;
486   TextIdx mark = 0;
487   size_t numTries = 0;
488   while (true) {
489     TextIdx dictSize = calculateDictionarySize(ranges);
490     size_t numCandidates = 0;
491     if (dictSize > targetSize - blockLen) {
492       break;
493     }
494     if (maxScore == 0) {
495       break;
496     }
497     while (true) {
498       TextIdx candidate = 0;
499       while (maxScore > 0) {
500         if (candidates[maxScore] != 0) {
501           candidate = candidates[maxScore];
502           candidates[maxScore] = next[candidate];
503           break;
504         }
505         maxScore--;
506       }
507       if (maxScore == 0) {
508         break;
509       }
510       mark++;
511       numTries++;
512       numCandidates++;
513       Score score = 0;
514       for (size_t j = candidate; j < candidate + span; ++j) {
515         MetaSlot& item = map[shortcut[j]];
516         if (item.mark != mark) {
517           score += item.score;
518           item.mark = mark;
519         }
520       }
521       if (score < maxScore) {
522         if (numTries < triesLimit && numCandidates < candidatesLimit) {
523           next[candidate] = candidates[score];
524           candidates[score] = candidate;
525         } else {
526           maxScore = rebuildCandidatesList(
527               &candidates, &map, span, shortcut, end, next.data());
528           mark = 0;
529           numTries = 0;
530           numCandidates = 0;
531         }
532         continue;
533       } else if (score > maxScore) {
534         fprintf(stderr, "Broken invariant\n");
535         return "";
536       }
537       for (TextIdx j = candidate; j < candidate + span; ++j) {
538         MetaSlot& item = map[shortcut[j]];
539         item.score = 0;
540       }
541       addRange(&ranges, candidate, candidate + blockLen);
542       break;
543     }
544   }
545 
546   return createDictionary(data, ranges, targetSize);
547 }
548 
durchschlagGenerateCollaborative(size_t dictionary_size_limit,size_t block_len,const DurchschlagContext & context,const uint8_t * sample_data)549 static std::string durchschlagGenerateCollaborative(
550     size_t dictionary_size_limit, size_t block_len,
551     const DurchschlagContext& context, const uint8_t* sample_data) {
552   /* Parameters aliasing */
553   TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit);
554   if (targetSize != dictionary_size_limit) {
555     fprintf(stderr, "dictionary_size_limit is too large\n");
556     return "";
557   }
558   TextIdx sliceLen = context.sliceLen;
559   TextIdx total = context.dataSize;
560   TextIdx blockLen = static_cast<TextIdx>(block_len);
561   if (blockLen != block_len) {
562     fprintf(stderr, "block_len is too large\n");
563     return "";
564   }
565   const uint8_t* data = sample_data;
566   const std::vector<TextIdx>& offsets = context.offsets;
567   std::vector<MetaSlot> map(context.numUniqueSlices);
568   const TextIdx* shortcut = context.sliceMap.data();
569 
570   /* Initialization */
571   if (blockLen < sliceLen) {
572     fprintf(stderr, "sliceLen is larger than block_len\n");
573     return "";
574   }
575   if (targetSize < blockLen || total < blockLen) {
576     fprintf(stderr, "block_len is too large\n");
577     return "";
578   }
579   TextIdx end = total - sliceLen + 1;
580   ScoreSlices(offsets, map, shortcut, end);
581   TextIdx span = blockLen - sliceLen + 1;
582   end = static_cast<TextIdx>(context.sliceMap.size()) - span;
583   std::vector<Candidate> candidates;
584   candidates.reserve(CANDIDATE_BUNDLE_SIZE + 1024);
585   Score minScore = buildCandidatesList(&candidates, &map, span, shortcut, end);
586 
587   /* Block selection */
588   std::vector<Range> ranges;
589   TextIdx mark = 0;
590   while (true) {
591     TextIdx dictSize = calculateDictionarySize(ranges);
592     if (dictSize > targetSize - blockLen) {
593       break;
594     }
595     if (minScore == 0 && candidates.empty()) {
596       break;
597     }
598     while (true) {
599       if (candidates.empty()) {
600         minScore = buildCandidatesList(&candidates, &map, span, shortcut, end);
601         mark = 0;
602       }
603       TextIdx candidate = candidates[0].position;
604       Score expectedScore = candidates[0].score;
605       if (expectedScore == 0) {
606         candidates.resize(0);
607         break;
608       }
609       std::pop_heap(candidates.begin(), candidates.end(), lessScore());
610       candidates.pop_back();
611       mark++;
612       Score score = 0;
613       for (TextIdx j = candidate; j < candidate + span; ++j) {
614         MetaSlot& item = map[shortcut[j]];
615         if (item.mark != mark) {
616           score += item.score;
617           item.mark = mark;
618         }
619       }
620       if (score < expectedScore) {
621         if (score >= minScore) {
622           candidates.push_back({score, candidate});
623           std::push_heap(candidates.begin(), candidates.end(), lessScore());
624         }
625         continue;
626       } else if (score > expectedScore) {
627         fatal("Broken invariant");
628       }
629       for (TextIdx j = candidate; j < candidate + span; ++j) {
630         MetaSlot& item = map[shortcut[j]];
631         item.score = 0;
632       }
633       addRange(&ranges, candidate, candidate + blockLen);
634       break;
635     }
636   }
637 
638   return createDictionary(data, ranges, targetSize);
639 }
640 
durchschlag_generate(DurchschalgResourceStrategy strategy,size_t dictionary_size_limit,size_t block_len,const DurchschlagContext & context,const uint8_t * sample_data)641 std::string durchschlag_generate(DurchschalgResourceStrategy strategy,
642     size_t dictionary_size_limit, size_t block_len,
643     const DurchschlagContext& context, const uint8_t* sample_data) {
644   if (strategy == DURCHSCHLAG_COLLABORATIVE) {
645     return durchschlagGenerateCollaborative(
646         dictionary_size_limit, block_len, context, sample_data);
647   } else {
648     return durchschlagGenerateExclusive(
649         dictionary_size_limit, block_len, context, sample_data);
650   }
651 }
652 
durchschlag_distill(size_t slice_len,size_t minimum_population,std::vector<size_t> * sample_sizes,uint8_t * sample_data)653 void durchschlag_distill(size_t slice_len, size_t minimum_population,
654     std::vector<size_t>* sample_sizes, uint8_t* sample_data) {
655   /* Parameters aliasing */
656   uint8_t* data = sample_data;
657 
658   /* Build slice map. */
659   DurchschlagContext context = durchschlag_prepare(
660       slice_len, *sample_sizes, data);
661 
662   /* Calculate slice population. */
663   const std::vector<TextIdx>& offsets = context.offsets;
664   std::vector<MetaSlot> map(context.numUniqueSlices);
665   const TextIdx* shortcut = context.sliceMap.data();
666   TextIdx sliceLen = context.sliceLen;
667   TextIdx total = context.dataSize;
668   TextIdx end = total - sliceLen + 1;
669   ScoreSlices(offsets, map, shortcut, end);
670 
671   /* Condense samples, omitting unique slices. */
672   TextIdx readPos = 0;
673   TextIdx writePos = 0;
674   TextIdx lastNonUniquePos = 0;
675   for (TextIdx i = 0; i < sample_sizes->size(); ++i) {
676     TextIdx sampleStart = writePos;
677     TextIdx oldSampleEnd =
678         readPos + static_cast<TextIdx>(sample_sizes->at(i));
679     while (readPos < oldSampleEnd) {
680       if (readPos < end) {
681         MetaSlot& item = map[shortcut[readPos]];
682         if (item.score >= minimum_population) {
683           lastNonUniquePos = readPos + sliceLen;
684         }
685       }
686       if (readPos < lastNonUniquePos) {
687         data[writePos++] = data[readPos];
688       }
689       readPos++;
690     }
691     sample_sizes->at(i) = writePos - sampleStart;
692   }
693 }
694 
durchschlag_purify(size_t slice_len,size_t minimum_population,const std::vector<size_t> & sample_sizes,uint8_t * sample_data)695 void durchschlag_purify(size_t slice_len, size_t minimum_population,
696     const std::vector<size_t>& sample_sizes, uint8_t* sample_data) {
697   /* Parameters aliasing */
698   uint8_t* data = sample_data;
699 
700   /* Build slice map. */
701   DurchschlagContext context = durchschlag_prepare(
702       slice_len, sample_sizes, data);
703 
704   /* Calculate slice population. */
705   const std::vector<TextIdx>& offsets = context.offsets;
706   std::vector<MetaSlot> map(context.numUniqueSlices);
707   const TextIdx* shortcut = context.sliceMap.data();
708   TextIdx sliceLen = context.sliceLen;
709   TextIdx total = context.dataSize;
710   TextIdx end = total - sliceLen + 1;
711   ScoreSlices(offsets, map, shortcut, end);
712 
713   /* Rewrite samples, zeroing out unique slices. */
714   TextIdx lastNonUniquePos = 0;
715   for (TextIdx readPos = 0; readPos < total; ++readPos) {
716     if (readPos < end) {
717       MetaSlot& item = map[shortcut[readPos]];
718       if (item.score >= minimum_population) {
719         lastNonUniquePos = readPos + sliceLen;
720       }
721     }
722     if (readPos >= lastNonUniquePos) {
723       data[readPos] = 0;
724     }
725   }
726 }
727