1 #ifndef BROTLI_RESEARCH_DURCHSCHLAG_H_
2 #define BROTLI_RESEARCH_DURCHSCHLAG_H_
3 
4 #include <cstddef>
5 #include <cstdint>
6 #include <string>
7 #include <vector>
8 
9 /**
10  * Generate a dictionary for given samples.
11  *
12  * @param dictionary_size_limit maximal dictionary size
13  * @param slice_len text slice size
14  * @param block_len score block length
15  * @param sample_sizes vector with sample sizes
16  * @param sample_data concatenated samples
17  * @return generated dictionary
18  */
19 std::string durchschlag_generate(
20     size_t dictionary_size_limit, size_t slice_len, size_t block_len,
21     const std::vector<size_t>& sample_sizes, const uint8_t* sample_data);
22 
23 //------------------------------------------------------------------------------
24 // Lower level API for repetitive dictionary generation.
25 //------------------------------------------------------------------------------
26 
27 /* Pointer to position in text. */
28 typedef uint32_t DurchschlagTextIdx;
29 
30 /* Context is made public for flexible serialization / deserialization. */
31 typedef struct DurchschlagContext {
32   DurchschlagTextIdx dataSize;
33   DurchschlagTextIdx sliceLen;
34   DurchschlagTextIdx numUniqueSlices;
35   std::vector<DurchschlagTextIdx> offsets;
36   std::vector<DurchschlagTextIdx> sliceMap;
37 } DurchschlagContext;
38 
39 DurchschlagContext durchschlag_prepare(size_t slice_len,
40     const std::vector<size_t>& sample_sizes, const uint8_t* sample_data);
41 
42 typedef enum DurchschalgResourceStrategy {
43   // Faster
44   DURCHSCHLAG_EXCLUSIVE = 0,
45   // Uses much less memory
46   DURCHSCHLAG_COLLABORATIVE = 1
47 } DurchschalgResourceStrategy;
48 
49 std::string durchschlag_generate(DurchschalgResourceStrategy strategy,
50     size_t dictionary_size_limit, size_t block_len,
51     const DurchschlagContext& context, const uint8_t* sample_data);
52 
53 //------------------------------------------------------------------------------
54 // Suffix Array based preparation.
55 //------------------------------------------------------------------------------
56 
57 typedef struct DurchschlagIndex {
58   std::vector<DurchschlagTextIdx> lcp;
59   std::vector<DurchschlagTextIdx> sa;
60 } DurchschlagIndex;
61 
62 DurchschlagIndex durchschlag_index(const std::vector<uint8_t>& data);
63 
64 DurchschlagContext durchschlag_prepare(size_t slice_len,
65     const std::vector<size_t>& sample_sizes, const DurchschlagIndex& index);
66 
67 //------------------------------------------------------------------------------
68 // Data preparation.
69 //------------------------------------------------------------------------------
70 
71 /**
72  * Cut out unique slices.
73  *
74  * Both @p sample_sizes and @p sample_data are modified in-place. Number of
75  * samples remains unchanged, but some samples become shorter.
76  *
77  * @param slice_len (unique) slice size
78  * @param minimum_population minimum non-unique slice occurrence
79  * @param sample_sizes [in / out] vector with sample sizes
80  * @param sample_data [in / out] concatenated samples
81  */
82 void durchschlag_distill(size_t slice_len, size_t minimum_population,
83     std::vector<size_t>* sample_sizes, uint8_t* sample_data);
84 
85 /**
86  * Replace unique slices with zeroes.
87  *
88  * @p sample_data is modified in-place. Number of samples and their length
89  * remain unchanged.
90  *
91  * @param slice_len (unique) slice size
92  * @param minimum_population minimum non-unique slice occurrence
93  * @param sample_sizes vector with sample sizes
94  * @param sample_data [in / out] concatenated samples
95  */
96 void durchschlag_purify(size_t slice_len, size_t minimum_population,
97     const std::vector<size_t>& sample_sizes, uint8_t* sample_data);
98 
99 #endif  // BROTLI_RESEARCH_DURCHSCHLAG_H_
100