1 /*
2  * Copyright (c) 2017-2020, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under both the BSD-style license (found in the
6  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7  * in the COPYING file in the root directory of this source tree).
8  * You may select, at your option, one of the above-listed licenses.
9  */
10 
11 #include <stdio.h>  /* fprintf */
12 #include <stdlib.h> /* malloc, free, qsort */
13 #include <string.h> /* memset */
14 #include <time.h>   /* clock */
15 #include "../common/mem.h" /* read */
16 #include "../common/pool.h"
17 #include "../common/threading.h"
18 #include "../common/zstd_internal.h" /* includes zstd.h */
19 #ifndef ZDICT_STATIC_LINKING_ONLY
20 #define ZDICT_STATIC_LINKING_ONLY
21 #endif
22 #include "zdict.h"
23 
24 /**
25  * COVER_best_t is used for two purposes:
26  * 1. Synchronizing threads.
27  * 2. Saving the best parameters and dictionary.
28  *
29  * All of the methods except COVER_best_init() are thread safe if zstd is
30  * compiled with multithreaded support.
31  */
32 typedef struct COVER_best_s {
33   ZSTD_pthread_mutex_t mutex;
34   ZSTD_pthread_cond_t cond;
35   size_t liveJobs;
36   void *dict;
37   size_t dictSize;
38   ZDICT_cover_params_t parameters;
39   size_t compressedSize;
40 } COVER_best_t;
41 
42 /**
43  * A segment is a range in the source as well as the score of the segment.
44  */
45 typedef struct {
46   U32 begin;
47   U32 end;
48   U32 score;
49 } COVER_segment_t;
50 
51 /**
52  *Number of epochs and size of each epoch.
53  */
54 typedef struct {
55   U32 num;
56   U32 size;
57 } COVER_epoch_info_t;
58 
59 /**
60  * Struct used for the dictionary selection function.
61  */
62 typedef struct COVER_dictSelection {
63   BYTE* dictContent;
64   size_t dictSize;
65   size_t totalCompressedSize;
66 } COVER_dictSelection_t;
67 
68 /**
69  * Computes the number of epochs and the size of each epoch.
70  * We will make sure that each epoch gets at least 10 * k bytes.
71  *
72  * The COVER algorithms divide the data up into epochs of equal size and
73  * select one segment from each epoch.
74  *
75  * @param maxDictSize The maximum allowed dictionary size.
76  * @param nbDmers     The number of dmers we are training on.
77  * @param k           The parameter k (segment size).
78  * @param passes      The target number of passes over the dmer corpus.
79  *                    More passes means a better dictionary.
80  */
81 COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers,
82                                        U32 k, U32 passes);
83 
84 /**
85  * Warns the user when their corpus is too small.
86  */
87 void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel);
88 
89 /**
90  *  Checks total compressed size of a dictionary
91  */
92 size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
93                                       const size_t *samplesSizes, const BYTE *samples,
94                                       size_t *offsets,
95                                       size_t nbTrainSamples, size_t nbSamples,
96                                       BYTE *const dict, size_t dictBufferCapacity);
97 
98 /**
99  * Returns the sum of the sample sizes.
100  */
101 size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ;
102 
103 /**
104  * Initialize the `COVER_best_t`.
105  */
106 void COVER_best_init(COVER_best_t *best);
107 
108 /**
109  * Wait until liveJobs == 0.
110  */
111 void COVER_best_wait(COVER_best_t *best);
112 
113 /**
114  * Call COVER_best_wait() and then destroy the COVER_best_t.
115  */
116 void COVER_best_destroy(COVER_best_t *best);
117 
118 /**
119  * Called when a thread is about to be launched.
120  * Increments liveJobs.
121  */
122 void COVER_best_start(COVER_best_t *best);
123 
124 /**
125  * Called when a thread finishes executing, both on error or success.
126  * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
127  * If this dictionary is the best so far save it and its parameters.
128  */
129 void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
130                        COVER_dictSelection_t selection);
131 /**
132  * Error function for COVER_selectDict function. Checks if the return
133  * value is an error.
134  */
135 unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection);
136 
137  /**
138   * Error function for COVER_selectDict function. Returns a struct where
139   * return.totalCompressedSize is a ZSTD error.
140   */
141 COVER_dictSelection_t COVER_dictSelectionError(size_t error);
142 
143 /**
144  * Always call after selectDict is called to free up used memory from
145  * newly created dictionary.
146  */
147 void COVER_dictSelectionFree(COVER_dictSelection_t selection);
148 
149 /**
150  * Called to finalize the dictionary and select one based on whether or not
151  * the shrink-dict flag was enabled. If enabled the dictionary used is the
152  * smallest dictionary within a specified regression of the compressed size
153  * from the largest dictionary.
154  */
155  COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity,
156                        size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
157                        size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize);
158