1 #include <climits>
2 #include <cstddef>
3 #include <cstdio>
4 #include <cstring>
5 #include <fstream>
6 #include <vector>
7 
8 #include "./deorummolae.h"
9 #include "./durchschlag.h"
10 #include "./sieve.h"
11 
12 /* This isn't a definitive list of "--foo" arguments, only those that take an
13  * additional "=#" integer parameter, like "--foo=20" or "--foo=32K".
14  */
15 #define LONG_ARG_BLOCK_LEN "--block_len="
16 #define LONG_ARG_SLICE_LEN "--slice_len="
17 #define LONG_ARG_TARGET_DICT_LEN "--target_dict_len="
18 #define LONG_ARG_MIN_SLICE_POP "--min_slice_pop="
19 #define LONG_ARG_CHUNK_LEN "--chunk_len="
20 #define LONG_ARG_OVERLAP_LEN "--overlap_len="
21 
22 #define METHOD_DM 0
23 #define METHOD_SIEVE 1
24 #define METHOD_DURCHSCHLAG 2
25 #define METHOD_DISTILL 3
26 #define METHOD_PURIFY 4
27 
readInt(const char * str)28 static size_t readInt(const char* str) {
29   size_t result = 0;
30   if (str[0] == 0 || str[0] == '0') {
31     return 0;
32   }
33   for (size_t i = 0; i < 13; ++i) {
34     if (str[i] == 0) {
35       return result;
36     }
37     if (str[i] == 'k' || str[i] == 'K') {
38       if ((str[i + 1] == 0) && ((result << 10) > result)) {
39         return result << 10;
40       }
41       return 0;
42     }
43     if (str[i] == 'm' || str[i] == 'M') {
44       if ((str[i + 1] == 0) && ((result << 20) > result)) {
45         return result << 20;
46       }
47       return 0;
48     }
49     if (str[i] < '0' || str[i] > '9') {
50       return 0;
51     }
52     size_t next = (10 * result) + (str[i] - '0');
53     if (next <= result) {
54       return 0;
55     }
56     result = next;
57   }
58   return 0;
59 }
60 
readFile(const std::string & path)61 static std::string readFile(const std::string& path) {
62   std::ifstream file(path);
63   std::string content(
64       (std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
65   return content;
66 }
67 
writeFile(const char * file,const std::string & content)68 static void writeFile(const char* file, const std::string& content) {
69   std::ofstream outfile(file, std::ofstream::binary);
70   outfile.write(content.c_str(), static_cast<std::streamsize>(content.size()));
71   outfile.close();
72 }
73 
writeSamples(char const * argv[],const std::vector<int> & pathArgs,const std::vector<size_t> & sizes,const uint8_t * data)74 static void writeSamples(char const* argv[], const std::vector<int>& pathArgs,
75     const std::vector<size_t>& sizes, const uint8_t* data) {
76   size_t offset = 0;
77   for (size_t i = 0; i < pathArgs.size(); ++i) {
78     int j = pathArgs[i];
79     const char* file = argv[j];
80     size_t sampleSize = sizes[i];
81     std::ofstream outfile(file, std::ofstream::binary);
82     outfile.write(reinterpret_cast<const char*>(data + offset),
83         static_cast<std::streamsize>(sampleSize));
84     outfile.close();
85     offset += sampleSize;
86   }
87 }
88 
89 /* Returns "base file name" or its tail, if it contains '/' or '\'. */
fileName(const char * path)90 static const char* fileName(const char* path) {
91   const char* separator_position = strrchr(path, '/');
92   if (separator_position) path = separator_position + 1;
93   separator_position = strrchr(path, '\\');
94   if (separator_position) path = separator_position + 1;
95   return path;
96 }
97 
printHelp(const char * name)98 static void printHelp(const char* name) {
99   fprintf(stderr, "Usage: %s [OPTION]... DICTIONARY [SAMPLE]...\n", name);
100   fprintf(stderr,
101       "Options:\n"
102       "  --dm       use 'deorummolae' engine\n"
103       "  --distill  rewrite samples; unique text parts are removed\n"
104       "  --dsh      use 'durchschlag' engine (default)\n"
105       "  --purify   rewrite samples; unique text parts are zeroed out\n"
106       "  --sieve    use 'sieve' engine\n"
107       "  -b#, --block_len=#\n"
108       "             set block length for 'durchschlag'; default: 1024\n"
109       "  -s#, --slice_len=#\n"
110       "             set slice length for 'distill', 'durchschlag', 'purify'\n"
111       "             and 'sieve'; default: 16\n"
112       "  -t#, --target_dict_len=#\n"
113       "             set target dictionary length (limit); default: 16K\n"
114       "  -u#, --min_slice_pop=#\n"
115       "             set minimum slice population (for rewrites); default: 2\n"
116       "  -c#, --chunk_len=#\n"
117       "             if positive, samples are cut into chunks of this length;\n"
118       "             default: 0; cannot mix with 'rewrite samples'\n"
119       "  -o#, --overlap_len=#\n"
120       "             set chunk overlap length; default 0\n"
121       "# is a decimal number with optional k/K/m/M suffix.\n"
122       "WARNING: 'distill' and 'purify' will overwrite original samples!\n"
123       "         Completely unique samples might become empty files.\n\n");
124 }
125 
main(int argc,char const * argv[])126 int main(int argc, char const* argv[]) {
127   int dictionaryArg = -1;
128   int method = METHOD_DURCHSCHLAG;
129   size_t sliceLen = 16;
130   size_t targetSize = 16 << 10;
131   size_t blockSize = 1024;
132   size_t minimumPopulation = 2;
133   size_t chunkLen = 0;
134   size_t overlapLen = 0;
135 
136   std::vector<uint8_t> data;
137   std::vector<size_t> sizes;
138   std::vector<int> pathArgs;
139   size_t total = 0;
140   for (int i = 1; i < argc; ++i) {
141     if (argv[i] == nullptr) {
142       continue;
143     }
144 
145     if (argv[i][0] == '-') {
146       char arg1 = argv[i][1];
147       const char* arg2 = arg1 ? &argv[i][2] : nullptr;
148       if (arg1 == '-') {
149         if (dictionaryArg != -1) {
150           fprintf(stderr,
151               "Method should be specified before dictionary / sample '%s'\n",
152               argv[i]);
153           exit(1);
154         }
155 
156         /* Look for "--long_arg" via exact match. */
157         if (std::strcmp(argv[i], "--sieve") == 0) {
158           method = METHOD_SIEVE;
159           continue;
160         }
161         if (std::strcmp(argv[i], "--dm") == 0) {
162           method = METHOD_DM;
163           continue;
164         }
165         if (std::strcmp(argv[i], "--dsh") == 0) {
166           method = METHOD_DURCHSCHLAG;
167           continue;
168         }
169         if (std::strcmp(argv[i], "--distill") == 0) {
170           method = METHOD_DISTILL;
171           continue;
172         }
173         if (std::strcmp(argv[i], "--purify") == 0) {
174           method = METHOD_PURIFY;
175           continue;
176         }
177 
178         /* Look for "--long_arg=#" via prefix match. */
179         if (std::strncmp(argv[i], LONG_ARG_BLOCK_LEN,
180               std::strlen(LONG_ARG_BLOCK_LEN)) == 0) {
181           arg1 = 'b';
182           arg2 = &argv[i][std::strlen(LONG_ARG_BLOCK_LEN)];
183         } else if (std::strncmp(argv[i], LONG_ARG_SLICE_LEN,
184               std::strlen(LONG_ARG_SLICE_LEN)) == 0) {
185           arg1 = 's';
186           arg2 = &argv[i][std::strlen(LONG_ARG_SLICE_LEN)];
187         } else if (std::strncmp(argv[i], LONG_ARG_TARGET_DICT_LEN,
188               std::strlen(LONG_ARG_TARGET_DICT_LEN)) == 0) {
189           arg1 = 't';
190           arg2 = &argv[i][std::strlen(LONG_ARG_TARGET_DICT_LEN)];
191         } else if (std::strncmp(argv[i], LONG_ARG_MIN_SLICE_POP,
192               std::strlen(LONG_ARG_MIN_SLICE_POP)) == 0) {
193           arg1 = 'u';
194           arg2 = &argv[i][std::strlen(LONG_ARG_MIN_SLICE_POP)];
195         } else if (std::strncmp(argv[i], LONG_ARG_CHUNK_LEN,
196               std::strlen(LONG_ARG_CHUNK_LEN)) == 0) {
197           arg1 = 'c';
198           arg2 = &argv[i][std::strlen(LONG_ARG_CHUNK_LEN)];
199         } else if (std::strncmp(argv[i], LONG_ARG_OVERLAP_LEN,
200               std::strlen(LONG_ARG_OVERLAP_LEN)) == 0) {
201           arg1 = 'o';
202           arg2 = &argv[i][std::strlen(LONG_ARG_OVERLAP_LEN)];
203         } else {
204           printHelp(fileName(argv[0]));
205           fprintf(stderr, "Invalid option '%s'\n", argv[i]);
206           exit(1);
207         }
208       }
209 
210       /* Look for "-f" short args or "--foo=#" long args. */
211       if (arg1 == 'b') {
212         blockSize = readInt(arg2);
213         if (blockSize < 16 || blockSize > 65536) {
214           printHelp(fileName(argv[0]));
215           fprintf(stderr, "Invalid option '%s'\n", argv[i]);
216           exit(1);
217         }
218       } else if (arg1 == 's') {
219         sliceLen = readInt(arg2);
220         if (sliceLen < 4 || sliceLen > 256) {
221           printHelp(fileName(argv[0]));
222           fprintf(stderr, "Invalid option '%s'\n", argv[i]);
223           exit(1);
224         }
225       } else if (arg1 == 't') {
226         targetSize = readInt(arg2);
227         if (targetSize < 256 || targetSize > (1 << 25)) {
228           printHelp(fileName(argv[0]));
229           fprintf(stderr, "Invalid option '%s'\n", argv[i]);
230           exit(1);
231         }
232       } else if (arg1 == 'u') {
233         minimumPopulation = readInt(arg2);
234         if (minimumPopulation < 256 || minimumPopulation > 65536) {
235           printHelp(fileName(argv[0]));
236           fprintf(stderr, "Invalid option '%s'\n", argv[i]);
237           exit(1);
238         }
239       } else if (arg1 == 'c') {
240         chunkLen = readInt(arg2);
241         if (chunkLen < 0 || chunkLen > INT_MAX) {
242           printHelp(fileName(argv[0]));
243           fprintf(stderr, "Invalid option '%s'\n", argv[i]);
244           exit(1);
245         }
246       } else if (arg1 == 'o') {
247         overlapLen = readInt(arg2);
248         if (overlapLen < 0 || overlapLen > INT_MAX) {
249           printHelp(fileName(argv[0]));
250           fprintf(stderr, "Invalid option '%s'\n", argv[i]);
251           exit(1);
252         }
253       } else {
254         printHelp(fileName(argv[0]));
255         fprintf(stderr, "Unrecognized option '%s'\n", argv[i]);
256         exit(1);
257       }
258       continue;
259     }
260 
261     if (dictionaryArg == -1) {
262       if (method != METHOD_DISTILL && method != METHOD_PURIFY) {
263         dictionaryArg = i;
264         continue;
265       }
266     }
267 
268     std::string content = readFile(argv[i]);
269     if (chunkLen == 0) {
270       pathArgs.push_back(i);
271       data.insert(data.end(), content.begin(), content.end());
272       total += content.size();
273       sizes.push_back(content.size());
274       continue;
275     } else if (chunkLen <= overlapLen) {
276       printHelp(fileName(argv[0]));
277       fprintf(stderr, "Invalid chunkLen - overlapLen combination\n");
278       exit(1);
279     }
280     for (size_t chunkStart = 0;
281         chunkStart < content.size();
282         chunkStart += chunkLen - overlapLen) {
283       std::string chunk = content.substr(chunkStart, chunkLen);
284       data.insert(data.end(), chunk.begin(), chunk.end());
285       total += chunk.size();
286       sizes.push_back(chunk.size());
287     }
288   }
289 
290   bool wantDictionary = (dictionaryArg == -1);
291   if (method == METHOD_DISTILL || method == METHOD_PURIFY) {
292     wantDictionary = false;
293     if (chunkLen != 0) {
294       printHelp(fileName(argv[0]));
295       fprintf(stderr, "Cannot mix 'rewrite samples' with positive chunk_len\n");
296       exit(1);
297     }
298   }
299   if (wantDictionary || total == 0) {
300     printHelp(fileName(argv[0]));
301     fprintf(stderr, "Not enough arguments\n");
302     exit(1);
303   }
304 
305   if (method == METHOD_SIEVE) {
306     writeFile(argv[dictionaryArg], sieve_generate(
307         targetSize, sliceLen, sizes, data.data()));
308   } else if (method == METHOD_DM) {
309     writeFile(argv[dictionaryArg], DM_generate(
310         targetSize, sizes, data.data()));
311   } else if (method == METHOD_DURCHSCHLAG) {
312     writeFile(argv[dictionaryArg], durchschlag_generate(
313         targetSize, sliceLen, blockSize, sizes, data.data()));
314   } else if (method == METHOD_DISTILL) {
315     durchschlag_distill(sliceLen, minimumPopulation, &sizes, data.data());
316     writeSamples(argv, pathArgs, sizes, data.data());
317   } else if (method == METHOD_PURIFY) {
318     durchschlag_purify(sliceLen, minimumPopulation, sizes, data.data());
319     writeSamples(argv, pathArgs, sizes, data.data());
320   } else {
321     printHelp(fileName(argv[0]));
322     fprintf(stderr, "Unknown generator\n");
323     exit(1);
324   }
325   return 0;
326 }
327