1 /* Copyright 2016 Google Inc. All Rights Reserved.
2    Author: zip753@gmail.com (Ivan Nikulin)
3 
4    Distributed under MIT license.
5    See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
6 */
7 
8 /* Tool for generating optimal backward references for the input file. Uses
9    sais-lite library for building suffix array. */
10 
11 #include <algorithm>
12 #include <cassert>
13 #include <cstdio>
14 #include <cstring>
15 #include <functional>
16 #include <utility>
17 #include <vector>
18 
19 #include <gflags/gflags.h>
20 using gflags::ParseCommandLineFlags;
21 
22 #include "./esaxx/sais.hxx"
23 
24 DEFINE_bool(advanced, false, "Advanced searching mode: finds all longest "
25     "matches at positions that are not covered by matches of length at least "
26     "max_length. WARNING: uses much more memory than simple mode, especially "
27     "for small values of min_length.");
28 DEFINE_int32(min_length, 1, "Minimal length of found backward references.");
29 /* For advanced mode. */
30 DEFINE_int32(long_length, 32,
31              "Maximal length of found backward references for advanced mode.");
32 DEFINE_int32(skip, 1, "Number of bytes to skip.");
33 
34 const size_t kFileBufferSize = (1 << 16);  // 64KB
35 
36 typedef int sarray_type;  // Can't make it unsigned because of templates :(
37 typedef uint8_t input_type;
38 typedef uint32_t lcp_type;
39 typedef std::pair<int, std::vector<int> > entry_type;
40 typedef std::function<void(sarray_type*, lcp_type*, size_t, int, int, int, int,
41                            int)> Fn;
42 
ReadInput(FILE * fin,input_type * storage,size_t input_size)43 void ReadInput(FILE* fin, input_type* storage, size_t input_size) {
44   size_t last_pos = 0;
45   size_t available_in;
46   fseek(fin, 0, SEEK_SET);
47   do {
48     available_in = fread(storage + last_pos, 1, kFileBufferSize, fin);
49     last_pos += available_in;
50   } while (available_in != 0);
51   assert(last_pos == input_size);
52 }
53 
BuildLCP(input_type * storage,sarray_type * sarray,lcp_type * lcp,size_t size,uint32_t * pos)54 void BuildLCP(input_type* storage, sarray_type* sarray, lcp_type* lcp,
55               size_t size, uint32_t* pos) {
56   for (int i = 0; i < size; ++i) {
57     pos[sarray[i]] = i;
58   }
59   uint32_t k = 0;
60   lcp[size - 1] = 0;
61   for (int i = 0; i < size; ++i) {
62     if (pos[i] == size - 1) {
63       k = 0;
64       continue;
65     }
66     uint32_t j = sarray[pos[i] + 1];  // Suffix which follow i-th suffix in SA.
67     while (i + k < size && j + k < size && storage[i + k] == storage[j + k]) {
68       ++k;
69     }
70     lcp[pos[i]] = k;
71     if (k > 0) --k;
72   }
73 }
74 
PrintReference(sarray_type * sarray,lcp_type * lcp,size_t size,int idx,int left_ix,int right_ix,int left_lcp,int right_lcp,FILE * fout)75 inline void PrintReference(sarray_type* sarray, lcp_type* lcp, size_t size,
76                            int idx, int left_ix, int right_ix, int left_lcp,
77                            int right_lcp, FILE* fout) {
78   int max_lcp_ix;
79   if (right_ix == size - 1 || (left_ix >= 0 && left_lcp >= right_lcp)) {
80     max_lcp_ix = left_ix;
81   } else {
82     max_lcp_ix = right_ix;
83   }
84   int dist = idx - sarray[max_lcp_ix];
85   assert(dist > 0);
86   fputc(1, fout);
87   fwrite(&idx, sizeof(int), 1, fout);   // Position in input.
88   fwrite(&dist, sizeof(int), 1, fout);  // Backward distance.
89 }
90 
GoLeft(sarray_type * sarray,lcp_type * lcp,int idx,int left_ix,int left_lcp,entry_type * entry)91 inline void GoLeft(sarray_type* sarray, lcp_type* lcp, int idx, int left_ix,
92                    int left_lcp, entry_type* entry) {
93   entry->first = left_lcp;
94   if (left_lcp > FLAGS_long_length) return;
95   for (; left_ix >= 0; --left_ix) {
96     if (lcp[left_ix] < left_lcp) break;
97     if (sarray[left_ix] < idx) {
98       entry->second.push_back(idx - sarray[left_ix]);
99     }
100   }
101 }
102 
GoRight(sarray_type * sarray,lcp_type * lcp,int idx,size_t size,int right_ix,int right_lcp,entry_type * entry)103 inline void GoRight(sarray_type* sarray, lcp_type* lcp, int idx, size_t size,
104                     int right_ix, int right_lcp, entry_type* entry) {
105   entry->first = right_lcp;
106   if (right_lcp > FLAGS_long_length) return;
107   for (; right_ix < size - 1; ++right_ix) {
108     if (lcp[right_ix] < right_lcp) break;
109     if (sarray[right_ix] < idx) {
110       entry->second.push_back(idx - sarray[right_ix]);
111     }
112   }
113 }
114 
StoreReference(sarray_type * sarray,lcp_type * lcp,size_t size,int idx,int left_ix,int right_ix,int left_lcp,int right_lcp,entry_type * entries)115 inline void StoreReference(sarray_type* sarray, lcp_type* lcp, size_t size,
116                            int idx, int left_ix, int right_ix, int left_lcp,
117                            int right_lcp, entry_type* entries) {
118   if (right_ix == size - 1 || (left_ix >= 0 && left_lcp > right_lcp)) {
119     // right is invalid or left is better
120     GoLeft(sarray, lcp, idx, left_ix, left_lcp, &entries[idx]);
121   } else if (left_ix < 0 || (right_ix < size - 1 && right_lcp > left_lcp)) {
122     // left is invalid or right is better
123     GoRight(sarray, lcp, idx, size, right_ix, right_lcp, &entries[idx]);
124   } else {  // both are valid and of equal length
125     GoLeft(sarray, lcp, idx, left_ix, left_lcp, &entries[idx]);
126     GoRight(sarray, lcp, idx, size, right_ix, right_lcp, &entries[idx]);
127   }
128 }
129 
ProcessReferences(sarray_type * sarray,lcp_type * lcp,size_t size,uint32_t * pos,const Fn & process_output)130 void ProcessReferences(sarray_type* sarray, lcp_type* lcp, size_t size,
131                        uint32_t* pos, const Fn& process_output) {
132   int min_length = FLAGS_min_length;
133   for (int idx = FLAGS_skip; idx < size; ++idx) {
134     int left_lcp = -1;
135     int left_ix;
136     for (left_ix = pos[idx] - 1; left_ix >= 0; --left_ix) {
137       if (left_lcp == -1 || left_lcp > lcp[left_ix]) {
138         left_lcp = lcp[left_ix];
139       }
140       if (left_lcp == 0) break;
141       if (sarray[left_ix] < idx) break;
142     }
143 
144     int right_lcp = -1;
145     int right_ix;
146     for (right_ix = pos[idx]; right_ix < size - 1; ++right_ix) {
147       if (right_lcp == -1 || right_lcp > lcp[right_ix]) {
148         right_lcp = lcp[right_ix];
149       }
150       // Stop if we have better result from the left side already.
151       if (right_lcp < left_lcp && left_ix >= 0) break;
152       if (right_lcp == 0) break;
153       if (sarray[right_ix] < idx) break;
154     }
155 
156     if ((left_ix >= 0 && left_lcp >= min_length) ||
157         (right_ix < size - 1 && right_lcp >= min_length)) {
158       process_output(sarray, lcp, size, idx, left_ix, right_ix, left_lcp,
159                      right_lcp);
160     }
161   }
162 }
163 
ProcessEntries(entry_type * entries,size_t size,FILE * fout)164 void ProcessEntries(entry_type* entries, size_t size, FILE* fout) {
165   int long_length = FLAGS_long_length;
166   std::vector<std::pair<int, int> > segments;
167   size_t idx;
168   for (idx = 0; idx < size;) {
169     entry_type& entry = entries[idx];
170     if (entry.first > long_length) {
171       // Add segment.
172       if (segments.empty() || segments.back().second < idx) {
173         segments.push_back({idx, idx + entry.first});
174       } else {
175         segments.back().second = idx + entry.first;
176       }
177     }
178     ++idx;
179   }
180   printf("Segments generated.\n");
181   size_t segments_ix = 0;
182   for (idx = 0; idx < size;) {
183     if (idx == segments[segments_ix].first) {
184       // Skip segment.
185       idx = segments[segments_ix].second;
186     } else {
187       for (auto& dist : entries[idx].second) {
188         fputc(1, fout);
189         fwrite(&idx, sizeof(int), 1, fout);   // Position in input.
190         fwrite(&dist, sizeof(int), 1, fout);  // Backward distance.
191       }
192       ++idx;
193     }
194   }
195 }
196 
main(int argc,char * argv[])197 int main(int argc, char* argv[]) {
198   ParseCommandLineFlags(&argc, &argv, true);
199   if (argc != 3) {
200     printf("usage: %s input_file output_file\n", argv[0]);
201     return 1;
202   }
203 
204   FILE* fin = fopen(argv[1], "rb");
205   FILE* fout = fopen(argv[2], "w");
206 
207   fseek(fin, 0, SEEK_END);
208   int input_size = ftell(fin);
209   fseek(fin, 0, SEEK_SET);
210   printf("The file size is %u bytes\n", input_size);
211 
212   input_type* storage = new input_type[input_size];
213 
214   ReadInput(fin, storage, input_size);
215   fclose(fin);
216 
217   sarray_type* sarray = new sarray_type[input_size];
218   saisxx(storage, sarray, input_size);
219   printf("Suffix array calculated.\n");
220 
221   // Inverse suffix array.
222   uint32_t* pos = new uint32_t[input_size];
223 
224   lcp_type* lcp = new lcp_type[input_size];
225   BuildLCP(storage, sarray, lcp, input_size, pos);
226   printf("LCP array constructed.\n");
227   delete[] storage;
228 
229   using std::placeholders::_1;
230   using std::placeholders::_2;
231   using std::placeholders::_3;
232   using std::placeholders::_4;
233   using std::placeholders::_5;
234   using std::placeholders::_6;
235   using std::placeholders::_7;
236   using std::placeholders::_8;
237   entry_type* entries;
238   if (FLAGS_advanced) {
239     entries = new entry_type[input_size];
240     for (size_t i = 0; i < input_size; ++i) entries[i].first = -1;
241   }
242   Fn print = std::bind(PrintReference, _1, _2, _3, _4, _5, _6, _7, _8, fout);
243   Fn store = std::bind(StoreReference, _1, _2, _3, _4, _5, _6, _7, _8, entries);
244 
245   ProcessReferences(sarray, lcp, input_size, pos,
246                     FLAGS_advanced ? store : print);
247   printf("References processed.\n");
248 
249   if (FLAGS_advanced) {
250     int good_cnt = 0;
251     uint64_t avg_cnt = 0;
252     for (size_t i = 0; i < input_size; ++i) {
253       if (entries[i].first != -1) {
254         ++good_cnt;
255         avg_cnt += entries[i].second.size();
256       }
257     }
258     printf("Number of covered positions = %d\n", good_cnt);
259     printf("Average number of references per covered position = %.4lf\n",
260             static_cast<double>(avg_cnt) / good_cnt);
261     ProcessEntries(entries, input_size, fout);
262     printf("Entries processed.\n");
263   }
264 
265   fclose(fout);
266   return 0;
267 }
268