1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/lib/strings/ordered_code.h"
17 
18 #include <assert.h>
19 #include <stddef.h>
20 
21 #include "tensorflow/core/lib/core/stringpiece.h"
22 #include "tensorflow/core/platform/logging.h"
23 
24 namespace tensorflow {
25 namespace strings {
26 
27 // We encode a string in different ways depending on whether the item
28 // should be in lexicographically increasing or decreasing order.
29 //
30 //
31 // Lexicographically increasing order
32 //
33 // We want a string-to-string mapping F(x) such that for any two strings
34 //
35 //      x < y   =>   F(x) < F(y)
36 //
37 // In addition to the normal characters '\x00' through '\xff', we want to
38 // encode a few extra symbols in strings:
39 //
40 //      <sep>           Separator between items
41 //      <infinity>      Infinite string
42 //
43 // Therefore we need an alphabet with at least 258 symbols.  Each
44 // character '\1' through '\xfe' is mapped to itself.  The other four are
45 // encoded into two-letter sequences starting with '\0' and '\xff':
46 //
47 //      <sep>           encoded as =>           \0\1
48 //      \0              encoded as =>           \0\xff
49 //      \xff            encoded as =>           \xff\x00
50 //      <infinity>      encoded as =>           \xff\xff
51 //
52 // The remaining two-letter sequences starting with '\0' and '\xff' are
53 // currently unused.
54 //
55 // F(<infinity>) is defined above.  For any finite string x, F(x) is the
56 // the encodings of x's characters followed by the encoding for <sep>.  The
57 // ordering of two finite strings is the same as the ordering of the
58 // respective characters at the first position where they differ, which in
59 // turn is the same as the ordering of the encodings of those two
60 // characters.  Moreover, for every finite string x, F(x) < F(<infinity>).
61 //
62 //
63 // Lexicographically decreasing order
64 //
65 // We want a string-to-string mapping G(x) such that for any two strings,
66 // whether finite or not,
67 //
68 //      x < y   =>   G(x) > G(y)
69 //
70 // To achieve this, define G(x) to be the inversion of F(x): I(F(x)).  In
71 // other words, invert every bit in F(x) to get G(x). For example,
72 //
73 //        x  = \x00\x13\xff
74 //      F(x) = \x00\xff\x13\xff\x00\x00\x01  escape \0, \xff, append F(<sep>)
75 //      G(x) = \xff\x00\xec\x00\xff\xff\xfe  invert every bit in F(x)
76 //
77 //        x  = <infinity>
78 //      F(x) = \xff\xff
79 //      G(x) = \x00\x00
80 //
81 // Another example is
82 //
83 //        x            F(x)        G(x) = I(F(x))
84 //        -            ----        --------------
85 //        <infinity>   \xff\xff    \x00\x00
86 //        "foo"        foo\0\1     \x99\x90\x90\xff\xfe
87 //        "aaa"        aaa\0\1     \x9e\x9e\x9e\xff\xfe
88 //        "aa"         aa\0\1      \x9e\x9e\xff\xfe
89 //        ""           \0\1        \xff\xfe
90 //
91 // More generally and rigorously, if for any two strings x and y
92 //
93 //      F(x) < F(y)   =>   I(F(x)) > I(F(y))                      (1)
94 //
95 // it would follow that x < y => G(x) > G(y) because
96 //
97 //      x < y   =>   F(x) < F(y)   =>   G(x) = I(F(x)) > I(F(y)) = G(y)
98 //
99 // We now show why (1) is true, in two parts.  Notice that for any two
100 // strings x < y, F(x) is *not* a proper prefix of F(y).  Suppose x is a
101 // proper prefix of y (say, x="abc" < y="abcd").  F(x) and F(y) diverge at
102 // the F(<sep>) in F(x) (v. F('d') in the example).  Suppose x is not a
103 // proper prefix of y (say, x="abce" < y="abd"), F(x) and F(y) diverge at
104 // their respective encodings of the characters where x and y diverge
105 // (F('c') v. F('d')).  Finally, if y=<infinity>, we can see that
106 // F(y)=\xff\xff is not the prefix of F(x) for any finite string x, simply
107 // by considering all the possible first characters of F(x).
108 //
109 // Given that F(x) is not a proper prefix F(y), the order of F(x) and F(y)
110 // is determined by the byte where F(x) and F(y) diverge.  For example, the
111 // order of F(x)="eefh" and F(y)="eeg" is determined by their third
112 // characters.  I(p) inverts each byte in p, which effectively subtracts
113 // each byte from 0xff.  So, in this example, I('f') > I('g'), and thus
114 // I(F(x)) > I(F(y)).
115 //
116 //
117 // Implementation
118 //
119 // To implement G(x) efficiently, we use C++ template to instantiate two
120 // versions of the code to produce F(x), one for normal encoding (giving us
121 // F(x)) and one for inverted encoding (giving us G(x) = I(F(x))).
122 
123 static const char kEscape1 = '\000';
124 static const char kNullCharacter = '\xff';  // Combined with kEscape1
125 static const char kSeparator = '\001';      // Combined with kEscape1
126 
127 static const char kEscape2 = '\xff';
128 static const char kFFCharacter = '\000';  // Combined with kEscape2
129 
130 static const char kEscape1_Separator[2] = {kEscape1, kSeparator};
131 
132 // Append to "*dest" the "len" bytes starting from "*src".
AppendBytes(string * dest,const char * src,size_t len)133 inline static void AppendBytes(string* dest, const char* src, size_t len) {
134   dest->append(src, len);
135 }
136 
IsSpecialByte(char c)137 inline bool IsSpecialByte(char c) {
138   return (static_cast<unsigned char>(c + 1)) < 2;
139 }
140 
141 // Return a pointer to the first byte in the range "[start..limit)"
142 // whose value is 0 or 255 (kEscape1 or kEscape2).  If no such byte
143 // exists in the range, returns "limit".
SkipToNextSpecialByte(const char * start,const char * limit)144 inline const char* SkipToNextSpecialByte(const char* start, const char* limit) {
145   // If these constants were ever changed, this routine needs to change
146   DCHECK_EQ(kEscape1, 0);
147   DCHECK_EQ(kEscape2 & 0xffu, 255u);
148   const char* p = start;
149   while (p < limit && !IsSpecialByte(*p)) {
150     p++;
151   }
152   return p;
153 }
154 
155 // Expose SkipToNextSpecialByte for testing purposes
TEST_SkipToNextSpecialByte(const char * start,const char * limit)156 const char* OrderedCode::TEST_SkipToNextSpecialByte(const char* start,
157                                                     const char* limit) {
158   return SkipToNextSpecialByte(start, limit);
159 }
160 
161 // Helper routine to encode "s" and append to "*dest", escaping special
162 // characters.
EncodeStringFragment(string * dest,StringPiece s)163 inline static void EncodeStringFragment(string* dest, StringPiece s) {
164   const char* p = s.data();
165   const char* limit = p + s.size();
166   const char* copy_start = p;
167   while (true) {
168     p = SkipToNextSpecialByte(p, limit);
169     if (p >= limit) break;  // No more special characters that need escaping
170     char c = *(p++);
171     DCHECK(IsSpecialByte(c));
172     if (c == kEscape1) {
173       AppendBytes(dest, copy_start, p - copy_start - 1);
174       dest->push_back(kEscape1);
175       dest->push_back(kNullCharacter);
176       copy_start = p;
177     } else {
178       assert(c == kEscape2);
179       AppendBytes(dest, copy_start, p - copy_start - 1);
180       dest->push_back(kEscape2);
181       dest->push_back(kFFCharacter);
182       copy_start = p;
183     }
184   }
185   if (p > copy_start) {
186     AppendBytes(dest, copy_start, p - copy_start);
187   }
188 }
189 
WriteString(string * dest,StringPiece s)190 void OrderedCode::WriteString(string* dest, StringPiece s) {
191   EncodeStringFragment(dest, s);
192   AppendBytes(dest, kEscape1_Separator, 2);
193 }
194 
WriteNumIncreasing(string * dest,uint64 val)195 void OrderedCode::WriteNumIncreasing(string* dest, uint64 val) {
196   // Values are encoded with a single byte length prefix, followed
197   // by the actual value in big-endian format with leading 0 bytes
198   // dropped.
199   unsigned char buf[9];  // 8 bytes for value plus one byte for length
200   int len = 0;
201   while (val > 0) {
202     len++;
203     buf[9 - len] = (val & 0xff);
204     val >>= 8;
205   }
206   buf[9 - len - 1] = len;
207   len++;
208   AppendBytes(dest, reinterpret_cast<const char*>(buf + 9 - len), len);
209 }
210 
211 // Parse the encoding of a previously encoded string.
212 // If parse succeeds, return true, consume encoding from
213 // "*src", and if result != NULL append the decoded string to "*result".
214 // Otherwise, return false and leave both undefined.
ReadStringInternal(StringPiece * src,string * result)215 inline static bool ReadStringInternal(StringPiece* src, string* result) {
216   const char* start = src->data();
217   const char* string_limit = src->data() + src->size();
218 
219   // We only scan up to "limit-2" since a valid string must end with
220   // a two character terminator: 'kEscape1 kSeparator'
221   const char* limit = string_limit - 1;
222   const char* copy_start = start;
223   while (true) {
224     start = SkipToNextSpecialByte(start, limit);
225     if (start >= limit) break;  // No terminator sequence found
226     const char c = *(start++);
227     // If inversion is required, instead of inverting 'c', we invert the
228     // character constants to which 'c' is compared.  We get the same
229     // behavior but save the runtime cost of inverting 'c'.
230     DCHECK(IsSpecialByte(c));
231     if (c == kEscape1) {
232       if (result) {
233         AppendBytes(result, copy_start, start - copy_start - 1);
234       }
235       // kEscape1 kSeparator ends component
236       // kEscape1 kNullCharacter represents '\0'
237       const char next = *(start++);
238       if (next == kSeparator) {
239         src->remove_prefix(start - src->data());
240         return true;
241       } else if (next == kNullCharacter) {
242         if (result) {
243           *result += '\0';
244         }
245       } else {
246         return false;
247       }
248       copy_start = start;
249     } else {
250       assert(c == kEscape2);
251       if (result) {
252         AppendBytes(result, copy_start, start - copy_start - 1);
253       }
254       // kEscape2 kFFCharacter represents '\xff'
255       // kEscape2 kInfinity is an error
256       const char next = *(start++);
257       if (next == kFFCharacter) {
258         if (result) {
259           *result += '\xff';
260         }
261       } else {
262         return false;
263       }
264       copy_start = start;
265     }
266   }
267   return false;
268 }
269 
ReadString(StringPiece * src,string * result)270 bool OrderedCode::ReadString(StringPiece* src, string* result) {
271   return ReadStringInternal(src, result);
272 }
273 
ReadNumIncreasing(StringPiece * src,uint64 * result)274 bool OrderedCode::ReadNumIncreasing(StringPiece* src, uint64* result) {
275   if (src->empty()) {
276     return false;  // Not enough bytes
277   }
278 
279   // Decode length byte
280   const size_t len = static_cast<unsigned char>((*src)[0]);
281 
282   // If len > 0 and src is longer than 1, the first byte of "payload"
283   // must be non-zero (otherwise the encoding is not minimal).
284   // In opt mode, we don't enforce that encodings must be minimal.
285   DCHECK(0 == len || src->size() == 1 || (*src)[1] != '\0')
286       << "invalid encoding";
287 
288   if (len + 1 > src->size() || len > 8) {
289     return false;  // Not enough bytes or too many bytes
290   }
291 
292   if (result) {
293     uint64 tmp = 0;
294     for (size_t i = 0; i < len; i++) {
295       tmp <<= 8;
296       tmp |= static_cast<unsigned char>((*src)[1 + i]);
297     }
298     *result = tmp;
299   }
300   src->remove_prefix(len + 1);
301   return true;
302 }
303 
TEST_Corrupt(string * str,int k)304 void OrderedCode::TEST_Corrupt(string* str, int k) {
305   int seen_seps = 0;
306   for (size_t i = 0; i + 1 < str->size(); i++) {
307     if ((*str)[i] == kEscape1 && (*str)[i + 1] == kSeparator) {
308       seen_seps++;
309       if (seen_seps == k) {
310         (*str)[i + 1] = kSeparator + 1;
311         return;
312       }
313     }
314   }
315 }
316 
317 // Signed number encoding/decoding /////////////////////////////////////
318 //
319 // The format is as follows:
320 //
321 // The first bit (the most significant bit of the first byte)
322 // represents the sign, 0 if the number is negative and
323 // 1 if the number is >= 0.
324 //
325 // Any unbroken sequence of successive bits with the same value as the sign
326 // bit, up to 9 (the 8th and 9th are the most significant bits of the next
327 // byte), are size bits that count the number of bytes after the first byte.
328 // That is, the total length is between 1 and 10 bytes.
329 //
330 // The value occupies the bits after the sign bit and the "size bits"
331 // till the end of the string, in network byte order.  If the number
332 // is negative, the bits are in 2-complement.
333 //
334 //
335 // Example 1: number 0x424242 -> 4 byte big-endian hex string 0xf0424242:
336 //
337 // +---------------+---------------+---------------+---------------+
338 //  1 1 1 1 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0
339 // +---------------+---------------+---------------+---------------+
340 //  ^ ^ ^ ^   ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
341 //  | | | |   | | | | | | | | | | | | | | | | | | | | | | | | | | |
342 //  | | | |   payload: the remaining bits after the sign and size bits
343 //  | | | |            and the delimiter bit, the value is 0x424242
344 //  | | | |
345 //  | size bits: 3 successive bits with the same value as the sign bit
346 //  |            (followed by a delimiter bit with the opposite value)
347 //  |            mean that there are 3 bytes after the first byte, 4 total
348 //  |
349 //  sign bit: 1 means that the number is non-negative
350 //
351 // Example 2: negative number -0x800 -> 2 byte big-endian hex string 0x3800:
352 //
353 // +---------------+---------------+
354 //  0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0
355 // +---------------+---------------+
356 //  ^ ^   ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
357 //  | |   | | | | | | | | | | | | | | | | | | | | | | | | | | |
358 //  | |   payload: the remaining bits after the sign and size bits and the
359 //  | |            delimiter bit, 2-complement because of the negative sign,
360 //  | |            value is ~0x7ff, represents the value -0x800
361 //  | |
362 //  | size bits: 1 bit with the same value as the sign bit
363 //  |            (followed by a delimiter bit with the opposite value)
364 //  |            means that there is 1 byte after the first byte, 2 total
365 //  |
366 //  sign bit: 0 means that the number is negative
367 //
368 //
369 // Compared with the simpler unsigned format used for uint64 numbers,
370 // this format is more compact for small numbers, namely one byte encodes
371 // numbers in the range [-64,64), two bytes cover the range [-2^13,2^13), etc.
372 // In general, n bytes encode numbers in the range [-2^(n*7-1),2^(n*7-1)).
373 // (The cross-over point for compactness of representation is 8 bytes,
374 // where this format only covers the range [-2^55,2^55),
375 // whereas an encoding with sign bit and length in the first byte and
376 // payload in all following bytes would cover [-2^56,2^56).)
377 
378 static const int kMaxSigned64Length = 10;
379 
380 // This array maps encoding length to header bits in the first two bytes.
381 static const char kLengthToHeaderBits[1 + kMaxSigned64Length][2] = {
382     {0, 0},      {'\x80', 0},      {'\xc0', 0},     {'\xe0', 0},
383     {'\xf0', 0}, {'\xf8', 0},      {'\xfc', 0},     {'\xfe', 0},
384     {'\xff', 0}, {'\xff', '\x80'}, {'\xff', '\xc0'}};
385 
386 // This array maps encoding lengths to the header bits that overlap with
387 // the payload and need fixing when reading.
388 static const uint64 kLengthToMask[1 + kMaxSigned64Length] = {
389     0ULL,
390     0x80ULL,
391     0xc000ULL,
392     0xe00000ULL,
393     0xf0000000ULL,
394     0xf800000000ULL,
395     0xfc0000000000ULL,
396     0xfe000000000000ULL,
397     0xff00000000000000ULL,
398     0x8000000000000000ULL,
399     0ULL};
400 
401 // This array maps the number of bits in a number to the encoding
402 // length produced by WriteSignedNumIncreasing.
403 // For positive numbers, the number of bits is 1 plus the most significant
404 // bit position (the highest bit position in a positive int64 is 63).
405 // For a negative number n, we count the bits in ~n.
406 // That is, length = kBitsToLength[Bits::Log2Floor64(n < 0 ? ~n : n) + 1].
407 static const int8 kBitsToLength[1 + 63] = {
408     1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4,
409     4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 7, 7,
410     7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 10};
411 
412 #if defined(__GNUC__)
413 // Returns floor(lg(n)).  Returns -1 if n == 0.
Log2Floor64(uint64 n)414 static int Log2Floor64(uint64 n) {
415   return n == 0 ? -1 : 63 ^ __builtin_clzll(n);
416 }
417 #else
418 // Portable slow version
Log2Floor32_Portable(uint32 n)419 static int Log2Floor32_Portable(uint32 n) {
420   if (n == 0) return -1;
421   int log = 0;
422   uint32 value = n;
423   for (int i = 4; i >= 0; --i) {
424     int shift = (1 << i);
425     uint32 x = value >> shift;
426     if (x != 0) {
427       value = x;
428       log += shift;
429     }
430   }
431   assert(value == 1);
432   return log;
433 }
434 // Returns floor(lg(n)).  Returns -1 if n == 0.
Log2Floor64(uint64 n)435 static int Log2Floor64(uint64 n) {
436   const uint32 topbits = static_cast<uint32>(n >> 32);
437   if (topbits == 0) {
438     // Top bits are zero, so scan in bottom bits
439     return Log2Floor32_Portable(static_cast<uint32>(n));
440   } else {
441     return 32 + Log2Floor32_Portable(topbits);
442   }
443 }
444 #endif
445 
446 // Calculates the encoding length in bytes of the signed number n.
SignedEncodingLength(int64 n)447 static inline int SignedEncodingLength(int64 n) {
448   return kBitsToLength[Log2Floor64(n < 0 ? ~n : n) + 1];
449 }
450 
StoreBigEndian64(char * dst,uint64 v)451 static void StoreBigEndian64(char* dst, uint64 v) {
452   for (int i = 0; i < 8; i++) {
453     dst[i] = (v >> (56 - 8 * i)) & 0xff;
454   }
455 }
456 
LoadBigEndian64(const char * src)457 static uint64 LoadBigEndian64(const char* src) {
458   uint64 result = 0;
459   for (int i = 0; i < 8; i++) {
460     unsigned char c = static_cast<unsigned char>(src[i]);
461     result |= static_cast<uint64>(c) << (56 - 8 * i);
462   }
463   return result;
464 }
465 
WriteSignedNumIncreasing(string * dest,int64 val)466 void OrderedCode::WriteSignedNumIncreasing(string* dest, int64 val) {
467   const uint64 x = val < 0 ? ~val : val;
468   if (x < 64) {  // fast path for encoding length == 1
469     *dest += kLengthToHeaderBits[1][0] ^ val;
470     return;
471   }
472   // buf = val in network byte order, sign extended to 10 bytes
473   const char sign_byte = val < 0 ? '\xff' : '\0';
474   char buf[10] = {
475       sign_byte,
476       sign_byte,
477   };
478   StoreBigEndian64(buf + 2, val);
479   static_assert(sizeof(buf) == kMaxSigned64Length, "max length size mismatch");
480   const int len = SignedEncodingLength(x);
481   DCHECK_GE(len, 2);
482   char* const begin = buf + sizeof(buf) - len;
483   begin[0] ^= kLengthToHeaderBits[len][0];
484   begin[1] ^= kLengthToHeaderBits[len][1];  // ok because len >= 2
485   dest->append(begin, len);
486 }
487 
ReadSignedNumIncreasing(StringPiece * src,int64 * result)488 bool OrderedCode::ReadSignedNumIncreasing(StringPiece* src, int64* result) {
489   if (src->empty()) return false;
490   const uint64 xor_mask = (!((*src)[0] & 0x80)) ? ~0ULL : 0ULL;
491   const unsigned char first_byte = (*src)[0] ^ (xor_mask & 0xff);
492 
493   // now calculate and test length, and set x to raw (unmasked) result
494   int len;
495   uint64 x;
496   if (first_byte != 0xff) {
497     len = 7 - Log2Floor64(first_byte ^ 0xff);
498     if (src->size() < static_cast<size_t>(len)) return false;
499     x = xor_mask;  // sign extend using xor_mask
500     for (int i = 0; i < len; ++i)
501       x = (x << 8) | static_cast<unsigned char>((*src)[i]);
502   } else {
503     len = 8;
504     if (src->size() < static_cast<size_t>(len)) return false;
505     const unsigned char second_byte = (*src)[1] ^ (xor_mask & 0xff);
506     if (second_byte >= 0x80) {
507       if (second_byte < 0xc0) {
508         len = 9;
509       } else {
510         const unsigned char third_byte = (*src)[2] ^ (xor_mask & 0xff);
511         if (second_byte == 0xc0 && third_byte < 0x80) {
512           len = 10;
513         } else {
514           return false;  // either len > 10 or len == 10 and #bits > 63
515         }
516       }
517       if (src->size() < static_cast<size_t>(len)) return false;
518     }
519     x = LoadBigEndian64(src->data() + len - 8);
520   }
521 
522   x ^= kLengthToMask[len];  // remove spurious header bits
523 
524   DCHECK_EQ(len, SignedEncodingLength(x)) << "invalid encoding";
525 
526   if (result) *result = x;
527   src->remove_prefix(len);
528   return true;
529 }
530 
531 }  // namespace strings
532 }  // namespace tensorflow
533