1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /** 4 ******************************************************************************* 5 * Copyright (C) 1996-2014, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.impl.coll; 10 11 import com.ibm.icu.util.ByteArrayWrapper; 12 13 /** 14 * <p>Binary Ordered Compression Scheme for Unicode</p> 15 * 16 * <p>Users are strongly encouraged to read the ICU paper on 17 * <a href="http://www.icu-project.org/docs/papers/binary_ordered_compression_for_unicode.html"> 18 * BOCU</a> before attempting to use this class.</p> 19 * 20 * <p>BOCU is used to compress unicode text into a stream of unsigned 21 * bytes. For many kinds of text the compression compares favorably 22 * to UTF-8, and for some kinds of text (such as CJK) it does better. 23 * The resulting bytes will compare in the same order as the original 24 * code points. The byte stream does not contain the values 0, 1, or 25 * 2.</p> 26 * 27 * <p>One example of a use of BOCU is in 28 * com.ibm.icu.text.Collator#getCollationKey(String) for a RuleBasedCollator object with 29 * collation strength IDENTICAL. The result CollationKey will consist of the 30 * collation order of the source string followed by the BOCU result of the 31 * source string. 32 * </p> 33 * 34 * <p>Unlike a UTF encoding, BOCU-compressed text is not suitable for 35 * random access.</p> 36 * 37 * <p>Method: Slope Detection<br> Remember the previous code point 38 * (initial 0). For each code point in the string, encode the 39 * difference with the previous one. Similar to a UTF, the length of 40 * the byte sequence is encoded in the lead bytes. Unlike a UTF, the 41 * trail byte values may overlap with lead/single byte values. The 42 * signedness of the difference must be encoded as the most 43 * significant part.</p> 44 * 45 * <p>We encode differences with few bytes if their absolute values 46 * are small. For correct ordering, we must treat the entire value 47 * range -10ffff..+10ffff in ascending order, which forbids encoding 48 * the sign and the absolute value separately. Instead, we split the 49 * lead byte range in the middle and encode non-negative values going 50 * up and negative values going down.</p> 51 * 52 * <p>For very small absolute values, the difference is added to a 53 * middle byte value for single-byte encoded differences. For 54 * somewhat larger absolute values, the difference is divided by the 55 * number of byte values available, the modulo is used for one trail 56 * byte, and the remainder is added to a lead byte avoiding the 57 * single-byte range. For large absolute values, the difference is 58 * similarly encoded in three bytes. (Syn Wee, I need examples 59 * here.)</p> 60 * 61 * <p>BOCU does not use byte values 0, 1, or 2, but uses all other 62 * byte values for lead and single bytes, so that the middle range of 63 * single bytes is as large as possible.</p> 64 * 65 * <p>Note that the lead byte ranges overlap some, but that the 66 * sequences as a whole are well ordered. I.e., even if the lead byte 67 * is the same for sequences of different lengths, the trail bytes 68 * establish correct order. It would be possible to encode slightly 69 * larger ranges for each length (>1) by subtracting the lower bound 70 * of the range. However, that would also slow down the calculation. 71 * (Syn Wee, need an example).</p> 72 * 73 * <p>For the actual string encoding, an optimization moves the 74 * previous code point value to the middle of its Unicode script block 75 * to minimize the differences in same-script text runs. (Syn Wee, 76 * need an example.)</p> 77 * 78 * @author Syn Wee Quek 79 * @since release 2.2, May 3rd 2002 80 */ 81 public class BOCSU 82 { 83 // public methods ------------------------------------------------------- 84 85 /** 86 * Encode the code points of a string as 87 * a sequence of byte-encoded differences (slope detection), 88 * preserving lexical order. 89 * 90 * <p>Optimize the difference-taking for runs of Unicode text within 91 * small scripts: 92 * 93 * <p>Most small scripts are allocated within aligned 128-blocks of Unicode 94 * code points. Lexical order is preserved if "prev" is always moved 95 * into the middle of such a block. 96 * 97 * <p>Additionally, "prev" is moved from anywhere in the Unihan 98 * area into the middle of that area. 99 * Note that the identical-level run in a sort key is generated from 100 * NFD text - there are never Hangul characters included. 101 */ writeIdenticalLevelRun(int prev, CharSequence s, int i, int length, ByteArrayWrapper sink)102 public static int writeIdenticalLevelRun(int prev, CharSequence s, int i, int length, ByteArrayWrapper sink) { 103 while (i < length) { 104 // We must have capacity>=SLOPE_MAX_BYTES in case writeDiff() writes that much, 105 // but we do not want to force the sink to allocate 106 // for a large min_capacity because we might actually only write one byte. 107 ensureAppendCapacity(sink, 16, s.length() * 2); 108 byte[] buffer = sink.bytes; 109 int capacity = buffer.length; 110 int p = sink.size; 111 int lastSafe = capacity - SLOPE_MAX_BYTES_; 112 while (i < length && p <= lastSafe) { 113 if (prev < 0x4e00 || prev >= 0xa000) { 114 prev = (prev & ~0x7f) - SLOPE_REACH_NEG_1_; 115 } else { 116 // Unihan U+4e00..U+9fa5: 117 // double-bytes down from the upper end 118 prev = 0x9fff - SLOPE_REACH_POS_2_; 119 } 120 121 int c = Character.codePointAt(s, i); 122 i += Character.charCount(c); 123 if (c == 0xfffe) { 124 buffer[p++] = 2; // merge separator 125 prev = 0; 126 } else { 127 p = writeDiff(c - prev, buffer, p); 128 prev = c; 129 } 130 } 131 sink.size = p; 132 } 133 return prev; 134 } 135 ensureAppendCapacity(ByteArrayWrapper sink, int minCapacity, int desiredCapacity)136 private static void ensureAppendCapacity(ByteArrayWrapper sink, int minCapacity, int desiredCapacity) { 137 int remainingCapacity = sink.bytes.length - sink.size; 138 if (remainingCapacity >= minCapacity) { return; } 139 if (desiredCapacity < minCapacity) { desiredCapacity = minCapacity; } 140 sink.ensureCapacity(sink.size + desiredCapacity); 141 } 142 143 // private data members -------------------------------------------------- 144 145 /** 146 * Do not use byte values 0, 1, 2 because they are separators in sort keys. 147 */ 148 private static final int SLOPE_MIN_ = 3; 149 private static final int SLOPE_MAX_ = 0xff; 150 private static final int SLOPE_MIDDLE_ = 0x81; 151 private static final int SLOPE_TAIL_COUNT_ = SLOPE_MAX_ - SLOPE_MIN_ + 1; 152 private static final int SLOPE_MAX_BYTES_ = 4; 153 154 /** 155 * Number of lead bytes: 156 * 1 middle byte for 0 157 * 2*80=160 single bytes for !=0 158 * 2*42=84 for double-byte values 159 * 2*3=6 for 3-byte values 160 * 2*1=2 for 4-byte values 161 * 162 * The sum must be <=SLOPE_TAIL_COUNT. 163 * 164 * Why these numbers? 165 * - There should be >=128 single-byte values to cover 128-blocks 166 * with small scripts. 167 * - There should be >=20902 single/double-byte values to cover Unihan. 168 * - It helps CJK Extension B some if there are 3-byte values that cover 169 * the distance between them and Unihan. 170 * This also helps to jump among distant places in the BMP. 171 * - Four-byte values are necessary to cover the rest of Unicode. 172 * 173 * Symmetrical lead byte counts are for convenience. 174 * With an equal distribution of even and odd differences there is also 175 * no advantage to asymmetrical lead byte counts. 176 */ 177 private static final int SLOPE_SINGLE_ = 80; 178 private static final int SLOPE_LEAD_2_ = 42; 179 private static final int SLOPE_LEAD_3_ = 3; 180 //private static final int SLOPE_LEAD_4_ = 1; 181 182 /** 183 * The difference value range for single-byters. 184 */ 185 private static final int SLOPE_REACH_POS_1_ = SLOPE_SINGLE_; 186 private static final int SLOPE_REACH_NEG_1_ = (-SLOPE_SINGLE_); 187 188 /** 189 * The difference value range for double-byters. 190 */ 191 private static final int SLOPE_REACH_POS_2_ = 192 SLOPE_LEAD_2_ * SLOPE_TAIL_COUNT_ + SLOPE_LEAD_2_ - 1; 193 private static final int SLOPE_REACH_NEG_2_ = (-SLOPE_REACH_POS_2_ - 1); 194 195 /** 196 * The difference value range for 3-byters. 197 */ 198 private static final int SLOPE_REACH_POS_3_ = SLOPE_LEAD_3_ 199 * SLOPE_TAIL_COUNT_ 200 * SLOPE_TAIL_COUNT_ 201 + (SLOPE_LEAD_3_ - 1) 202 * SLOPE_TAIL_COUNT_ + 203 (SLOPE_TAIL_COUNT_ - 1); 204 private static final int SLOPE_REACH_NEG_3_ = (-SLOPE_REACH_POS_3_ - 1); 205 206 /** 207 * The lead byte start values. 208 */ 209 private static final int SLOPE_START_POS_2_ = SLOPE_MIDDLE_ 210 + SLOPE_SINGLE_ + 1; 211 private static final int SLOPE_START_POS_3_ = SLOPE_START_POS_2_ 212 + SLOPE_LEAD_2_; 213 private static final int SLOPE_START_NEG_2_ = SLOPE_MIDDLE_ + 214 SLOPE_REACH_NEG_1_; 215 private static final int SLOPE_START_NEG_3_ = SLOPE_START_NEG_2_ 216 - SLOPE_LEAD_2_; 217 218 // private constructor --------------------------------------------------- 219 220 /** 221 * Constructor private to prevent initialization 222 */ 223 ///CLOVER:OFF BOCSU()224 private BOCSU() 225 { 226 } 227 ///CLOVER:ON 228 229 // private methods ------------------------------------------------------- 230 231 /** 232 * Integer division and modulo with negative numerators 233 * yields negative modulo results and quotients that are one more than 234 * what we need here. 235 * @param number which operations are to be performed on 236 * @param factor the factor to use for division 237 * @return (result of division) << 32 | modulo 238 */ getNegDivMod(int number, int factor)239 private static final long getNegDivMod(int number, int factor) 240 { 241 int modulo = number % factor; 242 long result = number / factor; 243 if (modulo < 0) { 244 -- result; 245 modulo += factor; 246 } 247 return (result << 32) | modulo; 248 } 249 250 /** 251 * Encode one difference value -0x10ffff..+0x10ffff in 1..4 bytes, 252 * preserving lexical order 253 * @param diff 254 * @param buffer byte buffer to append to 255 * @param offset to the byte buffer to start appending 256 * @return end offset where the appending stops 257 */ writeDiff(int diff, byte buffer[], int offset)258 private static final int writeDiff(int diff, byte buffer[], int offset) 259 { 260 if (diff >= SLOPE_REACH_NEG_1_) { 261 if (diff <= SLOPE_REACH_POS_1_) { 262 buffer[offset ++] = (byte)(SLOPE_MIDDLE_ + diff); 263 } 264 else if (diff <= SLOPE_REACH_POS_2_) { 265 buffer[offset ++] = (byte)(SLOPE_START_POS_2_ 266 + (diff / SLOPE_TAIL_COUNT_)); 267 buffer[offset ++] = (byte)(SLOPE_MIN_ + 268 (diff % SLOPE_TAIL_COUNT_)); 269 } 270 else if (diff <= SLOPE_REACH_POS_3_) { 271 buffer[offset + 2] = (byte)(SLOPE_MIN_ 272 + (diff % SLOPE_TAIL_COUNT_)); 273 diff /= SLOPE_TAIL_COUNT_; 274 buffer[offset + 1] = (byte)(SLOPE_MIN_ 275 + (diff % SLOPE_TAIL_COUNT_)); 276 buffer[offset] = (byte)(SLOPE_START_POS_3_ 277 + (diff / SLOPE_TAIL_COUNT_)); 278 offset += 3; 279 } 280 else { 281 buffer[offset + 3] = (byte)(SLOPE_MIN_ 282 + diff % SLOPE_TAIL_COUNT_); 283 diff /= SLOPE_TAIL_COUNT_; 284 buffer[offset + 2] = (byte)(SLOPE_MIN_ 285 + diff % SLOPE_TAIL_COUNT_); 286 diff /= SLOPE_TAIL_COUNT_; 287 buffer[offset + 1] = (byte)(SLOPE_MIN_ 288 + diff % SLOPE_TAIL_COUNT_); 289 buffer[offset] = (byte)SLOPE_MAX_; 290 offset += 4; 291 } 292 } 293 else { 294 long division = getNegDivMod(diff, SLOPE_TAIL_COUNT_); 295 int modulo = (int)division; 296 if (diff >= SLOPE_REACH_NEG_2_) { 297 diff = (int)(division >> 32); 298 buffer[offset ++] = (byte)(SLOPE_START_NEG_2_ + diff); 299 buffer[offset ++] = (byte)(SLOPE_MIN_ + modulo); 300 } 301 else if (diff >= SLOPE_REACH_NEG_3_) { 302 buffer[offset + 2] = (byte)(SLOPE_MIN_ + modulo); 303 diff = (int)(division >> 32); 304 division = getNegDivMod(diff, SLOPE_TAIL_COUNT_); 305 modulo = (int)division; 306 diff = (int)(division >> 32); 307 buffer[offset + 1] = (byte)(SLOPE_MIN_ + modulo); 308 buffer[offset] = (byte)(SLOPE_START_NEG_3_ + diff); 309 offset += 3; 310 } 311 else { 312 buffer[offset + 3] = (byte)(SLOPE_MIN_ + modulo); 313 diff = (int)(division >> 32); 314 division = getNegDivMod(diff, SLOPE_TAIL_COUNT_); 315 modulo = (int)division; 316 diff = (int)(division >> 32); 317 buffer[offset + 2] = (byte)(SLOPE_MIN_ + modulo); 318 division = getNegDivMod(diff, SLOPE_TAIL_COUNT_); 319 modulo = (int)division; 320 buffer[offset + 1] = (byte)(SLOPE_MIN_ + modulo); 321 buffer[offset] = SLOPE_MIN_; 322 offset += 4; 323 } 324 } 325 return offset; 326 } 327 } 328