1// Copyright 2011 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package norm 6 7import ( 8 "fmt" 9 "unicode/utf8" 10) 11 12// MaxSegmentSize is the maximum size of a byte buffer needed to consider any 13// sequence of starter and non-starter runes for the purpose of normalization. 14const MaxSegmentSize = maxByteBufferSize 15 16// An Iter iterates over a string or byte slice, while normalizing it 17// to a given Form. 18type Iter struct { 19 rb reorderBuffer 20 buf [maxByteBufferSize]byte 21 info Properties // first character saved from previous iteration 22 next iterFunc // implementation of next depends on form 23 asciiF iterFunc 24 25 p int // current position in input source 26 multiSeg []byte // remainder of multi-segment decomposition 27} 28 29type iterFunc func(*Iter) []byte 30 31// Init initializes i to iterate over src after normalizing it to Form f. 32func (i *Iter) Init(f Form, src []byte) { 33 i.p = 0 34 if len(src) == 0 { 35 i.setDone() 36 i.rb.nsrc = 0 37 return 38 } 39 i.multiSeg = nil 40 i.rb.init(f, src) 41 i.next = i.rb.f.nextMain 42 i.asciiF = nextASCIIBytes 43 i.info = i.rb.f.info(i.rb.src, i.p) 44 i.rb.ss.first(i.info) 45} 46 47// InitString initializes i to iterate over src after normalizing it to Form f. 48func (i *Iter) InitString(f Form, src string) { 49 i.p = 0 50 if len(src) == 0 { 51 i.setDone() 52 i.rb.nsrc = 0 53 return 54 } 55 i.multiSeg = nil 56 i.rb.initString(f, src) 57 i.next = i.rb.f.nextMain 58 i.asciiF = nextASCIIString 59 i.info = i.rb.f.info(i.rb.src, i.p) 60 i.rb.ss.first(i.info) 61} 62 63// Seek sets the segment to be returned by the next call to Next to start 64// at position p. It is the responsibility of the caller to set p to the 65// start of a segment. 66func (i *Iter) Seek(offset int64, whence int) (int64, error) { 67 var abs int64 68 switch whence { 69 case 0: 70 abs = offset 71 case 1: 72 abs = int64(i.p) + offset 73 case 2: 74 abs = int64(i.rb.nsrc) + offset 75 default: 76 return 0, fmt.Errorf("norm: invalid whence") 77 } 78 if abs < 0 { 79 return 0, fmt.Errorf("norm: negative position") 80 } 81 if int(abs) >= i.rb.nsrc { 82 i.setDone() 83 return int64(i.p), nil 84 } 85 i.p = int(abs) 86 i.multiSeg = nil 87 i.next = i.rb.f.nextMain 88 i.info = i.rb.f.info(i.rb.src, i.p) 89 i.rb.ss.first(i.info) 90 return abs, nil 91} 92 93// returnSlice returns a slice of the underlying input type as a byte slice. 94// If the underlying is of type []byte, it will simply return a slice. 95// If the underlying is of type string, it will copy the slice to the buffer 96// and return that. 97func (i *Iter) returnSlice(a, b int) []byte { 98 if i.rb.src.bytes == nil { 99 return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])] 100 } 101 return i.rb.src.bytes[a:b] 102} 103 104// Pos returns the byte position at which the next call to Next will commence processing. 105func (i *Iter) Pos() int { 106 return i.p 107} 108 109func (i *Iter) setDone() { 110 i.next = nextDone 111 i.p = i.rb.nsrc 112} 113 114// Done returns true if there is no more input to process. 115func (i *Iter) Done() bool { 116 return i.p >= i.rb.nsrc 117} 118 119// Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input. 120// For any input a and b for which f(a) == f(b), subsequent calls 121// to Next will return the same segments. 122// Modifying runes are grouped together with the preceding starter, if such a starter exists. 123// Although not guaranteed, n will typically be the smallest possible n. 124func (i *Iter) Next() []byte { 125 return i.next(i) 126} 127 128func nextASCIIBytes(i *Iter) []byte { 129 p := i.p + 1 130 if p >= i.rb.nsrc { 131 i.setDone() 132 return i.rb.src.bytes[i.p:p] 133 } 134 if i.rb.src.bytes[p] < utf8.RuneSelf { 135 p0 := i.p 136 i.p = p 137 return i.rb.src.bytes[p0:p] 138 } 139 i.info = i.rb.f.info(i.rb.src, i.p) 140 i.next = i.rb.f.nextMain 141 return i.next(i) 142} 143 144func nextASCIIString(i *Iter) []byte { 145 p := i.p + 1 146 if p >= i.rb.nsrc { 147 i.buf[0] = i.rb.src.str[i.p] 148 i.setDone() 149 return i.buf[:1] 150 } 151 if i.rb.src.str[p] < utf8.RuneSelf { 152 i.buf[0] = i.rb.src.str[i.p] 153 i.p = p 154 return i.buf[:1] 155 } 156 i.info = i.rb.f.info(i.rb.src, i.p) 157 i.next = i.rb.f.nextMain 158 return i.next(i) 159} 160 161func nextHangul(i *Iter) []byte { 162 p := i.p 163 next := p + hangulUTF8Size 164 if next >= i.rb.nsrc { 165 i.setDone() 166 } else if i.rb.src.hangul(next) == 0 { 167 i.rb.ss.next(i.info) 168 i.info = i.rb.f.info(i.rb.src, i.p) 169 i.next = i.rb.f.nextMain 170 return i.next(i) 171 } 172 i.p = next 173 return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))] 174} 175 176func nextDone(i *Iter) []byte { 177 return nil 178} 179 180// nextMulti is used for iterating over multi-segment decompositions 181// for decomposing normal forms. 182func nextMulti(i *Iter) []byte { 183 j := 0 184 d := i.multiSeg 185 // skip first rune 186 for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ { 187 } 188 for j < len(d) { 189 info := i.rb.f.info(input{bytes: d}, j) 190 if info.BoundaryBefore() { 191 i.multiSeg = d[j:] 192 return d[:j] 193 } 194 j += int(info.size) 195 } 196 // treat last segment as normal decomposition 197 i.next = i.rb.f.nextMain 198 return i.next(i) 199} 200 201// nextMultiNorm is used for iterating over multi-segment decompositions 202// for composing normal forms. 203func nextMultiNorm(i *Iter) []byte { 204 j := 0 205 d := i.multiSeg 206 for j < len(d) { 207 info := i.rb.f.info(input{bytes: d}, j) 208 if info.BoundaryBefore() { 209 i.rb.compose() 210 seg := i.buf[:i.rb.flushCopy(i.buf[:])] 211 i.rb.insertUnsafe(input{bytes: d}, j, info) 212 i.multiSeg = d[j+int(info.size):] 213 return seg 214 } 215 i.rb.insertUnsafe(input{bytes: d}, j, info) 216 j += int(info.size) 217 } 218 i.multiSeg = nil 219 i.next = nextComposed 220 return doNormComposed(i) 221} 222 223// nextDecomposed is the implementation of Next for forms NFD and NFKD. 224func nextDecomposed(i *Iter) (next []byte) { 225 outp := 0 226 inCopyStart, outCopyStart := i.p, 0 227 for { 228 if sz := int(i.info.size); sz <= 1 { 229 i.rb.ss = 0 230 p := i.p 231 i.p++ // ASCII or illegal byte. Either way, advance by 1. 232 if i.p >= i.rb.nsrc { 233 i.setDone() 234 return i.returnSlice(p, i.p) 235 } else if i.rb.src._byte(i.p) < utf8.RuneSelf { 236 i.next = i.asciiF 237 return i.returnSlice(p, i.p) 238 } 239 outp++ 240 } else if d := i.info.Decomposition(); d != nil { 241 // Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero. 242 // Case 1: there is a leftover to copy. In this case the decomposition 243 // must begin with a modifier and should always be appended. 244 // Case 2: no leftover. Simply return d if followed by a ccc == 0 value. 245 p := outp + len(d) 246 if outp > 0 { 247 i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) 248 // TODO: this condition should not be possible, but we leave it 249 // in for defensive purposes. 250 if p > len(i.buf) { 251 return i.buf[:outp] 252 } 253 } else if i.info.multiSegment() { 254 // outp must be 0 as multi-segment decompositions always 255 // start a new segment. 256 if i.multiSeg == nil { 257 i.multiSeg = d 258 i.next = nextMulti 259 return nextMulti(i) 260 } 261 // We are in the last segment. Treat as normal decomposition. 262 d = i.multiSeg 263 i.multiSeg = nil 264 p = len(d) 265 } 266 prevCC := i.info.tccc 267 if i.p += sz; i.p >= i.rb.nsrc { 268 i.setDone() 269 i.info = Properties{} // Force BoundaryBefore to succeed. 270 } else { 271 i.info = i.rb.f.info(i.rb.src, i.p) 272 } 273 switch i.rb.ss.next(i.info) { 274 case ssOverflow: 275 i.next = nextCGJDecompose 276 fallthrough 277 case ssStarter: 278 if outp > 0 { 279 copy(i.buf[outp:], d) 280 return i.buf[:p] 281 } 282 return d 283 } 284 copy(i.buf[outp:], d) 285 outp = p 286 inCopyStart, outCopyStart = i.p, outp 287 if i.info.ccc < prevCC { 288 goto doNorm 289 } 290 continue 291 } else if r := i.rb.src.hangul(i.p); r != 0 { 292 outp = decomposeHangul(i.buf[:], r) 293 i.p += hangulUTF8Size 294 inCopyStart, outCopyStart = i.p, outp 295 if i.p >= i.rb.nsrc { 296 i.setDone() 297 break 298 } else if i.rb.src.hangul(i.p) != 0 { 299 i.next = nextHangul 300 return i.buf[:outp] 301 } 302 } else { 303 p := outp + sz 304 if p > len(i.buf) { 305 break 306 } 307 outp = p 308 i.p += sz 309 } 310 if i.p >= i.rb.nsrc { 311 i.setDone() 312 break 313 } 314 prevCC := i.info.tccc 315 i.info = i.rb.f.info(i.rb.src, i.p) 316 if v := i.rb.ss.next(i.info); v == ssStarter { 317 break 318 } else if v == ssOverflow { 319 i.next = nextCGJDecompose 320 break 321 } 322 if i.info.ccc < prevCC { 323 goto doNorm 324 } 325 } 326 if outCopyStart == 0 { 327 return i.returnSlice(inCopyStart, i.p) 328 } else if inCopyStart < i.p { 329 i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) 330 } 331 return i.buf[:outp] 332doNorm: 333 // Insert what we have decomposed so far in the reorderBuffer. 334 // As we will only reorder, there will always be enough room. 335 i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) 336 i.rb.insertDecomposed(i.buf[0:outp]) 337 return doNormDecomposed(i) 338} 339 340func doNormDecomposed(i *Iter) []byte { 341 for { 342 i.rb.insertUnsafe(i.rb.src, i.p, i.info) 343 if i.p += int(i.info.size); i.p >= i.rb.nsrc { 344 i.setDone() 345 break 346 } 347 i.info = i.rb.f.info(i.rb.src, i.p) 348 if i.info.ccc == 0 { 349 break 350 } 351 if s := i.rb.ss.next(i.info); s == ssOverflow { 352 i.next = nextCGJDecompose 353 break 354 } 355 } 356 // new segment or too many combining characters: exit normalization 357 return i.buf[:i.rb.flushCopy(i.buf[:])] 358} 359 360func nextCGJDecompose(i *Iter) []byte { 361 i.rb.ss = 0 362 i.rb.insertCGJ() 363 i.next = nextDecomposed 364 i.rb.ss.first(i.info) 365 buf := doNormDecomposed(i) 366 return buf 367} 368 369// nextComposed is the implementation of Next for forms NFC and NFKC. 370func nextComposed(i *Iter) []byte { 371 outp, startp := 0, i.p 372 var prevCC uint8 373 for { 374 if !i.info.isYesC() { 375 goto doNorm 376 } 377 prevCC = i.info.tccc 378 sz := int(i.info.size) 379 if sz == 0 { 380 sz = 1 // illegal rune: copy byte-by-byte 381 } 382 p := outp + sz 383 if p > len(i.buf) { 384 break 385 } 386 outp = p 387 i.p += sz 388 if i.p >= i.rb.nsrc { 389 i.setDone() 390 break 391 } else if i.rb.src._byte(i.p) < utf8.RuneSelf { 392 i.rb.ss = 0 393 i.next = i.asciiF 394 break 395 } 396 i.info = i.rb.f.info(i.rb.src, i.p) 397 if v := i.rb.ss.next(i.info); v == ssStarter { 398 break 399 } else if v == ssOverflow { 400 i.next = nextCGJCompose 401 break 402 } 403 if i.info.ccc < prevCC { 404 goto doNorm 405 } 406 } 407 return i.returnSlice(startp, i.p) 408doNorm: 409 // reset to start position 410 i.p = startp 411 i.info = i.rb.f.info(i.rb.src, i.p) 412 i.rb.ss.first(i.info) 413 if i.info.multiSegment() { 414 d := i.info.Decomposition() 415 info := i.rb.f.info(input{bytes: d}, 0) 416 i.rb.insertUnsafe(input{bytes: d}, 0, info) 417 i.multiSeg = d[int(info.size):] 418 i.next = nextMultiNorm 419 return nextMultiNorm(i) 420 } 421 i.rb.ss.first(i.info) 422 i.rb.insertUnsafe(i.rb.src, i.p, i.info) 423 return doNormComposed(i) 424} 425 426func doNormComposed(i *Iter) []byte { 427 // First rune should already be inserted. 428 for { 429 if i.p += int(i.info.size); i.p >= i.rb.nsrc { 430 i.setDone() 431 break 432 } 433 i.info = i.rb.f.info(i.rb.src, i.p) 434 if s := i.rb.ss.next(i.info); s == ssStarter { 435 break 436 } else if s == ssOverflow { 437 i.next = nextCGJCompose 438 break 439 } 440 i.rb.insertUnsafe(i.rb.src, i.p, i.info) 441 } 442 i.rb.compose() 443 seg := i.buf[:i.rb.flushCopy(i.buf[:])] 444 return seg 445} 446 447func nextCGJCompose(i *Iter) []byte { 448 i.rb.ss = 0 // instead of first 449 i.rb.insertCGJ() 450 i.next = nextComposed 451 // Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter, 452 // even if they are not. This is particularly dubious for U+FF9E and UFF9A. 453 // If we ever change that, insert a check here. 454 i.rb.ss.first(i.info) 455 i.rb.insertUnsafe(i.rb.src, i.p, i.info) 456 return doNormComposed(i) 457} 458