1// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Package transform provides reader and writer wrappers that transform the
6// bytes passing through as well as various transformations. Example
7// transformations provided by other packages include normalization and
8// conversion between character sets.
9package transform
10
11import (
12	"bytes"
13	"errors"
14	"io"
15	"unicode/utf8"
16)
17
18var (
19	// ErrShortDst means that the destination buffer was too short to
20	// receive all of the transformed bytes.
21	ErrShortDst = errors.New("transform: short destination buffer")
22
23	// ErrShortSrc means that the source buffer has insufficient data to
24	// complete the transformation.
25	ErrShortSrc = errors.New("transform: short source buffer")
26
27	// ErrEndOfSpan means that the input and output (the transformed input)
28	// are not identical.
29	ErrEndOfSpan = errors.New("transform: input and output are not identical")
30
31	// errInconsistentByteCount means that Transform returned success (nil
32	// error) but also returned nSrc inconsistent with the src argument.
33	errInconsistentByteCount = errors.New("transform: inconsistent byte count returned")
34
35	// errShortInternal means that an internal buffer is not large enough
36	// to make progress and the Transform operation must be aborted.
37	errShortInternal = errors.New("transform: short internal buffer")
38)
39
40// Transformer transforms bytes.
41type Transformer interface {
42	// Transform writes to dst the transformed bytes read from src, and
43	// returns the number of dst bytes written and src bytes read. The
44	// atEOF argument tells whether src represents the last bytes of the
45	// input.
46	//
47	// Callers should always process the nDst bytes produced and account
48	// for the nSrc bytes consumed before considering the error err.
49	//
50	// A nil error means that all of the transformed bytes (whether freshly
51	// transformed from src or left over from previous Transform calls)
52	// were written to dst. A nil error can be returned regardless of
53	// whether atEOF is true. If err is nil then nSrc must equal len(src);
54	// the converse is not necessarily true.
55	//
56	// ErrShortDst means that dst was too short to receive all of the
57	// transformed bytes. ErrShortSrc means that src had insufficient data
58	// to complete the transformation. If both conditions apply, then
59	// either error may be returned. Other than the error conditions listed
60	// here, implementations are free to report other errors that arise.
61	Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error)
62
63	// Reset resets the state and allows a Transformer to be reused.
64	Reset()
65}
66
67// SpanningTransformer extends the Transformer interface with a Span method
68// that determines how much of the input already conforms to the Transformer.
69type SpanningTransformer interface {
70	Transformer
71
72	// Span returns a position in src such that transforming src[:n] results in
73	// identical output src[:n] for these bytes. It does not necessarily return
74	// the largest such n. The atEOF argument tells whether src represents the
75	// last bytes of the input.
76	//
77	// Callers should always account for the n bytes consumed before
78	// considering the error err.
79	//
80	// A nil error means that all input bytes are known to be identical to the
81	// output produced by the Transformer. A nil error can be be returned
82	// regardless of whether atEOF is true. If err is nil, then then n must
83	// equal len(src); the converse is not necessarily true.
84	//
85	// ErrEndOfSpan means that the Transformer output may differ from the
86	// input after n bytes. Note that n may be len(src), meaning that the output
87	// would contain additional bytes after otherwise identical output.
88	// ErrShortSrc means that src had insufficient data to determine whether the
89	// remaining bytes would change. Other than the error conditions listed
90	// here, implementations are free to report other errors that arise.
91	//
92	// Calling Span can modify the Transformer state as a side effect. In
93	// effect, it does the transformation just as calling Transform would, only
94	// without copying to a destination buffer and only up to a point it can
95	// determine the input and output bytes are the same. This is obviously more
96	// limited than calling Transform, but can be more efficient in terms of
97	// copying and allocating buffers. Calls to Span and Transform may be
98	// interleaved.
99	Span(src []byte, atEOF bool) (n int, err error)
100}
101
102// NopResetter can be embedded by implementations of Transformer to add a nop
103// Reset method.
104type NopResetter struct{}
105
106// Reset implements the Reset method of the Transformer interface.
107func (NopResetter) Reset() {}
108
109// Reader wraps another io.Reader by transforming the bytes read.
110type Reader struct {
111	r   io.Reader
112	t   Transformer
113	err error
114
115	// dst[dst0:dst1] contains bytes that have been transformed by t but
116	// not yet copied out via Read.
117	dst        []byte
118	dst0, dst1 int
119
120	// src[src0:src1] contains bytes that have been read from r but not
121	// yet transformed through t.
122	src        []byte
123	src0, src1 int
124
125	// transformComplete is whether the transformation is complete,
126	// regardless of whether or not it was successful.
127	transformComplete bool
128}
129
130const defaultBufSize = 4096
131
132// NewReader returns a new Reader that wraps r by transforming the bytes read
133// via t. It calls Reset on t.
134func NewReader(r io.Reader, t Transformer) *Reader {
135	t.Reset()
136	return &Reader{
137		r:   r,
138		t:   t,
139		dst: make([]byte, defaultBufSize),
140		src: make([]byte, defaultBufSize),
141	}
142}
143
144// Read implements the io.Reader interface.
145func (r *Reader) Read(p []byte) (int, error) {
146	n, err := 0, error(nil)
147	for {
148		// Copy out any transformed bytes and return the final error if we are done.
149		if r.dst0 != r.dst1 {
150			n = copy(p, r.dst[r.dst0:r.dst1])
151			r.dst0 += n
152			if r.dst0 == r.dst1 && r.transformComplete {
153				return n, r.err
154			}
155			return n, nil
156		} else if r.transformComplete {
157			return 0, r.err
158		}
159
160		// Try to transform some source bytes, or to flush the transformer if we
161		// are out of source bytes. We do this even if r.r.Read returned an error.
162		// As the io.Reader documentation says, "process the n > 0 bytes returned
163		// before considering the error".
164		if r.src0 != r.src1 || r.err != nil {
165			r.dst0 = 0
166			r.dst1, n, err = r.t.Transform(r.dst, r.src[r.src0:r.src1], r.err == io.EOF)
167			r.src0 += n
168
169			switch {
170			case err == nil:
171				if r.src0 != r.src1 {
172					r.err = errInconsistentByteCount
173				}
174				// The Transform call was successful; we are complete if we
175				// cannot read more bytes into src.
176				r.transformComplete = r.err != nil
177				continue
178			case err == ErrShortDst && (r.dst1 != 0 || n != 0):
179				// Make room in dst by copying out, and try again.
180				continue
181			case err == ErrShortSrc && r.src1-r.src0 != len(r.src) && r.err == nil:
182				// Read more bytes into src via the code below, and try again.
183			default:
184				r.transformComplete = true
185				// The reader error (r.err) takes precedence over the
186				// transformer error (err) unless r.err is nil or io.EOF.
187				if r.err == nil || r.err == io.EOF {
188					r.err = err
189				}
190				continue
191			}
192		}
193
194		// Move any untransformed source bytes to the start of the buffer
195		// and read more bytes.
196		if r.src0 != 0 {
197			r.src0, r.src1 = 0, copy(r.src, r.src[r.src0:r.src1])
198		}
199		n, r.err = r.r.Read(r.src[r.src1:])
200		r.src1 += n
201	}
202}
203
204// TODO: implement ReadByte (and ReadRune??).
205
206// Writer wraps another io.Writer by transforming the bytes read.
207// The user needs to call Close to flush unwritten bytes that may
208// be buffered.
209type Writer struct {
210	w   io.Writer
211	t   Transformer
212	dst []byte
213
214	// src[:n] contains bytes that have not yet passed through t.
215	src []byte
216	n   int
217}
218
219// NewWriter returns a new Writer that wraps w by transforming the bytes written
220// via t. It calls Reset on t.
221func NewWriter(w io.Writer, t Transformer) *Writer {
222	t.Reset()
223	return &Writer{
224		w:   w,
225		t:   t,
226		dst: make([]byte, defaultBufSize),
227		src: make([]byte, defaultBufSize),
228	}
229}
230
231// Write implements the io.Writer interface. If there are not enough
232// bytes available to complete a Transform, the bytes will be buffered
233// for the next write. Call Close to convert the remaining bytes.
234func (w *Writer) Write(data []byte) (n int, err error) {
235	src := data
236	if w.n > 0 {
237		// Append bytes from data to the last remainder.
238		// TODO: limit the amount copied on first try.
239		n = copy(w.src[w.n:], data)
240		w.n += n
241		src = w.src[:w.n]
242	}
243	for {
244		nDst, nSrc, err := w.t.Transform(w.dst, src, false)
245		if _, werr := w.w.Write(w.dst[:nDst]); werr != nil {
246			return n, werr
247		}
248		src = src[nSrc:]
249		if w.n == 0 {
250			n += nSrc
251		} else if len(src) <= n {
252			// Enough bytes from w.src have been consumed. We make src point
253			// to data instead to reduce the copying.
254			w.n = 0
255			n -= len(src)
256			src = data[n:]
257			if n < len(data) && (err == nil || err == ErrShortSrc) {
258				continue
259			}
260		}
261		switch err {
262		case ErrShortDst:
263			// This error is okay as long as we are making progress.
264			if nDst > 0 || nSrc > 0 {
265				continue
266			}
267		case ErrShortSrc:
268			if len(src) < len(w.src) {
269				m := copy(w.src, src)
270				// If w.n > 0, bytes from data were already copied to w.src and n
271				// was already set to the number of bytes consumed.
272				if w.n == 0 {
273					n += m
274				}
275				w.n = m
276				err = nil
277			} else if nDst > 0 || nSrc > 0 {
278				// Not enough buffer to store the remainder. Keep processing as
279				// long as there is progress. Without this case, transforms that
280				// require a lookahead larger than the buffer may result in an
281				// error. This is not something one may expect to be common in
282				// practice, but it may occur when buffers are set to small
283				// sizes during testing.
284				continue
285			}
286		case nil:
287			if w.n > 0 {
288				err = errInconsistentByteCount
289			}
290		}
291		return n, err
292	}
293}
294
295// Close implements the io.Closer interface.
296func (w *Writer) Close() error {
297	src := w.src[:w.n]
298	for {
299		nDst, nSrc, err := w.t.Transform(w.dst, src, true)
300		if _, werr := w.w.Write(w.dst[:nDst]); werr != nil {
301			return werr
302		}
303		if err != ErrShortDst {
304			return err
305		}
306		src = src[nSrc:]
307	}
308}
309
310type nop struct{ NopResetter }
311
312func (nop) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
313	n := copy(dst, src)
314	if n < len(src) {
315		err = ErrShortDst
316	}
317	return n, n, err
318}
319
320func (nop) Span(src []byte, atEOF bool) (n int, err error) {
321	return len(src), nil
322}
323
324type discard struct{ NopResetter }
325
326func (discard) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
327	return 0, len(src), nil
328}
329
330var (
331	// Discard is a Transformer for which all Transform calls succeed
332	// by consuming all bytes and writing nothing.
333	Discard Transformer = discard{}
334
335	// Nop is a SpanningTransformer that copies src to dst.
336	Nop SpanningTransformer = nop{}
337)
338
339// chain is a sequence of links. A chain with N Transformers has N+1 links and
340// N+1 buffers. Of those N+1 buffers, the first and last are the src and dst
341// buffers given to chain.Transform and the middle N-1 buffers are intermediate
342// buffers owned by the chain. The i'th link transforms bytes from the i'th
343// buffer chain.link[i].b at read offset chain.link[i].p to the i+1'th buffer
344// chain.link[i+1].b at write offset chain.link[i+1].n, for i in [0, N).
345type chain struct {
346	link []link
347	err  error
348	// errStart is the index at which the error occurred plus 1. Processing
349	// errStart at this level at the next call to Transform. As long as
350	// errStart > 0, chain will not consume any more source bytes.
351	errStart int
352}
353
354func (c *chain) fatalError(errIndex int, err error) {
355	if i := errIndex + 1; i > c.errStart {
356		c.errStart = i
357		c.err = err
358	}
359}
360
361type link struct {
362	t Transformer
363	// b[p:n] holds the bytes to be transformed by t.
364	b []byte
365	p int
366	n int
367}
368
369func (l *link) src() []byte {
370	return l.b[l.p:l.n]
371}
372
373func (l *link) dst() []byte {
374	return l.b[l.n:]
375}
376
377// Chain returns a Transformer that applies t in sequence.
378func Chain(t ...Transformer) Transformer {
379	if len(t) == 0 {
380		return nop{}
381	}
382	c := &chain{link: make([]link, len(t)+1)}
383	for i, tt := range t {
384		c.link[i].t = tt
385	}
386	// Allocate intermediate buffers.
387	b := make([][defaultBufSize]byte, len(t)-1)
388	for i := range b {
389		c.link[i+1].b = b[i][:]
390	}
391	return c
392}
393
394// Reset resets the state of Chain. It calls Reset on all the Transformers.
395func (c *chain) Reset() {
396	for i, l := range c.link {
397		if l.t != nil {
398			l.t.Reset()
399		}
400		c.link[i].p, c.link[i].n = 0, 0
401	}
402}
403
404// TODO: make chain use Span (is going to be fun to implement!)
405
406// Transform applies the transformers of c in sequence.
407func (c *chain) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
408	// Set up src and dst in the chain.
409	srcL := &c.link[0]
410	dstL := &c.link[len(c.link)-1]
411	srcL.b, srcL.p, srcL.n = src, 0, len(src)
412	dstL.b, dstL.n = dst, 0
413	var lastFull, needProgress bool // for detecting progress
414
415	// i is the index of the next Transformer to apply, for i in [low, high].
416	// low is the lowest index for which c.link[low] may still produce bytes.
417	// high is the highest index for which c.link[high] has a Transformer.
418	// The error returned by Transform determines whether to increase or
419	// decrease i. We try to completely fill a buffer before converting it.
420	for low, i, high := c.errStart, c.errStart, len(c.link)-2; low <= i && i <= high; {
421		in, out := &c.link[i], &c.link[i+1]
422		nDst, nSrc, err0 := in.t.Transform(out.dst(), in.src(), atEOF && low == i)
423		out.n += nDst
424		in.p += nSrc
425		if i > 0 && in.p == in.n {
426			in.p, in.n = 0, 0
427		}
428		needProgress, lastFull = lastFull, false
429		switch err0 {
430		case ErrShortDst:
431			// Process the destination buffer next. Return if we are already
432			// at the high index.
433			if i == high {
434				return dstL.n, srcL.p, ErrShortDst
435			}
436			if out.n != 0 {
437				i++
438				// If the Transformer at the next index is not able to process any
439				// source bytes there is nothing that can be done to make progress
440				// and the bytes will remain unprocessed. lastFull is used to
441				// detect this and break out of the loop with a fatal error.
442				lastFull = true
443				continue
444			}
445			// The destination buffer was too small, but is completely empty.
446			// Return a fatal error as this transformation can never complete.
447			c.fatalError(i, errShortInternal)
448		case ErrShortSrc:
449			if i == 0 {
450				// Save ErrShortSrc in err. All other errors take precedence.
451				err = ErrShortSrc
452				break
453			}
454			// Source bytes were depleted before filling up the destination buffer.
455			// Verify we made some progress, move the remaining bytes to the errStart
456			// and try to get more source bytes.
457			if needProgress && nSrc == 0 || in.n-in.p == len(in.b) {
458				// There were not enough source bytes to proceed while the source
459				// buffer cannot hold any more bytes. Return a fatal error as this
460				// transformation can never complete.
461				c.fatalError(i, errShortInternal)
462				break
463			}
464			// in.b is an internal buffer and we can make progress.
465			in.p, in.n = 0, copy(in.b, in.src())
466			fallthrough
467		case nil:
468			// if i == low, we have depleted the bytes at index i or any lower levels.
469			// In that case we increase low and i. In all other cases we decrease i to
470			// fetch more bytes before proceeding to the next index.
471			if i > low {
472				i--
473				continue
474			}
475		default:
476			c.fatalError(i, err0)
477		}
478		// Exhausted level low or fatal error: increase low and continue
479		// to process the bytes accepted so far.
480		i++
481		low = i
482	}
483
484	// If c.errStart > 0, this means we found a fatal error.  We will clear
485	// all upstream buffers. At this point, no more progress can be made
486	// downstream, as Transform would have bailed while handling ErrShortDst.
487	if c.errStart > 0 {
488		for i := 1; i < c.errStart; i++ {
489			c.link[i].p, c.link[i].n = 0, 0
490		}
491		err, c.errStart, c.err = c.err, 0, nil
492	}
493	return dstL.n, srcL.p, err
494}
495
496// Deprecated: use runes.Remove instead.
497func RemoveFunc(f func(r rune) bool) Transformer {
498	return removeF(f)
499}
500
501type removeF func(r rune) bool
502
503func (removeF) Reset() {}
504
505// Transform implements the Transformer interface.
506func (t removeF) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
507	for r, sz := rune(0), 0; len(src) > 0; src = src[sz:] {
508
509		if r = rune(src[0]); r < utf8.RuneSelf {
510			sz = 1
511		} else {
512			r, sz = utf8.DecodeRune(src)
513
514			if sz == 1 {
515				// Invalid rune.
516				if !atEOF && !utf8.FullRune(src) {
517					err = ErrShortSrc
518					break
519				}
520				// We replace illegal bytes with RuneError. Not doing so might
521				// otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
522				// The resulting byte sequence may subsequently contain runes
523				// for which t(r) is true that were passed unnoticed.
524				if !t(r) {
525					if nDst+3 > len(dst) {
526						err = ErrShortDst
527						break
528					}
529					nDst += copy(dst[nDst:], "\uFFFD")
530				}
531				nSrc++
532				continue
533			}
534		}
535
536		if !t(r) {
537			if nDst+sz > len(dst) {
538				err = ErrShortDst
539				break
540			}
541			nDst += copy(dst[nDst:], src[:sz])
542		}
543		nSrc += sz
544	}
545	return
546}
547
548// grow returns a new []byte that is longer than b, and copies the first n bytes
549// of b to the start of the new slice.
550func grow(b []byte, n int) []byte {
551	m := len(b)
552	if m <= 32 {
553		m = 64
554	} else if m <= 256 {
555		m *= 2
556	} else {
557		m += m >> 1
558	}
559	buf := make([]byte, m)
560	copy(buf, b[:n])
561	return buf
562}
563
564const initialBufSize = 128
565
566// String returns a string with the result of converting s[:n] using t, where
567// n <= len(s). If err == nil, n will be len(s). It calls Reset on t.
568func String(t Transformer, s string) (result string, n int, err error) {
569	t.Reset()
570	if s == "" {
571		// Fast path for the common case for empty input. Results in about a
572		// 86% reduction of running time for BenchmarkStringLowerEmpty.
573		if _, _, err := t.Transform(nil, nil, true); err == nil {
574			return "", 0, nil
575		}
576	}
577
578	// Allocate only once. Note that both dst and src escape when passed to
579	// Transform.
580	buf := [2 * initialBufSize]byte{}
581	dst := buf[:initialBufSize:initialBufSize]
582	src := buf[initialBufSize : 2*initialBufSize]
583
584	// The input string s is transformed in multiple chunks (starting with a
585	// chunk size of initialBufSize). nDst and nSrc are per-chunk (or
586	// per-Transform-call) indexes, pDst and pSrc are overall indexes.
587	nDst, nSrc := 0, 0
588	pDst, pSrc := 0, 0
589
590	// pPrefix is the length of a common prefix: the first pPrefix bytes of the
591	// result will equal the first pPrefix bytes of s. It is not guaranteed to
592	// be the largest such value, but if pPrefix, len(result) and len(s) are
593	// all equal after the final transform (i.e. calling Transform with atEOF
594	// being true returned nil error) then we don't need to allocate a new
595	// result string.
596	pPrefix := 0
597	for {
598		// Invariant: pDst == pPrefix && pSrc == pPrefix.
599
600		n := copy(src, s[pSrc:])
601		nDst, nSrc, err = t.Transform(dst, src[:n], pSrc+n == len(s))
602		pDst += nDst
603		pSrc += nSrc
604
605		// TODO:  let transformers implement an optional Spanner interface, akin
606		// to norm's QuickSpan. This would even allow us to avoid any allocation.
607		if !bytes.Equal(dst[:nDst], src[:nSrc]) {
608			break
609		}
610		pPrefix = pSrc
611		if err == ErrShortDst {
612			// A buffer can only be short if a transformer modifies its input.
613			break
614		} else if err == ErrShortSrc {
615			if nSrc == 0 {
616				// No progress was made.
617				break
618			}
619			// Equal so far and !atEOF, so continue checking.
620		} else if err != nil || pPrefix == len(s) {
621			return string(s[:pPrefix]), pPrefix, err
622		}
623	}
624	// Post-condition: pDst == pPrefix + nDst && pSrc == pPrefix + nSrc.
625
626	// We have transformed the first pSrc bytes of the input s to become pDst
627	// transformed bytes. Those transformed bytes are discontiguous: the first
628	// pPrefix of them equal s[:pPrefix] and the last nDst of them equal
629	// dst[:nDst]. We copy them around, into a new dst buffer if necessary, so
630	// that they become one contiguous slice: dst[:pDst].
631	if pPrefix != 0 {
632		newDst := dst
633		if pDst > len(newDst) {
634			newDst = make([]byte, len(s)+nDst-nSrc)
635		}
636		copy(newDst[pPrefix:pDst], dst[:nDst])
637		copy(newDst[:pPrefix], s[:pPrefix])
638		dst = newDst
639	}
640
641	// Prevent duplicate Transform calls with atEOF being true at the end of
642	// the input. Also return if we have an unrecoverable error.
643	if (err == nil && pSrc == len(s)) ||
644		(err != nil && err != ErrShortDst && err != ErrShortSrc) {
645		return string(dst[:pDst]), pSrc, err
646	}
647
648	// Transform the remaining input, growing dst and src buffers as necessary.
649	for {
650		n := copy(src, s[pSrc:])
651		nDst, nSrc, err := t.Transform(dst[pDst:], src[:n], pSrc+n == len(s))
652		pDst += nDst
653		pSrc += nSrc
654
655		// If we got ErrShortDst or ErrShortSrc, do not grow as long as we can
656		// make progress. This may avoid excessive allocations.
657		if err == ErrShortDst {
658			if nDst == 0 {
659				dst = grow(dst, pDst)
660			}
661		} else if err == ErrShortSrc {
662			if nSrc == 0 {
663				src = grow(src, 0)
664			}
665		} else if err != nil || pSrc == len(s) {
666			return string(dst[:pDst]), pSrc, err
667		}
668	}
669}
670
671// Bytes returns a new byte slice with the result of converting b[:n] using t,
672// where n <= len(b). If err == nil, n will be len(b). It calls Reset on t.
673func Bytes(t Transformer, b []byte) (result []byte, n int, err error) {
674	return doAppend(t, 0, make([]byte, len(b)), b)
675}
676
677// Append appends the result of converting src[:n] using t to dst, where
678// n <= len(src), If err == nil, n will be len(src). It calls Reset on t.
679func Append(t Transformer, dst, src []byte) (result []byte, n int, err error) {
680	if len(dst) == cap(dst) {
681		n := len(src) + len(dst) // It is okay for this to be 0.
682		b := make([]byte, n)
683		dst = b[:copy(b, dst)]
684	}
685	return doAppend(t, len(dst), dst[:cap(dst)], src)
686}
687
688func doAppend(t Transformer, pDst int, dst, src []byte) (result []byte, n int, err error) {
689	t.Reset()
690	pSrc := 0
691	for {
692		nDst, nSrc, err := t.Transform(dst[pDst:], src[pSrc:], true)
693		pDst += nDst
694		pSrc += nSrc
695		if err != ErrShortDst {
696			return dst[:pDst], pSrc, err
697		}
698
699		// Grow the destination buffer, but do not grow as long as we can make
700		// progress. This may avoid excessive allocations.
701		if nDst == 0 {
702			dst = grow(dst, pDst)
703		}
704	}
705}
706