1 /*
2  * Copyright © 2012,2017  Google, Inc.
3  *
4  *  This is part of HarfBuzz, a text shaping library.
5  *
6  * Permission is hereby granted, without written agreement and without
7  * license or royalty fees, to use, copy, modify, and distribute this
8  * software and its documentation for any purpose, provided that the
9  * above copyright notice and the following two paragraphs appear in
10  * all copies of this software.
11  *
12  * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
13  * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
14  * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
15  * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
16  * DAMAGE.
17  *
18  * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
19  * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
20  * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
21  * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
22  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
23  *
24  * Google Author(s): Behdad Esfahbod
25  */
26 
27 #ifndef HB_SET_HH
28 #define HB_SET_HH
29 
30 #include "hb.hh"
31 
32 
33 /*
34  * hb_set_t
35  */
36 
37 /* TODO Keep a free-list so we can free pages that are completely zeroed.  At that
38  * point maybe also use a sentinel value for "all-1" pages? */
39 
40 struct hb_set_t
41 {
42   HB_NO_COPY_ASSIGN (hb_set_t);
hb_set_thb_set_t43   hb_set_t ()  { init (); }
~hb_set_thb_set_t44   ~hb_set_t () { fini (); }
45 
46   struct page_map_t
47   {
cmphb_set_t::page_map_t48     int cmp (const page_map_t &o) const { return (int) o.major - (int) major; }
49 
50     uint32_t major;
51     uint32_t index;
52   };
53 
54   struct page_t
55   {
init0hb_set_t::page_t56     void init0 () { v.clear (); }
init1hb_set_t::page_t57     void init1 () { v.clear (0xFF); }
58 
lenhb_set_t::page_t59     unsigned int len () const
60     { return ARRAY_LENGTH_CONST (v); }
61 
is_emptyhb_set_t::page_t62     bool is_empty () const
63     {
64       for (unsigned int i = 0; i < len (); i++)
65         if (v[i])
66 	  return false;
67       return true;
68     }
69 
addhb_set_t::page_t70     void add (hb_codepoint_t g) { elt (g) |= mask (g); }
delhb_set_t::page_t71     void del (hb_codepoint_t g) { elt (g) &= ~mask (g); }
hashb_set_t::page_t72     bool has (hb_codepoint_t g) const { return !!(elt (g) & mask (g)); }
73 
add_rangehb_set_t::page_t74     void add_range (hb_codepoint_t a, hb_codepoint_t b)
75     {
76       elt_t *la = &elt (a);
77       elt_t *lb = &elt (b);
78       if (la == lb)
79         *la |= (mask (b) << 1) - mask(a);
80       else
81       {
82 	*la |= ~(mask (a) - 1);
83 	la++;
84 
85 	memset (la, 0xff, (char *) lb - (char *) la);
86 
87 	*lb |= ((mask (b) << 1) - 1);
88       }
89     }
90 
is_equalhb_set_t::page_t91     bool is_equal (const page_t *other) const
92     {
93       return 0 == hb_memcmp (&v, &other->v, sizeof (v));
94     }
95 
get_populationhb_set_t::page_t96     unsigned int get_population () const
97     {
98       unsigned int pop = 0;
99       for (unsigned int i = 0; i < len (); i++)
100         pop += hb_popcount (v[i]);
101       return pop;
102     }
103 
nexthb_set_t::page_t104     bool next (hb_codepoint_t *codepoint) const
105     {
106       unsigned int m = (*codepoint + 1) & MASK;
107       if (!m)
108       {
109 	*codepoint = INVALID;
110 	return false;
111       }
112       unsigned int i = m / ELT_BITS;
113       unsigned int j = m & ELT_MASK;
114 
115       const elt_t vv = v[i] & ~((elt_t (1) << j) - 1);
116       for (const elt_t *p = &vv; i < len (); p = &v[++i])
117 	if (*p)
118 	{
119 	  *codepoint = i * ELT_BITS + elt_get_min (*p);
120 	  return true;
121 	}
122 
123       *codepoint = INVALID;
124       return false;
125     }
previoushb_set_t::page_t126     bool previous (hb_codepoint_t *codepoint) const
127     {
128       unsigned int m = (*codepoint - 1) & MASK;
129       if (m == MASK)
130       {
131 	*codepoint = INVALID;
132 	return false;
133       }
134       unsigned int i = m / ELT_BITS;
135       unsigned int j = m & ELT_MASK;
136 
137       const elt_t vv = v[i] & ((elt_t (1) << (j + 1)) - 1);
138       for (const elt_t *p = &vv; (int) i >= 0; p = &v[--i])
139 	if (*p)
140 	{
141 	  *codepoint = i * ELT_BITS + elt_get_max (*p);
142 	  return true;
143 	}
144 
145       *codepoint = INVALID;
146       return false;
147     }
get_minhb_set_t::page_t148     hb_codepoint_t get_min () const
149     {
150       for (unsigned int i = 0; i < len (); i++)
151         if (v[i])
152 	  return i * ELT_BITS + elt_get_min (v[i]);
153       return INVALID;
154     }
get_maxhb_set_t::page_t155     hb_codepoint_t get_max () const
156     {
157       for (int i = len () - 1; i >= 0; i--)
158         if (v[i])
159 	  return i * ELT_BITS + elt_get_max (v[i]);
160       return 0;
161     }
162 
163     typedef unsigned long long elt_t;
164     enum { PAGE_BITS = 512 };
165     static_assert ((PAGE_BITS & ((PAGE_BITS) - 1)) == 0, "");
166 
elt_get_minhb_set_t::page_t167     static unsigned int elt_get_min (const elt_t &elt) { return hb_ctz (elt); }
elt_get_maxhb_set_t::page_t168     static unsigned int elt_get_max (const elt_t &elt) { return hb_bit_storage (elt) - 1; }
169 
170     typedef hb_vector_size_t<elt_t, PAGE_BITS / 8> vector_t;
171 
172     enum { ELT_BITS = sizeof (elt_t) * 8 };
173     enum { ELT_MASK = ELT_BITS - 1 };
174     enum { BITS = sizeof (vector_t) * 8 };
175     enum { MASK = BITS - 1 };
176     static_assert ((unsigned) PAGE_BITS == (unsigned) BITS, "");
177 
elthb_set_t::page_t178     elt_t &elt (hb_codepoint_t g) { return v[(g & MASK) / ELT_BITS]; }
elthb_set_t::page_t179     elt_t const &elt (hb_codepoint_t g) const { return v[(g & MASK) / ELT_BITS]; }
maskhb_set_t::page_t180     elt_t mask (hb_codepoint_t g) const { return elt_t (1) << (g & ELT_MASK); }
181 
182     vector_t v;
183   };
184   static_assert (page_t::PAGE_BITS == sizeof (page_t) * 8, "");
185 
186   hb_object_header_t header;
187   bool successful; /* Allocations successful */
188   mutable unsigned int population;
189   hb_vector_t<page_map_t, 1> page_map;
190   hb_vector_t<page_t, 1> pages;
191 
init_shallowhb_set_t192   void init_shallow ()
193   {
194     successful = true;
195     population = 0;
196     page_map.init ();
197     pages.init ();
198   }
inithb_set_t199   void init ()
200   {
201     hb_object_init (this);
202     init_shallow ();
203   }
fini_shallowhb_set_t204   void fini_shallow ()
205   {
206     population = 0;
207     page_map.fini ();
208     pages.fini ();
209   }
finihb_set_t210   void fini ()
211   {
212     hb_object_fini (this);
213     fini_shallow ();
214   }
215 
in_errorhb_set_t216   bool in_error () const { return !successful; }
217 
resizehb_set_t218   bool resize (unsigned int count)
219   {
220     if (unlikely (!successful)) return false;
221     if (!pages.resize (count) || !page_map.resize (count))
222     {
223       pages.resize (page_map.len);
224       successful = false;
225       return false;
226     }
227     return true;
228   }
229 
clearhb_set_t230   void clear ()
231   {
232     if (unlikely (hb_object_is_immutable (this)))
233       return;
234     successful = true;
235     population = 0;
236     page_map.resize (0);
237     pages.resize (0);
238   }
is_emptyhb_set_t239   bool is_empty () const
240   {
241     unsigned int count = pages.len;
242     for (unsigned int i = 0; i < count; i++)
243       if (!pages[i].is_empty ())
244         return false;
245     return true;
246   }
247 
dirtyhb_set_t248   void dirty () { population = (unsigned int) -1; }
249 
addhb_set_t250   void add (hb_codepoint_t g)
251   {
252     if (unlikely (!successful)) return;
253     if (unlikely (g == INVALID)) return;
254     dirty ();
255     page_t *page = page_for_insert (g); if (unlikely (!page)) return;
256     page->add (g);
257   }
add_rangehb_set_t258   bool add_range (hb_codepoint_t a, hb_codepoint_t b)
259   {
260     if (unlikely (!successful)) return true; /* https://github.com/harfbuzz/harfbuzz/issues/657 */
261     if (unlikely (a > b || a == INVALID || b == INVALID)) return false;
262     dirty ();
263     unsigned int ma = get_major (a);
264     unsigned int mb = get_major (b);
265     if (ma == mb)
266     {
267       page_t *page = page_for_insert (a); if (unlikely (!page)) return false;
268       page->add_range (a, b);
269     }
270     else
271     {
272       page_t *page = page_for_insert (a); if (unlikely (!page)) return false;
273       page->add_range (a, major_start (ma + 1) - 1);
274 
275       for (unsigned int m = ma + 1; m < mb; m++)
276       {
277 	page = page_for_insert (major_start (m)); if (unlikely (!page)) return false;
278 	page->init1 ();
279       }
280 
281       page = page_for_insert (b); if (unlikely (!page)) return false;
282       page->add_range (major_start (mb), b);
283     }
284     return true;
285   }
286 
287   template <typename T>
add_arrayhb_set_t288   void add_array (const T *array, unsigned int count, unsigned int stride=sizeof(T))
289   {
290     if (unlikely (!successful)) return;
291     if (!count) return;
292     dirty ();
293     hb_codepoint_t g = *array;
294     while (count)
295     {
296       unsigned int m = get_major (g);
297       page_t *page = page_for_insert (g); if (unlikely (!page)) return;
298       unsigned int start = major_start (m);
299       unsigned int end = major_start (m + 1);
300       do
301       {
302 	page->add (g);
303 
304 	array = (const T *) ((const char *) array + stride);
305 	count--;
306       }
307       while (count && (g = *array, start <= g && g < end));
308     }
309   }
310 
311   /* Might return false if array looks unsorted.
312    * Used for faster rejection of corrupt data. */
313   template <typename T>
add_sorted_arrayhb_set_t314   bool add_sorted_array (const T *array, unsigned int count, unsigned int stride=sizeof(T))
315   {
316     if (unlikely (!successful)) return true; /* https://github.com/harfbuzz/harfbuzz/issues/657 */
317     if (!count) return true;
318     dirty ();
319     hb_codepoint_t g = *array;
320     hb_codepoint_t last_g = g;
321     while (count)
322     {
323       unsigned int m = get_major (g);
324       page_t *page = page_for_insert (g); if (unlikely (!page)) return false;
325       unsigned int end = major_start (m + 1);
326       do
327       {
328         /* If we try harder we can change the following comparison to <=;
329 	 * Not sure if it's worth it. */
330         if (g < last_g) return false;
331 	last_g = g;
332 	page->add (g);
333 
334 	array = (const T *) ((const char *) array + stride);
335 	count--;
336       }
337       while (count && (g = *array, g < end));
338     }
339     return true;
340   }
341 
delhb_set_t342   void del (hb_codepoint_t g)
343   {
344     /* TODO perform op even if !successful. */
345     if (unlikely (!successful)) return;
346     page_t *page = page_for (g);
347     if (!page)
348       return;
349     dirty ();
350     page->del (g);
351   }
del_rangehb_set_t352   void del_range (hb_codepoint_t a, hb_codepoint_t b)
353   {
354     /* TODO perform op even if !successful. */
355     /* TODO Optimize, like add_range(). */
356     if (unlikely (!successful)) return;
357     for (unsigned int i = a; i < b + 1; i++)
358       del (i);
359   }
hashb_set_t360   bool has (hb_codepoint_t g) const
361   {
362     const page_t *page = page_for (g);
363     if (!page)
364       return false;
365     return page->has (g);
366   }
intersectshb_set_t367   bool intersects (hb_codepoint_t first,
368 			  hb_codepoint_t last) const
369   {
370     hb_codepoint_t c = first - 1;
371     return next (&c) && c <= last;
372   }
sethb_set_t373   void set (const hb_set_t *other)
374   {
375     if (unlikely (!successful)) return;
376     unsigned int count = other->pages.len;
377     if (!resize (count))
378       return;
379     population = other->population;
380     memcpy ((void *) pages, (const void *) other->pages, count * pages.item_size);
381     memcpy ((void *) page_map, (const void *) other->page_map, count * page_map.item_size);
382   }
383 
is_equalhb_set_t384   bool is_equal (const hb_set_t *other) const
385   {
386     if (get_population () != other->get_population ())
387       return false;
388 
389     unsigned int na = pages.len;
390     unsigned int nb = other->pages.len;
391 
392     unsigned int a = 0, b = 0;
393     for (; a < na && b < nb; )
394     {
395       if (page_at (a).is_empty ()) { a++; continue; }
396       if (other->page_at (b).is_empty ()) { b++; continue; }
397       if (page_map[a].major != other->page_map[b].major ||
398 	  !page_at (a).is_equal (&other->page_at (b)))
399         return false;
400       a++;
401       b++;
402     }
403     for (; a < na; a++)
404       if (!page_at (a).is_empty ()) { return false; }
405     for (; b < nb; b++)
406       if (!other->page_at (b).is_empty ()) { return false; }
407 
408     return true;
409   }
410 
is_subsethb_set_t411   bool is_subset (const hb_set_t *larger_set) const
412   {
413     if (get_population () > larger_set->get_population ())
414       return false;
415 
416     /* TODO Optimize to use pages. */
417     hb_codepoint_t c = INVALID;
418     while (next (&c))
419       if (!larger_set->has (c))
420         return false;
421 
422     return true;
423   }
424 
425   template <class Op>
processhb_set_t426   void process (const hb_set_t *other)
427   {
428     if (unlikely (!successful)) return;
429 
430     dirty ();
431 
432     unsigned int na = pages.len;
433     unsigned int nb = other->pages.len;
434     unsigned int next_page = na;
435 
436     unsigned int count = 0, newCount = 0;
437     unsigned int a = 0, b = 0;
438     for (; a < na && b < nb; )
439     {
440       if (page_map[a].major == other->page_map[b].major)
441       {
442         count++;
443 	a++;
444 	b++;
445       }
446       else if (page_map[a].major < other->page_map[b].major)
447       {
448         if (Op::passthru_left)
449 	  count++;
450         a++;
451       }
452       else
453       {
454         if (Op::passthru_right)
455 	  count++;
456         b++;
457       }
458     }
459     if (Op::passthru_left)
460       count += na - a;
461     if (Op::passthru_right)
462       count += nb - b;
463 
464     if (count > pages.len)
465       if (!resize (count))
466         return;
467     newCount = count;
468 
469     /* Process in-place backward. */
470     a = na;
471     b = nb;
472     for (; a && b; )
473     {
474       if (page_map[a - 1].major == other->page_map[b - 1].major)
475       {
476 	a--;
477 	b--;
478 	count--;
479 	page_map[count] = page_map[a];
480 	Op::process (page_at (count).v, page_at (a).v, other->page_at (b).v);
481       }
482       else if (page_map[a - 1].major > other->page_map[b - 1].major)
483       {
484 	a--;
485 	if (Op::passthru_left)
486 	{
487 	  count--;
488 	  page_map[count] = page_map[a];
489 	}
490       }
491       else
492       {
493 	b--;
494 	if (Op::passthru_right)
495 	{
496 	  count--;
497 	  page_map[count].major = other->page_map[b].major;
498 	  page_map[count].index = next_page++;
499 	  page_at (count).v = other->page_at (b).v;
500 	}
501       }
502     }
503     if (Op::passthru_left)
504       while (a)
505       {
506 	a--;
507 	count--;
508 	page_map[count] = page_map [a];
509       }
510     if (Op::passthru_right)
511       while (b)
512       {
513 	b--;
514 	count--;
515 	page_map[count].major = other->page_map[b].major;
516 	page_map[count].index = next_page++;
517 	page_at (count).v = other->page_at (b).v;
518       }
519     assert (!count);
520     if (pages.len > newCount)
521       resize (newCount);
522   }
523 
union_hb_set_t524   void union_ (const hb_set_t *other)
525   {
526     process<HbOpOr> (other);
527   }
intersecthb_set_t528   void intersect (const hb_set_t *other)
529   {
530     process<HbOpAnd> (other);
531   }
subtracthb_set_t532   void subtract (const hb_set_t *other)
533   {
534     process<HbOpMinus> (other);
535   }
symmetric_differencehb_set_t536   void symmetric_difference (const hb_set_t *other)
537   {
538     process<HbOpXor> (other);
539   }
nexthb_set_t540   bool next (hb_codepoint_t *codepoint) const
541   {
542     if (unlikely (*codepoint == INVALID)) {
543       *codepoint = get_min ();
544       return *codepoint != INVALID;
545     }
546 
547     page_map_t map = {get_major (*codepoint), 0};
548     unsigned int i;
549     page_map.bfind (map, &i, HB_BFIND_NOT_FOUND_STORE_CLOSEST);
550     if (i < page_map.len && page_map[i].major == map.major)
551     {
552       if (pages[page_map[i].index].next (codepoint))
553       {
554 	*codepoint += page_map[i].major * page_t::PAGE_BITS;
555 	return true;
556       }
557       i++;
558     }
559     for (; i < page_map.len; i++)
560     {
561       hb_codepoint_t m = pages[page_map[i].index].get_min ();
562       if (m != INVALID)
563       {
564 	*codepoint = page_map[i].major * page_t::PAGE_BITS + m;
565 	return true;
566       }
567     }
568     *codepoint = INVALID;
569     return false;
570   }
previoushb_set_t571   bool previous (hb_codepoint_t *codepoint) const
572   {
573     if (unlikely (*codepoint == INVALID)) {
574       *codepoint = get_max ();
575       return *codepoint != INVALID;
576     }
577 
578     page_map_t map = {get_major (*codepoint), 0};
579     unsigned int i;
580     page_map.bfind (map, &i, HB_BFIND_NOT_FOUND_STORE_CLOSEST);
581     if (i < page_map.len && page_map[i].major == map.major)
582     {
583       if (pages[page_map[i].index].previous (codepoint))
584       {
585 	*codepoint += page_map[i].major * page_t::PAGE_BITS;
586 	return true;
587       }
588     }
589     i--;
590     for (; (int) i >= 0; i--)
591     {
592       hb_codepoint_t m = pages[page_map[i].index].get_max ();
593       if (m != INVALID)
594       {
595 	*codepoint = page_map[i].major * page_t::PAGE_BITS + m;
596 	return true;
597       }
598     }
599     *codepoint = INVALID;
600     return false;
601   }
next_rangehb_set_t602   bool next_range (hb_codepoint_t *first, hb_codepoint_t *last) const
603   {
604     hb_codepoint_t i;
605 
606     i = *last;
607     if (!next (&i))
608     {
609       *last = *first = INVALID;
610       return false;
611     }
612 
613     /* TODO Speed up. */
614     *last = *first = i;
615     while (next (&i) && i == *last + 1)
616       (*last)++;
617 
618     return true;
619   }
previous_rangehb_set_t620   bool previous_range (hb_codepoint_t *first, hb_codepoint_t *last) const
621   {
622     hb_codepoint_t i;
623 
624     i = *first;
625     if (!previous (&i))
626     {
627       *last = *first = INVALID;
628       return false;
629     }
630 
631     /* TODO Speed up. */
632     *last = *first = i;
633     while (previous (&i) && i == *first - 1)
634       (*first)--;
635 
636     return true;
637   }
638 
get_populationhb_set_t639   unsigned int get_population () const
640   {
641     if (population != (unsigned int) -1)
642       return population;
643 
644     unsigned int pop = 0;
645     unsigned int count = pages.len;
646     for (unsigned int i = 0; i < count; i++)
647       pop += pages[i].get_population ();
648 
649     population = pop;
650     return pop;
651   }
get_minhb_set_t652   hb_codepoint_t get_min () const
653   {
654     unsigned int count = pages.len;
655     for (unsigned int i = 0; i < count; i++)
656       if (!page_at (i).is_empty ())
657         return page_map[i].major * page_t::PAGE_BITS + page_at (i).get_min ();
658     return INVALID;
659   }
get_maxhb_set_t660   hb_codepoint_t get_max () const
661   {
662     unsigned int count = pages.len;
663     for (int i = count - 1; i >= 0; i++)
664       if (!page_at (i).is_empty ())
665         return page_map[(unsigned) i].major * page_t::PAGE_BITS + page_at (i).get_max ();
666     return INVALID;
667   }
668 
669   static  const hb_codepoint_t INVALID = HB_SET_VALUE_INVALID;
670 
page_for_inserthb_set_t671   page_t *page_for_insert (hb_codepoint_t g)
672   {
673     page_map_t map = {get_major (g), pages.len};
674     unsigned int i;
675     if (!page_map.bfind (map, &i, HB_BFIND_NOT_FOUND_STORE_CLOSEST))
676     {
677       if (!resize (pages.len + 1))
678 	return nullptr;
679 
680       pages[map.index].init0 ();
681       memmove (page_map + i + 1,
682 	       page_map + i,
683 	       (page_map.len - 1 - i) * page_map.item_size);
684       page_map[i] = map;
685     }
686     return &pages[page_map[i].index];
687   }
page_forhb_set_t688   page_t *page_for (hb_codepoint_t g)
689   {
690     page_map_t key = {get_major (g)};
691     const page_map_t *found = page_map.bsearch (key);
692     if (found)
693       return &pages[found->index];
694     return nullptr;
695   }
page_forhb_set_t696   const page_t *page_for (hb_codepoint_t g) const
697   {
698     page_map_t key = {get_major (g)};
699     const page_map_t *found = page_map.bsearch (key);
700     if (found)
701       return &pages[found->index];
702     return nullptr;
703   }
page_athb_set_t704   page_t &page_at (unsigned int i) { return pages[page_map[i].index]; }
page_athb_set_t705   const page_t &page_at (unsigned int i) const { return pages[page_map[i].index]; }
get_majorhb_set_t706   unsigned int get_major (hb_codepoint_t g) const { return g / page_t::PAGE_BITS; }
major_starthb_set_t707   hb_codepoint_t major_start (unsigned int major) const { return major * page_t::PAGE_BITS; }
708 };
709 
710 
711 #endif /* HB_SET_HH */
712