1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <memory.h>
15 #include <math.h>
16 #include <assert.h>
17 
18 #include "config/av1_rtcd.h"
19 
20 #include "av1/common/warped_motion.h"
21 #include "av1/common/scale.h"
22 
23 #define WARP_ERROR_BLOCK 32
24 
25 /* clang-format off */
26 static const int error_measure_lut[512] = {
27   // pow 0.7
28   16384, 16339, 16294, 16249, 16204, 16158, 16113, 16068,
29   16022, 15977, 15932, 15886, 15840, 15795, 15749, 15703,
30   15657, 15612, 15566, 15520, 15474, 15427, 15381, 15335,
31   15289, 15242, 15196, 15149, 15103, 15056, 15010, 14963,
32   14916, 14869, 14822, 14775, 14728, 14681, 14634, 14587,
33   14539, 14492, 14445, 14397, 14350, 14302, 14254, 14206,
34   14159, 14111, 14063, 14015, 13967, 13918, 13870, 13822,
35   13773, 13725, 13676, 13628, 13579, 13530, 13481, 13432,
36   13383, 13334, 13285, 13236, 13187, 13137, 13088, 13038,
37   12988, 12939, 12889, 12839, 12789, 12739, 12689, 12639,
38   12588, 12538, 12487, 12437, 12386, 12335, 12285, 12234,
39   12183, 12132, 12080, 12029, 11978, 11926, 11875, 11823,
40   11771, 11719, 11667, 11615, 11563, 11511, 11458, 11406,
41   11353, 11301, 11248, 11195, 11142, 11089, 11036, 10982,
42   10929, 10875, 10822, 10768, 10714, 10660, 10606, 10552,
43   10497, 10443, 10388, 10333, 10279, 10224, 10168, 10113,
44   10058, 10002,  9947,  9891,  9835,  9779,  9723,  9666,
45   9610, 9553, 9497, 9440, 9383, 9326, 9268, 9211,
46   9153, 9095, 9037, 8979, 8921, 8862, 8804, 8745,
47   8686, 8627, 8568, 8508, 8449, 8389, 8329, 8269,
48   8208, 8148, 8087, 8026, 7965, 7903, 7842, 7780,
49   7718, 7656, 7593, 7531, 7468, 7405, 7341, 7278,
50   7214, 7150, 7086, 7021, 6956, 6891, 6826, 6760,
51   6695, 6628, 6562, 6495, 6428, 6361, 6293, 6225,
52   6157, 6089, 6020, 5950, 5881, 5811, 5741, 5670,
53   5599, 5527, 5456, 5383, 5311, 5237, 5164, 5090,
54   5015, 4941, 4865, 4789, 4713, 4636, 4558, 4480,
55   4401, 4322, 4242, 4162, 4080, 3998, 3916, 3832,
56   3748, 3663, 3577, 3490, 3402, 3314, 3224, 3133,
57   3041, 2948, 2854, 2758, 2661, 2562, 2461, 2359,
58   2255, 2148, 2040, 1929, 1815, 1698, 1577, 1452,
59   1323, 1187, 1045,  894,  731,  550,  339,    0,
60   339,  550,  731,  894, 1045, 1187, 1323, 1452,
61   1577, 1698, 1815, 1929, 2040, 2148, 2255, 2359,
62   2461, 2562, 2661, 2758, 2854, 2948, 3041, 3133,
63   3224, 3314, 3402, 3490, 3577, 3663, 3748, 3832,
64   3916, 3998, 4080, 4162, 4242, 4322, 4401, 4480,
65   4558, 4636, 4713, 4789, 4865, 4941, 5015, 5090,
66   5164, 5237, 5311, 5383, 5456, 5527, 5599, 5670,
67   5741, 5811, 5881, 5950, 6020, 6089, 6157, 6225,
68   6293, 6361, 6428, 6495, 6562, 6628, 6695, 6760,
69   6826, 6891, 6956, 7021, 7086, 7150, 7214, 7278,
70   7341, 7405, 7468, 7531, 7593, 7656, 7718, 7780,
71   7842, 7903, 7965, 8026, 8087, 8148, 8208, 8269,
72   8329, 8389, 8449, 8508, 8568, 8627, 8686, 8745,
73   8804, 8862, 8921, 8979, 9037, 9095, 9153, 9211,
74   9268, 9326, 9383, 9440, 9497, 9553, 9610, 9666,
75   9723,  9779,  9835,  9891,  9947, 10002, 10058, 10113,
76   10168, 10224, 10279, 10333, 10388, 10443, 10497, 10552,
77   10606, 10660, 10714, 10768, 10822, 10875, 10929, 10982,
78   11036, 11089, 11142, 11195, 11248, 11301, 11353, 11406,
79   11458, 11511, 11563, 11615, 11667, 11719, 11771, 11823,
80   11875, 11926, 11978, 12029, 12080, 12132, 12183, 12234,
81   12285, 12335, 12386, 12437, 12487, 12538, 12588, 12639,
82   12689, 12739, 12789, 12839, 12889, 12939, 12988, 13038,
83   13088, 13137, 13187, 13236, 13285, 13334, 13383, 13432,
84   13481, 13530, 13579, 13628, 13676, 13725, 13773, 13822,
85   13870, 13918, 13967, 14015, 14063, 14111, 14159, 14206,
86   14254, 14302, 14350, 14397, 14445, 14492, 14539, 14587,
87   14634, 14681, 14728, 14775, 14822, 14869, 14916, 14963,
88   15010, 15056, 15103, 15149, 15196, 15242, 15289, 15335,
89   15381, 15427, 15474, 15520, 15566, 15612, 15657, 15703,
90   15749, 15795, 15840, 15886, 15932, 15977, 16022, 16068,
91   16113, 16158, 16204, 16249, 16294, 16339, 16384, 16384,
92 };
93 /* clang-format on */
94 
95 // For warping, we really use a 6-tap filter, but we do blocks of 8 pixels
96 // at a time. The zoom/rotation/shear in the model are applied to the
97 // "fractional" position of each pixel, which therefore varies within
98 // [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
99 // We need an extra 2 taps to fit this in, for a total of 8 taps.
100 /* clang-format off */
101 const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = {
102 #if WARPEDPIXEL_PREC_BITS == 6
103   // [-1, 0)
104   { 0,   0, 127,   1,   0, 0, 0, 0 }, { 0, - 1, 127,   2,   0, 0, 0, 0 },
105   { 1, - 3, 127,   4, - 1, 0, 0, 0 }, { 1, - 4, 126,   6, - 2, 1, 0, 0 },
106   { 1, - 5, 126,   8, - 3, 1, 0, 0 }, { 1, - 6, 125,  11, - 4, 1, 0, 0 },
107   { 1, - 7, 124,  13, - 4, 1, 0, 0 }, { 2, - 8, 123,  15, - 5, 1, 0, 0 },
108   { 2, - 9, 122,  18, - 6, 1, 0, 0 }, { 2, -10, 121,  20, - 6, 1, 0, 0 },
109   { 2, -11, 120,  22, - 7, 2, 0, 0 }, { 2, -12, 119,  25, - 8, 2, 0, 0 },
110   { 3, -13, 117,  27, - 8, 2, 0, 0 }, { 3, -13, 116,  29, - 9, 2, 0, 0 },
111   { 3, -14, 114,  32, -10, 3, 0, 0 }, { 3, -15, 113,  35, -10, 2, 0, 0 },
112   { 3, -15, 111,  37, -11, 3, 0, 0 }, { 3, -16, 109,  40, -11, 3, 0, 0 },
113   { 3, -16, 108,  42, -12, 3, 0, 0 }, { 4, -17, 106,  45, -13, 3, 0, 0 },
114   { 4, -17, 104,  47, -13, 3, 0, 0 }, { 4, -17, 102,  50, -14, 3, 0, 0 },
115   { 4, -17, 100,  52, -14, 3, 0, 0 }, { 4, -18,  98,  55, -15, 4, 0, 0 },
116   { 4, -18,  96,  58, -15, 3, 0, 0 }, { 4, -18,  94,  60, -16, 4, 0, 0 },
117   { 4, -18,  91,  63, -16, 4, 0, 0 }, { 4, -18,  89,  65, -16, 4, 0, 0 },
118   { 4, -18,  87,  68, -17, 4, 0, 0 }, { 4, -18,  85,  70, -17, 4, 0, 0 },
119   { 4, -18,  82,  73, -17, 4, 0, 0 }, { 4, -18,  80,  75, -17, 4, 0, 0 },
120   { 4, -18,  78,  78, -18, 4, 0, 0 }, { 4, -17,  75,  80, -18, 4, 0, 0 },
121   { 4, -17,  73,  82, -18, 4, 0, 0 }, { 4, -17,  70,  85, -18, 4, 0, 0 },
122   { 4, -17,  68,  87, -18, 4, 0, 0 }, { 4, -16,  65,  89, -18, 4, 0, 0 },
123   { 4, -16,  63,  91, -18, 4, 0, 0 }, { 4, -16,  60,  94, -18, 4, 0, 0 },
124   { 3, -15,  58,  96, -18, 4, 0, 0 }, { 4, -15,  55,  98, -18, 4, 0, 0 },
125   { 3, -14,  52, 100, -17, 4, 0, 0 }, { 3, -14,  50, 102, -17, 4, 0, 0 },
126   { 3, -13,  47, 104, -17, 4, 0, 0 }, { 3, -13,  45, 106, -17, 4, 0, 0 },
127   { 3, -12,  42, 108, -16, 3, 0, 0 }, { 3, -11,  40, 109, -16, 3, 0, 0 },
128   { 3, -11,  37, 111, -15, 3, 0, 0 }, { 2, -10,  35, 113, -15, 3, 0, 0 },
129   { 3, -10,  32, 114, -14, 3, 0, 0 }, { 2, - 9,  29, 116, -13, 3, 0, 0 },
130   { 2, - 8,  27, 117, -13, 3, 0, 0 }, { 2, - 8,  25, 119, -12, 2, 0, 0 },
131   { 2, - 7,  22, 120, -11, 2, 0, 0 }, { 1, - 6,  20, 121, -10, 2, 0, 0 },
132   { 1, - 6,  18, 122, - 9, 2, 0, 0 }, { 1, - 5,  15, 123, - 8, 2, 0, 0 },
133   { 1, - 4,  13, 124, - 7, 1, 0, 0 }, { 1, - 4,  11, 125, - 6, 1, 0, 0 },
134   { 1, - 3,   8, 126, - 5, 1, 0, 0 }, { 1, - 2,   6, 126, - 4, 1, 0, 0 },
135   { 0, - 1,   4, 127, - 3, 1, 0, 0 }, { 0,   0,   2, 127, - 1, 0, 0, 0 },
136 
137   // [0, 1)
138   { 0,  0,   0, 127,   1,   0,  0,  0}, { 0,  0,  -1, 127,   2,   0,  0,  0},
139   { 0,  1,  -3, 127,   4,  -2,  1,  0}, { 0,  1,  -5, 127,   6,  -2,  1,  0},
140   { 0,  2,  -6, 126,   8,  -3,  1,  0}, {-1,  2,  -7, 126,  11,  -4,  2, -1},
141   {-1,  3,  -8, 125,  13,  -5,  2, -1}, {-1,  3, -10, 124,  16,  -6,  3, -1},
142   {-1,  4, -11, 123,  18,  -7,  3, -1}, {-1,  4, -12, 122,  20,  -7,  3, -1},
143   {-1,  4, -13, 121,  23,  -8,  3, -1}, {-2,  5, -14, 120,  25,  -9,  4, -1},
144   {-1,  5, -15, 119,  27, -10,  4, -1}, {-1,  5, -16, 118,  30, -11,  4, -1},
145   {-2,  6, -17, 116,  33, -12,  5, -1}, {-2,  6, -17, 114,  35, -12,  5, -1},
146   {-2,  6, -18, 113,  38, -13,  5, -1}, {-2,  7, -19, 111,  41, -14,  6, -2},
147   {-2,  7, -19, 110,  43, -15,  6, -2}, {-2,  7, -20, 108,  46, -15,  6, -2},
148   {-2,  7, -20, 106,  49, -16,  6, -2}, {-2,  7, -21, 104,  51, -16,  7, -2},
149   {-2,  7, -21, 102,  54, -17,  7, -2}, {-2,  8, -21, 100,  56, -18,  7, -2},
150   {-2,  8, -22,  98,  59, -18,  7, -2}, {-2,  8, -22,  96,  62, -19,  7, -2},
151   {-2,  8, -22,  94,  64, -19,  7, -2}, {-2,  8, -22,  91,  67, -20,  8, -2},
152   {-2,  8, -22,  89,  69, -20,  8, -2}, {-2,  8, -22,  87,  72, -21,  8, -2},
153   {-2,  8, -21,  84,  74, -21,  8, -2}, {-2,  8, -22,  82,  77, -21,  8, -2},
154   {-2,  8, -21,  79,  79, -21,  8, -2}, {-2,  8, -21,  77,  82, -22,  8, -2},
155   {-2,  8, -21,  74,  84, -21,  8, -2}, {-2,  8, -21,  72,  87, -22,  8, -2},
156   {-2,  8, -20,  69,  89, -22,  8, -2}, {-2,  8, -20,  67,  91, -22,  8, -2},
157   {-2,  7, -19,  64,  94, -22,  8, -2}, {-2,  7, -19,  62,  96, -22,  8, -2},
158   {-2,  7, -18,  59,  98, -22,  8, -2}, {-2,  7, -18,  56, 100, -21,  8, -2},
159   {-2,  7, -17,  54, 102, -21,  7, -2}, {-2,  7, -16,  51, 104, -21,  7, -2},
160   {-2,  6, -16,  49, 106, -20,  7, -2}, {-2,  6, -15,  46, 108, -20,  7, -2},
161   {-2,  6, -15,  43, 110, -19,  7, -2}, {-2,  6, -14,  41, 111, -19,  7, -2},
162   {-1,  5, -13,  38, 113, -18,  6, -2}, {-1,  5, -12,  35, 114, -17,  6, -2},
163   {-1,  5, -12,  33, 116, -17,  6, -2}, {-1,  4, -11,  30, 118, -16,  5, -1},
164   {-1,  4, -10,  27, 119, -15,  5, -1}, {-1,  4,  -9,  25, 120, -14,  5, -2},
165   {-1,  3,  -8,  23, 121, -13,  4, -1}, {-1,  3,  -7,  20, 122, -12,  4, -1},
166   {-1,  3,  -7,  18, 123, -11,  4, -1}, {-1,  3,  -6,  16, 124, -10,  3, -1},
167   {-1,  2,  -5,  13, 125,  -8,  3, -1}, {-1,  2,  -4,  11, 126,  -7,  2, -1},
168   { 0,  1,  -3,   8, 126,  -6,  2,  0}, { 0,  1,  -2,   6, 127,  -5,  1,  0},
169   { 0,  1,  -2,   4, 127,  -3,  1,  0}, { 0,  0,   0,   2, 127,  -1,  0,  0},
170 
171   // [1, 2)
172   { 0, 0, 0,   1, 127,   0,   0, 0 }, { 0, 0, 0, - 1, 127,   2,   0, 0 },
173   { 0, 0, 1, - 3, 127,   4, - 1, 0 }, { 0, 0, 1, - 4, 126,   6, - 2, 1 },
174   { 0, 0, 1, - 5, 126,   8, - 3, 1 }, { 0, 0, 1, - 6, 125,  11, - 4, 1 },
175   { 0, 0, 1, - 7, 124,  13, - 4, 1 }, { 0, 0, 2, - 8, 123,  15, - 5, 1 },
176   { 0, 0, 2, - 9, 122,  18, - 6, 1 }, { 0, 0, 2, -10, 121,  20, - 6, 1 },
177   { 0, 0, 2, -11, 120,  22, - 7, 2 }, { 0, 0, 2, -12, 119,  25, - 8, 2 },
178   { 0, 0, 3, -13, 117,  27, - 8, 2 }, { 0, 0, 3, -13, 116,  29, - 9, 2 },
179   { 0, 0, 3, -14, 114,  32, -10, 3 }, { 0, 0, 3, -15, 113,  35, -10, 2 },
180   { 0, 0, 3, -15, 111,  37, -11, 3 }, { 0, 0, 3, -16, 109,  40, -11, 3 },
181   { 0, 0, 3, -16, 108,  42, -12, 3 }, { 0, 0, 4, -17, 106,  45, -13, 3 },
182   { 0, 0, 4, -17, 104,  47, -13, 3 }, { 0, 0, 4, -17, 102,  50, -14, 3 },
183   { 0, 0, 4, -17, 100,  52, -14, 3 }, { 0, 0, 4, -18,  98,  55, -15, 4 },
184   { 0, 0, 4, -18,  96,  58, -15, 3 }, { 0, 0, 4, -18,  94,  60, -16, 4 },
185   { 0, 0, 4, -18,  91,  63, -16, 4 }, { 0, 0, 4, -18,  89,  65, -16, 4 },
186   { 0, 0, 4, -18,  87,  68, -17, 4 }, { 0, 0, 4, -18,  85,  70, -17, 4 },
187   { 0, 0, 4, -18,  82,  73, -17, 4 }, { 0, 0, 4, -18,  80,  75, -17, 4 },
188   { 0, 0, 4, -18,  78,  78, -18, 4 }, { 0, 0, 4, -17,  75,  80, -18, 4 },
189   { 0, 0, 4, -17,  73,  82, -18, 4 }, { 0, 0, 4, -17,  70,  85, -18, 4 },
190   { 0, 0, 4, -17,  68,  87, -18, 4 }, { 0, 0, 4, -16,  65,  89, -18, 4 },
191   { 0, 0, 4, -16,  63,  91, -18, 4 }, { 0, 0, 4, -16,  60,  94, -18, 4 },
192   { 0, 0, 3, -15,  58,  96, -18, 4 }, { 0, 0, 4, -15,  55,  98, -18, 4 },
193   { 0, 0, 3, -14,  52, 100, -17, 4 }, { 0, 0, 3, -14,  50, 102, -17, 4 },
194   { 0, 0, 3, -13,  47, 104, -17, 4 }, { 0, 0, 3, -13,  45, 106, -17, 4 },
195   { 0, 0, 3, -12,  42, 108, -16, 3 }, { 0, 0, 3, -11,  40, 109, -16, 3 },
196   { 0, 0, 3, -11,  37, 111, -15, 3 }, { 0, 0, 2, -10,  35, 113, -15, 3 },
197   { 0, 0, 3, -10,  32, 114, -14, 3 }, { 0, 0, 2, - 9,  29, 116, -13, 3 },
198   { 0, 0, 2, - 8,  27, 117, -13, 3 }, { 0, 0, 2, - 8,  25, 119, -12, 2 },
199   { 0, 0, 2, - 7,  22, 120, -11, 2 }, { 0, 0, 1, - 6,  20, 121, -10, 2 },
200   { 0, 0, 1, - 6,  18, 122, - 9, 2 }, { 0, 0, 1, - 5,  15, 123, - 8, 2 },
201   { 0, 0, 1, - 4,  13, 124, - 7, 1 }, { 0, 0, 1, - 4,  11, 125, - 6, 1 },
202   { 0, 0, 1, - 3,   8, 126, - 5, 1 }, { 0, 0, 1, - 2,   6, 126, - 4, 1 },
203   { 0, 0, 0, - 1,   4, 127, - 3, 1 }, { 0, 0, 0,   0,   2, 127, - 1, 0 },
204   // dummy (replicate row index 191)
205   { 0, 0, 0,   0,   2, 127, - 1, 0 },
206 
207 #elif WARPEDPIXEL_PREC_BITS == 5
208   // [-1, 0)
209   {0,   0, 127,   1,   0, 0, 0, 0}, {1,  -3, 127,   4,  -1, 0, 0, 0},
210   {1,  -5, 126,   8,  -3, 1, 0, 0}, {1,  -7, 124,  13,  -4, 1, 0, 0},
211   {2,  -9, 122,  18,  -6, 1, 0, 0}, {2, -11, 120,  22,  -7, 2, 0, 0},
212   {3, -13, 117,  27,  -8, 2, 0, 0}, {3, -14, 114,  32, -10, 3, 0, 0},
213   {3, -15, 111,  37, -11, 3, 0, 0}, {3, -16, 108,  42, -12, 3, 0, 0},
214   {4, -17, 104,  47, -13, 3, 0, 0}, {4, -17, 100,  52, -14, 3, 0, 0},
215   {4, -18,  96,  58, -15, 3, 0, 0}, {4, -18,  91,  63, -16, 4, 0, 0},
216   {4, -18,  87,  68, -17, 4, 0, 0}, {4, -18,  82,  73, -17, 4, 0, 0},
217   {4, -18,  78,  78, -18, 4, 0, 0}, {4, -17,  73,  82, -18, 4, 0, 0},
218   {4, -17,  68,  87, -18, 4, 0, 0}, {4, -16,  63,  91, -18, 4, 0, 0},
219   {3, -15,  58,  96, -18, 4, 0, 0}, {3, -14,  52, 100, -17, 4, 0, 0},
220   {3, -13,  47, 104, -17, 4, 0, 0}, {3, -12,  42, 108, -16, 3, 0, 0},
221   {3, -11,  37, 111, -15, 3, 0, 0}, {3, -10,  32, 114, -14, 3, 0, 0},
222   {2,  -8,  27, 117, -13, 3, 0, 0}, {2,  -7,  22, 120, -11, 2, 0, 0},
223   {1,  -6,  18, 122,  -9, 2, 0, 0}, {1,  -4,  13, 124,  -7, 1, 0, 0},
224   {1,  -3,   8, 126,  -5, 1, 0, 0}, {0,  -1,   4, 127,  -3, 1, 0, 0},
225   // [0, 1)
226   { 0,  0,   0, 127,   1,   0,   0,  0}, { 0,  1,  -3, 127,   4,  -2,   1,  0},
227   { 0,  2,  -6, 126,   8,  -3,   1,  0}, {-1,  3,  -8, 125,  13,  -5,   2, -1},
228   {-1,  4, -11, 123,  18,  -7,   3, -1}, {-1,  4, -13, 121,  23,  -8,   3, -1},
229   {-1,  5, -15, 119,  27, -10,   4, -1}, {-2,  6, -17, 116,  33, -12,   5, -1},
230   {-2,  6, -18, 113,  38, -13,   5, -1}, {-2,  7, -19, 110,  43, -15,   6, -2},
231   {-2,  7, -20, 106,  49, -16,   6, -2}, {-2,  7, -21, 102,  54, -17,   7, -2},
232   {-2,  8, -22,  98,  59, -18,   7, -2}, {-2,  8, -22,  94,  64, -19,   7, -2},
233   {-2,  8, -22,  89,  69, -20,   8, -2}, {-2,  8, -21,  84,  74, -21,   8, -2},
234   {-2,  8, -21,  79,  79, -21,   8, -2}, {-2,  8, -21,  74,  84, -21,   8, -2},
235   {-2,  8, -20,  69,  89, -22,   8, -2}, {-2,  7, -19,  64,  94, -22,   8, -2},
236   {-2,  7, -18,  59,  98, -22,   8, -2}, {-2,  7, -17,  54, 102, -21,   7, -2},
237   {-2,  6, -16,  49, 106, -20,   7, -2}, {-2,  6, -15,  43, 110, -19,   7, -2},
238   {-1,  5, -13,  38, 113, -18,   6, -2}, {-1,  5, -12,  33, 116, -17,   6, -2},
239   {-1,  4, -10,  27, 119, -15,   5, -1}, {-1,  3,  -8,  23, 121, -13,   4, -1},
240   {-1,  3,  -7,  18, 123, -11,   4, -1}, {-1,  2,  -5,  13, 125,  -8,   3, -1},
241   { 0,  1,  -3,   8, 126,  -6,   2,  0}, { 0,  1,  -2,   4, 127,  -3,   1,  0},
242   // [1, 2)
243   {0, 0, 0,   1, 127,   0,   0, 0}, {0, 0, 1,  -3, 127,   4,  -1, 0},
244   {0, 0, 1,  -5, 126,   8,  -3, 1}, {0, 0, 1,  -7, 124,  13,  -4, 1},
245   {0, 0, 2,  -9, 122,  18,  -6, 1}, {0, 0, 2, -11, 120,  22,  -7, 2},
246   {0, 0, 3, -13, 117,  27,  -8, 2}, {0, 0, 3, -14, 114,  32, -10, 3},
247   {0, 0, 3, -15, 111,  37, -11, 3}, {0, 0, 3, -16, 108,  42, -12, 3},
248   {0, 0, 4, -17, 104,  47, -13, 3}, {0, 0, 4, -17, 100,  52, -14, 3},
249   {0, 0, 4, -18,  96,  58, -15, 3}, {0, 0, 4, -18,  91,  63, -16, 4},
250   {0, 0, 4, -18,  87,  68, -17, 4}, {0, 0, 4, -18,  82,  73, -17, 4},
251   {0, 0, 4, -18,  78,  78, -18, 4}, {0, 0, 4, -17,  73,  82, -18, 4},
252   {0, 0, 4, -17,  68,  87, -18, 4}, {0, 0, 4, -16,  63,  91, -18, 4},
253   {0, 0, 3, -15,  58,  96, -18, 4}, {0, 0, 3, -14,  52, 100, -17, 4},
254   {0, 0, 3, -13,  47, 104, -17, 4}, {0, 0, 3, -12,  42, 108, -16, 3},
255   {0, 0, 3, -11,  37, 111, -15, 3}, {0, 0, 3, -10,  32, 114, -14, 3},
256   {0, 0, 2,  -8,  27, 117, -13, 3}, {0, 0, 2,  -7,  22, 120, -11, 2},
257   {0, 0, 1,  -6,  18, 122,  -9, 2}, {0, 0, 1,  -4,  13, 124,  -7, 1},
258   {0, 0, 1,  -3,   8, 126,  -5, 1}, {0, 0, 0,  -1,   4, 127,  -3, 1},
259   // dummy (replicate row index 95)
260   {0, 0, 0,  -1,   4, 127,  -3, 1},
261 
262 #endif  // WARPEDPIXEL_PREC_BITS == 6
263 };
264 
265 /* clang-format on */
266 
267 #define DIV_LUT_PREC_BITS 14
268 #define DIV_LUT_BITS 8
269 #define DIV_LUT_NUM (1 << DIV_LUT_BITS)
270 
271 static const uint16_t div_lut[DIV_LUT_NUM + 1] = {
272   16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
273   15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
274   15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
275   14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
276   13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
277   13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
278   13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
279   12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
280   12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
281   11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
282   11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
283   11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
284   10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
285   10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
286   10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
287   9963,  9939,  9916,  9892,  9869,  9846,  9823,  9800,  9777,  9754,  9732,
288   9709,  9687,  9664,  9642,  9620,  9598,  9576,  9554,  9533,  9511,  9489,
289   9468,  9447,  9425,  9404,  9383,  9362,  9341,  9321,  9300,  9279,  9259,
290   9239,  9218,  9198,  9178,  9158,  9138,  9118,  9098,  9079,  9059,  9039,
291   9020,  9001,  8981,  8962,  8943,  8924,  8905,  8886,  8867,  8849,  8830,
292   8812,  8793,  8775,  8756,  8738,  8720,  8702,  8684,  8666,  8648,  8630,
293   8613,  8595,  8577,  8560,  8542,  8525,  8508,  8490,  8473,  8456,  8439,
294   8422,  8405,  8389,  8372,  8355,  8339,  8322,  8306,  8289,  8273,  8257,
295   8240,  8224,  8208,  8192,
296 };
297 
298 // Decomposes a divisor D such that 1/D = y/2^shift, where y is returned
299 // at precision of DIV_LUT_PREC_BITS along with the shift.
resolve_divisor_64(uint64_t D,int16_t * shift)300 static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) {
301   int64_t f;
302   *shift = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
303                                : get_msb((unsigned int)D));
304   // e is obtained from D after resetting the most significant 1 bit.
305   const int64_t e = D - ((uint64_t)1 << *shift);
306   // Get the most significant DIV_LUT_BITS (8) bits of e into f
307   if (*shift > DIV_LUT_BITS)
308     f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS);
309   else
310     f = e << (DIV_LUT_BITS - *shift);
311   assert(f <= DIV_LUT_NUM);
312   *shift += DIV_LUT_PREC_BITS;
313   // Use f as lookup into the precomputed table of multipliers
314   return div_lut[f];
315 }
316 
resolve_divisor_32(uint32_t D,int16_t * shift)317 static int16_t resolve_divisor_32(uint32_t D, int16_t *shift) {
318   int32_t f;
319   *shift = get_msb(D);
320   // e is obtained from D after resetting the most significant 1 bit.
321   const int32_t e = D - ((uint32_t)1 << *shift);
322   // Get the most significant DIV_LUT_BITS (8) bits of e into f
323   if (*shift > DIV_LUT_BITS)
324     f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS);
325   else
326     f = e << (DIV_LUT_BITS - *shift);
327   assert(f <= DIV_LUT_NUM);
328   *shift += DIV_LUT_PREC_BITS;
329   // Use f as lookup into the precomputed table of multipliers
330   return div_lut[f];
331 }
332 
is_affine_valid(const WarpedMotionParams * const wm)333 static int is_affine_valid(const WarpedMotionParams *const wm) {
334   const int32_t *mat = wm->wmmat;
335   return (mat[2] > 0);
336 }
337 
is_affine_shear_allowed(int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)338 static int is_affine_shear_allowed(int16_t alpha, int16_t beta, int16_t gamma,
339                                    int16_t delta) {
340   if ((4 * abs(alpha) + 7 * abs(beta) >= (1 << WARPEDMODEL_PREC_BITS)) ||
341       (4 * abs(gamma) + 4 * abs(delta) >= (1 << WARPEDMODEL_PREC_BITS)))
342     return 0;
343   else
344     return 1;
345 }
346 
347 // Returns 1 on success or 0 on an invalid affine set
get_shear_params(WarpedMotionParams * wm)348 int get_shear_params(WarpedMotionParams *wm) {
349   const int32_t *mat = wm->wmmat;
350   if (!is_affine_valid(wm)) return 0;
351   wm->alpha =
352       clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX);
353   wm->beta = clamp(mat[3], INT16_MIN, INT16_MAX);
354   int16_t shift;
355   int16_t y = resolve_divisor_32(abs(mat[2]), &shift) * (mat[2] < 0 ? -1 : 1);
356   int64_t v = ((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) * y;
357   wm->gamma =
358       clamp((int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift), INT16_MIN, INT16_MAX);
359   v = ((int64_t)mat[3] * mat[4]) * y;
360   wm->delta = clamp(mat[5] - (int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift) -
361                         (1 << WARPEDMODEL_PREC_BITS),
362                     INT16_MIN, INT16_MAX);
363 
364   wm->alpha = ROUND_POWER_OF_TWO_SIGNED(wm->alpha, WARP_PARAM_REDUCE_BITS) *
365               (1 << WARP_PARAM_REDUCE_BITS);
366   wm->beta = ROUND_POWER_OF_TWO_SIGNED(wm->beta, WARP_PARAM_REDUCE_BITS) *
367              (1 << WARP_PARAM_REDUCE_BITS);
368   wm->gamma = ROUND_POWER_OF_TWO_SIGNED(wm->gamma, WARP_PARAM_REDUCE_BITS) *
369               (1 << WARP_PARAM_REDUCE_BITS);
370   wm->delta = ROUND_POWER_OF_TWO_SIGNED(wm->delta, WARP_PARAM_REDUCE_BITS) *
371               (1 << WARP_PARAM_REDUCE_BITS);
372 
373   if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta))
374     return 0;
375 
376   return 1;
377 }
378 
highbd_error_measure(int err,int bd)379 static INLINE int highbd_error_measure(int err, int bd) {
380   const int b = bd - 8;
381   const int bmask = (1 << b) - 1;
382   const int v = (1 << b);
383   err = abs(err);
384   const int e1 = err >> b;
385   const int e2 = err & bmask;
386   return error_measure_lut[255 + e1] * (v - e2) +
387          error_measure_lut[256 + e1] * e2;
388 }
389 
390 /* Note: For an explanation of the warp algorithm, and some notes on bit widths
391     for hardware implementations, see the comments above av1_warp_affine_c
392 */
av1_highbd_warp_affine_c(const int32_t * mat,const uint16_t * ref,int width,int height,int stride,uint16_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int bd,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)393 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
394                               int width, int height, int stride, uint16_t *pred,
395                               int p_col, int p_row, int p_width, int p_height,
396                               int p_stride, int subsampling_x,
397                               int subsampling_y, int bd,
398                               ConvolveParams *conv_params, int16_t alpha,
399                               int16_t beta, int16_t gamma, int16_t delta) {
400   int32_t tmp[15 * 8];
401   const int reduce_bits_horiz =
402       conv_params->round_0 +
403       AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0);
404   const int reduce_bits_vert = conv_params->is_compound
405                                    ? conv_params->round_1
406                                    : 2 * FILTER_BITS - reduce_bits_horiz;
407   const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
408   const int offset_bits_horiz = bd + FILTER_BITS - 1;
409   const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
410   const int round_bits =
411       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
412   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
413   (void)max_bits_horiz;
414   assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
415 
416   for (int i = p_row; i < p_row + p_height; i += 8) {
417     for (int j = p_col; j < p_col + p_width; j += 8) {
418       // Calculate the center of this 8x8 block,
419       // project to luma coordinates (if in a subsampled chroma plane),
420       // apply the affine transformation,
421       // then convert back to the original coordinates (if necessary)
422       const int32_t src_x = (j + 4) << subsampling_x;
423       const int32_t src_y = (i + 4) << subsampling_y;
424       const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
425       const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
426       const int32_t x4 = dst_x >> subsampling_x;
427       const int32_t y4 = dst_y >> subsampling_y;
428 
429       const int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
430       int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
431       const int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
432       int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
433 
434       sx4 += alpha * (-4) + beta * (-4);
435       sy4 += gamma * (-4) + delta * (-4);
436 
437       sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
438       sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
439 
440       // Horizontal filter
441       for (int k = -7; k < 8; ++k) {
442         const int iy = clamp(iy4 + k, 0, height - 1);
443 
444         int sx = sx4 + beta * (k + 4);
445         for (int l = -4; l < 4; ++l) {
446           int ix = ix4 + l - 3;
447           const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
448                            WARPEDPIXEL_PREC_SHIFTS;
449           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
450           const int16_t *coeffs = warped_filter[offs];
451 
452           int32_t sum = 1 << offset_bits_horiz;
453           for (int m = 0; m < 8; ++m) {
454             const int sample_x = clamp(ix + m, 0, width - 1);
455             sum += ref[iy * stride + sample_x] * coeffs[m];
456           }
457           sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
458           assert(0 <= sum && sum < (1 << max_bits_horiz));
459           tmp[(k + 7) * 8 + (l + 4)] = sum;
460           sx += alpha;
461         }
462       }
463 
464       // Vertical filter
465       for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
466         int sy = sy4 + delta * (k + 4);
467         for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
468           const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
469                            WARPEDPIXEL_PREC_SHIFTS;
470           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
471           const int16_t *coeffs = warped_filter[offs];
472 
473           int32_t sum = 1 << offset_bits_vert;
474           for (int m = 0; m < 8; ++m) {
475             sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
476           }
477 
478           if (conv_params->is_compound) {
479             CONV_BUF_TYPE *p =
480                 &conv_params
481                      ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
482                            (j - p_col + l + 4)];
483             sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
484             if (conv_params->do_average) {
485               uint16_t *dst16 =
486                   &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
487               int32_t tmp32 = *p;
488               if (conv_params->use_dist_wtd_comp_avg) {
489                 tmp32 = tmp32 * conv_params->fwd_offset +
490                         sum * conv_params->bck_offset;
491                 tmp32 = tmp32 >> DIST_PRECISION_BITS;
492               } else {
493                 tmp32 += sum;
494                 tmp32 = tmp32 >> 1;
495               }
496               tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
497                       (1 << (offset_bits - conv_params->round_1 - 1));
498               *dst16 =
499                   clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp32, round_bits), bd);
500             } else {
501               *p = sum;
502             }
503           } else {
504             uint16_t *p =
505                 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
506             sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
507             assert(0 <= sum && sum < (1 << (bd + 2)));
508             *p = clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd);
509           }
510           sy += gamma;
511         }
512       }
513     }
514   }
515 }
516 
highbd_warp_plane(WarpedMotionParams * wm,const uint8_t * const ref8,int width,int height,int stride,const uint8_t * const pred8,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int bd,ConvolveParams * conv_params)517 static void highbd_warp_plane(WarpedMotionParams *wm, const uint8_t *const ref8,
518                               int width, int height, int stride,
519                               const uint8_t *const pred8, int p_col, int p_row,
520                               int p_width, int p_height, int p_stride,
521                               int subsampling_x, int subsampling_y, int bd,
522                               ConvolveParams *conv_params) {
523   assert(wm->wmtype <= AFFINE);
524   if (wm->wmtype == ROTZOOM) {
525     wm->wmmat[5] = wm->wmmat[2];
526     wm->wmmat[4] = -wm->wmmat[3];
527   }
528   const int32_t *const mat = wm->wmmat;
529   const int16_t alpha = wm->alpha;
530   const int16_t beta = wm->beta;
531   const int16_t gamma = wm->gamma;
532   const int16_t delta = wm->delta;
533 
534   const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8);
535   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
536   av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
537                          p_width, p_height, p_stride, subsampling_x,
538                          subsampling_y, bd, conv_params, alpha, beta, gamma,
539                          delta);
540 }
541 
highbd_frame_error(const uint16_t * const ref,int stride,const uint16_t * const dst,int p_width,int p_height,int p_stride,int bd)542 static int64_t highbd_frame_error(const uint16_t *const ref, int stride,
543                                   const uint16_t *const dst, int p_width,
544                                   int p_height, int p_stride, int bd) {
545   int64_t sum_error = 0;
546   for (int i = 0; i < p_height; ++i) {
547     for (int j = 0; j < p_width; ++j) {
548       sum_error +=
549           highbd_error_measure(dst[j + i * p_stride] - ref[j + i * stride], bd);
550     }
551   }
552   return sum_error;
553 }
554 
highbd_warp_error(WarpedMotionParams * wm,const uint8_t * const ref8,int width,int height,int stride,const uint8_t * const dst8,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int bd,int64_t best_error)555 static int64_t highbd_warp_error(
556     WarpedMotionParams *wm, const uint8_t *const ref8, int width, int height,
557     int stride, const uint8_t *const dst8, int p_col, int p_row, int p_width,
558     int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd,
559     int64_t best_error) {
560   int64_t gm_sumerr = 0;
561   const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
562   const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
563   uint16_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
564 
565   ConvolveParams conv_params = get_conv_params(0, 0, bd);
566   conv_params.use_dist_wtd_comp_avg = 0;
567   for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
568     for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
569       // avoid warping extra 8x8 blocks in the padded region of the frame
570       // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
571       const int warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
572       const int warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
573       highbd_warp_plane(wm, ref8, width, height, stride,
574                         CONVERT_TO_BYTEPTR(tmp), j, i, warp_w, warp_h,
575                         WARP_ERROR_BLOCK, subsampling_x, subsampling_y, bd,
576                         &conv_params);
577 
578       gm_sumerr += highbd_frame_error(
579           tmp, WARP_ERROR_BLOCK, CONVERT_TO_SHORTPTR(dst8) + j + i * p_stride,
580           warp_w, warp_h, p_stride, bd);
581       if (gm_sumerr > best_error) return gm_sumerr;
582     }
583   }
584   return gm_sumerr;
585 }
586 
error_measure(int err)587 static INLINE int error_measure(int err) {
588   return error_measure_lut[255 + err];
589 }
590 
591 /* The warp filter for ROTZOOM and AFFINE models works as follows:
592    * Split the input into 8x8 blocks
593    * For each block, project the point (4, 4) within the block, to get the
594      overall block position. Split into integer and fractional coordinates,
595      maintaining full WARPEDMODEL precision
596    * Filter horizontally: Generate 15 rows of 8 pixels each. Each pixel gets a
597      variable horizontal offset. This means that, while the rows of the
598      intermediate buffer align with the rows of the *reference* image, the
599      columns align with the columns of the *destination* image.
600    * Filter vertically: Generate the output block (up to 8x8 pixels, but if the
601      destination is too small we crop the output at this stage). Each pixel has
602      a variable vertical offset, so that the resulting rows are aligned with
603      the rows of the destination image.
604 
605    To accomplish these alignments, we factor the warp matrix as a
606    product of two shear / asymmetric zoom matrices:
607    / a b \  = /   1       0    \ * / 1+alpha  beta \
608    \ c d /    \ gamma  1+delta /   \    0      1   /
609    where a, b, c, d are wmmat[2], wmmat[3], wmmat[4], wmmat[5] respectively.
610    The horizontal shear (with alpha and beta) is applied first,
611    then the vertical shear (with gamma and delta) is applied second.
612 
613    The only limitation is that, to fit this in a fixed 8-tap filter size,
614    the fractional pixel offsets must be at most +-1. Since the horizontal filter
615    generates 15 rows of 8 columns, and the initial point we project is at (4, 4)
616    within the block, the parameters must satisfy
617    4 * |alpha| + 7 * |beta| <= 1   and   4 * |gamma| + 4 * |delta| <= 1
618    for this filter to be applicable.
619 
620    Note: This function assumes that the caller has done all of the relevant
621    checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5]
622    are set appropriately (if using a ROTZOOM model), and that alpha, beta,
623    gamma, delta are all in range.
624 
625    TODO(david.barker): Maybe support scaled references?
626 */
627 /* A note on hardware implementation:
628     The warp filter is intended to be implementable using the same hardware as
629     the high-precision convolve filters from the loop-restoration and
630     convolve-round experiments.
631 
632     For a single filter stage, considering all of the coefficient sets for the
633     warp filter and the regular convolution filter, an input in the range
634     [0, 2^k - 1] is mapped into the range [-56 * (2^k - 1), 184 * (2^k - 1)]
635     before rounding.
636 
637     Allowing for some changes to the filter coefficient sets, call the range
638     [-64 * 2^k, 192 * 2^k]. Then, if we initialize the accumulator to 64 * 2^k,
639     we can replace this by the range [0, 256 * 2^k], which can be stored in an
640     unsigned value with 8 + k bits.
641 
642     This allows the derivation of the appropriate bit widths and offsets for
643     the various intermediate values: If
644 
645     F := FILTER_BITS = 7 (or else the above ranges need adjusting)
646          So a *single* filter stage maps a k-bit input to a (k + F + 1)-bit
647          intermediate value.
648     H := ROUND0_BITS
649     V := VERSHEAR_REDUCE_PREC_BITS
650     (and note that we must have H + V = 2*F for the output to have the same
651      scale as the input)
652 
653     then we end up with the following offsets and ranges:
654     Horizontal filter: Apply an offset of 1 << (bd + F - 1), sum fits into a
655                        uint{bd + F + 1}
656     After rounding: The values stored in 'tmp' fit into a uint{bd + F + 1 - H}.
657     Vertical filter: Apply an offset of 1 << (bd + 2*F - H), sum fits into a
658                      uint{bd + 2*F + 2 - H}
659     After rounding: The final value, before undoing the offset, fits into a
660                     uint{bd + 2}.
661 
662     Then we need to undo the offsets before clamping to a pixel. Note that,
663     if we do this at the end, the amount to subtract is actually independent
664     of H and V:
665 
666     offset to subtract = (1 << ((bd + F - 1) - H + F - V)) +
667                          (1 << ((bd + 2*F - H) - V))
668                       == (1 << (bd - 1)) + (1 << bd)
669 
670     This allows us to entirely avoid clamping in both the warp filter and
671     the convolve-round experiment. As of the time of writing, the Wiener filter
672     from loop-restoration can encode a central coefficient up to 216, which
673     leads to a maximum value of about 282 * 2^k after applying the offset.
674     So in that case we still need to clamp.
675 */
av1_warp_affine_c(const int32_t * mat,const uint8_t * ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)676 void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
677                        int height, int stride, uint8_t *pred, int p_col,
678                        int p_row, int p_width, int p_height, int p_stride,
679                        int subsampling_x, int subsampling_y,
680                        ConvolveParams *conv_params, int16_t alpha, int16_t beta,
681                        int16_t gamma, int16_t delta) {
682   int32_t tmp[15 * 8];
683   const int bd = 8;
684   const int reduce_bits_horiz = conv_params->round_0;
685   const int reduce_bits_vert = conv_params->is_compound
686                                    ? conv_params->round_1
687                                    : 2 * FILTER_BITS - reduce_bits_horiz;
688   const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
689   const int offset_bits_horiz = bd + FILTER_BITS - 1;
690   const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
691   const int round_bits =
692       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
693   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
694   (void)max_bits_horiz;
695   assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
696   assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
697 
698   for (int i = p_row; i < p_row + p_height; i += 8) {
699     for (int j = p_col; j < p_col + p_width; j += 8) {
700       // Calculate the center of this 8x8 block,
701       // project to luma coordinates (if in a subsampled chroma plane),
702       // apply the affine transformation,
703       // then convert back to the original coordinates (if necessary)
704       const int32_t src_x = (j + 4) << subsampling_x;
705       const int32_t src_y = (i + 4) << subsampling_y;
706       const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
707       const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
708       const int32_t x4 = dst_x >> subsampling_x;
709       const int32_t y4 = dst_y >> subsampling_y;
710 
711       int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
712       int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
713       int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
714       int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
715 
716       sx4 += alpha * (-4) + beta * (-4);
717       sy4 += gamma * (-4) + delta * (-4);
718 
719       sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
720       sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
721 
722       // Horizontal filter
723       for (int k = -7; k < 8; ++k) {
724         // Clamp to top/bottom edge of the frame
725         const int iy = clamp(iy4 + k, 0, height - 1);
726 
727         int sx = sx4 + beta * (k + 4);
728 
729         for (int l = -4; l < 4; ++l) {
730           int ix = ix4 + l - 3;
731           // At this point, sx = sx4 + alpha * l + beta * k
732           const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
733                            WARPEDPIXEL_PREC_SHIFTS;
734           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
735           const int16_t *coeffs = warped_filter[offs];
736 
737           int32_t sum = 1 << offset_bits_horiz;
738           for (int m = 0; m < 8; ++m) {
739             // Clamp to left/right edge of the frame
740             const int sample_x = clamp(ix + m, 0, width - 1);
741 
742             sum += ref[iy * stride + sample_x] * coeffs[m];
743           }
744           sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
745           assert(0 <= sum && sum < (1 << max_bits_horiz));
746           tmp[(k + 7) * 8 + (l + 4)] = sum;
747           sx += alpha;
748         }
749       }
750 
751       // Vertical filter
752       for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
753         int sy = sy4 + delta * (k + 4);
754         for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
755           // At this point, sy = sy4 + gamma * l + delta * k
756           const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
757                            WARPEDPIXEL_PREC_SHIFTS;
758           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
759           const int16_t *coeffs = warped_filter[offs];
760 
761           int32_t sum = 1 << offset_bits_vert;
762           for (int m = 0; m < 8; ++m) {
763             sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
764           }
765 
766           if (conv_params->is_compound) {
767             CONV_BUF_TYPE *p =
768                 &conv_params
769                      ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
770                            (j - p_col + l + 4)];
771             sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
772             if (conv_params->do_average) {
773               uint8_t *dst8 =
774                   &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
775               int32_t tmp32 = *p;
776               if (conv_params->use_dist_wtd_comp_avg) {
777                 tmp32 = tmp32 * conv_params->fwd_offset +
778                         sum * conv_params->bck_offset;
779                 tmp32 = tmp32 >> DIST_PRECISION_BITS;
780               } else {
781                 tmp32 += sum;
782                 tmp32 = tmp32 >> 1;
783               }
784               tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
785                       (1 << (offset_bits - conv_params->round_1 - 1));
786               *dst8 = clip_pixel(ROUND_POWER_OF_TWO(tmp32, round_bits));
787             } else {
788               *p = sum;
789             }
790           } else {
791             uint8_t *p =
792                 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
793             sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
794             assert(0 <= sum && sum < (1 << (bd + 2)));
795             *p = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd));
796           }
797           sy += gamma;
798         }
799       }
800     }
801   }
802 }
803 
warp_plane(WarpedMotionParams * wm,const uint8_t * const ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params)804 static void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref,
805                        int width, int height, int stride, uint8_t *pred,
806                        int p_col, int p_row, int p_width, int p_height,
807                        int p_stride, int subsampling_x, int subsampling_y,
808                        ConvolveParams *conv_params) {
809   assert(wm->wmtype <= AFFINE);
810   if (wm->wmtype == ROTZOOM) {
811     wm->wmmat[5] = wm->wmmat[2];
812     wm->wmmat[4] = -wm->wmmat[3];
813   }
814   const int32_t *const mat = wm->wmmat;
815   const int16_t alpha = wm->alpha;
816   const int16_t beta = wm->beta;
817   const int16_t gamma = wm->gamma;
818   const int16_t delta = wm->delta;
819   av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width,
820                   p_height, p_stride, subsampling_x, subsampling_y, conv_params,
821                   alpha, beta, gamma, delta);
822 }
823 
frame_error(const uint8_t * const ref,int stride,const uint8_t * const dst,int p_width,int p_height,int p_stride)824 static int64_t frame_error(const uint8_t *const ref, int stride,
825                            const uint8_t *const dst, int p_width, int p_height,
826                            int p_stride) {
827   int64_t sum_error = 0;
828   for (int i = 0; i < p_height; ++i) {
829     for (int j = 0; j < p_width; ++j) {
830       sum_error +=
831           (int64_t)error_measure(dst[j + i * p_stride] - ref[j + i * stride]);
832     }
833   }
834   return sum_error;
835 }
836 
warp_error(WarpedMotionParams * wm,const uint8_t * const ref,int width,int height,int stride,const uint8_t * const dst,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int64_t best_error)837 static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
838                           int width, int height, int stride,
839                           const uint8_t *const dst, int p_col, int p_row,
840                           int p_width, int p_height, int p_stride,
841                           int subsampling_x, int subsampling_y,
842                           int64_t best_error) {
843   int64_t gm_sumerr = 0;
844   int warp_w, warp_h;
845   int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
846   int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
847   uint8_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
848   ConvolveParams conv_params = get_conv_params(0, 0, 8);
849   conv_params.use_dist_wtd_comp_avg = 0;
850 
851   for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
852     for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
853       // avoid warping extra 8x8 blocks in the padded region of the frame
854       // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
855       warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
856       warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
857       warp_plane(wm, ref, width, height, stride, tmp, j, i, warp_w, warp_h,
858                  WARP_ERROR_BLOCK, subsampling_x, subsampling_y, &conv_params);
859 
860       gm_sumerr += frame_error(tmp, WARP_ERROR_BLOCK, dst + j + i * p_stride,
861                                warp_w, warp_h, p_stride);
862       if (gm_sumerr > best_error) return gm_sumerr;
863     }
864   }
865   return gm_sumerr;
866 }
867 
av1_frame_error(int use_hbd,int bd,const uint8_t * ref,int stride,uint8_t * dst,int p_width,int p_height,int p_stride)868 int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride,
869                         uint8_t *dst, int p_width, int p_height, int p_stride) {
870   if (use_hbd) {
871     return highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride,
872                               CONVERT_TO_SHORTPTR(dst), p_width, p_height,
873                               p_stride, bd);
874   }
875   return frame_error(ref, stride, dst, p_width, p_height, p_stride);
876 }
877 
av1_warp_error(WarpedMotionParams * wm,int use_hbd,int bd,const uint8_t * ref,int width,int height,int stride,uint8_t * dst,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int64_t best_error)878 int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
879                        const uint8_t *ref, int width, int height, int stride,
880                        uint8_t *dst, int p_col, int p_row, int p_width,
881                        int p_height, int p_stride, int subsampling_x,
882                        int subsampling_y, int64_t best_error) {
883   if (wm->wmtype <= AFFINE)
884     if (!get_shear_params(wm)) return 1;
885   if (use_hbd)
886     return highbd_warp_error(wm, ref, width, height, stride, dst, p_col, p_row,
887                              p_width, p_height, p_stride, subsampling_x,
888                              subsampling_y, bd, best_error);
889   return warp_error(wm, ref, width, height, stride, dst, p_col, p_row, p_width,
890                     p_height, p_stride, subsampling_x, subsampling_y,
891                     best_error);
892 }
893 
av1_warp_plane(WarpedMotionParams * wm,int use_hbd,int bd,const uint8_t * ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params)894 void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
895                     const uint8_t *ref, int width, int height, int stride,
896                     uint8_t *pred, int p_col, int p_row, int p_width,
897                     int p_height, int p_stride, int subsampling_x,
898                     int subsampling_y, ConvolveParams *conv_params) {
899   if (use_hbd)
900     highbd_warp_plane(wm, ref, width, height, stride, pred, p_col, p_row,
901                       p_width, p_height, p_stride, subsampling_x, subsampling_y,
902                       bd, conv_params);
903   else
904     warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
905                p_height, p_stride, subsampling_x, subsampling_y, conv_params);
906 }
907 
908 #define LS_MV_MAX 256  // max mv in 1/8-pel
909 // Use LS_STEP = 8 so that 2 less bits needed for A, Bx, By.
910 #define LS_STEP 8
911 
912 // Assuming LS_MV_MAX is < MAX_SB_SIZE * 8,
913 // the precision needed is:
914 //   (MAX_SB_SIZE_LOG2 + 3) [for sx * sx magnitude] +
915 //   (MAX_SB_SIZE_LOG2 + 4) [for sx * dx magnitude] +
916 //   1 [for sign] +
917 //   LEAST_SQUARES_SAMPLES_MAX_BITS
918 //        [for adding up to LEAST_SQUARES_SAMPLES_MAX samples]
919 // The value is 23
920 #define LS_MAT_RANGE_BITS \
921   ((MAX_SB_SIZE_LOG2 + 4) * 2 + LEAST_SQUARES_SAMPLES_MAX_BITS)
922 
923 // Bit-depth reduction from the full-range
924 #define LS_MAT_DOWN_BITS 2
925 
926 // bits range of A, Bx and By after downshifting
927 #define LS_MAT_BITS (LS_MAT_RANGE_BITS - LS_MAT_DOWN_BITS)
928 #define LS_MAT_MIN (-(1 << (LS_MAT_BITS - 1)))
929 #define LS_MAT_MAX ((1 << (LS_MAT_BITS - 1)) - 1)
930 
931 // By setting LS_STEP = 8, the least 2 bits of every elements in A, Bx, By are
932 // 0. So, we can reduce LS_MAT_RANGE_BITS(2) bits here.
933 #define LS_SQUARE(a)                                          \
934   (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
935    (2 + LS_MAT_DOWN_BITS))
936 #define LS_PRODUCT1(a, b)                                           \
937   (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
938    (2 + LS_MAT_DOWN_BITS))
939 #define LS_PRODUCT2(a, b)                                               \
940   (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
941    (2 + LS_MAT_DOWN_BITS))
942 
943 #define USE_LIMITED_PREC_MULT 0
944 
945 #if USE_LIMITED_PREC_MULT
946 
947 #define MUL_PREC_BITS 16
resolve_multiplier_64(uint64_t D,int16_t * shift)948 static uint16_t resolve_multiplier_64(uint64_t D, int16_t *shift) {
949   int msb = 0;
950   uint16_t mult = 0;
951   *shift = 0;
952   if (D != 0) {
953     msb = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
954                               : get_msb((unsigned int)D));
955     if (msb >= MUL_PREC_BITS) {
956       mult = (uint16_t)ROUND_POWER_OF_TWO_64(D, msb + 1 - MUL_PREC_BITS);
957       *shift = msb + 1 - MUL_PREC_BITS;
958     } else {
959       mult = (uint16_t)D;
960       *shift = 0;
961     }
962   }
963   return mult;
964 }
965 
get_mult_shift_ndiag(int64_t Px,int16_t iDet,int shift)966 static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
967   int32_t ret;
968   int16_t mshift;
969   uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
970   int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
971   shift -= mshift;
972   if (shift > 0) {
973     return (int32_t)clamp(ROUND_POWER_OF_TWO_SIGNED(v, shift),
974                           -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
975                           WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
976   } else {
977     return (int32_t)clamp(v * (1 << (-shift)),
978                           -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
979                           WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
980   }
981   return ret;
982 }
983 
get_mult_shift_diag(int64_t Px,int16_t iDet,int shift)984 static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
985   int16_t mshift;
986   uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
987   int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
988   shift -= mshift;
989   if (shift > 0) {
990     return (int32_t)clamp(
991         ROUND_POWER_OF_TWO_SIGNED(v, shift),
992         (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
993         (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
994   } else {
995     return (int32_t)clamp(
996         v * (1 << (-shift)),
997         (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
998         (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
999   }
1000 }
1001 
1002 #else
1003 
get_mult_shift_ndiag(int64_t Px,int16_t iDet,int shift)1004 static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
1005   int64_t v = Px * (int64_t)iDet;
1006   return (int32_t)clamp64(ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
1007                           -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
1008                           WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
1009 }
1010 
get_mult_shift_diag(int64_t Px,int16_t iDet,int shift)1011 static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
1012   int64_t v = Px * (int64_t)iDet;
1013   return (int32_t)clamp64(
1014       ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
1015       (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
1016       (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
1017 }
1018 #endif  // USE_LIMITED_PREC_MULT
1019 
find_affine_int(int np,const int * pts1,const int * pts2,BLOCK_SIZE bsize,int mvy,int mvx,WarpedMotionParams * wm,int mi_row,int mi_col)1020 static int find_affine_int(int np, const int *pts1, const int *pts2,
1021                            BLOCK_SIZE bsize, int mvy, int mvx,
1022                            WarpedMotionParams *wm, int mi_row, int mi_col) {
1023   int32_t A[2][2] = { { 0, 0 }, { 0, 0 } };
1024   int32_t Bx[2] = { 0, 0 };
1025   int32_t By[2] = { 0, 0 };
1026   int i;
1027 
1028   const int bw = block_size_wide[bsize];
1029   const int bh = block_size_high[bsize];
1030   const int rsuy = (AOMMAX(bh, MI_SIZE) / 2 - 1);
1031   const int rsux = (AOMMAX(bw, MI_SIZE) / 2 - 1);
1032   const int suy = rsuy * 8;
1033   const int sux = rsux * 8;
1034   const int duy = suy + mvy;
1035   const int dux = sux + mvx;
1036   const int isuy = (mi_row * MI_SIZE + rsuy);
1037   const int isux = (mi_col * MI_SIZE + rsux);
1038 
1039   // Assume the center pixel of the block has exactly the same motion vector
1040   // as transmitted for the block. First shift the origin of the source
1041   // points to the block center, and the origin of the destination points to
1042   // the block center added to the motion vector transmitted.
1043   // Let (xi, yi) denote the source points and (xi', yi') denote destination
1044   // points after origin shfifting, for i = 0, 1, 2, .... n-1.
1045   // Then if  P = [x0, y0,
1046   //               x1, y1
1047   //               x2, y1,
1048   //                ....
1049   //              ]
1050   //          q = [x0', x1', x2', ... ]'
1051   //          r = [y0', y1', y2', ... ]'
1052   // the least squares problems that need to be solved are:
1053   //          [h1, h2]' = inv(P'P)P'q and
1054   //          [h3, h4]' = inv(P'P)P'r
1055   // where the affine transformation is given by:
1056   //          x' = h1.x + h2.y
1057   //          y' = h3.x + h4.y
1058   //
1059   // The loop below computes: A = P'P, Bx = P'q, By = P'r
1060   // We need to just compute inv(A).Bx and inv(A).By for the solutions.
1061   // Contribution from neighbor block
1062   for (i = 0; i < np; i++) {
1063     const int dx = pts2[i * 2] - dux;
1064     const int dy = pts2[i * 2 + 1] - duy;
1065     const int sx = pts1[i * 2] - sux;
1066     const int sy = pts1[i * 2 + 1] - suy;
1067     // (TODO)yunqing: This comparison wouldn't be necessary if the sample
1068     // selection is done in find_samples(). Also, global offset can be removed
1069     // while collecting samples.
1070     if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
1071       A[0][0] += LS_SQUARE(sx);
1072       A[0][1] += LS_PRODUCT1(sx, sy);
1073       A[1][1] += LS_SQUARE(sy);
1074       Bx[0] += LS_PRODUCT2(sx, dx);
1075       Bx[1] += LS_PRODUCT1(sy, dx);
1076       By[0] += LS_PRODUCT1(sx, dy);
1077       By[1] += LS_PRODUCT2(sy, dy);
1078     }
1079   }
1080 
1081   // Just for debugging, and can be removed later.
1082   assert(A[0][0] >= LS_MAT_MIN && A[0][0] <= LS_MAT_MAX);
1083   assert(A[0][1] >= LS_MAT_MIN && A[0][1] <= LS_MAT_MAX);
1084   assert(A[1][1] >= LS_MAT_MIN && A[1][1] <= LS_MAT_MAX);
1085   assert(Bx[0] >= LS_MAT_MIN && Bx[0] <= LS_MAT_MAX);
1086   assert(Bx[1] >= LS_MAT_MIN && Bx[1] <= LS_MAT_MAX);
1087   assert(By[0] >= LS_MAT_MIN && By[0] <= LS_MAT_MAX);
1088   assert(By[1] >= LS_MAT_MIN && By[1] <= LS_MAT_MAX);
1089 
1090   int64_t Det;
1091   int16_t iDet, shift;
1092 
1093   // Compute Determinant of A
1094   Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
1095   if (Det == 0) return 1;
1096   iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1);
1097   shift -= WARPEDMODEL_PREC_BITS;
1098   if (shift < 0) {
1099     iDet <<= (-shift);
1100     shift = 0;
1101   }
1102 
1103   int64_t Px[2], Py[2];
1104 
1105   // These divided by the Det, are the least squares solutions
1106   Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1];
1107   Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1];
1108   Py[0] = (int64_t)A[1][1] * By[0] - (int64_t)A[0][1] * By[1];
1109   Py[1] = -(int64_t)A[0][1] * By[0] + (int64_t)A[0][0] * By[1];
1110 
1111   wm->wmmat[2] = get_mult_shift_diag(Px[0], iDet, shift);
1112   wm->wmmat[3] = get_mult_shift_ndiag(Px[1], iDet, shift);
1113   wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift);
1114   wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift);
1115 
1116   // Note: In the vx, vy expressions below, the max value of each of the
1117   // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room
1118   // for the first term so that the overall sum in the worst case fits
1119   // within 32 bits overall.
1120   int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
1121                (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) +
1122                 isuy * wm->wmmat[3]);
1123   int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
1124                (isux * wm->wmmat[4] +
1125                 isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS)));
1126   wm->wmmat[0] =
1127       clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
1128   wm->wmmat[1] =
1129       clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
1130 
1131   wm->wmmat[6] = wm->wmmat[7] = 0;
1132   return 0;
1133 }
1134 
find_projection(int np,int * pts1,int * pts2,BLOCK_SIZE bsize,int mvy,int mvx,WarpedMotionParams * wm_params,int mi_row,int mi_col)1135 int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy,
1136                     int mvx, WarpedMotionParams *wm_params, int mi_row,
1137                     int mi_col) {
1138   assert(wm_params->wmtype == AFFINE);
1139 
1140   if (find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params, mi_row,
1141                       mi_col))
1142     return 1;
1143 
1144   // check compatibility with the fast warp filter
1145   if (!get_shear_params(wm_params)) return 1;
1146 
1147   return 0;
1148 }
1149