1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <memory.h>
15 #include <math.h>
16 #include <assert.h>
17 
18 #include "config/av1_rtcd.h"
19 
20 #include "av1/common/warped_motion.h"
21 #include "av1/common/scale.h"
22 
23 // For warping, we really use a 6-tap filter, but we do blocks of 8 pixels
24 // at a time. The zoom/rotation/shear in the model are applied to the
25 // "fractional" position of each pixel, which therefore varies within
26 // [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
27 // We need an extra 2 taps to fit this in, for a total of 8 taps.
28 /* clang-format off */
29 const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = {
30 #if WARPEDPIXEL_PREC_BITS == 6
31   // [-1, 0)
32   { 0,   0, 127,   1,   0, 0, 0, 0 }, { 0, - 1, 127,   2,   0, 0, 0, 0 },
33   { 1, - 3, 127,   4, - 1, 0, 0, 0 }, { 1, - 4, 126,   6, - 2, 1, 0, 0 },
34   { 1, - 5, 126,   8, - 3, 1, 0, 0 }, { 1, - 6, 125,  11, - 4, 1, 0, 0 },
35   { 1, - 7, 124,  13, - 4, 1, 0, 0 }, { 2, - 8, 123,  15, - 5, 1, 0, 0 },
36   { 2, - 9, 122,  18, - 6, 1, 0, 0 }, { 2, -10, 121,  20, - 6, 1, 0, 0 },
37   { 2, -11, 120,  22, - 7, 2, 0, 0 }, { 2, -12, 119,  25, - 8, 2, 0, 0 },
38   { 3, -13, 117,  27, - 8, 2, 0, 0 }, { 3, -13, 116,  29, - 9, 2, 0, 0 },
39   { 3, -14, 114,  32, -10, 3, 0, 0 }, { 3, -15, 113,  35, -10, 2, 0, 0 },
40   { 3, -15, 111,  37, -11, 3, 0, 0 }, { 3, -16, 109,  40, -11, 3, 0, 0 },
41   { 3, -16, 108,  42, -12, 3, 0, 0 }, { 4, -17, 106,  45, -13, 3, 0, 0 },
42   { 4, -17, 104,  47, -13, 3, 0, 0 }, { 4, -17, 102,  50, -14, 3, 0, 0 },
43   { 4, -17, 100,  52, -14, 3, 0, 0 }, { 4, -18,  98,  55, -15, 4, 0, 0 },
44   { 4, -18,  96,  58, -15, 3, 0, 0 }, { 4, -18,  94,  60, -16, 4, 0, 0 },
45   { 4, -18,  91,  63, -16, 4, 0, 0 }, { 4, -18,  89,  65, -16, 4, 0, 0 },
46   { 4, -18,  87,  68, -17, 4, 0, 0 }, { 4, -18,  85,  70, -17, 4, 0, 0 },
47   { 4, -18,  82,  73, -17, 4, 0, 0 }, { 4, -18,  80,  75, -17, 4, 0, 0 },
48   { 4, -18,  78,  78, -18, 4, 0, 0 }, { 4, -17,  75,  80, -18, 4, 0, 0 },
49   { 4, -17,  73,  82, -18, 4, 0, 0 }, { 4, -17,  70,  85, -18, 4, 0, 0 },
50   { 4, -17,  68,  87, -18, 4, 0, 0 }, { 4, -16,  65,  89, -18, 4, 0, 0 },
51   { 4, -16,  63,  91, -18, 4, 0, 0 }, { 4, -16,  60,  94, -18, 4, 0, 0 },
52   { 3, -15,  58,  96, -18, 4, 0, 0 }, { 4, -15,  55,  98, -18, 4, 0, 0 },
53   { 3, -14,  52, 100, -17, 4, 0, 0 }, { 3, -14,  50, 102, -17, 4, 0, 0 },
54   { 3, -13,  47, 104, -17, 4, 0, 0 }, { 3, -13,  45, 106, -17, 4, 0, 0 },
55   { 3, -12,  42, 108, -16, 3, 0, 0 }, { 3, -11,  40, 109, -16, 3, 0, 0 },
56   { 3, -11,  37, 111, -15, 3, 0, 0 }, { 2, -10,  35, 113, -15, 3, 0, 0 },
57   { 3, -10,  32, 114, -14, 3, 0, 0 }, { 2, - 9,  29, 116, -13, 3, 0, 0 },
58   { 2, - 8,  27, 117, -13, 3, 0, 0 }, { 2, - 8,  25, 119, -12, 2, 0, 0 },
59   { 2, - 7,  22, 120, -11, 2, 0, 0 }, { 1, - 6,  20, 121, -10, 2, 0, 0 },
60   { 1, - 6,  18, 122, - 9, 2, 0, 0 }, { 1, - 5,  15, 123, - 8, 2, 0, 0 },
61   { 1, - 4,  13, 124, - 7, 1, 0, 0 }, { 1, - 4,  11, 125, - 6, 1, 0, 0 },
62   { 1, - 3,   8, 126, - 5, 1, 0, 0 }, { 1, - 2,   6, 126, - 4, 1, 0, 0 },
63   { 0, - 1,   4, 127, - 3, 1, 0, 0 }, { 0,   0,   2, 127, - 1, 0, 0, 0 },
64 
65   // [0, 1)
66   { 0,  0,   0, 127,   1,   0,  0,  0}, { 0,  0,  -1, 127,   2,   0,  0,  0},
67   { 0,  1,  -3, 127,   4,  -2,  1,  0}, { 0,  1,  -5, 127,   6,  -2,  1,  0},
68   { 0,  2,  -6, 126,   8,  -3,  1,  0}, {-1,  2,  -7, 126,  11,  -4,  2, -1},
69   {-1,  3,  -8, 125,  13,  -5,  2, -1}, {-1,  3, -10, 124,  16,  -6,  3, -1},
70   {-1,  4, -11, 123,  18,  -7,  3, -1}, {-1,  4, -12, 122,  20,  -7,  3, -1},
71   {-1,  4, -13, 121,  23,  -8,  3, -1}, {-2,  5, -14, 120,  25,  -9,  4, -1},
72   {-1,  5, -15, 119,  27, -10,  4, -1}, {-1,  5, -16, 118,  30, -11,  4, -1},
73   {-2,  6, -17, 116,  33, -12,  5, -1}, {-2,  6, -17, 114,  35, -12,  5, -1},
74   {-2,  6, -18, 113,  38, -13,  5, -1}, {-2,  7, -19, 111,  41, -14,  6, -2},
75   {-2,  7, -19, 110,  43, -15,  6, -2}, {-2,  7, -20, 108,  46, -15,  6, -2},
76   {-2,  7, -20, 106,  49, -16,  6, -2}, {-2,  7, -21, 104,  51, -16,  7, -2},
77   {-2,  7, -21, 102,  54, -17,  7, -2}, {-2,  8, -21, 100,  56, -18,  7, -2},
78   {-2,  8, -22,  98,  59, -18,  7, -2}, {-2,  8, -22,  96,  62, -19,  7, -2},
79   {-2,  8, -22,  94,  64, -19,  7, -2}, {-2,  8, -22,  91,  67, -20,  8, -2},
80   {-2,  8, -22,  89,  69, -20,  8, -2}, {-2,  8, -22,  87,  72, -21,  8, -2},
81   {-2,  8, -21,  84,  74, -21,  8, -2}, {-2,  8, -22,  82,  77, -21,  8, -2},
82   {-2,  8, -21,  79,  79, -21,  8, -2}, {-2,  8, -21,  77,  82, -22,  8, -2},
83   {-2,  8, -21,  74,  84, -21,  8, -2}, {-2,  8, -21,  72,  87, -22,  8, -2},
84   {-2,  8, -20,  69,  89, -22,  8, -2}, {-2,  8, -20,  67,  91, -22,  8, -2},
85   {-2,  7, -19,  64,  94, -22,  8, -2}, {-2,  7, -19,  62,  96, -22,  8, -2},
86   {-2,  7, -18,  59,  98, -22,  8, -2}, {-2,  7, -18,  56, 100, -21,  8, -2},
87   {-2,  7, -17,  54, 102, -21,  7, -2}, {-2,  7, -16,  51, 104, -21,  7, -2},
88   {-2,  6, -16,  49, 106, -20,  7, -2}, {-2,  6, -15,  46, 108, -20,  7, -2},
89   {-2,  6, -15,  43, 110, -19,  7, -2}, {-2,  6, -14,  41, 111, -19,  7, -2},
90   {-1,  5, -13,  38, 113, -18,  6, -2}, {-1,  5, -12,  35, 114, -17,  6, -2},
91   {-1,  5, -12,  33, 116, -17,  6, -2}, {-1,  4, -11,  30, 118, -16,  5, -1},
92   {-1,  4, -10,  27, 119, -15,  5, -1}, {-1,  4,  -9,  25, 120, -14,  5, -2},
93   {-1,  3,  -8,  23, 121, -13,  4, -1}, {-1,  3,  -7,  20, 122, -12,  4, -1},
94   {-1,  3,  -7,  18, 123, -11,  4, -1}, {-1,  3,  -6,  16, 124, -10,  3, -1},
95   {-1,  2,  -5,  13, 125,  -8,  3, -1}, {-1,  2,  -4,  11, 126,  -7,  2, -1},
96   { 0,  1,  -3,   8, 126,  -6,  2,  0}, { 0,  1,  -2,   6, 127,  -5,  1,  0},
97   { 0,  1,  -2,   4, 127,  -3,  1,  0}, { 0,  0,   0,   2, 127,  -1,  0,  0},
98 
99   // [1, 2)
100   { 0, 0, 0,   1, 127,   0,   0, 0 }, { 0, 0, 0, - 1, 127,   2,   0, 0 },
101   { 0, 0, 1, - 3, 127,   4, - 1, 0 }, { 0, 0, 1, - 4, 126,   6, - 2, 1 },
102   { 0, 0, 1, - 5, 126,   8, - 3, 1 }, { 0, 0, 1, - 6, 125,  11, - 4, 1 },
103   { 0, 0, 1, - 7, 124,  13, - 4, 1 }, { 0, 0, 2, - 8, 123,  15, - 5, 1 },
104   { 0, 0, 2, - 9, 122,  18, - 6, 1 }, { 0, 0, 2, -10, 121,  20, - 6, 1 },
105   { 0, 0, 2, -11, 120,  22, - 7, 2 }, { 0, 0, 2, -12, 119,  25, - 8, 2 },
106   { 0, 0, 3, -13, 117,  27, - 8, 2 }, { 0, 0, 3, -13, 116,  29, - 9, 2 },
107   { 0, 0, 3, -14, 114,  32, -10, 3 }, { 0, 0, 3, -15, 113,  35, -10, 2 },
108   { 0, 0, 3, -15, 111,  37, -11, 3 }, { 0, 0, 3, -16, 109,  40, -11, 3 },
109   { 0, 0, 3, -16, 108,  42, -12, 3 }, { 0, 0, 4, -17, 106,  45, -13, 3 },
110   { 0, 0, 4, -17, 104,  47, -13, 3 }, { 0, 0, 4, -17, 102,  50, -14, 3 },
111   { 0, 0, 4, -17, 100,  52, -14, 3 }, { 0, 0, 4, -18,  98,  55, -15, 4 },
112   { 0, 0, 4, -18,  96,  58, -15, 3 }, { 0, 0, 4, -18,  94,  60, -16, 4 },
113   { 0, 0, 4, -18,  91,  63, -16, 4 }, { 0, 0, 4, -18,  89,  65, -16, 4 },
114   { 0, 0, 4, -18,  87,  68, -17, 4 }, { 0, 0, 4, -18,  85,  70, -17, 4 },
115   { 0, 0, 4, -18,  82,  73, -17, 4 }, { 0, 0, 4, -18,  80,  75, -17, 4 },
116   { 0, 0, 4, -18,  78,  78, -18, 4 }, { 0, 0, 4, -17,  75,  80, -18, 4 },
117   { 0, 0, 4, -17,  73,  82, -18, 4 }, { 0, 0, 4, -17,  70,  85, -18, 4 },
118   { 0, 0, 4, -17,  68,  87, -18, 4 }, { 0, 0, 4, -16,  65,  89, -18, 4 },
119   { 0, 0, 4, -16,  63,  91, -18, 4 }, { 0, 0, 4, -16,  60,  94, -18, 4 },
120   { 0, 0, 3, -15,  58,  96, -18, 4 }, { 0, 0, 4, -15,  55,  98, -18, 4 },
121   { 0, 0, 3, -14,  52, 100, -17, 4 }, { 0, 0, 3, -14,  50, 102, -17, 4 },
122   { 0, 0, 3, -13,  47, 104, -17, 4 }, { 0, 0, 3, -13,  45, 106, -17, 4 },
123   { 0, 0, 3, -12,  42, 108, -16, 3 }, { 0, 0, 3, -11,  40, 109, -16, 3 },
124   { 0, 0, 3, -11,  37, 111, -15, 3 }, { 0, 0, 2, -10,  35, 113, -15, 3 },
125   { 0, 0, 3, -10,  32, 114, -14, 3 }, { 0, 0, 2, - 9,  29, 116, -13, 3 },
126   { 0, 0, 2, - 8,  27, 117, -13, 3 }, { 0, 0, 2, - 8,  25, 119, -12, 2 },
127   { 0, 0, 2, - 7,  22, 120, -11, 2 }, { 0, 0, 1, - 6,  20, 121, -10, 2 },
128   { 0, 0, 1, - 6,  18, 122, - 9, 2 }, { 0, 0, 1, - 5,  15, 123, - 8, 2 },
129   { 0, 0, 1, - 4,  13, 124, - 7, 1 }, { 0, 0, 1, - 4,  11, 125, - 6, 1 },
130   { 0, 0, 1, - 3,   8, 126, - 5, 1 }, { 0, 0, 1, - 2,   6, 126, - 4, 1 },
131   { 0, 0, 0, - 1,   4, 127, - 3, 1 }, { 0, 0, 0,   0,   2, 127, - 1, 0 },
132   // dummy (replicate row index 191)
133   { 0, 0, 0,   0,   2, 127, - 1, 0 },
134 
135 #elif WARPEDPIXEL_PREC_BITS == 5
136   // [-1, 0)
137   {0,   0, 127,   1,   0, 0, 0, 0}, {1,  -3, 127,   4,  -1, 0, 0, 0},
138   {1,  -5, 126,   8,  -3, 1, 0, 0}, {1,  -7, 124,  13,  -4, 1, 0, 0},
139   {2,  -9, 122,  18,  -6, 1, 0, 0}, {2, -11, 120,  22,  -7, 2, 0, 0},
140   {3, -13, 117,  27,  -8, 2, 0, 0}, {3, -14, 114,  32, -10, 3, 0, 0},
141   {3, -15, 111,  37, -11, 3, 0, 0}, {3, -16, 108,  42, -12, 3, 0, 0},
142   {4, -17, 104,  47, -13, 3, 0, 0}, {4, -17, 100,  52, -14, 3, 0, 0},
143   {4, -18,  96,  58, -15, 3, 0, 0}, {4, -18,  91,  63, -16, 4, 0, 0},
144   {4, -18,  87,  68, -17, 4, 0, 0}, {4, -18,  82,  73, -17, 4, 0, 0},
145   {4, -18,  78,  78, -18, 4, 0, 0}, {4, -17,  73,  82, -18, 4, 0, 0},
146   {4, -17,  68,  87, -18, 4, 0, 0}, {4, -16,  63,  91, -18, 4, 0, 0},
147   {3, -15,  58,  96, -18, 4, 0, 0}, {3, -14,  52, 100, -17, 4, 0, 0},
148   {3, -13,  47, 104, -17, 4, 0, 0}, {3, -12,  42, 108, -16, 3, 0, 0},
149   {3, -11,  37, 111, -15, 3, 0, 0}, {3, -10,  32, 114, -14, 3, 0, 0},
150   {2,  -8,  27, 117, -13, 3, 0, 0}, {2,  -7,  22, 120, -11, 2, 0, 0},
151   {1,  -6,  18, 122,  -9, 2, 0, 0}, {1,  -4,  13, 124,  -7, 1, 0, 0},
152   {1,  -3,   8, 126,  -5, 1, 0, 0}, {0,  -1,   4, 127,  -3, 1, 0, 0},
153   // [0, 1)
154   { 0,  0,   0, 127,   1,   0,   0,  0}, { 0,  1,  -3, 127,   4,  -2,   1,  0},
155   { 0,  2,  -6, 126,   8,  -3,   1,  0}, {-1,  3,  -8, 125,  13,  -5,   2, -1},
156   {-1,  4, -11, 123,  18,  -7,   3, -1}, {-1,  4, -13, 121,  23,  -8,   3, -1},
157   {-1,  5, -15, 119,  27, -10,   4, -1}, {-2,  6, -17, 116,  33, -12,   5, -1},
158   {-2,  6, -18, 113,  38, -13,   5, -1}, {-2,  7, -19, 110,  43, -15,   6, -2},
159   {-2,  7, -20, 106,  49, -16,   6, -2}, {-2,  7, -21, 102,  54, -17,   7, -2},
160   {-2,  8, -22,  98,  59, -18,   7, -2}, {-2,  8, -22,  94,  64, -19,   7, -2},
161   {-2,  8, -22,  89,  69, -20,   8, -2}, {-2,  8, -21,  84,  74, -21,   8, -2},
162   {-2,  8, -21,  79,  79, -21,   8, -2}, {-2,  8, -21,  74,  84, -21,   8, -2},
163   {-2,  8, -20,  69,  89, -22,   8, -2}, {-2,  7, -19,  64,  94, -22,   8, -2},
164   {-2,  7, -18,  59,  98, -22,   8, -2}, {-2,  7, -17,  54, 102, -21,   7, -2},
165   {-2,  6, -16,  49, 106, -20,   7, -2}, {-2,  6, -15,  43, 110, -19,   7, -2},
166   {-1,  5, -13,  38, 113, -18,   6, -2}, {-1,  5, -12,  33, 116, -17,   6, -2},
167   {-1,  4, -10,  27, 119, -15,   5, -1}, {-1,  3,  -8,  23, 121, -13,   4, -1},
168   {-1,  3,  -7,  18, 123, -11,   4, -1}, {-1,  2,  -5,  13, 125,  -8,   3, -1},
169   { 0,  1,  -3,   8, 126,  -6,   2,  0}, { 0,  1,  -2,   4, 127,  -3,   1,  0},
170   // [1, 2)
171   {0, 0, 0,   1, 127,   0,   0, 0}, {0, 0, 1,  -3, 127,   4,  -1, 0},
172   {0, 0, 1,  -5, 126,   8,  -3, 1}, {0, 0, 1,  -7, 124,  13,  -4, 1},
173   {0, 0, 2,  -9, 122,  18,  -6, 1}, {0, 0, 2, -11, 120,  22,  -7, 2},
174   {0, 0, 3, -13, 117,  27,  -8, 2}, {0, 0, 3, -14, 114,  32, -10, 3},
175   {0, 0, 3, -15, 111,  37, -11, 3}, {0, 0, 3, -16, 108,  42, -12, 3},
176   {0, 0, 4, -17, 104,  47, -13, 3}, {0, 0, 4, -17, 100,  52, -14, 3},
177   {0, 0, 4, -18,  96,  58, -15, 3}, {0, 0, 4, -18,  91,  63, -16, 4},
178   {0, 0, 4, -18,  87,  68, -17, 4}, {0, 0, 4, -18,  82,  73, -17, 4},
179   {0, 0, 4, -18,  78,  78, -18, 4}, {0, 0, 4, -17,  73,  82, -18, 4},
180   {0, 0, 4, -17,  68,  87, -18, 4}, {0, 0, 4, -16,  63,  91, -18, 4},
181   {0, 0, 3, -15,  58,  96, -18, 4}, {0, 0, 3, -14,  52, 100, -17, 4},
182   {0, 0, 3, -13,  47, 104, -17, 4}, {0, 0, 3, -12,  42, 108, -16, 3},
183   {0, 0, 3, -11,  37, 111, -15, 3}, {0, 0, 3, -10,  32, 114, -14, 3},
184   {0, 0, 2,  -8,  27, 117, -13, 3}, {0, 0, 2,  -7,  22, 120, -11, 2},
185   {0, 0, 1,  -6,  18, 122,  -9, 2}, {0, 0, 1,  -4,  13, 124,  -7, 1},
186   {0, 0, 1,  -3,   8, 126,  -5, 1}, {0, 0, 0,  -1,   4, 127,  -3, 1},
187   // dummy (replicate row index 95)
188   {0, 0, 0,  -1,   4, 127,  -3, 1},
189 
190 #endif  // WARPEDPIXEL_PREC_BITS == 6
191 };
192 
193 /* clang-format on */
194 
195 #define DIV_LUT_PREC_BITS 14
196 #define DIV_LUT_BITS 8
197 #define DIV_LUT_NUM (1 << DIV_LUT_BITS)
198 
199 static const uint16_t div_lut[DIV_LUT_NUM + 1] = {
200   16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
201   15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
202   15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
203   14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
204   13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
205   13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
206   13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
207   12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
208   12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
209   11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
210   11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
211   11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
212   10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
213   10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
214   10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
215   9963,  9939,  9916,  9892,  9869,  9846,  9823,  9800,  9777,  9754,  9732,
216   9709,  9687,  9664,  9642,  9620,  9598,  9576,  9554,  9533,  9511,  9489,
217   9468,  9447,  9425,  9404,  9383,  9362,  9341,  9321,  9300,  9279,  9259,
218   9239,  9218,  9198,  9178,  9158,  9138,  9118,  9098,  9079,  9059,  9039,
219   9020,  9001,  8981,  8962,  8943,  8924,  8905,  8886,  8867,  8849,  8830,
220   8812,  8793,  8775,  8756,  8738,  8720,  8702,  8684,  8666,  8648,  8630,
221   8613,  8595,  8577,  8560,  8542,  8525,  8508,  8490,  8473,  8456,  8439,
222   8422,  8405,  8389,  8372,  8355,  8339,  8322,  8306,  8289,  8273,  8257,
223   8240,  8224,  8208,  8192,
224 };
225 
226 // Decomposes a divisor D such that 1/D = y/2^shift, where y is returned
227 // at precision of DIV_LUT_PREC_BITS along with the shift.
resolve_divisor_64(uint64_t D,int16_t * shift)228 static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) {
229   int64_t f;
230   *shift = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
231                                : get_msb((unsigned int)D));
232   // e is obtained from D after resetting the most significant 1 bit.
233   const int64_t e = D - ((uint64_t)1 << *shift);
234   // Get the most significant DIV_LUT_BITS (8) bits of e into f
235   if (*shift > DIV_LUT_BITS)
236     f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS);
237   else
238     f = e << (DIV_LUT_BITS - *shift);
239   assert(f <= DIV_LUT_NUM);
240   *shift += DIV_LUT_PREC_BITS;
241   // Use f as lookup into the precomputed table of multipliers
242   return div_lut[f];
243 }
244 
resolve_divisor_32(uint32_t D,int16_t * shift)245 static int16_t resolve_divisor_32(uint32_t D, int16_t *shift) {
246   int32_t f;
247   *shift = get_msb(D);
248   // e is obtained from D after resetting the most significant 1 bit.
249   const int32_t e = D - ((uint32_t)1 << *shift);
250   // Get the most significant DIV_LUT_BITS (8) bits of e into f
251   if (*shift > DIV_LUT_BITS)
252     f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS);
253   else
254     f = e << (DIV_LUT_BITS - *shift);
255   assert(f <= DIV_LUT_NUM);
256   *shift += DIV_LUT_PREC_BITS;
257   // Use f as lookup into the precomputed table of multipliers
258   return div_lut[f];
259 }
260 
is_affine_valid(const WarpedMotionParams * const wm)261 static int is_affine_valid(const WarpedMotionParams *const wm) {
262   const int32_t *mat = wm->wmmat;
263   return (mat[2] > 0);
264 }
265 
is_affine_shear_allowed(int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)266 static int is_affine_shear_allowed(int16_t alpha, int16_t beta, int16_t gamma,
267                                    int16_t delta) {
268   if ((4 * abs(alpha) + 7 * abs(beta) >= (1 << WARPEDMODEL_PREC_BITS)) ||
269       (4 * abs(gamma) + 4 * abs(delta) >= (1 << WARPEDMODEL_PREC_BITS)))
270     return 0;
271   else
272     return 1;
273 }
274 
275 // Returns 1 on success or 0 on an invalid affine set
av1_get_shear_params(WarpedMotionParams * wm)276 int av1_get_shear_params(WarpedMotionParams *wm) {
277   const int32_t *mat = wm->wmmat;
278   if (!is_affine_valid(wm)) return 0;
279   wm->alpha =
280       clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX);
281   wm->beta = clamp(mat[3], INT16_MIN, INT16_MAX);
282   int16_t shift;
283   int16_t y = resolve_divisor_32(abs(mat[2]), &shift) * (mat[2] < 0 ? -1 : 1);
284   int64_t v = ((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) * y;
285   wm->gamma =
286       clamp((int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift), INT16_MIN, INT16_MAX);
287   v = ((int64_t)mat[3] * mat[4]) * y;
288   wm->delta = clamp(mat[5] - (int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift) -
289                         (1 << WARPEDMODEL_PREC_BITS),
290                     INT16_MIN, INT16_MAX);
291 
292   wm->alpha = ROUND_POWER_OF_TWO_SIGNED(wm->alpha, WARP_PARAM_REDUCE_BITS) *
293               (1 << WARP_PARAM_REDUCE_BITS);
294   wm->beta = ROUND_POWER_OF_TWO_SIGNED(wm->beta, WARP_PARAM_REDUCE_BITS) *
295              (1 << WARP_PARAM_REDUCE_BITS);
296   wm->gamma = ROUND_POWER_OF_TWO_SIGNED(wm->gamma, WARP_PARAM_REDUCE_BITS) *
297               (1 << WARP_PARAM_REDUCE_BITS);
298   wm->delta = ROUND_POWER_OF_TWO_SIGNED(wm->delta, WARP_PARAM_REDUCE_BITS) *
299               (1 << WARP_PARAM_REDUCE_BITS);
300 
301   if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta))
302     return 0;
303 
304   return 1;
305 }
306 
307 #if CONFIG_AV1_HIGHBITDEPTH
highbd_error_measure(int err,int bd)308 static INLINE int highbd_error_measure(int err, int bd) {
309   const int b = bd - 8;
310   const int bmask = (1 << b) - 1;
311   const int v = (1 << b);
312   err = abs(err);
313   const int e1 = err >> b;
314   const int e2 = err & bmask;
315   return error_measure_lut[255 + e1] * (v - e2) +
316          error_measure_lut[256 + e1] * e2;
317 }
318 
319 /* Note: For an explanation of the warp algorithm, and some notes on bit widths
320     for hardware implementations, see the comments above av1_warp_affine_c
321 */
av1_highbd_warp_affine_c(const int32_t * mat,const uint16_t * ref,int width,int height,int stride,uint16_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int bd,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)322 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
323                               int width, int height, int stride, uint16_t *pred,
324                               int p_col, int p_row, int p_width, int p_height,
325                               int p_stride, int subsampling_x,
326                               int subsampling_y, int bd,
327                               ConvolveParams *conv_params, int16_t alpha,
328                               int16_t beta, int16_t gamma, int16_t delta) {
329   int32_t tmp[15 * 8];
330   const int reduce_bits_horiz =
331       conv_params->round_0 +
332       AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0);
333   const int reduce_bits_vert = conv_params->is_compound
334                                    ? conv_params->round_1
335                                    : 2 * FILTER_BITS - reduce_bits_horiz;
336   const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
337   const int offset_bits_horiz = bd + FILTER_BITS - 1;
338   const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
339   const int round_bits =
340       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
341   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
342   (void)max_bits_horiz;
343   assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
344 
345   for (int i = p_row; i < p_row + p_height; i += 8) {
346     for (int j = p_col; j < p_col + p_width; j += 8) {
347       // Calculate the center of this 8x8 block,
348       // project to luma coordinates (if in a subsampled chroma plane),
349       // apply the affine transformation,
350       // then convert back to the original coordinates (if necessary)
351       const int32_t src_x = (j + 4) << subsampling_x;
352       const int32_t src_y = (i + 4) << subsampling_y;
353       const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
354       const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
355       const int32_t x4 = dst_x >> subsampling_x;
356       const int32_t y4 = dst_y >> subsampling_y;
357 
358       const int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
359       int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
360       const int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
361       int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
362 
363       sx4 += alpha * (-4) + beta * (-4);
364       sy4 += gamma * (-4) + delta * (-4);
365 
366       sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
367       sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
368 
369       // Horizontal filter
370       for (int k = -7; k < 8; ++k) {
371         const int iy = clamp(iy4 + k, 0, height - 1);
372 
373         int sx = sx4 + beta * (k + 4);
374         for (int l = -4; l < 4; ++l) {
375           int ix = ix4 + l - 3;
376           const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
377                            WARPEDPIXEL_PREC_SHIFTS;
378           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
379           const int16_t *coeffs = av1_warped_filter[offs];
380 
381           int32_t sum = 1 << offset_bits_horiz;
382           for (int m = 0; m < 8; ++m) {
383             const int sample_x = clamp(ix + m, 0, width - 1);
384             sum += ref[iy * stride + sample_x] * coeffs[m];
385           }
386           sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
387           assert(0 <= sum && sum < (1 << max_bits_horiz));
388           tmp[(k + 7) * 8 + (l + 4)] = sum;
389           sx += alpha;
390         }
391       }
392 
393       // Vertical filter
394       for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
395         int sy = sy4 + delta * (k + 4);
396         for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
397           const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
398                            WARPEDPIXEL_PREC_SHIFTS;
399           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
400           const int16_t *coeffs = av1_warped_filter[offs];
401 
402           int32_t sum = 1 << offset_bits_vert;
403           for (int m = 0; m < 8; ++m) {
404             sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
405           }
406 
407           if (conv_params->is_compound) {
408             CONV_BUF_TYPE *p =
409                 &conv_params
410                      ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
411                            (j - p_col + l + 4)];
412             sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
413             if (conv_params->do_average) {
414               uint16_t *dst16 =
415                   &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
416               int32_t tmp32 = *p;
417               if (conv_params->use_dist_wtd_comp_avg) {
418                 tmp32 = tmp32 * conv_params->fwd_offset +
419                         sum * conv_params->bck_offset;
420                 tmp32 = tmp32 >> DIST_PRECISION_BITS;
421               } else {
422                 tmp32 += sum;
423                 tmp32 = tmp32 >> 1;
424               }
425               tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
426                       (1 << (offset_bits - conv_params->round_1 - 1));
427               *dst16 =
428                   clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp32, round_bits), bd);
429             } else {
430               *p = sum;
431             }
432           } else {
433             uint16_t *p =
434                 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
435             sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
436             assert(0 <= sum && sum < (1 << (bd + 2)));
437             *p = clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd);
438           }
439           sy += gamma;
440         }
441       }
442     }
443   }
444 }
445 
highbd_warp_plane(WarpedMotionParams * wm,const uint16_t * const ref,int width,int height,int stride,uint16_t * const pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int bd,ConvolveParams * conv_params)446 void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref,
447                        int width, int height, int stride, uint16_t *const pred,
448                        int p_col, int p_row, int p_width, int p_height,
449                        int p_stride, int subsampling_x, int subsampling_y,
450                        int bd, ConvolveParams *conv_params) {
451   assert(wm->wmtype <= AFFINE);
452   if (wm->wmtype == ROTZOOM) {
453     wm->wmmat[5] = wm->wmmat[2];
454     wm->wmmat[4] = -wm->wmmat[3];
455   }
456   const int32_t *const mat = wm->wmmat;
457   const int16_t alpha = wm->alpha;
458   const int16_t beta = wm->beta;
459   const int16_t gamma = wm->gamma;
460   const int16_t delta = wm->delta;
461 
462   av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
463                          p_width, p_height, p_stride, subsampling_x,
464                          subsampling_y, bd, conv_params, alpha, beta, gamma,
465                          delta);
466 }
467 
av1_calc_highbd_frame_error(const uint16_t * const ref,int stride,const uint16_t * const dst,int p_width,int p_height,int p_stride,int bd)468 int64_t av1_calc_highbd_frame_error(const uint16_t *const ref, int stride,
469                                     const uint16_t *const dst, int p_width,
470                                     int p_height, int p_stride, int bd) {
471   int64_t sum_error = 0;
472   for (int i = 0; i < p_height; ++i) {
473     for (int j = 0; j < p_width; ++j) {
474       sum_error +=
475           highbd_error_measure(dst[j + i * p_stride] - ref[j + i * stride], bd);
476     }
477   }
478   return sum_error;
479 }
480 
highbd_segmented_frame_error(const uint16_t * const ref,int stride,const uint16_t * const dst,int p_width,int p_height,int p_stride,int bd,uint8_t * segment_map,int segment_map_stride)481 static int64_t highbd_segmented_frame_error(
482     const uint16_t *const ref, int stride, const uint16_t *const dst,
483     int p_width, int p_height, int p_stride, int bd, uint8_t *segment_map,
484     int segment_map_stride) {
485   int patch_w, patch_h;
486   const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
487   const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
488   int64_t sum_error = 0;
489   for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) {
490     for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) {
491       int seg_x = j >> WARP_ERROR_BLOCK_LOG;
492       int seg_y = i >> WARP_ERROR_BLOCK_LOG;
493       // Only compute the error if this block contains inliers from the motion
494       // model
495       if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
496 
497       // avoid computing error into the frame padding
498       patch_w = AOMMIN(error_bsize_w, p_width - j);
499       patch_h = AOMMIN(error_bsize_h, p_height - i);
500       sum_error += av1_calc_highbd_frame_error(ref + j + i * stride, stride,
501                                                dst + j + i * p_stride, patch_w,
502                                                patch_h, p_stride, bd);
503     }
504   }
505   return sum_error;
506 }
507 #endif  // CONFIG_AV1_HIGHBITDEPTH
508 
509 /* The warp filter for ROTZOOM and AFFINE models works as follows:
510    * Split the input into 8x8 blocks
511    * For each block, project the point (4, 4) within the block, to get the
512      overall block position. Split into integer and fractional coordinates,
513      maintaining full WARPEDMODEL precision
514    * Filter horizontally: Generate 15 rows of 8 pixels each. Each pixel gets a
515      variable horizontal offset. This means that, while the rows of the
516      intermediate buffer align with the rows of the *reference* image, the
517      columns align with the columns of the *destination* image.
518    * Filter vertically: Generate the output block (up to 8x8 pixels, but if the
519      destination is too small we crop the output at this stage). Each pixel has
520      a variable vertical offset, so that the resulting rows are aligned with
521      the rows of the destination image.
522 
523    To accomplish these alignments, we factor the warp matrix as a
524    product of two shear / asymmetric zoom matrices:
525    / a b \  = /   1       0    \ * / 1+alpha  beta \
526    \ c d /    \ gamma  1+delta /   \    0      1   /
527    where a, b, c, d are wmmat[2], wmmat[3], wmmat[4], wmmat[5] respectively.
528    The horizontal shear (with alpha and beta) is applied first,
529    then the vertical shear (with gamma and delta) is applied second.
530 
531    The only limitation is that, to fit this in a fixed 8-tap filter size,
532    the fractional pixel offsets must be at most +-1. Since the horizontal filter
533    generates 15 rows of 8 columns, and the initial point we project is at (4, 4)
534    within the block, the parameters must satisfy
535    4 * |alpha| + 7 * |beta| <= 1   and   4 * |gamma| + 4 * |delta| <= 1
536    for this filter to be applicable.
537 
538    Note: This function assumes that the caller has done all of the relevant
539    checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5]
540    are set appropriately (if using a ROTZOOM model), and that alpha, beta,
541    gamma, delta are all in range.
542 
543    TODO(david.barker): Maybe support scaled references?
544 */
545 /* A note on hardware implementation:
546     The warp filter is intended to be implementable using the same hardware as
547     the high-precision convolve filters from the loop-restoration and
548     convolve-round experiments.
549 
550     For a single filter stage, considering all of the coefficient sets for the
551     warp filter and the regular convolution filter, an input in the range
552     [0, 2^k - 1] is mapped into the range [-56 * (2^k - 1), 184 * (2^k - 1)]
553     before rounding.
554 
555     Allowing for some changes to the filter coefficient sets, call the range
556     [-64 * 2^k, 192 * 2^k]. Then, if we initialize the accumulator to 64 * 2^k,
557     we can replace this by the range [0, 256 * 2^k], which can be stored in an
558     unsigned value with 8 + k bits.
559 
560     This allows the derivation of the appropriate bit widths and offsets for
561     the various intermediate values: If
562 
563     F := FILTER_BITS = 7 (or else the above ranges need adjusting)
564          So a *single* filter stage maps a k-bit input to a (k + F + 1)-bit
565          intermediate value.
566     H := ROUND0_BITS
567     V := VERSHEAR_REDUCE_PREC_BITS
568     (and note that we must have H + V = 2*F for the output to have the same
569      scale as the input)
570 
571     then we end up with the following offsets and ranges:
572     Horizontal filter: Apply an offset of 1 << (bd + F - 1), sum fits into a
573                        uint{bd + F + 1}
574     After rounding: The values stored in 'tmp' fit into a uint{bd + F + 1 - H}.
575     Vertical filter: Apply an offset of 1 << (bd + 2*F - H), sum fits into a
576                      uint{bd + 2*F + 2 - H}
577     After rounding: The final value, before undoing the offset, fits into a
578                     uint{bd + 2}.
579 
580     Then we need to undo the offsets before clamping to a pixel. Note that,
581     if we do this at the end, the amount to subtract is actually independent
582     of H and V:
583 
584     offset to subtract = (1 << ((bd + F - 1) - H + F - V)) +
585                          (1 << ((bd + 2*F - H) - V))
586                       == (1 << (bd - 1)) + (1 << bd)
587 
588     This allows us to entirely avoid clamping in both the warp filter and
589     the convolve-round experiment. As of the time of writing, the Wiener filter
590     from loop-restoration can encode a central coefficient up to 216, which
591     leads to a maximum value of about 282 * 2^k after applying the offset.
592     So in that case we still need to clamp.
593 */
av1_warp_affine_c(const int32_t * mat,const uint8_t * ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)594 void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
595                        int height, int stride, uint8_t *pred, int p_col,
596                        int p_row, int p_width, int p_height, int p_stride,
597                        int subsampling_x, int subsampling_y,
598                        ConvolveParams *conv_params, int16_t alpha, int16_t beta,
599                        int16_t gamma, int16_t delta) {
600   int32_t tmp[15 * 8];
601   const int bd = 8;
602   const int reduce_bits_horiz = conv_params->round_0;
603   const int reduce_bits_vert = conv_params->is_compound
604                                    ? conv_params->round_1
605                                    : 2 * FILTER_BITS - reduce_bits_horiz;
606   const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
607   const int offset_bits_horiz = bd + FILTER_BITS - 1;
608   const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
609   const int round_bits =
610       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
611   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
612   (void)max_bits_horiz;
613   assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
614   assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
615 
616   for (int i = p_row; i < p_row + p_height; i += 8) {
617     for (int j = p_col; j < p_col + p_width; j += 8) {
618       // Calculate the center of this 8x8 block,
619       // project to luma coordinates (if in a subsampled chroma plane),
620       // apply the affine transformation,
621       // then convert back to the original coordinates (if necessary)
622       const int32_t src_x = (j + 4) << subsampling_x;
623       const int32_t src_y = (i + 4) << subsampling_y;
624       const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
625       const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
626       const int32_t x4 = dst_x >> subsampling_x;
627       const int32_t y4 = dst_y >> subsampling_y;
628 
629       int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
630       int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
631       int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
632       int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
633 
634       sx4 += alpha * (-4) + beta * (-4);
635       sy4 += gamma * (-4) + delta * (-4);
636 
637       sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
638       sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
639 
640       // Horizontal filter
641       for (int k = -7; k < 8; ++k) {
642         // Clamp to top/bottom edge of the frame
643         const int iy = clamp(iy4 + k, 0, height - 1);
644 
645         int sx = sx4 + beta * (k + 4);
646 
647         for (int l = -4; l < 4; ++l) {
648           int ix = ix4 + l - 3;
649           // At this point, sx = sx4 + alpha * l + beta * k
650           const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
651                            WARPEDPIXEL_PREC_SHIFTS;
652           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
653           const int16_t *coeffs = av1_warped_filter[offs];
654 
655           int32_t sum = 1 << offset_bits_horiz;
656           for (int m = 0; m < 8; ++m) {
657             // Clamp to left/right edge of the frame
658             const int sample_x = clamp(ix + m, 0, width - 1);
659 
660             sum += ref[iy * stride + sample_x] * coeffs[m];
661           }
662           sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
663           assert(0 <= sum && sum < (1 << max_bits_horiz));
664           tmp[(k + 7) * 8 + (l + 4)] = sum;
665           sx += alpha;
666         }
667       }
668 
669       // Vertical filter
670       for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
671         int sy = sy4 + delta * (k + 4);
672         for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
673           // At this point, sy = sy4 + gamma * l + delta * k
674           const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
675                            WARPEDPIXEL_PREC_SHIFTS;
676           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
677           const int16_t *coeffs = av1_warped_filter[offs];
678 
679           int32_t sum = 1 << offset_bits_vert;
680           for (int m = 0; m < 8; ++m) {
681             sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
682           }
683 
684           if (conv_params->is_compound) {
685             CONV_BUF_TYPE *p =
686                 &conv_params
687                      ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
688                            (j - p_col + l + 4)];
689             sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
690             if (conv_params->do_average) {
691               uint8_t *dst8 =
692                   &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
693               int32_t tmp32 = *p;
694               if (conv_params->use_dist_wtd_comp_avg) {
695                 tmp32 = tmp32 * conv_params->fwd_offset +
696                         sum * conv_params->bck_offset;
697                 tmp32 = tmp32 >> DIST_PRECISION_BITS;
698               } else {
699                 tmp32 += sum;
700                 tmp32 = tmp32 >> 1;
701               }
702               tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
703                       (1 << (offset_bits - conv_params->round_1 - 1));
704               *dst8 = clip_pixel(ROUND_POWER_OF_TWO(tmp32, round_bits));
705             } else {
706               *p = sum;
707             }
708           } else {
709             uint8_t *p =
710                 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
711             sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
712             assert(0 <= sum && sum < (1 << (bd + 2)));
713             *p = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd));
714           }
715           sy += gamma;
716         }
717       }
718     }
719   }
720 }
721 
warp_plane(WarpedMotionParams * wm,const uint8_t * const ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params)722 void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width,
723                 int height, int stride, uint8_t *pred, int p_col, int p_row,
724                 int p_width, int p_height, int p_stride, int subsampling_x,
725                 int subsampling_y, ConvolveParams *conv_params) {
726   assert(wm->wmtype <= AFFINE);
727   if (wm->wmtype == ROTZOOM) {
728     wm->wmmat[5] = wm->wmmat[2];
729     wm->wmmat[4] = -wm->wmmat[3];
730   }
731   const int32_t *const mat = wm->wmmat;
732   const int16_t alpha = wm->alpha;
733   const int16_t beta = wm->beta;
734   const int16_t gamma = wm->gamma;
735   const int16_t delta = wm->delta;
736   av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width,
737                   p_height, p_stride, subsampling_x, subsampling_y, conv_params,
738                   alpha, beta, gamma, delta);
739 }
740 
av1_calc_frame_error_c(const uint8_t * const ref,int stride,const uint8_t * const dst,int p_width,int p_height,int p_stride)741 int64_t av1_calc_frame_error_c(const uint8_t *const ref, int stride,
742                                const uint8_t *const dst, int p_width,
743                                int p_height, int p_stride) {
744   int64_t sum_error = 0;
745   for (int i = 0; i < p_height; ++i) {
746     for (int j = 0; j < p_width; ++j) {
747       sum_error +=
748           (int64_t)error_measure(dst[j + i * p_stride] - ref[j + i * stride]);
749     }
750   }
751   return sum_error;
752 }
753 
segmented_frame_error(const uint8_t * const ref,int stride,const uint8_t * const dst,int p_width,int p_height,int p_stride,uint8_t * segment_map,int segment_map_stride)754 static int64_t segmented_frame_error(const uint8_t *const ref, int stride,
755                                      const uint8_t *const dst, int p_width,
756                                      int p_height, int p_stride,
757                                      uint8_t *segment_map,
758                                      int segment_map_stride) {
759   int patch_w, patch_h;
760   const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
761   const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
762   int64_t sum_error = 0;
763   for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) {
764     for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) {
765       int seg_x = j >> WARP_ERROR_BLOCK_LOG;
766       int seg_y = i >> WARP_ERROR_BLOCK_LOG;
767       // Only compute the error if this block contains inliers from the motion
768       // model
769       if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
770 
771       // avoid computing error into the frame padding
772       patch_w = AOMMIN(error_bsize_w, p_width - j);
773       patch_h = AOMMIN(error_bsize_h, p_height - i);
774       sum_error += av1_calc_frame_error(ref + j + i * stride, stride,
775                                         dst + j + i * p_stride, patch_w,
776                                         patch_h, p_stride);
777     }
778   }
779   return sum_error;
780 }
781 
av1_frame_error(int use_hbd,int bd,const uint8_t * ref,int stride,uint8_t * dst,int p_width,int p_height,int p_stride)782 int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride,
783                         uint8_t *dst, int p_width, int p_height, int p_stride) {
784 #if CONFIG_AV1_HIGHBITDEPTH
785   if (use_hbd) {
786     return av1_calc_highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride,
787                                        CONVERT_TO_SHORTPTR(dst), p_width,
788                                        p_height, p_stride, bd);
789   }
790 #endif
791   (void)use_hbd;
792   (void)bd;
793   return av1_calc_frame_error(ref, stride, dst, p_width, p_height, p_stride);
794 }
795 
av1_segmented_frame_error(int use_hbd,int bd,const uint8_t * ref,int stride,uint8_t * dst,int p_width,int p_height,int p_stride,uint8_t * segment_map,int segment_map_stride)796 int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref,
797                                   int stride, uint8_t *dst, int p_width,
798                                   int p_height, int p_stride,
799                                   uint8_t *segment_map,
800                                   int segment_map_stride) {
801 #if CONFIG_AV1_HIGHBITDEPTH
802   if (use_hbd) {
803     return highbd_segmented_frame_error(
804         CONVERT_TO_SHORTPTR(ref), stride, CONVERT_TO_SHORTPTR(dst), p_width,
805         p_height, p_stride, bd, segment_map, segment_map_stride);
806   }
807 #endif
808   (void)use_hbd;
809   (void)bd;
810   return segmented_frame_error(ref, stride, dst, p_width, p_height, p_stride,
811                                segment_map, segment_map_stride);
812 }
813 
av1_warp_plane(WarpedMotionParams * wm,int use_hbd,int bd,const uint8_t * ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params)814 void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
815                     const uint8_t *ref, int width, int height, int stride,
816                     uint8_t *pred, int p_col, int p_row, int p_width,
817                     int p_height, int p_stride, int subsampling_x,
818                     int subsampling_y, ConvolveParams *conv_params) {
819 #if CONFIG_AV1_HIGHBITDEPTH
820   if (use_hbd)
821     highbd_warp_plane(wm, CONVERT_TO_SHORTPTR(ref), width, height, stride,
822                       CONVERT_TO_SHORTPTR(pred), p_col, p_row, p_width,
823                       p_height, p_stride, subsampling_x, subsampling_y, bd,
824                       conv_params);
825   else
826     warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
827                p_height, p_stride, subsampling_x, subsampling_y, conv_params);
828 #else
829   (void)use_hbd;
830   (void)bd;
831   warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
832              p_height, p_stride, subsampling_x, subsampling_y, conv_params);
833 #endif
834 }
835 
836 #define LS_MV_MAX 256  // max mv in 1/8-pel
837 // Use LS_STEP = 8 so that 2 less bits needed for A, Bx, By.
838 #define LS_STEP 8
839 
840 // Assuming LS_MV_MAX is < MAX_SB_SIZE * 8,
841 // the precision needed is:
842 //   (MAX_SB_SIZE_LOG2 + 3) [for sx * sx magnitude] +
843 //   (MAX_SB_SIZE_LOG2 + 4) [for sx * dx magnitude] +
844 //   1 [for sign] +
845 //   LEAST_SQUARES_SAMPLES_MAX_BITS
846 //        [for adding up to LEAST_SQUARES_SAMPLES_MAX samples]
847 // The value is 23
848 #define LS_MAT_RANGE_BITS \
849   ((MAX_SB_SIZE_LOG2 + 4) * 2 + LEAST_SQUARES_SAMPLES_MAX_BITS)
850 
851 // Bit-depth reduction from the full-range
852 #define LS_MAT_DOWN_BITS 2
853 
854 // bits range of A, Bx and By after downshifting
855 #define LS_MAT_BITS (LS_MAT_RANGE_BITS - LS_MAT_DOWN_BITS)
856 #define LS_MAT_MIN (-(1 << (LS_MAT_BITS - 1)))
857 #define LS_MAT_MAX ((1 << (LS_MAT_BITS - 1)) - 1)
858 
859 // By setting LS_STEP = 8, the least 2 bits of every elements in A, Bx, By are
860 // 0. So, we can reduce LS_MAT_RANGE_BITS(2) bits here.
861 #define LS_SQUARE(a)                                          \
862   (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
863    (2 + LS_MAT_DOWN_BITS))
864 #define LS_PRODUCT1(a, b)                                           \
865   (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
866    (2 + LS_MAT_DOWN_BITS))
867 #define LS_PRODUCT2(a, b)                                               \
868   (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
869    (2 + LS_MAT_DOWN_BITS))
870 
871 #define USE_LIMITED_PREC_MULT 0
872 
873 #if USE_LIMITED_PREC_MULT
874 
875 #define MUL_PREC_BITS 16
resolve_multiplier_64(uint64_t D,int16_t * shift)876 static uint16_t resolve_multiplier_64(uint64_t D, int16_t *shift) {
877   int msb = 0;
878   uint16_t mult = 0;
879   *shift = 0;
880   if (D != 0) {
881     msb = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
882                               : get_msb((unsigned int)D));
883     if (msb >= MUL_PREC_BITS) {
884       mult = (uint16_t)ROUND_POWER_OF_TWO_64(D, msb + 1 - MUL_PREC_BITS);
885       *shift = msb + 1 - MUL_PREC_BITS;
886     } else {
887       mult = (uint16_t)D;
888       *shift = 0;
889     }
890   }
891   return mult;
892 }
893 
get_mult_shift_ndiag(int64_t Px,int16_t iDet,int shift)894 static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
895   int32_t ret;
896   int16_t mshift;
897   uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
898   int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
899   shift -= mshift;
900   if (shift > 0) {
901     return (int32_t)clamp(ROUND_POWER_OF_TWO_SIGNED(v, shift),
902                           -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
903                           WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
904   } else {
905     return (int32_t)clamp(v * (1 << (-shift)),
906                           -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
907                           WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
908   }
909   return ret;
910 }
911 
get_mult_shift_diag(int64_t Px,int16_t iDet,int shift)912 static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
913   int16_t mshift;
914   uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
915   int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
916   shift -= mshift;
917   if (shift > 0) {
918     return (int32_t)clamp(
919         ROUND_POWER_OF_TWO_SIGNED(v, shift),
920         (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
921         (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
922   } else {
923     return (int32_t)clamp(
924         v * (1 << (-shift)),
925         (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
926         (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
927   }
928 }
929 
930 #else
931 
get_mult_shift_ndiag(int64_t Px,int16_t iDet,int shift)932 static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
933   int64_t v = Px * (int64_t)iDet;
934   return (int32_t)clamp64(ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
935                           -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
936                           WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
937 }
938 
get_mult_shift_diag(int64_t Px,int16_t iDet,int shift)939 static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
940   int64_t v = Px * (int64_t)iDet;
941   return (int32_t)clamp64(
942       ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
943       (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
944       (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
945 }
946 #endif  // USE_LIMITED_PREC_MULT
947 
find_affine_int(int np,const int * pts1,const int * pts2,BLOCK_SIZE bsize,int mvy,int mvx,WarpedMotionParams * wm,int mi_row,int mi_col)948 static int find_affine_int(int np, const int *pts1, const int *pts2,
949                            BLOCK_SIZE bsize, int mvy, int mvx,
950                            WarpedMotionParams *wm, int mi_row, int mi_col) {
951   int32_t A[2][2] = { { 0, 0 }, { 0, 0 } };
952   int32_t Bx[2] = { 0, 0 };
953   int32_t By[2] = { 0, 0 };
954 
955   const int bw = block_size_wide[bsize];
956   const int bh = block_size_high[bsize];
957   const int rsuy = bh / 2 - 1;
958   const int rsux = bw / 2 - 1;
959   const int suy = rsuy * 8;
960   const int sux = rsux * 8;
961   const int duy = suy + mvy;
962   const int dux = sux + mvx;
963 
964   // Assume the center pixel of the block has exactly the same motion vector
965   // as transmitted for the block. First shift the origin of the source
966   // points to the block center, and the origin of the destination points to
967   // the block center added to the motion vector transmitted.
968   // Let (xi, yi) denote the source points and (xi', yi') denote destination
969   // points after origin shfifting, for i = 0, 1, 2, .... n-1.
970   // Then if  P = [x0, y0,
971   //               x1, y1
972   //               x2, y1,
973   //                ....
974   //              ]
975   //          q = [x0', x1', x2', ... ]'
976   //          r = [y0', y1', y2', ... ]'
977   // the least squares problems that need to be solved are:
978   //          [h1, h2]' = inv(P'P)P'q and
979   //          [h3, h4]' = inv(P'P)P'r
980   // where the affine transformation is given by:
981   //          x' = h1.x + h2.y
982   //          y' = h3.x + h4.y
983   //
984   // The loop below computes: A = P'P, Bx = P'q, By = P'r
985   // We need to just compute inv(A).Bx and inv(A).By for the solutions.
986   // Contribution from neighbor block
987   for (int i = 0; i < np; i++) {
988     const int dx = pts2[i * 2] - dux;
989     const int dy = pts2[i * 2 + 1] - duy;
990     const int sx = pts1[i * 2] - sux;
991     const int sy = pts1[i * 2 + 1] - suy;
992     // (TODO)yunqing: This comparison wouldn't be necessary if the sample
993     // selection is done in find_samples(). Also, global offset can be removed
994     // while collecting samples.
995     if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
996       A[0][0] += LS_SQUARE(sx);
997       A[0][1] += LS_PRODUCT1(sx, sy);
998       A[1][1] += LS_SQUARE(sy);
999       Bx[0] += LS_PRODUCT2(sx, dx);
1000       Bx[1] += LS_PRODUCT1(sy, dx);
1001       By[0] += LS_PRODUCT1(sx, dy);
1002       By[1] += LS_PRODUCT2(sy, dy);
1003     }
1004   }
1005 
1006   // Just for debugging, and can be removed later.
1007   assert(A[0][0] >= LS_MAT_MIN && A[0][0] <= LS_MAT_MAX);
1008   assert(A[0][1] >= LS_MAT_MIN && A[0][1] <= LS_MAT_MAX);
1009   assert(A[1][1] >= LS_MAT_MIN && A[1][1] <= LS_MAT_MAX);
1010   assert(Bx[0] >= LS_MAT_MIN && Bx[0] <= LS_MAT_MAX);
1011   assert(Bx[1] >= LS_MAT_MIN && Bx[1] <= LS_MAT_MAX);
1012   assert(By[0] >= LS_MAT_MIN && By[0] <= LS_MAT_MAX);
1013   assert(By[1] >= LS_MAT_MIN && By[1] <= LS_MAT_MAX);
1014 
1015   // Compute Determinant of A
1016   const int64_t Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
1017   if (Det == 0) return 1;
1018 
1019   int16_t shift;
1020   int16_t iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1);
1021   shift -= WARPEDMODEL_PREC_BITS;
1022   if (shift < 0) {
1023     iDet <<= (-shift);
1024     shift = 0;
1025   }
1026 
1027   int64_t Px[2], Py[2];
1028   // These divided by the Det, are the least squares solutions
1029   Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1];
1030   Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1];
1031   Py[0] = (int64_t)A[1][1] * By[0] - (int64_t)A[0][1] * By[1];
1032   Py[1] = -(int64_t)A[0][1] * By[0] + (int64_t)A[0][0] * By[1];
1033 
1034   wm->wmmat[2] = get_mult_shift_diag(Px[0], iDet, shift);
1035   wm->wmmat[3] = get_mult_shift_ndiag(Px[1], iDet, shift);
1036   wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift);
1037   wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift);
1038 
1039   const int isuy = (mi_row * MI_SIZE + rsuy);
1040   const int isux = (mi_col * MI_SIZE + rsux);
1041   // Note: In the vx, vy expressions below, the max value of each of the
1042   // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room
1043   // for the first term so that the overall sum in the worst case fits
1044   // within 32 bits overall.
1045   const int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
1046                      (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) +
1047                       isuy * wm->wmmat[3]);
1048   const int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
1049                      (isux * wm->wmmat[4] +
1050                       isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS)));
1051   wm->wmmat[0] =
1052       clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
1053   wm->wmmat[1] =
1054       clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
1055 
1056   wm->wmmat[6] = wm->wmmat[7] = 0;
1057   return 0;
1058 }
1059 
av1_find_projection(int np,const int * pts1,const int * pts2,BLOCK_SIZE bsize,int mvy,int mvx,WarpedMotionParams * wm_params,int mi_row,int mi_col)1060 int av1_find_projection(int np, const int *pts1, const int *pts2,
1061                         BLOCK_SIZE bsize, int mvy, int mvx,
1062                         WarpedMotionParams *wm_params, int mi_row, int mi_col) {
1063   assert(wm_params->wmtype == AFFINE);
1064 
1065   if (find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params, mi_row,
1066                       mi_col))
1067     return 1;
1068 
1069   // check compatibility with the fast warp filter
1070   if (!av1_get_shear_params(wm_params)) return 1;
1071 
1072   return 0;
1073 }
1074