1/*
2 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "./vpx_config.h"
12#include "vp9/common/vp9_loopfilter.h"
13#include "vp9/common/vp9_onyxc_int.h"
14#include "vp9/common/vp9_reconinter.h"
15#include "vpx_mem/vpx_mem.h"
16
17#include "vp9/common/vp9_seg_common.h"
18
19// This structure holds bit masks for all 8x8 blocks in a 64x64 region.
20// Each 1 bit represents a position in which we want to apply the loop filter.
21// Left_ entries refer to whether we apply a filter on the border to the
22// left of the block.   Above_ entries refer to whether or not to apply a
23// filter on the above border.   Int_ entries refer to whether or not to
24// apply borders on the 4x4 edges within the 8x8 block that each bit
25// represents.
26// Since each transform is accompanied by a potentially different type of
27// loop filter there is a different entry in the array for each transform size.
28typedef struct {
29  uint64_t left_y[TX_SIZES];
30  uint64_t above_y[TX_SIZES];
31  uint64_t int_4x4_y;
32  uint16_t left_uv[TX_SIZES];
33  uint16_t above_uv[TX_SIZES];
34  uint16_t int_4x4_uv;
35  uint8_t lfl_y[64];
36  uint8_t lfl_uv[16];
37} LOOP_FILTER_MASK;
38
39// 64 bit masks for left transform size.  Each 1 represents a position where
40// we should apply a loop filter across the left border of an 8x8 block
41// boundary.
42//
43// In the case of TX_16X16->  ( in low order byte first we end up with
44// a mask that looks like this
45//
46//    10101010
47//    10101010
48//    10101010
49//    10101010
50//    10101010
51//    10101010
52//    10101010
53//    10101010
54//
55// A loopfilter should be applied to every other 8x8 horizontally.
56static const uint64_t left_64x64_txform_mask[TX_SIZES]= {
57    0xffffffffffffffff,  // TX_4X4
58    0xffffffffffffffff,  // TX_8x8
59    0x5555555555555555,  // TX_16x16
60    0x1111111111111111,  // TX_32x32
61};
62
63// 64 bit masks for above transform size.  Each 1 represents a position where
64// we should apply a loop filter across the top border of an 8x8 block
65// boundary.
66//
67// In the case of TX_32x32 ->  ( in low order byte first we end up with
68// a mask that looks like this
69//
70//    11111111
71//    00000000
72//    00000000
73//    00000000
74//    11111111
75//    00000000
76//    00000000
77//    00000000
78//
79// A loopfilter should be applied to every other 4 the row vertically.
80static const uint64_t above_64x64_txform_mask[TX_SIZES]= {
81    0xffffffffffffffff,  // TX_4X4
82    0xffffffffffffffff,  // TX_8x8
83    0x00ff00ff00ff00ff,  // TX_16x16
84    0x000000ff000000ff,  // TX_32x32
85};
86
87// 64 bit masks for prediction sizes (left).  Each 1 represents a position
88// where left border of an 8x8 block.  These are aligned to the right most
89// appropriate bit,  and then shifted into place.
90//
91// In the case of TX_16x32 ->  ( low order byte first ) we end up with
92// a mask that looks like this :
93//
94//  10000000
95//  10000000
96//  10000000
97//  10000000
98//  00000000
99//  00000000
100//  00000000
101//  00000000
102static const uint64_t left_prediction_mask[BLOCK_SIZES] = {
103    0x0000000000000001,  // BLOCK_4X4,
104    0x0000000000000001,  // BLOCK_4X8,
105    0x0000000000000001,  // BLOCK_8X4,
106    0x0000000000000001,  // BLOCK_8X8,
107    0x0000000000000101,  // BLOCK_8X16,
108    0x0000000000000001,  // BLOCK_16X8,
109    0x0000000000000101,  // BLOCK_16X16,
110    0x0000000001010101,  // BLOCK_16X32,
111    0x0000000000000101,  // BLOCK_32X16,
112    0x0000000001010101,  // BLOCK_32X32,
113    0x0101010101010101,  // BLOCK_32X64,
114    0x0000000001010101,  // BLOCK_64X32,
115    0x0101010101010101,  // BLOCK_64X64
116};
117
118// 64 bit mask to shift and set for each prediction size.
119static const uint64_t above_prediction_mask[BLOCK_SIZES] = {
120    0x0000000000000001,  // BLOCK_4X4
121    0x0000000000000001,  // BLOCK_4X8
122    0x0000000000000001,  // BLOCK_8X4
123    0x0000000000000001,  // BLOCK_8X8
124    0x0000000000000001,  // BLOCK_8X16,
125    0x0000000000000003,  // BLOCK_16X8
126    0x0000000000000003,  // BLOCK_16X16
127    0x0000000000000003,  // BLOCK_16X32,
128    0x000000000000000f,  // BLOCK_32X16,
129    0x000000000000000f,  // BLOCK_32X32,
130    0x000000000000000f,  // BLOCK_32X64,
131    0x00000000000000ff,  // BLOCK_64X32,
132    0x00000000000000ff,  // BLOCK_64X64
133};
134// 64 bit mask to shift and set for each prediction size.  A bit is set for
135// each 8x8 block that would be in the left most block of the given block
136// size in the 64x64 block.
137static const uint64_t size_mask[BLOCK_SIZES] = {
138    0x0000000000000001,  // BLOCK_4X4
139    0x0000000000000001,  // BLOCK_4X8
140    0x0000000000000001,  // BLOCK_8X4
141    0x0000000000000001,  // BLOCK_8X8
142    0x0000000000000101,  // BLOCK_8X16,
143    0x0000000000000003,  // BLOCK_16X8
144    0x0000000000000303,  // BLOCK_16X16
145    0x0000000003030303,  // BLOCK_16X32,
146    0x0000000000000f0f,  // BLOCK_32X16,
147    0x000000000f0f0f0f,  // BLOCK_32X32,
148    0x0f0f0f0f0f0f0f0f,  // BLOCK_32X64,
149    0x00000000ffffffff,  // BLOCK_64X32,
150    0xffffffffffffffff,  // BLOCK_64X64
151};
152
153// These are used for masking the left and above borders.
154static const uint64_t left_border =  0x1111111111111111;
155static const uint64_t above_border = 0x000000ff000000ff;
156
157// 16 bit masks for uv transform sizes.
158static const uint16_t left_64x64_txform_mask_uv[TX_SIZES]= {
159    0xffff,  // TX_4X4
160    0xffff,  // TX_8x8
161    0x5555,  // TX_16x16
162    0x1111,  // TX_32x32
163};
164
165static const uint16_t above_64x64_txform_mask_uv[TX_SIZES]= {
166    0xffff,  // TX_4X4
167    0xffff,  // TX_8x8
168    0x0f0f,  // TX_16x16
169    0x000f,  // TX_32x32
170};
171
172// 16 bit left mask to shift and set for each uv prediction size.
173static const uint16_t left_prediction_mask_uv[BLOCK_SIZES] = {
174    0x0001,  // BLOCK_4X4,
175    0x0001,  // BLOCK_4X8,
176    0x0001,  // BLOCK_8X4,
177    0x0001,  // BLOCK_8X8,
178    0x0001,  // BLOCK_8X16,
179    0x0001,  // BLOCK_16X8,
180    0x0001,  // BLOCK_16X16,
181    0x0011,  // BLOCK_16X32,
182    0x0001,  // BLOCK_32X16,
183    0x0011,  // BLOCK_32X32,
184    0x1111,  // BLOCK_32X64
185    0x0011,  // BLOCK_64X32,
186    0x1111,  // BLOCK_64X64
187};
188// 16 bit above mask to shift and set for uv each prediction size.
189static const uint16_t above_prediction_mask_uv[BLOCK_SIZES] = {
190    0x0001,  // BLOCK_4X4
191    0x0001,  // BLOCK_4X8
192    0x0001,  // BLOCK_8X4
193    0x0001,  // BLOCK_8X8
194    0x0001,  // BLOCK_8X16,
195    0x0001,  // BLOCK_16X8
196    0x0001,  // BLOCK_16X16
197    0x0001,  // BLOCK_16X32,
198    0x0003,  // BLOCK_32X16,
199    0x0003,  // BLOCK_32X32,
200    0x0003,  // BLOCK_32X64,
201    0x000f,  // BLOCK_64X32,
202    0x000f,  // BLOCK_64X64
203};
204
205// 64 bit mask to shift and set for each uv prediction size
206static const uint16_t size_mask_uv[BLOCK_SIZES] = {
207    0x0001,  // BLOCK_4X4
208    0x0001,  // BLOCK_4X8
209    0x0001,  // BLOCK_8X4
210    0x0001,  // BLOCK_8X8
211    0x0001,  // BLOCK_8X16,
212    0x0001,  // BLOCK_16X8
213    0x0001,  // BLOCK_16X16
214    0x0011,  // BLOCK_16X32,
215    0x0003,  // BLOCK_32X16,
216    0x0033,  // BLOCK_32X32,
217    0x3333,  // BLOCK_32X64,
218    0x00ff,  // BLOCK_64X32,
219    0xffff,  // BLOCK_64X64
220};
221static const uint16_t left_border_uv =  0x1111;
222static const uint16_t above_border_uv = 0x000f;
223
224
225static void lf_init_lut(loop_filter_info_n *lfi) {
226  lfi->mode_lf_lut[DC_PRED] = 0;
227  lfi->mode_lf_lut[D45_PRED] = 0;
228  lfi->mode_lf_lut[D135_PRED] = 0;
229  lfi->mode_lf_lut[D117_PRED] = 0;
230  lfi->mode_lf_lut[D153_PRED] = 0;
231  lfi->mode_lf_lut[D207_PRED] = 0;
232  lfi->mode_lf_lut[D63_PRED] = 0;
233  lfi->mode_lf_lut[V_PRED] = 0;
234  lfi->mode_lf_lut[H_PRED] = 0;
235  lfi->mode_lf_lut[TM_PRED] = 0;
236  lfi->mode_lf_lut[ZEROMV]  = 0;
237  lfi->mode_lf_lut[NEARESTMV] = 1;
238  lfi->mode_lf_lut[NEARMV] = 1;
239  lfi->mode_lf_lut[NEWMV] = 1;
240}
241
242static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
243  int lvl;
244
245  // For each possible value for the loop filter fill out limits
246  for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) {
247    // Set loop filter paramaeters that control sharpness.
248    int block_inside_limit = lvl >> ((sharpness_lvl > 0) + (sharpness_lvl > 4));
249
250    if (sharpness_lvl > 0) {
251      if (block_inside_limit > (9 - sharpness_lvl))
252        block_inside_limit = (9 - sharpness_lvl);
253    }
254
255    if (block_inside_limit < 1)
256      block_inside_limit = 1;
257
258    vpx_memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH);
259    vpx_memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit),
260               SIMD_WIDTH);
261  }
262}
263
264void vp9_loop_filter_init(VP9_COMMON *cm) {
265  loop_filter_info_n *lfi = &cm->lf_info;
266  struct loopfilter *lf = &cm->lf;
267  int lvl;
268
269  // init limits for given sharpness
270  update_sharpness(lfi, lf->sharpness_level);
271  lf->last_sharpness_level = lf->sharpness_level;
272
273  // init LUT for lvl  and hev thr picking
274  lf_init_lut(lfi);
275
276  // init hev threshold const vectors
277  for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++)
278    vpx_memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH);
279}
280
281void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
282  int seg_id;
283  // n_shift is the a multiplier for lf_deltas
284  // the multiplier is 1 for when filter_lvl is between 0 and 31;
285  // 2 when filter_lvl is between 32 and 63
286  const int n_shift = default_filt_lvl >> 5;
287  loop_filter_info_n *const lfi = &cm->lf_info;
288  struct loopfilter *const lf = &cm->lf;
289  struct segmentation *const seg = &cm->seg;
290
291  // update limits if sharpness has changed
292  if (lf->last_sharpness_level != lf->sharpness_level) {
293    update_sharpness(lfi, lf->sharpness_level);
294    lf->last_sharpness_level = lf->sharpness_level;
295  }
296
297  for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) {
298    int lvl_seg = default_filt_lvl, ref, mode, intra_lvl;
299
300    // Set the baseline filter values for each segment
301    if (vp9_segfeature_active(seg, seg_id, SEG_LVL_ALT_LF)) {
302      const int data = vp9_get_segdata(seg, seg_id, SEG_LVL_ALT_LF);
303      lvl_seg = seg->abs_delta == SEGMENT_ABSDATA
304                  ? data
305                  : clamp(default_filt_lvl + data, 0, MAX_LOOP_FILTER);
306    }
307
308    if (!lf->mode_ref_delta_enabled) {
309      // we could get rid of this if we assume that deltas are set to
310      // zero when not in use; encoder always uses deltas
311      vpx_memset(lfi->lvl[seg_id], lvl_seg, sizeof(lfi->lvl[seg_id]));
312      continue;
313    }
314
315    intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * (1 << n_shift);
316    lfi->lvl[seg_id][INTRA_FRAME][0] = clamp(intra_lvl, 0, MAX_LOOP_FILTER);
317
318    for (ref = LAST_FRAME; ref < MAX_REF_FRAMES; ++ref)
319      for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
320        const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * (1 << n_shift)
321                                      + lf->mode_deltas[mode] * (1 << n_shift);
322        lfi->lvl[seg_id][ref][mode] = clamp(inter_lvl, 0, MAX_LOOP_FILTER);
323      }
324  }
325}
326
327static uint8_t build_lfi(const loop_filter_info_n *lfi_n,
328                     const MB_MODE_INFO *mbmi) {
329  const int seg = mbmi->segment_id;
330  const int ref = mbmi->ref_frame[0];
331  const int mode = lfi_n->mode_lf_lut[mbmi->mode];
332  const int filter_level = lfi_n->lvl[seg][ref][mode];
333
334  return filter_level;
335}
336
337static void filter_selectively_vert(uint8_t *s, int pitch,
338                                    unsigned int mask_16x16,
339                                    unsigned int mask_8x8,
340                                    unsigned int mask_4x4,
341                                    unsigned int mask_4x4_int,
342                                    const loop_filter_info_n *lfi_n,
343                                    const uint8_t *lfl) {
344  unsigned int mask;
345
346  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
347       mask; mask >>= 1) {
348    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
349
350    if (mask & 1) {
351      if (mask_16x16 & 1) {
352        vp9_mb_lpf_vertical_edge_w(s, pitch, lfi->mblim, lfi->lim,
353                                   lfi->hev_thr);
354        assert(!(mask_8x8 & 1));
355        assert(!(mask_4x4 & 1));
356        assert(!(mask_4x4_int & 1));
357      } else if (mask_8x8 & 1) {
358        vp9_mbloop_filter_vertical_edge(s, pitch, lfi->mblim, lfi->lim,
359                                        lfi->hev_thr, 1);
360        assert(!(mask_16x16 & 1));
361        assert(!(mask_4x4 & 1));
362      } else if (mask_4x4 & 1) {
363        vp9_loop_filter_vertical_edge(s, pitch, lfi->mblim, lfi->lim,
364                                      lfi->hev_thr, 1);
365        assert(!(mask_16x16 & 1));
366        assert(!(mask_8x8 & 1));
367      }
368    }
369    if (mask_4x4_int & 1)
370      vp9_loop_filter_vertical_edge(s + 4, pitch, lfi->mblim, lfi->lim,
371                                    lfi->hev_thr, 1);
372    s += 8;
373    lfl += 1;
374    mask_16x16 >>= 1;
375    mask_8x8 >>= 1;
376    mask_4x4 >>= 1;
377    mask_4x4_int >>= 1;
378  }
379}
380
381static void filter_selectively_horiz(uint8_t *s, int pitch,
382                                     unsigned int mask_16x16,
383                                     unsigned int mask_8x8,
384                                     unsigned int mask_4x4,
385                                     unsigned int mask_4x4_int,
386                                     const loop_filter_info_n *lfi_n,
387                                     const uint8_t *lfl) {
388  unsigned int mask;
389  int count;
390
391  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
392       mask; mask >>= count) {
393    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
394
395    count = 1;
396    if (mask & 1) {
397      if (mask_16x16 & 1) {
398        if ((mask_16x16 & 3) == 3) {
399          vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
400                                       lfi->hev_thr, 2);
401          count = 2;
402        } else {
403          vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
404                                       lfi->hev_thr, 1);
405        }
406        assert(!(mask_8x8 & 1));
407        assert(!(mask_4x4 & 1));
408        assert(!(mask_4x4_int & 1));
409      } else if (mask_8x8 & 1) {
410        if ((mask_8x8 & 3) == 3) {
411          // Next block's thresholds
412          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
413
414          // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering.
415          vp9_mbloop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim,
416                                            lfi->hev_thr, 1);
417          vp9_mbloop_filter_horizontal_edge(s + 8, pitch, lfin->mblim,
418                                            lfin->lim, lfin->hev_thr, 1);
419
420          if ((mask_4x4_int & 3) == 3) {
421            // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering.
422            vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
423                                            lfi->lim, lfi->hev_thr, 1);
424            vp9_loop_filter_horizontal_edge(s + 8 + 4 * pitch, pitch,
425                                            lfin->mblim, lfin->lim,
426                                            lfin->hev_thr, 1);
427          } else {
428            if (mask_4x4_int & 1)
429              vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
430                                              lfi->lim, lfi->hev_thr, 1);
431            else if (mask_4x4_int & 2)
432              vp9_loop_filter_horizontal_edge(s + 8 + 4 * pitch, pitch,
433                                              lfin->mblim, lfin->lim,
434                                              lfin->hev_thr, 1);
435          }
436          count = 2;
437        } else {
438          vp9_mbloop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim,
439                                            lfi->hev_thr, 1);
440
441          if (mask_4x4_int & 1)
442            vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
443                                            lfi->lim, lfi->hev_thr, 1);
444        }
445        assert(!(mask_16x16 & 1));
446        assert(!(mask_4x4 & 1));
447      } else if (mask_4x4 & 1) {
448        if ((mask_4x4 & 3) == 3) {
449          // Next block's thresholds
450          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
451
452          // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering.
453          vp9_loop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim,
454                                            lfi->hev_thr, 1);
455          vp9_loop_filter_horizontal_edge(s + 8, pitch, lfin->mblim, lfin->lim,
456                                            lfin->hev_thr, 1);
457
458          if ((mask_4x4_int & 3) == 3) {
459            // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering.
460            vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
461                                            lfi->lim, lfi->hev_thr, 1);
462            vp9_loop_filter_horizontal_edge(s + 8 + 4 * pitch, pitch,
463                                            lfin->mblim, lfin->lim,
464                                            lfin->hev_thr, 1);
465          } else {
466            if (mask_4x4_int & 1)
467              vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
468                                              lfi->lim, lfi->hev_thr, 1);
469            else if (mask_4x4_int & 2)
470              vp9_loop_filter_horizontal_edge(s + 8 + 4 * pitch, pitch,
471                                              lfin->mblim, lfin->lim,
472                                              lfin->hev_thr, 1);
473          }
474          count = 2;
475        } else {
476        vp9_loop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim,
477                                        lfi->hev_thr, 1);
478
479        if (mask_4x4_int & 1)
480          vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
481                                          lfi->lim, lfi->hev_thr, 1);
482        }
483        assert(!(mask_16x16 & 1));
484        assert(!(mask_8x8 & 1));
485      } else if (mask_4x4_int & 1) {
486        vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
487                                        lfi->lim, lfi->hev_thr, 1);
488      }
489    }
490    s += 8 * count;
491    lfl += count;
492    mask_16x16 >>= count;
493    mask_8x8 >>= count;
494    mask_4x4 >>= count;
495    mask_4x4_int >>= count;
496  }
497}
498
499// This function ors into the current lfm structure, where to do loop
500// filters for the specific mi we are looking at.   It uses information
501// including the block_size_type (32x16, 32x32, etc),  the transform size,
502// whether there were any coefficients encoded, and the loop filter strength
503// block we are currently looking at. Shift is used to position the
504// 1's we produce.
505// TODO(JBB) Need another function for different resolution color..
506static void build_masks(const loop_filter_info_n *const lfi_n,
507                        const MODE_INFO *mi, const int shift_y,
508                        const int shift_uv,
509                        LOOP_FILTER_MASK *lfm) {
510  const BLOCK_SIZE block_size = mi->mbmi.sb_type;
511  const TX_SIZE tx_size_y = mi->mbmi.tx_size;
512  const TX_SIZE tx_size_uv = get_uv_tx_size(&mi->mbmi);
513  const int skip = mi->mbmi.skip_coeff;
514  const int seg = mi->mbmi.segment_id;
515  const int ref = mi->mbmi.ref_frame[0];
516  const int mode = lfi_n->mode_lf_lut[mi->mbmi.mode];
517  const int filter_level = lfi_n->lvl[seg][ref][mode];
518  uint64_t *left_y = &lfm->left_y[tx_size_y];
519  uint64_t *above_y = &lfm->above_y[tx_size_y];
520  uint64_t *int_4x4_y = &lfm->int_4x4_y;
521  uint16_t *left_uv = &lfm->left_uv[tx_size_uv];
522  uint16_t *above_uv = &lfm->above_uv[tx_size_uv];
523  uint16_t *int_4x4_uv = &lfm->int_4x4_uv;
524  int i;
525  int w = num_8x8_blocks_wide_lookup[block_size];
526  int h = num_8x8_blocks_high_lookup[block_size];
527
528  // If filter level is 0 we don't loop filter.
529  if (!filter_level) {
530    return;
531  } else {
532    int index = shift_y;
533    for (i = 0; i < h; i++) {
534      vpx_memset(&lfm->lfl_y[index], filter_level, w);
535      index += 8;
536    }
537  }
538
539  // These set 1 in the current block size for the block size edges.
540  // For instance if the block size is 32x16,   we'll set :
541  //    above =   1111
542  //              0000
543  //    and
544  //    left  =   1000
545  //          =   1000
546  // NOTE : In this example the low bit is left most ( 1000 ) is stored as
547  //        1,  not 8...
548  //
549  // U and v set things on a 16 bit scale.
550  //
551  *above_y |= above_prediction_mask[block_size] << shift_y;
552  *above_uv |= above_prediction_mask_uv[block_size] << shift_uv;
553  *left_y |= left_prediction_mask[block_size] << shift_y;
554  *left_uv |= left_prediction_mask_uv[block_size] << shift_uv;
555
556  // If the block has no coefficients and is not intra we skip applying
557  // the loop filter on block edges.
558  if (skip && ref > INTRA_FRAME)
559    return;
560
561  // Here we are adding a mask for the transform size.  The transform
562  // size mask is set to be correct for a 64x64 prediction block size. We
563  // mask to match the size of the block we are working on and then shift it
564  // into place..
565  *above_y |= (size_mask[block_size] &
566               above_64x64_txform_mask[tx_size_y]) << shift_y;
567  *above_uv |= (size_mask_uv[block_size] &
568                above_64x64_txform_mask_uv[tx_size_uv]) << shift_uv;
569
570  *left_y |= (size_mask[block_size] &
571              left_64x64_txform_mask[tx_size_y]) << shift_y;
572  *left_uv |= (size_mask_uv[block_size] &
573               left_64x64_txform_mask_uv[tx_size_uv]) << shift_uv;
574
575  // Here we are trying to determine what to do with the internal 4x4 block
576  // boundaries.  These differ from the 4x4 boundaries on the outside edge of
577  // an 8x8 in that the internal ones can be skipped and don't depend on
578  // the prediction block size.
579  if (tx_size_y == TX_4X4) {
580    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y;
581  }
582  if (tx_size_uv == TX_4X4) {
583    *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv;
584  }
585}
586
587// This function does the same thing as the one above with the exception that
588// it only affects the y masks.   It exists because for blocks < 16x16 in size,
589// we only update u and v masks on the first block.
590static void build_y_mask(const loop_filter_info_n *const lfi_n,
591                         const MODE_INFO *mi, const int shift_y,
592                         LOOP_FILTER_MASK *lfm) {
593  const BLOCK_SIZE block_size = mi->mbmi.sb_type;
594  const TX_SIZE tx_size_y = mi->mbmi.tx_size;
595  const int skip = mi->mbmi.skip_coeff;
596  const int seg = mi->mbmi.segment_id;
597  const int ref = mi->mbmi.ref_frame[0];
598  const int mode = lfi_n->mode_lf_lut[mi->mbmi.mode];
599  const int filter_level = lfi_n->lvl[seg][ref][mode];
600  uint64_t *left_y = &lfm->left_y[tx_size_y];
601  uint64_t *above_y = &lfm->above_y[tx_size_y];
602  uint64_t *int_4x4_y = &lfm->int_4x4_y;
603  int i;
604  int w = num_8x8_blocks_wide_lookup[block_size];
605  int h = num_8x8_blocks_high_lookup[block_size];
606
607  if (!filter_level) {
608    return;
609  } else {
610    int index = shift_y;
611    for (i = 0; i < h; i++) {
612      vpx_memset(&lfm->lfl_y[index], filter_level, w);
613      index += 8;
614    }
615  }
616
617  *above_y |= above_prediction_mask[block_size] << shift_y;
618  *left_y |= left_prediction_mask[block_size] << shift_y;
619
620  if (skip && ref > INTRA_FRAME)
621    return;
622
623  *above_y |= (size_mask[block_size] &
624               above_64x64_txform_mask[tx_size_y]) << shift_y;
625
626  *left_y |= (size_mask[block_size] &
627              left_64x64_txform_mask[tx_size_y]) << shift_y;
628
629  if (tx_size_y == TX_4X4) {
630    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y;
631  }
632}
633
634// This function sets up the bit masks for the entire 64x64 region represented
635// by mi_row, mi_col.
636// TODO(JBB): This function only works for yv12.
637static void setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
638                       MODE_INFO **mi_8x8, const int mode_info_stride,
639                       LOOP_FILTER_MASK *lfm) {
640  int idx_32, idx_16, idx_8;
641  const loop_filter_info_n *const lfi_n = &cm->lf_info;
642  MODE_INFO **mip = mi_8x8;
643  MODE_INFO **mip2 = mi_8x8;
644
645  // These are offsets to the next mi in the 64x64 block. It is what gets
646  // added to the mi ptr as we go through each loop.  It helps us to avoids
647  // setting up special row and column counters for each index.  The last step
648  // brings us out back to the starting position.
649  const int offset_32[] = {4, (mode_info_stride << 2) - 4, 4,
650                           -(mode_info_stride << 2) - 4};
651  const int offset_16[] = {2, (mode_info_stride << 1) - 2, 2,
652                           -(mode_info_stride << 1) - 2};
653  const int offset[] = {1, mode_info_stride - 1, 1, -mode_info_stride - 1};
654
655  // Following variables represent shifts to position the current block
656  // mask over the appropriate block.   A shift of 36 to the left will move
657  // the bits for the final 32 by 32 block in the 64x64 up 4 rows and left
658  // 4 rows to the appropriate spot.
659  const int shift_32_y[] = {0, 4, 32, 36};
660  const int shift_16_y[] = {0, 2, 16, 18};
661  const int shift_8_y[] = {0, 1, 8, 9};
662  const int shift_32_uv[] = {0, 2, 8, 10};
663  const int shift_16_uv[] = {0, 1, 4, 5};
664  int i;
665  const int max_rows = (mi_row + MI_BLOCK_SIZE > cm->mi_rows ?
666                        cm->mi_rows - mi_row : MI_BLOCK_SIZE);
667  const int max_cols = (mi_col + MI_BLOCK_SIZE > cm->mi_cols ?
668                        cm->mi_cols - mi_col : MI_BLOCK_SIZE);
669
670  vp9_zero(*lfm);
671
672  // TODO(jimbankoski): Try moving most of the following code into decode
673  // loop and storing lfm in the mbmi structure so that we don't have to go
674  // through the recursive loop structure multiple times.
675  switch (mip[0]->mbmi.sb_type) {
676    case BLOCK_64X64:
677      build_masks(lfi_n, mip[0] , 0, 0, lfm);
678      break;
679    case BLOCK_64X32:
680      build_masks(lfi_n, mip[0], 0, 0, lfm);
681      mip2 = mip + mode_info_stride * 4;
682      if (4 >= max_rows)
683        break;
684      build_masks(lfi_n, mip2[0], 32, 8, lfm);
685      break;
686    case BLOCK_32X64:
687      build_masks(lfi_n, mip[0], 0, 0, lfm);
688      mip2 = mip + 4;
689      if (4 >= max_cols)
690        break;
691      build_masks(lfi_n, mip2[0], 4, 2, lfm);
692      break;
693    default:
694      for (idx_32 = 0; idx_32 < 4; mip += offset_32[idx_32], ++idx_32) {
695        const int shift_y = shift_32_y[idx_32];
696        const int shift_uv = shift_32_uv[idx_32];
697        const int mi_32_col_offset = ((idx_32 & 1) << 2);
698        const int mi_32_row_offset = ((idx_32 >> 1) << 2);
699        if (mi_32_col_offset >= max_cols || mi_32_row_offset >= max_rows)
700          continue;
701        switch (mip[0]->mbmi.sb_type) {
702          case BLOCK_32X32:
703            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
704            break;
705          case BLOCK_32X16:
706            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
707            if (mi_32_row_offset + 2 >= max_rows)
708              continue;
709            mip2 = mip + mode_info_stride * 2;
710            build_masks(lfi_n, mip2[0], shift_y + 16, shift_uv + 4, lfm);
711            break;
712          case BLOCK_16X32:
713            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
714            if (mi_32_col_offset + 2 >= max_cols)
715              continue;
716            mip2 = mip + 2;
717            build_masks(lfi_n, mip2[0], shift_y + 2, shift_uv + 1, lfm);
718            break;
719          default:
720            for (idx_16 = 0; idx_16 < 4; mip += offset_16[idx_16], ++idx_16) {
721              const int shift_y = shift_32_y[idx_32] + shift_16_y[idx_16];
722              const int shift_uv = shift_32_uv[idx_32] + shift_16_uv[idx_16];
723              const int mi_16_col_offset = mi_32_col_offset +
724                  ((idx_16 & 1) << 1);
725              const int mi_16_row_offset = mi_32_row_offset +
726                  ((idx_16 >> 1) << 1);
727
728              if (mi_16_col_offset >= max_cols || mi_16_row_offset >= max_rows)
729                continue;
730
731              switch (mip[0]->mbmi.sb_type) {
732                case BLOCK_16X16:
733                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
734                  break;
735                case BLOCK_16X8:
736                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
737                  if (mi_16_row_offset + 1 >= max_rows)
738                    continue;
739                  mip2 = mip + mode_info_stride;
740                  build_y_mask(lfi_n, mip2[0], shift_y+8, lfm);
741                  break;
742                case BLOCK_8X16:
743                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
744                  if (mi_16_col_offset +1 >= max_cols)
745                    continue;
746                  mip2 = mip + 1;
747                  build_y_mask(lfi_n, mip2[0], shift_y+1, lfm);
748                  break;
749                default: {
750                  const int shift_y = shift_32_y[idx_32] +
751                                      shift_16_y[idx_16] +
752                                      shift_8_y[0];
753                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
754                  mip += offset[0];
755                  for (idx_8 = 1; idx_8 < 4; mip += offset[idx_8], ++idx_8) {
756                    const int shift_y = shift_32_y[idx_32] +
757                                        shift_16_y[idx_16] +
758                                        shift_8_y[idx_8];
759                    const int mi_8_col_offset = mi_16_col_offset +
760                        ((idx_8 & 1));
761                    const int mi_8_row_offset = mi_16_row_offset +
762                        ((idx_8 >> 1));
763
764                    if (mi_8_col_offset >= max_cols ||
765                        mi_8_row_offset >= max_rows)
766                      continue;
767                    build_y_mask(lfi_n, mip[0], shift_y, lfm);
768                  }
769                  break;
770                }
771              }
772            }
773            break;
774        }
775      }
776      break;
777  }
778  // The largest loopfilter we have is 16x16 so we use the 16x16 mask
779  // for 32x32 transforms also also.
780  lfm->left_y[TX_16X16] |= lfm->left_y[TX_32X32];
781  lfm->above_y[TX_16X16] |= lfm->above_y[TX_32X32];
782  lfm->left_uv[TX_16X16] |= lfm->left_uv[TX_32X32];
783  lfm->above_uv[TX_16X16] |= lfm->above_uv[TX_32X32];
784
785  // We do at least 8 tap filter on every 32x32 even if the transform size
786  // is 4x4.  So if the 4x4 is set on a border pixel add it to the 8x8 and
787  // remove it from the 4x4.
788  lfm->left_y[TX_8X8] |= lfm->left_y[TX_4X4] & left_border;
789  lfm->left_y[TX_4X4] &= ~left_border;
790  lfm->above_y[TX_8X8] |= lfm->above_y[TX_4X4] & above_border;
791  lfm->above_y[TX_4X4] &= ~above_border;
792  lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_4X4] & left_border_uv;
793  lfm->left_uv[TX_4X4] &= ~left_border_uv;
794  lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_4X4] & above_border_uv;
795  lfm->above_uv[TX_4X4] &= ~above_border_uv;
796
797  // We do some special edge handling.
798  if (mi_row + MI_BLOCK_SIZE > cm->mi_rows) {
799    const uint64_t rows = cm->mi_rows - mi_row;
800
801    // Each pixel inside the border gets a 1,
802    const uint64_t mask_y = (((uint64_t) 1 << (rows << 3)) - 1);
803    const uint16_t mask_uv = (((uint16_t) 1 << (((rows + 1) >> 1) << 2)) - 1);
804
805    // Remove values completely outside our border.
806    for (i = 0; i < TX_32X32; i++) {
807      lfm->left_y[i] &= mask_y;
808      lfm->above_y[i] &= mask_y;
809      lfm->left_uv[i] &= mask_uv;
810      lfm->above_uv[i] &= mask_uv;
811    }
812    lfm->int_4x4_y &= mask_y;
813    lfm->int_4x4_uv &= mask_uv;
814
815    // We don't apply a wide loop filter on the last uv block row.  If set
816    // apply the shorter one instead.
817    if (rows == 1) {
818      lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16];
819      lfm->above_uv[TX_16X16] = 0;
820    }
821    if (rows == 5) {
822      lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16] & 0xff00;
823      lfm->above_uv[TX_16X16] &= ~(lfm->above_uv[TX_16X16] & 0xff00);
824    }
825  }
826
827  if (mi_col + MI_BLOCK_SIZE > cm->mi_cols) {
828    const uint64_t columns = cm->mi_cols - mi_col;
829
830    // Each pixel inside the border gets a 1, the multiply copies the border
831    // to where we need it.
832    const uint64_t mask_y  = (((1 << columns) - 1)) * 0x0101010101010101;
833    const uint16_t mask_uv = ((1 << ((columns + 1) >> 1)) - 1) * 0x1111;
834
835    // Internal edges are not applied on the last column of the image so
836    // we mask 1 more for the internal edges
837    const uint16_t mask_uv_int = ((1 << (columns >> 1)) - 1) * 0x1111;
838
839    // Remove the bits outside the image edge.
840    for (i = 0; i < TX_32X32; i++) {
841      lfm->left_y[i] &= mask_y;
842      lfm->above_y[i] &= mask_y;
843      lfm->left_uv[i] &= mask_uv;
844      lfm->above_uv[i] &= mask_uv;
845    }
846    lfm->int_4x4_y &= mask_y;
847    lfm->int_4x4_uv &= mask_uv_int;
848
849    // We don't apply a wide loop filter on the last uv column.  If set
850    // apply the shorter one instead.
851    if (columns == 1) {
852      lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_16X16];
853      lfm->left_uv[TX_16X16] = 0;
854    }
855    if (columns == 5) {
856      lfm->left_uv[TX_8X8] |= (lfm->left_uv[TX_16X16] & 0xcccc);
857      lfm->left_uv[TX_16X16] &= ~(lfm->left_uv[TX_16X16] & 0xcccc);
858    }
859  }
860  // We don't a loop filter on the first column in the image.  Mask that out.
861  if (mi_col == 0) {
862    for (i = 0; i < TX_32X32; i++) {
863      lfm->left_y[i] &= 0xfefefefefefefefe;
864      lfm->left_uv[i] &= 0xeeee;
865    }
866  }
867}
868
869#if CONFIG_NON420
870static void filter_block_plane_non420(VP9_COMMON *cm,
871                                      struct macroblockd_plane *plane,
872                                      MODE_INFO **mi_8x8,
873                                      int mi_row, int mi_col) {
874  const int ss_x = plane->subsampling_x;
875  const int ss_y = plane->subsampling_y;
876  const int row_step = 1 << ss_x;
877  const int col_step = 1 << ss_y;
878  const int row_step_stride = cm->mode_info_stride * row_step;
879  struct buf_2d *const dst = &plane->dst;
880  uint8_t* const dst0 = dst->buf;
881  unsigned int mask_16x16[MI_BLOCK_SIZE] = {0};
882  unsigned int mask_8x8[MI_BLOCK_SIZE] = {0};
883  unsigned int mask_4x4[MI_BLOCK_SIZE] = {0};
884  unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
885  uint8_t lfl[MI_BLOCK_SIZE * MI_BLOCK_SIZE];
886  int r, c;
887
888  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
889    unsigned int mask_16x16_c = 0;
890    unsigned int mask_8x8_c = 0;
891    unsigned int mask_4x4_c = 0;
892    unsigned int border_mask;
893
894    // Determine the vertical edges that need filtering
895    for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
896      const MODE_INFO *mi = mi_8x8[c];
897      const int skip_this = mi[0].mbmi.skip_coeff
898                            && is_inter_block(&mi[0].mbmi);
899      // left edge of current unit is block/partition edge -> no skip
900      const int block_edge_left = b_width_log2(mi[0].mbmi.sb_type) ?
901          !(c & ((1 << (b_width_log2(mi[0].mbmi.sb_type)-1)) - 1)) : 1;
902      const int skip_this_c = skip_this && !block_edge_left;
903      // top edge of current unit is block/partition edge -> no skip
904      const int block_edge_above = b_height_log2(mi[0].mbmi.sb_type) ?
905          !(r & ((1 << (b_height_log2(mi[0].mbmi.sb_type)-1)) - 1)) : 1;
906      const int skip_this_r = skip_this && !block_edge_above;
907      const TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV)
908                            ? get_uv_tx_size(&mi[0].mbmi)
909                            : mi[0].mbmi.tx_size;
910      const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1;
911      const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
912
913      // Filter level can vary per MI
914      if (!(lfl[(r << 3) + (c >> ss_x)] =
915          build_lfi(&cm->lf_info, &mi[0].mbmi)))
916        continue;
917
918      // Build masks based on the transform size of each block
919      if (tx_size == TX_32X32) {
920        if (!skip_this_c && ((c >> ss_x) & 3) == 0) {
921          if (!skip_border_4x4_c)
922            mask_16x16_c |= 1 << (c >> ss_x);
923          else
924            mask_8x8_c |= 1 << (c >> ss_x);
925        }
926        if (!skip_this_r && ((r >> ss_y) & 3) == 0) {
927          if (!skip_border_4x4_r)
928            mask_16x16[r] |= 1 << (c >> ss_x);
929          else
930            mask_8x8[r] |= 1 << (c >> ss_x);
931        }
932      } else if (tx_size == TX_16X16) {
933        if (!skip_this_c && ((c >> ss_x) & 1) == 0) {
934          if (!skip_border_4x4_c)
935            mask_16x16_c |= 1 << (c >> ss_x);
936          else
937            mask_8x8_c |= 1 << (c >> ss_x);
938        }
939        if (!skip_this_r && ((r >> ss_y) & 1) == 0) {
940          if (!skip_border_4x4_r)
941            mask_16x16[r] |= 1 << (c >> ss_x);
942          else
943            mask_8x8[r] |= 1 << (c >> ss_x);
944        }
945      } else {
946        // force 8x8 filtering on 32x32 boundaries
947        if (!skip_this_c) {
948          if (tx_size == TX_8X8 || ((c >> ss_x) & 3) == 0)
949            mask_8x8_c |= 1 << (c >> ss_x);
950          else
951            mask_4x4_c |= 1 << (c >> ss_x);
952        }
953
954        if (!skip_this_r) {
955          if (tx_size == TX_8X8 || ((r >> ss_y) & 3) == 0)
956            mask_8x8[r] |= 1 << (c >> ss_x);
957          else
958            mask_4x4[r] |= 1 << (c >> ss_x);
959        }
960
961        if (!skip_this && tx_size < TX_8X8 && !skip_border_4x4_c)
962          mask_4x4_int[r] |= 1 << (c >> ss_x);
963      }
964    }
965
966    // Disable filtering on the leftmost column
967    border_mask = ~(mi_col == 0);
968    filter_selectively_vert(dst->buf, dst->stride,
969                            mask_16x16_c & border_mask,
970                            mask_8x8_c & border_mask,
971                            mask_4x4_c & border_mask,
972                            mask_4x4_int[r],
973                            &cm->lf_info, &lfl[r << 3]);
974    dst->buf += 8 * dst->stride;
975    mi_8x8 += row_step_stride;
976  }
977
978  // Now do horizontal pass
979  dst->buf = dst0;
980  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
981    const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
982    const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r];
983
984    unsigned int mask_16x16_r;
985    unsigned int mask_8x8_r;
986    unsigned int mask_4x4_r;
987
988    if (mi_row + r == 0) {
989      mask_16x16_r = 0;
990      mask_8x8_r = 0;
991      mask_4x4_r = 0;
992    } else {
993      mask_16x16_r = mask_16x16[r];
994      mask_8x8_r = mask_8x8[r];
995      mask_4x4_r = mask_4x4[r];
996    }
997
998    filter_selectively_horiz(dst->buf, dst->stride,
999                             mask_16x16_r,
1000                             mask_8x8_r,
1001                             mask_4x4_r,
1002                             mask_4x4_int_r,
1003                             &cm->lf_info, &lfl[r << 3]);
1004    dst->buf += 8 * dst->stride;
1005  }
1006}
1007#endif
1008
1009static void filter_block_plane(VP9_COMMON *const cm,
1010                               struct macroblockd_plane *const plane,
1011                               int mi_row,
1012                               LOOP_FILTER_MASK *lfm) {
1013  struct buf_2d *const dst = &plane->dst;
1014  uint8_t* const dst0 = dst->buf;
1015  unsigned int mask_4x4_int_row[MI_BLOCK_SIZE] = {0};
1016  int r, c;
1017
1018  if (!plane->plane_type) {
1019    uint64_t mask_16x16 = lfm->left_y[TX_16X16];
1020    uint64_t mask_8x8 = lfm->left_y[TX_8X8];
1021    uint64_t mask_4x4 = lfm->left_y[TX_4X4];
1022    uint64_t mask_4x4_int = lfm->int_4x4_y;
1023
1024    // Vertical pass
1025    for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r++) {
1026      mask_4x4_int_row[r] = mask_4x4_int & 0xff;
1027
1028      // Disable filtering on the leftmost column
1029      filter_selectively_vert(dst->buf, dst->stride,
1030                              mask_16x16 & 0xff,
1031                              mask_8x8 & 0xff,
1032                              mask_4x4 & 0xff,
1033                              mask_4x4_int_row[r],
1034                              &cm->lf_info, &lfm->lfl_y[r << 3]);
1035
1036      dst->buf += 8 * dst->stride;
1037      mask_16x16 >>= 8;
1038      mask_8x8 >>= 8;
1039      mask_4x4 >>= 8;
1040      mask_4x4_int >>= 8;
1041    }
1042
1043    // Horizontal pass
1044    dst->buf = dst0;
1045    mask_16x16 = lfm->above_y[TX_16X16];
1046    mask_8x8 = lfm->above_y[TX_8X8];
1047    mask_4x4 = lfm->above_y[TX_4X4];
1048
1049    for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r++) {
1050      unsigned int mask_16x16_r;
1051      unsigned int mask_8x8_r;
1052      unsigned int mask_4x4_r;
1053
1054      if (mi_row + r == 0) {
1055        mask_16x16_r = 0;
1056        mask_8x8_r = 0;
1057        mask_4x4_r = 0;
1058      } else {
1059        mask_16x16_r = mask_16x16 & 0xff;
1060        mask_8x8_r = mask_8x8 & 0xff;
1061        mask_4x4_r = mask_4x4 & 0xff;
1062      }
1063
1064      filter_selectively_horiz(dst->buf, dst->stride,
1065                               mask_16x16_r,
1066                               mask_8x8_r,
1067                               mask_4x4_r,
1068                               mask_4x4_int_row[r],
1069                               &cm->lf_info, &lfm->lfl_y[r << 3]);
1070
1071      dst->buf += 8 * dst->stride;
1072      mask_16x16 >>= 8;
1073      mask_8x8 >>= 8;
1074      mask_4x4 >>= 8;
1075    }
1076  } else {
1077    uint16_t mask_16x16 = lfm->left_uv[TX_16X16];
1078    uint16_t mask_8x8 = lfm->left_uv[TX_8X8];
1079    uint16_t mask_4x4 = lfm->left_uv[TX_4X4];
1080    uint16_t mask_4x4_int = lfm->int_4x4_uv;
1081
1082    // Vertical pass
1083    for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
1084      if (plane->plane_type == 1) {
1085        for (c = 0; c < (MI_BLOCK_SIZE >> 1); c++)
1086          lfm->lfl_uv[(r << 1) + c] = lfm->lfl_y[(r << 3) + (c << 1)];
1087      }
1088
1089      mask_4x4_int_row[r] = mask_4x4_int & 0xf;
1090      // Disable filtering on the leftmost column
1091      filter_selectively_vert(dst->buf, dst->stride,
1092                              mask_16x16 & 0xf,
1093                              mask_8x8 & 0xf,
1094                              mask_4x4 & 0xf,
1095                              mask_4x4_int_row[r],
1096                              &cm->lf_info, &lfm->lfl_uv[r << 1]);
1097
1098      dst->buf += 8 * dst->stride;
1099      mask_16x16 >>= 4;
1100      mask_8x8 >>= 4;
1101      mask_4x4 >>= 4;
1102      mask_4x4_int >>= 4;
1103    }
1104
1105    // Horizontal pass
1106    dst->buf = dst0;
1107    mask_16x16 = lfm->above_uv[TX_16X16];
1108    mask_8x8 = lfm->above_uv[TX_8X8];
1109    mask_4x4 = lfm->above_uv[TX_4X4];
1110
1111    for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
1112      const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1;
1113      const unsigned int mask_4x4_int_r = skip_border_4x4_r ?
1114          0 : (mask_4x4_int_row[r]);
1115      unsigned int mask_16x16_r;
1116      unsigned int mask_8x8_r;
1117      unsigned int mask_4x4_r;
1118
1119      if (mi_row + r == 0) {
1120        mask_16x16_r = 0;
1121        mask_8x8_r = 0;
1122        mask_4x4_r = 0;
1123      } else {
1124        mask_16x16_r = mask_16x16 & 0xf;
1125        mask_8x8_r = mask_8x8 & 0xf;
1126        mask_4x4_r = mask_4x4 & 0xf;
1127      }
1128
1129      filter_selectively_horiz(dst->buf, dst->stride,
1130                               mask_16x16_r,
1131                               mask_8x8_r,
1132                               mask_4x4_r,
1133                               mask_4x4_int_r,
1134                               &cm->lf_info, &lfm->lfl_uv[r << 1]);
1135
1136      dst->buf += 8 * dst->stride;
1137      mask_16x16 >>= 4;
1138      mask_8x8 >>= 4;
1139      mask_4x4 >>= 4;
1140    }
1141  }
1142}
1143
1144void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
1145                          VP9_COMMON *cm, MACROBLOCKD *xd,
1146                          int start, int stop, int y_only) {
1147  const int num_planes = y_only ? 1 : MAX_MB_PLANE;
1148  int mi_row, mi_col;
1149  LOOP_FILTER_MASK lfm;
1150#if CONFIG_NON420
1151  int use_420 = y_only || (xd->plane[1].subsampling_y == 1 &&
1152      xd->plane[1].subsampling_x == 1);
1153#endif
1154
1155  for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) {
1156    MODE_INFO **mi_8x8 = cm->mi_grid_visible + mi_row * cm->mode_info_stride;
1157
1158    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
1159      int plane;
1160
1161      setup_dst_planes(xd, frame_buffer, mi_row, mi_col);
1162
1163      // TODO(JBB): Make setup_mask work for non 420.
1164#if CONFIG_NON420
1165      if (use_420)
1166#endif
1167        setup_mask(cm, mi_row, mi_col, mi_8x8 + mi_col, cm->mode_info_stride,
1168                   &lfm);
1169
1170      for (plane = 0; plane < num_planes; ++plane) {
1171#if CONFIG_NON420
1172        if (use_420)
1173#endif
1174          filter_block_plane(cm, &xd->plane[plane], mi_row, &lfm);
1175#if CONFIG_NON420
1176        else
1177          filter_block_plane_non420(cm, &xd->plane[plane], mi_8x8 + mi_col,
1178                                    mi_row, mi_col);
1179#endif
1180      }
1181    }
1182  }
1183}
1184
1185void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd,
1186                           int frame_filter_level,
1187                           int y_only, int partial) {
1188  int start_mi_row, end_mi_row, mi_rows_to_filter;
1189  if (!frame_filter_level) return;
1190  start_mi_row = 0;
1191  mi_rows_to_filter = cm->mi_rows;
1192  if (partial && cm->mi_rows > 8) {
1193    start_mi_row = cm->mi_rows >> 1;
1194    start_mi_row &= 0xfffffff8;
1195    mi_rows_to_filter = MAX(cm->mi_rows / 8, 8);
1196  }
1197  end_mi_row = start_mi_row + mi_rows_to_filter;
1198  vp9_loop_filter_frame_init(cm, frame_filter_level);
1199  vp9_loop_filter_rows(cm->frame_to_show, cm, xd,
1200                       start_mi_row, end_mi_row,
1201                       y_only);
1202}
1203
1204int vp9_loop_filter_worker(void *arg1, void *arg2) {
1205  LFWorkerData *const lf_data = (LFWorkerData*)arg1;
1206  (void)arg2;
1207  vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, &lf_data->xd,
1208                       lf_data->start, lf_data->stop, lf_data->y_only);
1209  return 1;
1210}
1211