1/* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "./vpx_config.h" 12 13#include "vpx_mem/vpx_mem.h" 14 15#include "vp9/common/vp9_reconinter.h" 16 17#include "vp9/decoder/vp9_dthread.h" 18#include "vp9/decoder/vp9_decoder.h" 19 20#if CONFIG_MULTITHREAD 21static INLINE void mutex_lock(pthread_mutex_t *const mutex) { 22 const int kMaxTryLocks = 4000; 23 int locked = 0; 24 int i; 25 26 for (i = 0; i < kMaxTryLocks; ++i) { 27 if (!pthread_mutex_trylock(mutex)) { 28 locked = 1; 29 break; 30 } 31 } 32 33 if (!locked) 34 pthread_mutex_lock(mutex); 35} 36#endif // CONFIG_MULTITHREAD 37 38static INLINE void sync_read(VP9LfSync *const lf_sync, int r, int c) { 39#if CONFIG_MULTITHREAD 40 const int nsync = lf_sync->sync_range; 41 42 if (r && !(c & (nsync - 1))) { 43 mutex_lock(&lf_sync->mutex_[r - 1]); 44 45 while (c > lf_sync->cur_sb_col[r - 1] - nsync) { 46 pthread_cond_wait(&lf_sync->cond_[r - 1], 47 &lf_sync->mutex_[r - 1]); 48 } 49 pthread_mutex_unlock(&lf_sync->mutex_[r - 1]); 50 } 51#else 52 (void)lf_sync; 53 (void)r; 54 (void)c; 55#endif // CONFIG_MULTITHREAD 56} 57 58static INLINE void sync_write(VP9LfSync *const lf_sync, int r, int c, 59 const int sb_cols) { 60#if CONFIG_MULTITHREAD 61 const int nsync = lf_sync->sync_range; 62 int cur; 63 // Only signal when there are enough filtered SB for next row to run. 64 int sig = 1; 65 66 if (c < sb_cols - 1) { 67 cur = c; 68 if (c % nsync) 69 sig = 0; 70 } else { 71 cur = sb_cols + nsync; 72 } 73 74 if (sig) { 75 mutex_lock(&lf_sync->mutex_[r]); 76 77 lf_sync->cur_sb_col[r] = cur; 78 79 pthread_cond_signal(&lf_sync->cond_[r]); 80 pthread_mutex_unlock(&lf_sync->mutex_[r]); 81 } 82#else 83 (void)lf_sync; 84 (void)r; 85 (void)c; 86 (void)sb_cols; 87#endif // CONFIG_MULTITHREAD 88} 89 90// Implement row loopfiltering for each thread. 91static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer, 92 VP9_COMMON *const cm, MACROBLOCKD *const xd, 93 int start, int stop, int y_only, 94 VP9LfSync *const lf_sync, int num_lf_workers) { 95 const int num_planes = y_only ? 1 : MAX_MB_PLANE; 96 int r, c; // SB row and col 97 LOOP_FILTER_MASK lfm; 98 const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2; 99 100 for (r = start; r < stop; r += num_lf_workers) { 101 const int mi_row = r << MI_BLOCK_SIZE_LOG2; 102 MODE_INFO **mi_8x8 = cm->mi_grid_visible + mi_row * cm->mi_stride; 103 104 for (c = 0; c < sb_cols; ++c) { 105 const int mi_col = c << MI_BLOCK_SIZE_LOG2; 106 int plane; 107 108 sync_read(lf_sync, r, c); 109 110 vp9_setup_dst_planes(xd, frame_buffer, mi_row, mi_col); 111 vp9_setup_mask(cm, mi_row, mi_col, mi_8x8 + mi_col, cm->mi_stride, &lfm); 112 113 for (plane = 0; plane < num_planes; ++plane) { 114 vp9_filter_block_plane(cm, &xd->plane[plane], mi_row, &lfm); 115 } 116 117 sync_write(lf_sync, r, c, sb_cols); 118 } 119 } 120} 121 122// Row-based multi-threaded loopfilter hook 123static int loop_filter_row_worker(void *arg1, void *arg2) { 124 TileWorkerData *const tile_data = (TileWorkerData*)arg1; 125 LFWorkerData *const lf_data = &tile_data->lfdata; 126 127 loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, &lf_data->xd, 128 lf_data->start, lf_data->stop, lf_data->y_only, 129 lf_data->lf_sync, lf_data->num_lf_workers); 130 return 1; 131} 132 133// VP9 decoder: Implement multi-threaded loopfilter that uses the tile 134// threads. 135void vp9_loop_filter_frame_mt(VP9Decoder *pbi, 136 VP9_COMMON *cm, 137 MACROBLOCKD *xd, 138 int frame_filter_level, 139 int y_only, int partial_frame) { 140 // Number of superblock rows and cols 141 const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; 142 const int tile_cols = 1 << cm->log2_tile_cols; 143 const int num_workers = MIN(pbi->oxcf.max_threads & ~1, tile_cols); 144 int i; 145 146 // Allocate memory used in thread synchronization. 147 // This always needs to be done even if frame_filter_level is 0. 148 if (!cm->current_video_frame || cm->last_height != cm->height) { 149 VP9LfSync *const lf_sync = &pbi->lf_row_sync; 150 151 if (cm->last_height != cm->height) { 152 const int aligned_last_height = 153 ALIGN_POWER_OF_TWO(cm->last_height, MI_SIZE_LOG2); 154 const int last_sb_rows = 155 mi_cols_aligned_to_sb(aligned_last_height >> MI_SIZE_LOG2) >> 156 MI_BLOCK_SIZE_LOG2; 157 158 vp9_loop_filter_dealloc(lf_sync, last_sb_rows); 159 } 160 161 vp9_loop_filter_alloc(cm, lf_sync, sb_rows, cm->width); 162 } 163 164 if (!frame_filter_level) return; 165 166 vp9_loop_filter_frame_init(cm, frame_filter_level); 167 168 // Initialize cur_sb_col to -1 for all SB rows. 169 vpx_memset(pbi->lf_row_sync.cur_sb_col, -1, 170 sizeof(*pbi->lf_row_sync.cur_sb_col) * sb_rows); 171 172 // Set up loopfilter thread data. 173 // The decoder is using num_workers instead of pbi->num_tile_workers 174 // because it has been observed that using more threads on the 175 // loopfilter, than there are tile columns in the frame will hurt 176 // performance on Android. This is because the system will only 177 // schedule the tile decode workers on cores equal to the number 178 // of tile columns. Then if the decoder tries to use more threads for the 179 // loopfilter, it will hurt performance because of contention. If the 180 // multithreading code changes in the future then the number of workers 181 // used by the loopfilter should be revisited. 182 for (i = 0; i < num_workers; ++i) { 183 VP9Worker *const worker = &pbi->tile_workers[i]; 184 TileWorkerData *const tile_data = (TileWorkerData*)worker->data1; 185 LFWorkerData *const lf_data = &tile_data->lfdata; 186 187 worker->hook = (VP9WorkerHook)loop_filter_row_worker; 188 189 // Loopfilter data 190 lf_data->frame_buffer = get_frame_new_buffer(cm); 191 lf_data->cm = cm; 192 lf_data->xd = pbi->mb; 193 lf_data->start = i; 194 lf_data->stop = sb_rows; 195 lf_data->y_only = y_only; // always do all planes in decoder 196 197 lf_data->lf_sync = &pbi->lf_row_sync; 198 lf_data->num_lf_workers = num_workers; 199 200 // Start loopfiltering 201 if (i == num_workers - 1) { 202 vp9_worker_execute(worker); 203 } else { 204 vp9_worker_launch(worker); 205 } 206 } 207 208 // Wait till all rows are finished 209 for (i = 0; i < num_workers; ++i) { 210 vp9_worker_sync(&pbi->tile_workers[i]); 211 } 212} 213 214// Set up nsync by width. 215static int get_sync_range(int width) { 216 // nsync numbers are picked by testing. For example, for 4k 217 // video, using 4 gives best performance. 218 if (width < 640) 219 return 1; 220 else if (width <= 1280) 221 return 2; 222 else if (width <= 4096) 223 return 4; 224 else 225 return 8; 226} 227 228// Allocate memory for lf row synchronization 229void vp9_loop_filter_alloc(VP9_COMMON *cm, VP9LfSync *lf_sync, int rows, 230 int width) { 231#if CONFIG_MULTITHREAD 232 int i; 233 234 CHECK_MEM_ERROR(cm, lf_sync->mutex_, 235 vpx_malloc(sizeof(*lf_sync->mutex_) * rows)); 236 for (i = 0; i < rows; ++i) { 237 pthread_mutex_init(&lf_sync->mutex_[i], NULL); 238 } 239 240 CHECK_MEM_ERROR(cm, lf_sync->cond_, 241 vpx_malloc(sizeof(*lf_sync->cond_) * rows)); 242 for (i = 0; i < rows; ++i) { 243 pthread_cond_init(&lf_sync->cond_[i], NULL); 244 } 245#endif // CONFIG_MULTITHREAD 246 247 CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col, 248 vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows)); 249 250 // Set up nsync. 251 lf_sync->sync_range = get_sync_range(width); 252} 253 254// Deallocate lf synchronization related mutex and data 255void vp9_loop_filter_dealloc(VP9LfSync *lf_sync, int rows) { 256#if CONFIG_MULTITHREAD 257 if (lf_sync != NULL) { 258 int i; 259 260 if (lf_sync->mutex_ != NULL) { 261 for (i = 0; i < rows; ++i) { 262 pthread_mutex_destroy(&lf_sync->mutex_[i]); 263 } 264 vpx_free(lf_sync->mutex_); 265 } 266 if (lf_sync->cond_ != NULL) { 267 for (i = 0; i < rows; ++i) { 268 pthread_cond_destroy(&lf_sync->cond_[i]); 269 } 270 vpx_free(lf_sync->cond_); 271 } 272 273 vpx_free(lf_sync->cur_sb_col); 274 // clear the structure as the source of this call may be a resize in which 275 // case this call will be followed by an _alloc() which may fail. 276 vpx_memset(lf_sync, 0, sizeof(*lf_sync)); 277 } 278#else 279 (void)rows; 280 if (lf_sync != NULL) { 281 vpx_free(lf_sync->cur_sb_col); 282 vpx_memset(lf_sync, 0, sizeof(*lf_sync)); 283 } 284#endif // CONFIG_MULTITHREAD 285} 286