1/* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "./vpx_config.h" 12 13#include "vpx_mem/vpx_mem.h" 14 15#include "vp9/common/vp9_reconinter.h" 16 17#include "vp9/decoder/vp9_dthread.h" 18#include "vp9/decoder/vp9_decoder.h" 19 20#if CONFIG_MULTITHREAD 21static INLINE void mutex_lock(pthread_mutex_t *const mutex) { 22 const int kMaxTryLocks = 4000; 23 int locked = 0; 24 int i; 25 26 for (i = 0; i < kMaxTryLocks; ++i) { 27 if (!pthread_mutex_trylock(mutex)) { 28 locked = 1; 29 break; 30 } 31 } 32 33 if (!locked) 34 pthread_mutex_lock(mutex); 35} 36#endif // CONFIG_MULTITHREAD 37 38static INLINE void sync_read(VP9LfSync *const lf_sync, int r, int c) { 39#if CONFIG_MULTITHREAD 40 const int nsync = lf_sync->sync_range; 41 42 if (r && !(c & (nsync - 1))) { 43 pthread_mutex_t *const mutex = &lf_sync->mutex_[r - 1]; 44 mutex_lock(mutex); 45 46 while (c > lf_sync->cur_sb_col[r - 1] - nsync) { 47 pthread_cond_wait(&lf_sync->cond_[r - 1], mutex); 48 } 49 pthread_mutex_unlock(mutex); 50 } 51#else 52 (void)lf_sync; 53 (void)r; 54 (void)c; 55#endif // CONFIG_MULTITHREAD 56} 57 58static INLINE void sync_write(VP9LfSync *const lf_sync, int r, int c, 59 const int sb_cols) { 60#if CONFIG_MULTITHREAD 61 const int nsync = lf_sync->sync_range; 62 int cur; 63 // Only signal when there are enough filtered SB for next row to run. 64 int sig = 1; 65 66 if (c < sb_cols - 1) { 67 cur = c; 68 if (c % nsync) 69 sig = 0; 70 } else { 71 cur = sb_cols + nsync; 72 } 73 74 if (sig) { 75 mutex_lock(&lf_sync->mutex_[r]); 76 77 lf_sync->cur_sb_col[r] = cur; 78 79 pthread_cond_signal(&lf_sync->cond_[r]); 80 pthread_mutex_unlock(&lf_sync->mutex_[r]); 81 } 82#else 83 (void)lf_sync; 84 (void)r; 85 (void)c; 86 (void)sb_cols; 87#endif // CONFIG_MULTITHREAD 88} 89 90// Implement row loopfiltering for each thread. 91static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer, 92 VP9_COMMON *const cm, 93 struct macroblockd_plane planes[MAX_MB_PLANE], 94 int start, int stop, int y_only, 95 VP9LfSync *const lf_sync, int num_lf_workers) { 96 const int num_planes = y_only ? 1 : MAX_MB_PLANE; 97 int r, c; // SB row and col 98 const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2; 99 100 for (r = start; r < stop; r += num_lf_workers) { 101 const int mi_row = r << MI_BLOCK_SIZE_LOG2; 102 MODE_INFO **const mi = cm->mi_grid_visible + mi_row * cm->mi_stride; 103 104 for (c = 0; c < sb_cols; ++c) { 105 const int mi_col = c << MI_BLOCK_SIZE_LOG2; 106 LOOP_FILTER_MASK lfm; 107 int plane; 108 109 sync_read(lf_sync, r, c); 110 111 vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col); 112 vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm); 113 114 for (plane = 0; plane < num_planes; ++plane) { 115 vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm); 116 } 117 118 sync_write(lf_sync, r, c, sb_cols); 119 } 120 } 121} 122 123// Row-based multi-threaded loopfilter hook 124static int loop_filter_row_worker(void *arg1, void *arg2) { 125 TileWorkerData *const tile_data = (TileWorkerData*)arg1; 126 LFWorkerData *const lf_data = &tile_data->lfdata; 127 (void) arg2; 128 loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, lf_data->planes, 129 lf_data->start, lf_data->stop, lf_data->y_only, 130 lf_data->lf_sync, lf_data->num_lf_workers); 131 return 1; 132} 133 134// VP9 decoder: Implement multi-threaded loopfilter that uses the tile 135// threads. 136void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, 137 VP9Decoder *pbi, VP9_COMMON *cm, 138 int frame_filter_level, 139 int y_only) { 140 VP9LfSync *const lf_sync = &pbi->lf_row_sync; 141 const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); 142 // Number of superblock rows and cols 143 const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; 144 const int tile_cols = 1 << cm->log2_tile_cols; 145 const int num_workers = MIN(pbi->max_threads & ~1, tile_cols); 146 int i; 147 148 // Allocate memory used in thread synchronization. 149 // This always needs to be done even if frame_filter_level is 0. 150 if (!cm->current_video_frame || cm->last_height != cm->height) { 151 if (cm->last_height != cm->height) { 152 const int aligned_last_height = 153 ALIGN_POWER_OF_TWO(cm->last_height, MI_SIZE_LOG2); 154 const int last_sb_rows = 155 mi_cols_aligned_to_sb(aligned_last_height >> MI_SIZE_LOG2) >> 156 MI_BLOCK_SIZE_LOG2; 157 158 vp9_loop_filter_dealloc(lf_sync, last_sb_rows); 159 } 160 161 vp9_loop_filter_alloc(cm, lf_sync, sb_rows, cm->width); 162 } 163 164 if (!frame_filter_level) return; 165 166 vp9_loop_filter_frame_init(cm, frame_filter_level); 167 168 // Initialize cur_sb_col to -1 for all SB rows. 169 vpx_memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows); 170 171 // Set up loopfilter thread data. 172 // The decoder is using num_workers instead of pbi->num_tile_workers 173 // because it has been observed that using more threads on the 174 // loopfilter, than there are tile columns in the frame will hurt 175 // performance on Android. This is because the system will only 176 // schedule the tile decode workers on cores equal to the number 177 // of tile columns. Then if the decoder tries to use more threads for the 178 // loopfilter, it will hurt performance because of contention. If the 179 // multithreading code changes in the future then the number of workers 180 // used by the loopfilter should be revisited. 181 for (i = 0; i < num_workers; ++i) { 182 VP9Worker *const worker = &pbi->tile_workers[i]; 183 TileWorkerData *const tile_data = (TileWorkerData*)worker->data1; 184 LFWorkerData *const lf_data = &tile_data->lfdata; 185 186 worker->hook = (VP9WorkerHook)loop_filter_row_worker; 187 188 // Loopfilter data 189 lf_data->frame_buffer = frame; 190 lf_data->cm = cm; 191 vp9_copy(lf_data->planes, pbi->mb.plane); 192 lf_data->start = i; 193 lf_data->stop = sb_rows; 194 lf_data->y_only = y_only; // always do all planes in decoder 195 196 lf_data->lf_sync = lf_sync; 197 lf_data->num_lf_workers = num_workers; 198 199 // Start loopfiltering 200 if (i == num_workers - 1) { 201 winterface->execute(worker); 202 } else { 203 winterface->launch(worker); 204 } 205 } 206 207 // Wait till all rows are finished 208 for (i = 0; i < num_workers; ++i) { 209 winterface->sync(&pbi->tile_workers[i]); 210 } 211} 212 213// Set up nsync by width. 214static int get_sync_range(int width) { 215 // nsync numbers are picked by testing. For example, for 4k 216 // video, using 4 gives best performance. 217 if (width < 640) 218 return 1; 219 else if (width <= 1280) 220 return 2; 221 else if (width <= 4096) 222 return 4; 223 else 224 return 8; 225} 226 227// Allocate memory for lf row synchronization 228void vp9_loop_filter_alloc(VP9_COMMON *cm, VP9LfSync *lf_sync, int rows, 229 int width) { 230#if CONFIG_MULTITHREAD 231 int i; 232 233 CHECK_MEM_ERROR(cm, lf_sync->mutex_, 234 vpx_malloc(sizeof(*lf_sync->mutex_) * rows)); 235 for (i = 0; i < rows; ++i) { 236 pthread_mutex_init(&lf_sync->mutex_[i], NULL); 237 } 238 239 CHECK_MEM_ERROR(cm, lf_sync->cond_, 240 vpx_malloc(sizeof(*lf_sync->cond_) * rows)); 241 for (i = 0; i < rows; ++i) { 242 pthread_cond_init(&lf_sync->cond_[i], NULL); 243 } 244#endif // CONFIG_MULTITHREAD 245 246 CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col, 247 vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows)); 248 249 // Set up nsync. 250 lf_sync->sync_range = get_sync_range(width); 251} 252 253// Deallocate lf synchronization related mutex and data 254void vp9_loop_filter_dealloc(VP9LfSync *lf_sync, int rows) { 255#if !CONFIG_MULTITHREAD 256 (void)rows; 257#endif // !CONFIG_MULTITHREAD 258 259 if (lf_sync != NULL) { 260#if CONFIG_MULTITHREAD 261 int i; 262 263 if (lf_sync->mutex_ != NULL) { 264 for (i = 0; i < rows; ++i) { 265 pthread_mutex_destroy(&lf_sync->mutex_[i]); 266 } 267 vpx_free(lf_sync->mutex_); 268 } 269 if (lf_sync->cond_ != NULL) { 270 for (i = 0; i < rows; ++i) { 271 pthread_cond_destroy(&lf_sync->cond_[i]); 272 } 273 vpx_free(lf_sync->cond_); 274 } 275#endif // CONFIG_MULTITHREAD 276 vpx_free(lf_sync->cur_sb_col); 277 // clear the structure as the source of this call may be a resize in which 278 // case this call will be followed by an _alloc() which may fail. 279 vp9_zero(*lf_sync); 280 } 281} 282