brw_cs.c revision 3d2485f011c4f1c7de35871ca359a84415bfcc06
1/*
2 * Copyright (c) 2014 - 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24#include "util/ralloc.h"
25#include "brw_context.h"
26#include "brw_cs.h"
27#include "brw_eu.h"
28#include "brw_wm.h"
29#include "brw_shader.h"
30#include "intel_mipmap_tree.h"
31#include "brw_state.h"
32#include "intel_batchbuffer.h"
33#include "brw_nir.h"
34#include "brw_program.h"
35#include "compiler/glsl/ir_uniform.h"
36
37static void
38assign_cs_binding_table_offsets(const struct gen_device_info *devinfo,
39                                const struct gl_shader_program *shader_prog,
40                                const struct gl_program *prog,
41                                struct brw_cs_prog_data *prog_data)
42{
43   uint32_t next_binding_table_offset = 0;
44
45   /* May not be used if the gl_NumWorkGroups variable is not accessed. */
46   prog_data->binding_table.work_groups_start = next_binding_table_offset;
47   next_binding_table_offset++;
48
49   brw_assign_common_binding_table_offsets(MESA_SHADER_COMPUTE, devinfo,
50                                           shader_prog, prog, &prog_data->base,
51                                           next_binding_table_offset);
52}
53
54static bool
55brw_codegen_cs_prog(struct brw_context *brw,
56                    struct gl_shader_program *prog,
57                    struct brw_program *cp,
58                    struct brw_cs_prog_key *key)
59{
60   const struct gen_device_info *devinfo = &brw->screen->devinfo;
61   struct gl_context *ctx = &brw->ctx;
62   const GLuint *program;
63   void *mem_ctx = ralloc_context(NULL);
64   GLuint program_size;
65   struct brw_cs_prog_data prog_data;
66   bool start_busy = false;
67   double start_time = 0;
68
69   memset(&prog_data, 0, sizeof(prog_data));
70
71   if (cp->program.info.cs.shared_size > 64 * 1024) {
72      cp->program.sh.data->LinkStatus = false;
73      const char *error_str =
74         "Compute shader used more than 64KB of shared variables";
75      ralloc_strcat(&cp->program.sh.data->InfoLog, error_str);
76      _mesa_problem(NULL, "Failed to link compute shader: %s\n", error_str);
77
78      ralloc_free(mem_ctx);
79      return false;
80   } else {
81      prog_data.base.total_shared = cp->program.info.cs.shared_size;
82   }
83
84   assign_cs_binding_table_offsets(devinfo, prog, &cp->program, &prog_data);
85
86   /* Allocate the references to the uniforms that will end up in the
87    * prog_data associated with the compiled program, and which will be freed
88    * by the state cache.
89    */
90   int param_count = cp->program.nir->num_uniforms / 4;
91
92   /* The backend also sometimes add a param for the thread local id. */
93   prog_data.thread_local_id_index = param_count++;
94
95   /* The backend also sometimes adds params for texture size. */
96   param_count += 2 * ctx->Const.Program[MESA_SHADER_COMPUTE].MaxTextureImageUnits;
97   prog_data.base.param =
98      rzalloc_array(NULL, const gl_constant_value *, param_count);
99   prog_data.base.pull_param =
100      rzalloc_array(NULL, const gl_constant_value *, param_count);
101   prog_data.base.image_param =
102      rzalloc_array(NULL, struct brw_image_param,
103                    cp->program.info.num_images);
104   prog_data.base.nr_params = param_count;
105   prog_data.base.nr_image_params = cp->program.info.num_images;
106
107   brw_nir_setup_glsl_uniforms(cp->program.nir, prog, &cp->program,
108                               &prog_data.base, true);
109
110   if (unlikely(brw->perf_debug)) {
111      start_busy = (brw->batch.last_bo &&
112                    drm_intel_bo_busy(brw->batch.last_bo));
113      start_time = get_time();
114   }
115
116   int st_index = -1;
117   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
118      st_index = brw_get_shader_time_index(brw, &cp->program, ST_CS, true);
119
120   char *error_str;
121   program = brw_compile_cs(brw->screen->compiler, brw, mem_ctx, key,
122                            &prog_data, cp->program.nir, st_index,
123                            &program_size, &error_str);
124   if (program == NULL) {
125      cp->program.sh.data->LinkStatus = false;
126      ralloc_strcat(&cp->program.sh.data->InfoLog, error_str);
127      _mesa_problem(NULL, "Failed to compile compute shader: %s\n", error_str);
128
129      ralloc_free(mem_ctx);
130      return false;
131   }
132
133   if (unlikely(brw->perf_debug)) {
134      if (cp->compiled_once) {
135         _mesa_problem(&brw->ctx, "CS programs shouldn't need recompiles");
136      }
137      cp->compiled_once = true;
138
139      if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
140         perf_debug("CS compile took %.03f ms and stalled the GPU\n",
141                    (get_time() - start_time) * 1000);
142      }
143   }
144
145   const unsigned subslices = MAX2(brw->screen->subslice_total, 1);
146
147   /* WaCSScratchSize:hsw
148    *
149    * Haswell's scratch space address calculation appears to be sparse
150    * rather than tightly packed.  The Thread ID has bits indicating
151    * which subslice, EU within a subslice, and thread within an EU
152    * it is.  There's a maximum of two slices and two subslices, so these
153    * can be stored with a single bit.  Even though there are only 10 EUs
154    * per subslice, this is stored in 4 bits, so there's an effective
155    * maximum value of 16 EUs.  Similarly, although there are only 7
156    * threads per EU, this is stored in a 3 bit number, giving an effective
157    * maximum value of 8 threads per EU.
158    *
159    * This means that we need to use 16 * 8 instead of 10 * 7 for the
160    * number of threads per subslice.
161    */
162   const unsigned scratch_ids_per_subslice =
163      brw->is_haswell ? 16 * 8 : devinfo->max_cs_threads;
164
165   brw_alloc_stage_scratch(brw, &brw->cs.base,
166                           prog_data.base.total_scratch,
167                           scratch_ids_per_subslice * subslices);
168
169   brw_upload_cache(&brw->cache, BRW_CACHE_CS_PROG,
170                    key, sizeof(*key),
171                    program, program_size,
172                    &prog_data, sizeof(prog_data),
173                    &brw->cs.base.prog_offset, &brw->cs.base.prog_data);
174   ralloc_free(mem_ctx);
175
176   return true;
177}
178
179
180static void
181brw_cs_populate_key(struct brw_context *brw, struct brw_cs_prog_key *key)
182{
183   struct gl_context *ctx = &brw->ctx;
184   /* BRW_NEW_COMPUTE_PROGRAM */
185   const struct brw_program *cp = (struct brw_program *) brw->compute_program;
186   const struct gl_program *prog = (struct gl_program *) cp;
187
188   memset(key, 0, sizeof(*key));
189
190   /* _NEW_TEXTURE */
191   brw_populate_sampler_prog_key_data(ctx, prog, &key->tex);
192
193   /* The unique compute program ID */
194   key->program_string_id = cp->id;
195}
196
197
198void
199brw_upload_cs_prog(struct brw_context *brw)
200{
201   struct gl_context *ctx = &brw->ctx;
202   struct brw_cs_prog_key key;
203   struct brw_program *cp = (struct brw_program *) brw->compute_program;
204
205   if (!cp)
206      return;
207
208   if (!brw_state_dirty(brw, _NEW_TEXTURE, BRW_NEW_COMPUTE_PROGRAM))
209      return;
210
211   brw->cs.base.sampler_count =
212      util_last_bit(ctx->ComputeProgram._Current->SamplersUsed);
213
214   brw_cs_populate_key(brw, &key);
215
216   if (!brw_search_cache(&brw->cache, BRW_CACHE_CS_PROG,
217                         &key, sizeof(key),
218                         &brw->cs.base.prog_offset,
219                         &brw->cs.base.prog_data)) {
220      bool success =
221         brw_codegen_cs_prog(brw,
222                             ctx->_Shader->CurrentProgram[MESA_SHADER_COMPUTE],
223                             cp, &key);
224      (void) success;
225      assert(success);
226   }
227}
228
229
230bool
231brw_cs_precompile(struct gl_context *ctx,
232                  struct gl_shader_program *shader_prog,
233                  struct gl_program *prog)
234{
235   struct brw_context *brw = brw_context(ctx);
236   struct brw_cs_prog_key key;
237
238   struct brw_program *bcp = brw_program(prog);
239
240   memset(&key, 0, sizeof(key));
241   key.program_string_id = bcp->id;
242
243   brw_setup_tex_for_precompile(brw, &key.tex, prog);
244
245   uint32_t old_prog_offset = brw->cs.base.prog_offset;
246   struct brw_stage_prog_data *old_prog_data = brw->cs.base.prog_data;
247
248   bool success = brw_codegen_cs_prog(brw, shader_prog, bcp, &key);
249
250   brw->cs.base.prog_offset = old_prog_offset;
251   brw->cs.base.prog_data = old_prog_data;
252
253   return success;
254}
255