1/*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 *    Eric Anholt <eric@anholt.net>
25 *
26 */
27
28#include "main/macros.h"
29#include "program/program.h"
30#include "program/prog_print.h"
31#include "brw_context.h"
32#include "brw_defines.h"
33#include "brw_eu.h"
34
35const struct brw_instruction_info brw_opcodes[128] = {
36    [BRW_OPCODE_MOV] = { .name = "mov", .nsrc = 1, .ndst = 1, .is_arith = 1 },
37    [BRW_OPCODE_FRC] = { .name = "frc", .nsrc = 1, .ndst = 1, .is_arith = 1 },
38    [BRW_OPCODE_RNDU] = { .name = "rndu", .nsrc = 1, .ndst = 1, .is_arith = 1 },
39    [BRW_OPCODE_RNDD] = { .name = "rndd", .nsrc = 1, .ndst = 1, .is_arith = 1 },
40    [BRW_OPCODE_RNDE] = { .name = "rnde", .nsrc = 1, .ndst = 1, .is_arith = 1 },
41    [BRW_OPCODE_RNDZ] = { .name = "rndz", .nsrc = 1, .ndst = 1, .is_arith = 1 },
42    [BRW_OPCODE_NOT] = { .name = "not", .nsrc = 1, .ndst = 1, .is_arith = 1 },
43    [BRW_OPCODE_LZD] = { .name = "lzd", .nsrc = 1, .ndst = 1 },
44
45    [BRW_OPCODE_MUL] = { .name = "mul", .nsrc = 2, .ndst = 1, .is_arith = 1 },
46    [BRW_OPCODE_MAC] = { .name = "mac", .nsrc = 2, .ndst = 1, .is_arith = 1 },
47    [BRW_OPCODE_MACH] = { .name = "mach", .nsrc = 2, .ndst = 1, .is_arith = 1 },
48    [BRW_OPCODE_LINE] = { .name = "line", .nsrc = 2, .ndst = 1, .is_arith = 1 },
49    [BRW_OPCODE_PLN] = { .name = "pln", .nsrc = 2, .ndst = 1 },
50    [BRW_OPCODE_SAD2] = { .name = "sad2", .nsrc = 2, .ndst = 1 },
51    [BRW_OPCODE_SADA2] = { .name = "sada2", .nsrc = 2, .ndst = 1 },
52    [BRW_OPCODE_DP4] = { .name = "dp4", .nsrc = 2, .ndst = 1 },
53    [BRW_OPCODE_DPH] = { .name = "dph", .nsrc = 2, .ndst = 1 },
54    [BRW_OPCODE_DP3] = { .name = "dp3", .nsrc = 2, .ndst = 1 },
55    [BRW_OPCODE_DP2] = { .name = "dp2", .nsrc = 2, .ndst = 1 },
56    [BRW_OPCODE_MATH] = { .name = "math", .nsrc = 2, .ndst = 1 },
57
58    [BRW_OPCODE_AVG] = { .name = "avg", .nsrc = 2, .ndst = 1, .is_arith = 1 },
59    [BRW_OPCODE_ADD] = { .name = "add", .nsrc = 2, .ndst = 1, .is_arith = 1 },
60    [BRW_OPCODE_SEL] = { .name = "sel", .nsrc = 2, .ndst = 1, .is_arith = 1 },
61    [BRW_OPCODE_AND] = { .name = "and", .nsrc = 2, .ndst = 1, .is_arith = 1 },
62    [BRW_OPCODE_OR] = { .name = "or", .nsrc = 2, .ndst = 1, .is_arith = 1 },
63    [BRW_OPCODE_XOR] = { .name = "xor", .nsrc = 2, .ndst = 1, .is_arith = 1 },
64    [BRW_OPCODE_SHR] = { .name = "shr", .nsrc = 2, .ndst = 1, .is_arith = 1 },
65    [BRW_OPCODE_SHL] = { .name = "shl", .nsrc = 2, .ndst = 1, .is_arith = 1 },
66    [BRW_OPCODE_ASR] = { .name = "asr", .nsrc = 2, .ndst = 1 },
67    [BRW_OPCODE_CMP] = { .name = "cmp", .nsrc = 2, .ndst = 1 },
68    [BRW_OPCODE_CMPN] = { .name = "cmpn", .nsrc = 2, .ndst = 1 },
69
70    [BRW_OPCODE_SEND] = { .name = "send", .nsrc = 1, .ndst = 1 },
71    [BRW_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
72    [BRW_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 1, .ndst = 0 },
73    [BRW_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 },
74    [BRW_OPCODE_IFF] = { .name = "iff", .nsrc = 2, .ndst = 1 },
75    [BRW_OPCODE_WHILE] = { .name = "while", .nsrc = 2, .ndst = 0 },
76    [BRW_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 },
77    [BRW_OPCODE_BREAK] = { .name = "break", .nsrc = 2, .ndst = 0 },
78    [BRW_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 },
79    [BRW_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 },
80    [BRW_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 },
81    [BRW_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 },
82    [BRW_OPCODE_MRESTORE] = { .name = "mrest", .nsrc = 1, .ndst = 1 },
83    [BRW_OPCODE_POP] = { .name = "pop", .nsrc = 2, .ndst = 0 },
84    [BRW_OPCODE_WAIT] = { .name = "wait", .nsrc = 1, .ndst = 0 },
85    [BRW_OPCODE_DO] = { .name = "do", .nsrc = 0, .ndst = 0 },
86    [BRW_OPCODE_ENDIF] = { .name = "endif", .nsrc = 2, .ndst = 0 },
87};
88
89static INLINE
90bool brw_is_arithmetic_inst(const struct brw_instruction *inst)
91{
92   return brw_opcodes[inst->header.opcode].is_arith;
93}
94
95static const GLuint inst_stride[7] = {
96    [0] = 0,
97    [1] = 1,
98    [2] = 2,
99    [3] = 4,
100    [4] = 8,
101    [5] = 16,
102    [6] = 32
103};
104
105static const GLuint inst_type_size[8] = {
106    [BRW_REGISTER_TYPE_UD] = 4,
107    [BRW_REGISTER_TYPE_D] = 4,
108    [BRW_REGISTER_TYPE_UW] = 2,
109    [BRW_REGISTER_TYPE_W] = 2,
110    [BRW_REGISTER_TYPE_UB] = 1,
111    [BRW_REGISTER_TYPE_B] = 1,
112    [BRW_REGISTER_TYPE_F] = 4
113};
114
115static INLINE bool
116brw_is_grf_written(const struct brw_instruction *inst,
117                   int reg_index, int size,
118                   int gen)
119{
120   if (brw_opcodes[inst->header.opcode].ndst == 0)
121      return false;
122
123   if (inst->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT)
124      if (inst->bits1.ia1.dest_reg_file == BRW_GENERAL_REGISTER_FILE)
125         return true;
126
127   if (inst->bits1.da1.dest_reg_file != BRW_GENERAL_REGISTER_FILE)
128      return false;
129
130   const int reg_start = reg_index * REG_SIZE;
131   const int reg_end = reg_start + size;
132
133   const int type_size = inst_type_size[inst->bits1.da1.dest_reg_type];
134   const int write_start = inst->bits1.da1.dest_reg_nr*REG_SIZE
135                         + inst->bits1.da1.dest_subreg_nr;
136   int length, write_end;
137
138   /* SEND is specific */
139   if (inst->header.opcode == BRW_OPCODE_SEND) {
140      if (gen >= 5)
141         length = inst->bits3.generic_gen5.response_length*REG_SIZE;
142      else
143         length = inst->bits3.generic.response_length*REG_SIZE;
144   }
145   else {
146      length = 1 << inst->header.execution_size;
147      length *= type_size;
148      length *= inst->bits1.da1.dest_horiz_stride;
149   }
150
151   /* If the two intervals intersect, we overwrite the register */
152   write_end = write_start + length;
153   const int left = MAX2(write_start, reg_start);
154   const int right = MIN2(write_end, reg_end);
155
156   return left < right;
157}
158
159static bool
160brw_is_mrf_written_alu(const struct brw_instruction *inst,
161		       int reg_index, int size)
162{
163   if (brw_opcodes[inst->header.opcode].ndst == 0)
164      return false;
165
166   if (inst->bits1.da1.dest_reg_file != BRW_MESSAGE_REGISTER_FILE)
167      return false;
168
169   if (inst->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT)
170      return true;
171
172   const int reg_start = reg_index * REG_SIZE;
173   const int reg_end = reg_start + size;
174
175   const int mrf_index = inst->bits1.da1.dest_reg_nr & 0x0f;
176   const int is_compr4 = inst->bits1.da1.dest_reg_nr & BRW_MRF_COMPR4;
177   const int type_size = inst_type_size[inst->bits1.da1.dest_reg_type];
178
179   /* We use compr4 with a size != 16 elements. Strange, we conservatively
180    * consider that we are writing the register.
181    */
182   if (is_compr4 && inst->header.execution_size != BRW_EXECUTE_16)
183      return true;
184
185   /* Here we write mrf_{i} and mrf_{i+4}. So we read two times 8 elements */
186   if (is_compr4) {
187      const int length = 8 * type_size * inst->bits1.da1.dest_horiz_stride;
188
189      /* First 8-way register */
190      const int write_start0 = mrf_index*REG_SIZE
191                             + inst->bits1.da1.dest_subreg_nr;
192      const int write_end0 = write_start0 + length;
193
194      /* Second 8-way register */
195      const int write_start1 = (mrf_index+4)*REG_SIZE
196                             + inst->bits1.da1.dest_subreg_nr;
197      const int write_end1 = write_start1 + length;
198
199      /* If the two intervals intersect, we overwrite the register */
200      const int left0 = MAX2(write_start0, reg_start);
201      const int right0 = MIN2(write_end0, reg_end);
202      const int left1 = MAX2(write_start1, reg_start);
203      const int right1 = MIN2(write_end1, reg_end);
204
205      if (left0 < right0 || left1 < right1)
206	 return true;
207   }
208   else {
209      int length;
210      length = 1 << inst->header.execution_size;
211      length *= type_size;
212      length *= inst->bits1.da1.dest_horiz_stride;
213
214      /* If the two intervals intersect, we write into the register */
215      const int write_start = inst->bits1.da1.dest_reg_nr*REG_SIZE
216                            + inst->bits1.da1.dest_subreg_nr;
217      const int write_end = write_start + length;
218      const int left = MAX2(write_start, reg_start);
219      const int right = MIN2(write_end, reg_end);
220
221      if (left < right)
222	 return true;
223   }
224
225   return false;
226}
227
228/* SEND may perform an implicit mov to a mrf register */
229static bool
230brw_is_mrf_written_send(const struct brw_instruction *inst,
231			int reg_index, int size)
232{
233
234   const int reg_start = reg_index * REG_SIZE;
235   const int reg_end = reg_start + size;
236   const int mrf_start = inst->header.destreg__conditionalmod;
237   const int write_start = mrf_start * REG_SIZE;
238   const int write_end = write_start + REG_SIZE;
239   const int left = MAX2(write_start, reg_start);
240   const int right = MIN2(write_end, reg_end);
241
242   if (inst->header.opcode != BRW_OPCODE_SEND ||
243       inst->bits1.da1.src0_reg_file == 0)
244      return false;
245
246   return left < right;
247}
248
249/* Specific path for message register since we need to handle the compr4 case */
250static INLINE bool
251brw_is_mrf_written(const struct brw_instruction *inst, int reg_index, int size)
252{
253   return (brw_is_mrf_written_alu(inst, reg_index, size) ||
254	   brw_is_mrf_written_send(inst, reg_index, size));
255}
256
257static INLINE bool
258brw_is_mrf_read(const struct brw_instruction *inst,
259                int reg_index, int size, int gen)
260{
261   if (inst->header.opcode != BRW_OPCODE_SEND)
262      return false;
263   if (inst->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT)
264      return true;
265
266   const int reg_start = reg_index*REG_SIZE;
267   const int reg_end = reg_start + size;
268
269   int length, read_start, read_end;
270   if (gen >= 5)
271      length = inst->bits3.generic_gen5.msg_length*REG_SIZE;
272   else
273      length = inst->bits3.generic.msg_length*REG_SIZE;
274
275   /* Look if SEND uses an implicit mov. In that case, we read one less register
276    * (but we write it)
277    */
278   if (inst->bits1.da1.src0_reg_file != 0)
279      read_start = inst->header.destreg__conditionalmod;
280   else {
281      length--;
282      read_start = inst->header.destreg__conditionalmod + 1;
283   }
284   read_start *= REG_SIZE;
285   read_end = read_start + length;
286
287   const int left = MAX2(read_start, reg_start);
288   const int right = MIN2(read_end, reg_end);
289
290   return left < right;
291}
292
293static INLINE bool
294brw_is_grf_read(const struct brw_instruction *inst, int reg_index, int size)
295{
296   int i, j;
297   if (brw_opcodes[inst->header.opcode].nsrc == 0)
298      return false;
299
300   /* Look at first source. We must take into account register regions to
301    * monitor carefully the read. Note that we are a bit too conservative here
302    * since we do not take into account the fact that some complete registers
303    * may be skipped
304    */
305   if (brw_opcodes[inst->header.opcode].nsrc >= 1) {
306
307      if (inst->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT)
308         if (inst->bits1.ia1.src0_reg_file == BRW_GENERAL_REGISTER_FILE)
309            return true;
310      if (inst->bits1.da1.src0_reg_file != BRW_GENERAL_REGISTER_FILE)
311         return false;
312
313      const int reg_start = reg_index*REG_SIZE;
314      const int reg_end = reg_start + size;
315
316      /* See if at least one of this element intersects the interval */
317      const int type_size = inst_type_size[inst->bits1.da1.src0_reg_type];
318      const int elem_num = 1 << inst->header.execution_size;
319      const int width = 1 << inst->bits2.da1.src0_width;
320      const int row_num = elem_num >> inst->bits2.da1.src0_width;
321      const int hs = type_size*inst_stride[inst->bits2.da1.src0_horiz_stride];
322      const int vs = type_size*inst_stride[inst->bits2.da1.src0_vert_stride];
323      int row_start = inst->bits2.da1.src0_reg_nr*REG_SIZE
324                    + inst->bits2.da1.src0_subreg_nr;
325      for (j = 0; j < row_num; ++j) {
326         int write_start = row_start;
327         for (i = 0; i < width; ++i) {
328            const int write_end = write_start + type_size;
329            const int left = write_start > reg_start ? write_start : reg_start;
330            const int right = write_end < reg_end ? write_end : reg_end;
331            if (left < right)
332               return true;
333            write_start += hs;
334         }
335         row_start += vs;
336      }
337   }
338
339   /* Second src register */
340   if (brw_opcodes[inst->header.opcode].nsrc >= 2) {
341
342      if (inst->bits3.da1.src1_address_mode != BRW_ADDRESS_DIRECT)
343         if (inst->bits1.ia1.src1_reg_file == BRW_GENERAL_REGISTER_FILE)
344            return true;
345      if (inst->bits1.da1.src1_reg_file != BRW_GENERAL_REGISTER_FILE)
346         return false;
347
348      const int reg_start = reg_index*REG_SIZE;
349      const int reg_end = reg_start + size;
350
351      /* See if at least one of this element intersects the interval */
352      const int type_size = inst_type_size[inst->bits1.da1.src1_reg_type];
353      const int elem_num = 1 << inst->header.execution_size;
354      const int width = 1 << inst->bits3.da1.src1_width;
355      const int row_num = elem_num >> inst->bits3.da1.src1_width;
356      const int hs = type_size*inst_stride[inst->bits3.da1.src1_horiz_stride];
357      const int vs = type_size*inst_stride[inst->bits3.da1.src1_vert_stride];
358      int row_start = inst->bits3.da1.src1_reg_nr*REG_SIZE
359                    + inst->bits3.da1.src1_subreg_nr;
360      for (j = 0; j < row_num; ++j) {
361         int write_start = row_start;
362         for (i = 0; i < width; ++i) {
363            const int write_end = write_start + type_size;
364            const int left = write_start > reg_start ? write_start : reg_start;
365            const int right = write_end < reg_end ? write_end : reg_end;
366            if (left < right)
367               return true;
368            write_start += hs;
369         }
370         row_start += vs;
371      }
372   }
373
374   return false;
375}
376
377static INLINE bool
378brw_is_control_done(const struct brw_instruction *mov) {
379   return
380       mov->header.dependency_control != 0 ||
381       mov->header.thread_control != 0 ||
382       mov->header.mask_control != 0 ||
383       mov->header.saturate != 0 ||
384       mov->header.debug_control != 0;
385}
386
387static INLINE bool
388brw_is_predicated(const struct brw_instruction *mov) {
389   return mov->header.predicate_control != 0;
390}
391
392static INLINE bool
393brw_is_grf_to_mrf_mov(const struct brw_instruction *mov,
394                      int *mrf_index,
395                      int *grf_index,
396                      bool *is_compr4)
397{
398   if (brw_is_predicated(mov) ||
399       brw_is_control_done(mov) ||
400       mov->header.debug_control != 0)
401      return false;
402
403   if (mov->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT ||
404       mov->bits1.da1.dest_reg_file != BRW_MESSAGE_REGISTER_FILE ||
405       mov->bits1.da1.dest_reg_type != BRW_REGISTER_TYPE_F ||
406       mov->bits1.da1.dest_horiz_stride != BRW_HORIZONTAL_STRIDE_1 ||
407       mov->bits1.da1.dest_subreg_nr != 0)
408      return false;
409
410   if (mov->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT ||
411       mov->bits1.da1.src0_reg_file != BRW_GENERAL_REGISTER_FILE ||
412       mov->bits1.da1.src0_reg_type != BRW_REGISTER_TYPE_F ||
413       mov->bits2.da1.src0_width != BRW_WIDTH_8 ||
414       mov->bits2.da1.src0_horiz_stride != BRW_HORIZONTAL_STRIDE_1 ||
415       mov->bits2.da1.src0_vert_stride != BRW_VERTICAL_STRIDE_8 ||
416       mov->bits2.da1.src0_subreg_nr != 0 ||
417       mov->bits2.da1.src0_abs != 0 ||
418       mov->bits2.da1.src0_negate != 0)
419      return false;
420
421   *grf_index = mov->bits2.da1.src0_reg_nr;
422   *mrf_index = mov->bits1.da1.dest_reg_nr & 0x0f;
423   *is_compr4 = (mov->bits1.da1.dest_reg_nr & BRW_MRF_COMPR4) != 0;
424   return true;
425}
426
427static INLINE bool
428brw_is_grf_straight_write(const struct brw_instruction *inst, int grf_index)
429{
430   /* remark: no problem to predicate a SEL instruction */
431   if ((!brw_is_predicated(inst) || inst->header.opcode == BRW_OPCODE_SEL) &&
432       brw_is_control_done(inst) == false &&
433       inst->header.execution_size == 4 &&
434       inst->header.access_mode == BRW_ALIGN_1 &&
435       inst->bits1.da1.dest_address_mode == BRW_ADDRESS_DIRECT &&
436       inst->bits1.da1.dest_reg_file == BRW_GENERAL_REGISTER_FILE &&
437       inst->bits1.da1.dest_reg_type == BRW_REGISTER_TYPE_F &&
438       inst->bits1.da1.dest_horiz_stride == BRW_HORIZONTAL_STRIDE_1 &&
439       inst->bits1.da1.dest_reg_nr == grf_index &&
440       inst->bits1.da1.dest_subreg_nr == 0 &&
441       brw_is_arithmetic_inst(inst))
442      return true;
443
444   return false;
445}
446
447static INLINE bool
448brw_inst_are_equal(const struct brw_instruction *src0,
449                   const struct brw_instruction *src1)
450{
451   const GLuint *field0 = (GLuint *) src0;
452   const GLuint *field1 = (GLuint *) src1;
453   return field0[0] == field1[0] &&
454          field0[1] == field1[1] &&
455          field0[2] == field1[2] &&
456          field0[3] == field1[3];
457}
458
459static INLINE void
460brw_inst_copy(struct brw_instruction *dst,
461              const struct brw_instruction *src)
462{
463   GLuint *field_dst = (GLuint *) dst;
464   const GLuint *field_src = (GLuint *) src;
465   field_dst[0] = field_src[0];
466   field_dst[1] = field_src[1];
467   field_dst[2] = field_src[2];
468   field_dst[3] = field_src[3];
469}
470
471static void brw_remove_inst(struct brw_compile *p, const bool *removeInst)
472{
473   int i, nr_insn = 0, to = 0, from = 0;
474
475   for (from = 0; from < p->nr_insn; ++from) {
476      if (removeInst[from])
477         continue;
478      if(to != from)
479         brw_inst_copy(p->store + to, p->store + from);
480      to++;
481   }
482
483   for (i = 0; i < p->nr_insn; ++i)
484      if (removeInst[i] == false)
485         nr_insn++;
486   p->nr_insn = nr_insn;
487}
488
489/* The gen code emitter generates a lot of duplications in the
490 * grf-to-mrf moves, for example when texture sampling with the same
491 * coordinates from multiple textures..  Here, we monitor same mov
492 * grf-to-mrf instrutions and remove repeated ones where the operands
493 * and dst ahven't changed in between.
494 */
495void brw_remove_duplicate_mrf_moves(struct brw_compile *p)
496{
497   const int gen = p->brw->intel.gen;
498   int i, j;
499
500   bool *removeInst = calloc(sizeof(bool), p->nr_insn);
501   for (i = 0; i < p->nr_insn; i++) {
502      if (removeInst[i])
503         continue;
504
505      const struct brw_instruction *mov = p->store + i;
506      int mrf_index, grf_index;
507      bool is_compr4;
508
509      /* Only consider _straight_ grf-to-mrf moves */
510      if (!brw_is_grf_to_mrf_mov(mov, &mrf_index, &grf_index, &is_compr4))
511         continue;
512
513      const int mrf_index0 = mrf_index;
514      const int mrf_index1 = is_compr4 ? mrf_index0+4 : mrf_index0+1;
515      const int simd16_size = 2 * REG_SIZE;
516
517      for (j = i + 1; j < p->nr_insn; j++) {
518         const struct brw_instruction *inst = p->store + j;
519
520         if (brw_inst_are_equal(mov, inst)) {
521            removeInst[j] = true;
522            continue;
523         }
524
525         if (brw_is_grf_written(inst, grf_index, simd16_size, gen) ||
526             brw_is_mrf_written(inst, mrf_index0, REG_SIZE) ||
527             brw_is_mrf_written(inst, mrf_index1, REG_SIZE))
528            break;
529      }
530   }
531
532   brw_remove_inst(p, removeInst);
533   free(removeInst);
534}
535
536/* Replace moves to MRFs where the value moved is the result of a
537 * normal arithmetic operation with computation right into the MRF.
538 */
539void brw_remove_grf_to_mrf_moves(struct brw_compile *p)
540{
541   int i, j, prev;
542   struct brw_context *brw = p->brw;
543   const int gen = brw->intel.gen;
544   const int simd16_size = 2*REG_SIZE;
545
546   bool *removeInst = calloc(sizeof(bool), p->nr_insn);
547   assert(removeInst);
548
549   for (i = 0; i < p->nr_insn; i++) {
550      if (removeInst[i])
551         continue;
552
553      struct brw_instruction *grf_inst = NULL;
554      const struct brw_instruction *mov = p->store + i;
555      int mrf_index, grf_index;
556      bool is_compr4;
557
558      /* Only consider _straight_ grf-to-mrf moves */
559      if (!brw_is_grf_to_mrf_mov(mov, &mrf_index, &grf_index, &is_compr4))
560         continue;
561
562      /* Using comp4 enables a stride of 4 for this instruction */
563      const int mrf_index0 = mrf_index;
564      const int mrf_index1 = is_compr4 ? mrf_index+4 : mrf_index+1;
565
566      /* Look where the register has been set */
567      prev = i;
568      bool potential_remove = false;
569      while (prev--) {
570
571         /* If _one_ instruction writes the grf, we try to remove the mov */
572         struct brw_instruction *inst = p->store + prev;
573         if (brw_is_grf_straight_write(inst, grf_index)) {
574            potential_remove = true;
575            grf_inst = inst;
576            break;
577         }
578
579      }
580
581      if (potential_remove == false)
582         continue;
583      removeInst[i] = true;
584
585      /* Monitor first the section of code between the grf computation and the
586       * mov. Here we cannot read or write both mrf and grf register
587       */
588      for (j = prev + 1; j < i; ++j) {
589         struct brw_instruction *inst = p->store + j;
590         if (removeInst[j])
591            continue;
592         if (brw_is_grf_written(inst, grf_index, simd16_size, gen)   ||
593             brw_is_grf_read(inst, grf_index, simd16_size)           ||
594             brw_is_mrf_written(inst, mrf_index0, REG_SIZE)   ||
595             brw_is_mrf_written(inst, mrf_index1, REG_SIZE)   ||
596             brw_is_mrf_read(inst, mrf_index0, REG_SIZE, gen) ||
597             brw_is_mrf_read(inst, mrf_index1, REG_SIZE, gen)) {
598            removeInst[i] = false;
599            break;
600         }
601      }
602
603      /* After the mov, we can read or write the mrf. If the grf is overwritten,
604       * we are done
605       */
606      for (j = i + 1; j < p->nr_insn; ++j) {
607         struct brw_instruction *inst = p->store + j;
608         if (removeInst[j])
609            continue;
610
611         if (brw_is_grf_read(inst, grf_index, simd16_size)) {
612            removeInst[i] = false;
613            break;
614         }
615
616         if (brw_is_grf_straight_write(inst, grf_index))
617            break;
618      }
619
620      /* Note that with the top down traversal, we can safely pacth the mov
621       * instruction
622       */
623      if (removeInst[i]) {
624         grf_inst->bits1.da1.dest_reg_file = mov->bits1.da1.dest_reg_file;
625         grf_inst->bits1.da1.dest_reg_nr = mov->bits1.da1.dest_reg_nr;
626      }
627   }
628
629   brw_remove_inst(p, removeInst);
630   free(removeInst);
631}
632
633static bool
634is_single_channel_dp4(struct brw_instruction *insn)
635{
636   if (insn->header.opcode != BRW_OPCODE_DP4 ||
637       insn->header.execution_size != BRW_EXECUTE_8 ||
638       insn->header.access_mode != BRW_ALIGN_16 ||
639       insn->bits1.da1.dest_reg_file != BRW_GENERAL_REGISTER_FILE)
640      return false;
641
642   if (!is_power_of_two(insn->bits1.da16.dest_writemask))
643      return false;
644
645   return true;
646}
647
648/**
649 * Sets the dependency control fields on DP4 instructions.
650 *
651 * The hardware only tracks dependencies on a register basis, so when
652 * you do:
653 *
654 * DP4 dst.x src1 src2
655 * DP4 dst.y src1 src3
656 * DP4 dst.z src1 src4
657 * DP4 dst.w src1 src5
658 *
659 * It will wait to do the DP4 dst.y until the dst.x is resolved, etc.
660 * We can examine our instruction stream and set the dependency
661 * control fields to tell the hardware when to do it.
662 *
663 * We may want to extend this to other instructions that are used to
664 * fill in a channel at a time of the destination register.
665 */
666static void
667brw_set_dp4_dependency_control(struct brw_compile *p)
668{
669   int i;
670
671   for (i = 1; i < p->nr_insn; i++) {
672      struct brw_instruction *insn = &p->store[i];
673      struct brw_instruction *prev = &p->store[i - 1];
674
675      if (!is_single_channel_dp4(prev))
676	 continue;
677
678      if (!is_single_channel_dp4(insn)) {
679	 i++;
680	 continue;
681      }
682
683      /* Only avoid hw dep control if the write masks are different
684       * channels of one reg.
685       */
686      if (insn->bits1.da16.dest_writemask == prev->bits1.da16.dest_writemask)
687	 continue;
688      if (insn->bits1.da16.dest_reg_nr != prev->bits1.da16.dest_reg_nr)
689	 continue;
690
691      /* Check if the second instruction depends on the previous one
692       * for a src.
693       */
694      if (insn->bits1.da1.src0_reg_file == BRW_GENERAL_REGISTER_FILE &&
695	  (insn->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT ||
696	   insn->bits2.da1.src0_reg_nr == insn->bits1.da16.dest_reg_nr))
697	  continue;
698      if (insn->bits1.da1.src1_reg_file == BRW_GENERAL_REGISTER_FILE &&
699	  (insn->bits3.da1.src1_address_mode != BRW_ADDRESS_DIRECT ||
700	   insn->bits3.da1.src1_reg_nr == insn->bits1.da16.dest_reg_nr))
701	  continue;
702
703      prev->header.dependency_control |= BRW_DEPENDENCY_NOTCLEARED;
704      insn->header.dependency_control |= BRW_DEPENDENCY_NOTCHECKED;
705   }
706}
707
708void
709brw_optimize(struct brw_compile *p)
710{
711   brw_set_dp4_dependency_control(p);
712}
713