1
2/*
3 * Copyright © 2014 Broadcom
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25#include "vc4_qpu.h"
26
27static void
28fail_instr(uint64_t inst, const char *msg)
29{
30        fprintf(stderr, "vc4_qpu_validate: %s: ", msg);
31        vc4_qpu_disasm(&inst, 1);
32        fprintf(stderr, "\n");
33        abort();
34}
35
36static bool
37writes_reg(uint64_t inst, uint32_t w)
38{
39        return (QPU_GET_FIELD(inst, QPU_WADDR_ADD) == w ||
40                QPU_GET_FIELD(inst, QPU_WADDR_MUL) == w);
41}
42
43static bool
44_reads_reg(uint64_t inst, uint32_t r, bool ignore_a, bool ignore_b)
45{
46        struct {
47                uint32_t mux, addr;
48        } src_regs[] = {
49                { QPU_GET_FIELD(inst, QPU_ADD_A) },
50                { QPU_GET_FIELD(inst, QPU_ADD_B) },
51                { QPU_GET_FIELD(inst, QPU_MUL_A) },
52                { QPU_GET_FIELD(inst, QPU_MUL_B) },
53        };
54
55        /* Branches only reference raddr_a (no mux), and we don't use that
56         * feature of branching.
57         */
58        if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH)
59                return false;
60
61        /* Load immediates don't read any registers. */
62        if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LOAD_IMM)
63                return false;
64
65        for (int i = 0; i < ARRAY_SIZE(src_regs); i++) {
66                if (!ignore_a &&
67                    src_regs[i].mux == QPU_MUX_A &&
68                    (QPU_GET_FIELD(inst, QPU_RADDR_A) == r))
69                        return true;
70
71                if (!ignore_b &&
72                    QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM &&
73                    src_regs[i].mux == QPU_MUX_B &&
74                    (QPU_GET_FIELD(inst, QPU_RADDR_B) == r))
75                        return true;
76        }
77
78        return false;
79}
80
81static bool
82reads_reg(uint64_t inst, uint32_t r)
83{
84        return _reads_reg(inst, r, false, false);
85}
86
87static bool
88reads_a_reg(uint64_t inst, uint32_t r)
89{
90        return _reads_reg(inst, r, false, true);
91}
92
93static bool
94reads_b_reg(uint64_t inst, uint32_t r)
95{
96        return _reads_reg(inst, r, true, false);
97}
98
99static bool
100writes_sfu(uint64_t inst)
101{
102        return (writes_reg(inst, QPU_W_SFU_RECIP) ||
103                writes_reg(inst, QPU_W_SFU_RECIPSQRT) ||
104                writes_reg(inst, QPU_W_SFU_EXP) ||
105                writes_reg(inst, QPU_W_SFU_LOG));
106}
107
108/**
109 * Checks for the instruction restrictions from page 37 ("Summary of
110 * Instruction Restrictions").
111 */
112void
113vc4_qpu_validate(uint64_t *insts, uint32_t num_inst)
114{
115        bool scoreboard_locked = false;
116        bool threaded = false;
117
118        /* We don't want to do validation in release builds, but we want to
119         * keep compiling the validation code to make sure it doesn't get
120         * broken.
121         */
122#ifndef DEBUG
123        return;
124#endif
125
126        for (int i = 0; i < num_inst; i++) {
127                uint64_t inst = insts[i];
128                uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
129
130                if (sig != QPU_SIG_PROG_END) {
131                        if (qpu_inst_is_tlb(inst))
132                                scoreboard_locked = true;
133
134                        if (sig == QPU_SIG_THREAD_SWITCH ||
135                            sig == QPU_SIG_LAST_THREAD_SWITCH) {
136                                threaded = true;
137                        }
138
139                        continue;
140                }
141
142                /* "The Thread End instruction must not write to either physical
143                 *  regfile A or B."
144                 */
145                if (QPU_GET_FIELD(inst, QPU_WADDR_ADD) < 32 ||
146                    QPU_GET_FIELD(inst, QPU_WADDR_MUL) < 32) {
147                        fail_instr(inst, "write to phys reg in thread end");
148                }
149
150                /* Can't trigger an implicit wait on scoreboard in the program
151                 * end instruction.
152                 */
153                if (qpu_inst_is_tlb(inst) && !scoreboard_locked)
154                        fail_instr(inst, "implicit sb wait in program end");
155
156                /* Two delay slots will be executed. */
157                assert(i + 2 <= num_inst);
158
159                 for (int j = i; j < i + 2; j++) {
160                         /* "The last three instructions of any program
161                          *  (Thread End plus the following two delay-slot
162                          *  instructions) must not do varyings read, uniforms
163                          *  read or any kind of VPM, VDR, or VDW read or
164                          *  write."
165                          */
166                         if (writes_reg(insts[j], QPU_W_VPM) ||
167                             reads_reg(insts[j], QPU_R_VARY) ||
168                             reads_reg(insts[j], QPU_R_UNIF) ||
169                             reads_reg(insts[j], QPU_R_VPM)) {
170                                 fail_instr(insts[j], "last 3 instructions "
171                                            "using fixed functions");
172                         }
173
174                         /* "The Thread End instruction and the following two
175                          *  delay slot instructions must not write or read
176                          *  address 14 in either regfile A or B."
177                          */
178                         if (writes_reg(insts[j], 14) ||
179                             reads_reg(insts[j], 14)) {
180                                 fail_instr(insts[j], "last 3 instructions "
181                                            "must not use r14");
182                         }
183                 }
184
185                 /* "The final program instruction (the second delay slot
186                  *  instruction) must not do a TLB Z write."
187                  */
188                 if (writes_reg(insts[i + 2], QPU_W_TLB_Z)) {
189                         fail_instr(insts[i + 2], "final instruction doing "
190                                    "Z write");
191                 }
192        }
193
194        /* "A scoreboard wait must not occur in the first two instructions of
195         *  a fragment shader. This is either the explicit Wait for Scoreboard
196         *  signal or an implicit wait with the first tile-buffer read or
197         *  write instruction."
198         */
199        for (int i = 0; i < 2; i++) {
200                uint64_t inst = insts[i];
201
202                if (qpu_inst_is_tlb(inst))
203                        fail_instr(inst, "sb wait in first two insts");
204        }
205
206        /* "If TMU_NOSWAP is written, the write must be three instructions
207         *  before the first TMU write instruction.  For example, if
208         *  TMU_NOSWAP is written in the first shader instruction, the first
209         *  TMU write cannot occur before the 4th shader instruction."
210         */
211        int last_tmu_noswap = -10;
212        for (int i = 0; i < num_inst; i++) {
213                uint64_t inst = insts[i];
214
215                if ((i - last_tmu_noswap) <= 3 &&
216                    (writes_reg(inst, QPU_W_TMU0_S) ||
217                     writes_reg(inst, QPU_W_TMU1_S))) {
218                        fail_instr(inst, "TMU write too soon after TMU_NOSWAP");
219                }
220
221                if (writes_reg(inst, QPU_W_TMU_NOSWAP))
222                    last_tmu_noswap = i;
223        }
224
225        /* "An instruction must not read from a location in physical regfile A
226         *  or B that was written to by the previous instruction."
227         */
228        for (int i = 0; i < num_inst - 1; i++) {
229                uint64_t inst = insts[i];
230                uint32_t add_waddr = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
231                uint32_t mul_waddr = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
232                uint32_t waddr_a, waddr_b;
233
234                if (inst & QPU_WS) {
235                        waddr_b = add_waddr;
236                        waddr_a = mul_waddr;
237                } else {
238                        waddr_a = add_waddr;
239                        waddr_b = mul_waddr;
240                }
241
242                if ((waddr_a < 32 && reads_a_reg(insts[i + 1], waddr_a)) ||
243                    (waddr_b < 32 && reads_b_reg(insts[i + 1], waddr_b))) {
244                        fail_instr(insts[i + 1],
245                                   "Reads physical reg too soon after write");
246                }
247        }
248
249        /* "After an SFU lookup instruction, accumulator r4 must not be read
250         *  in the following two instructions. Any other instruction that
251         *  results in r4 being written (that is, TMU read, TLB read, SFU
252         *  lookup) cannot occur in the two instructions following an SFU
253         *  lookup."
254         */
255        int last_sfu_inst = -10;
256        for (int i = 0; i < num_inst - 1; i++) {
257                uint64_t inst = insts[i];
258                uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
259
260                if (i - last_sfu_inst <= 2 &&
261                    (writes_sfu(inst) ||
262                     sig == QPU_SIG_LOAD_TMU0 ||
263                     sig == QPU_SIG_LOAD_TMU1 ||
264                     sig == QPU_SIG_COLOR_LOAD)) {
265                        fail_instr(inst, "R4 write too soon after SFU write");
266                }
267
268                if (writes_sfu(inst))
269                        last_sfu_inst = i;
270        }
271
272        for (int i = 0; i < num_inst - 1; i++) {
273                uint64_t inst = insts[i];
274
275                if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_SMALL_IMM &&
276                    QPU_GET_FIELD(inst, QPU_SMALL_IMM) >=
277                    QPU_SMALL_IMM_MUL_ROT) {
278                        uint32_t mux_a = QPU_GET_FIELD(inst, QPU_MUL_A);
279                        uint32_t mux_b = QPU_GET_FIELD(inst, QPU_MUL_B);
280
281                        /* "The full horizontal vector rotate is only
282                         *  available when both of the mul ALU input arguments
283                         *  are taken from accumulators r0-r3."
284                         */
285                        if (mux_a > QPU_MUX_R3 || mux_b > QPU_MUX_R3) {
286                                fail_instr(inst,
287                                           "MUL rotate using non-accumulator "
288                                           "input");
289                        }
290
291                        if (QPU_GET_FIELD(inst, QPU_SMALL_IMM) ==
292                            QPU_SMALL_IMM_MUL_ROT) {
293                                /* "An instruction that does a vector rotate
294                                 *  by r5 must not immediately follow an
295                                 *  instruction that writes to r5."
296                                 */
297                                if (writes_reg(insts[i - 1], QPU_W_ACC5)) {
298                                        fail_instr(inst,
299                                                   "vector rotate by r5 "
300                                                   "immediately after r5 write");
301                                }
302                        }
303
304                        /* "An instruction that does a vector rotate must not
305                         *  immediately follow an instruction that writes to the
306                         *  accumulator that is being rotated."
307                         */
308                        if (writes_reg(insts[i - 1], QPU_W_ACC0 + mux_a) ||
309                            writes_reg(insts[i - 1], QPU_W_ACC0 + mux_b)) {
310                                fail_instr(inst,
311                                           "vector rotate of value "
312                                           "written in previous instruction");
313                        }
314                }
315        }
316
317        /* "An instruction that does a vector rotate must not immediately
318         *  follow an instruction that writes to the accumulator that is being
319         *  rotated.
320         *
321         * XXX: TODO.
322         */
323
324        /* "After an instruction that does a TLB Z write, the multisample mask
325         *  must not be read as an instruction input argument in the following
326         *  two instruction. The TLB Z write instruction can, however, be
327         *  followed immediately by a TLB color write."
328         */
329        for (int i = 0; i < num_inst - 1; i++) {
330                uint64_t inst = insts[i];
331                if (writes_reg(inst, QPU_W_TLB_Z) &&
332                    (reads_a_reg(insts[i + 1], QPU_R_MS_REV_FLAGS) ||
333                     reads_a_reg(insts[i + 2], QPU_R_MS_REV_FLAGS))) {
334                        fail_instr(inst, "TLB Z write followed by MS mask read");
335                }
336        }
337
338        /*
339         * "A single instruction can only perform a maximum of one of the
340         *  following closely coupled peripheral accesses in a single
341         *  instruction: TMU write, TMU read, TLB write, TLB read, TLB
342         *  combined color read and write, SFU write, Mutex read or Semaphore
343         *  access."
344         */
345        for (int i = 0; i < num_inst - 1; i++) {
346                uint64_t inst = insts[i];
347
348                if (qpu_num_sf_accesses(inst) > 1)
349                        fail_instr(inst, "Single instruction writes SFU twice");
350        }
351
352        /* "The uniform base pointer can be written (from SIMD element 0) by
353         *  the processor to reset the stream, there must be at least two
354         *  nonuniform-accessing instructions following a pointer change
355         *  before uniforms can be accessed once more."
356         */
357        int last_unif_pointer_update = -3;
358        for (int i = 0; i < num_inst; i++) {
359                uint64_t inst = insts[i];
360                uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
361                uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
362
363                if (reads_reg(inst, QPU_R_UNIF) &&
364                    i - last_unif_pointer_update <= 2) {
365                        fail_instr(inst,
366                                   "uniform read too soon after pointer update");
367                }
368
369                if (waddr_add == QPU_W_UNIFORMS_ADDRESS ||
370                    waddr_mul == QPU_W_UNIFORMS_ADDRESS)
371                        last_unif_pointer_update = i;
372        }
373
374        if (threaded) {
375                bool last_thrsw_found = false;
376                bool scoreboard_locked = false;
377                int tex_samples_outstanding = 0;
378                int last_tex_samples_outstanding = 0;
379                int thrsw_ip = -1;
380
381                for (int i = 0; i < num_inst; i++) {
382                        uint64_t inst = insts[i];
383                        uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
384
385                        if (i == thrsw_ip) {
386                                /* In order to get texture results back in the
387                                 * correct order, before a new thrsw we have
388                                 * to read all the texture results from before
389                                 * the previous thrsw.
390                                 *
391                                 * FIXME: Is collecting the remaining results
392                                 * during the delay slots OK, or should we do
393                                 * this at THRSW signal time?
394                                 */
395                                if (last_tex_samples_outstanding != 0) {
396                                        fail_instr(inst, "THRSW with texture "
397                                                   "results from the previous "
398                                                   "THRSW still in the FIFO.");
399                                }
400
401                                last_tex_samples_outstanding =
402                                        tex_samples_outstanding;
403                                tex_samples_outstanding = 0;
404                        }
405
406                        if (qpu_inst_is_tlb(inst))
407                                scoreboard_locked = true;
408
409                        switch (sig) {
410                        case QPU_SIG_THREAD_SWITCH:
411                        case QPU_SIG_LAST_THREAD_SWITCH:
412                                /* No thread switching with the scoreboard
413                                 * locked.  Doing so means we may deadlock
414                                 * when the other thread tries to lock
415                                 * scoreboard.
416                                 */
417                                if (scoreboard_locked) {
418                                        fail_instr(inst, "THRSW with the "
419                                                   "scoreboard locked.");
420                                }
421
422                                /* No thread switching after lthrsw, since
423                                 * lthrsw means that we get delayed until the
424                                 * other shader is ready for us to terminate.
425                                 */
426                                if (last_thrsw_found) {
427                                        fail_instr(inst, "THRSW after a "
428                                                   "previous LTHRSW");
429                                }
430
431                                if (sig == QPU_SIG_LAST_THREAD_SWITCH)
432                                        last_thrsw_found = true;
433
434                                /* No THRSW while we already have a THRSW
435                                 * queued.
436                                 */
437                                if (i < thrsw_ip) {
438                                        fail_instr(inst,
439                                                   "THRSW with a THRSW queued.");
440                                }
441
442                                thrsw_ip = i + 3;
443                                break;
444
445                        case QPU_SIG_LOAD_TMU0:
446                        case QPU_SIG_LOAD_TMU1:
447                                if (last_tex_samples_outstanding == 0) {
448                                        fail_instr(inst, "TMU load with nothing "
449                                                   "in the results fifo from "
450                                                   "the previous THRSW.");
451                                }
452
453                                last_tex_samples_outstanding--;
454                                break;
455                        }
456
457                        uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
458                        uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
459                        if (waddr_add == QPU_W_TMU0_S ||
460                            waddr_add == QPU_W_TMU1_S ||
461                            waddr_mul == QPU_W_TMU0_S ||
462                            waddr_mul == QPU_W_TMU1_S) {
463                                tex_samples_outstanding++;
464                        }
465                }
466        }
467}
468