1 2/* 3 * Copyright © 2014 Broadcom 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22 * IN THE SOFTWARE. 23 */ 24 25#include "vc4_qpu.h" 26 27static void 28fail_instr(uint64_t inst, const char *msg) 29{ 30 fprintf(stderr, "vc4_qpu_validate: %s: ", msg); 31 vc4_qpu_disasm(&inst, 1); 32 fprintf(stderr, "\n"); 33 abort(); 34} 35 36static bool 37writes_reg(uint64_t inst, uint32_t w) 38{ 39 return (QPU_GET_FIELD(inst, QPU_WADDR_ADD) == w || 40 QPU_GET_FIELD(inst, QPU_WADDR_MUL) == w); 41} 42 43static bool 44_reads_reg(uint64_t inst, uint32_t r, bool ignore_a, bool ignore_b) 45{ 46 struct { 47 uint32_t mux, addr; 48 } src_regs[] = { 49 { QPU_GET_FIELD(inst, QPU_ADD_A) }, 50 { QPU_GET_FIELD(inst, QPU_ADD_B) }, 51 { QPU_GET_FIELD(inst, QPU_MUL_A) }, 52 { QPU_GET_FIELD(inst, QPU_MUL_B) }, 53 }; 54 55 /* Branches only reference raddr_a (no mux), and we don't use that 56 * feature of branching. 57 */ 58 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH) 59 return false; 60 61 /* Load immediates don't read any registers. */ 62 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LOAD_IMM) 63 return false; 64 65 for (int i = 0; i < ARRAY_SIZE(src_regs); i++) { 66 if (!ignore_a && 67 src_regs[i].mux == QPU_MUX_A && 68 (QPU_GET_FIELD(inst, QPU_RADDR_A) == r)) 69 return true; 70 71 if (!ignore_b && 72 QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM && 73 src_regs[i].mux == QPU_MUX_B && 74 (QPU_GET_FIELD(inst, QPU_RADDR_B) == r)) 75 return true; 76 } 77 78 return false; 79} 80 81static bool 82reads_reg(uint64_t inst, uint32_t r) 83{ 84 return _reads_reg(inst, r, false, false); 85} 86 87static bool 88reads_a_reg(uint64_t inst, uint32_t r) 89{ 90 return _reads_reg(inst, r, false, true); 91} 92 93static bool 94reads_b_reg(uint64_t inst, uint32_t r) 95{ 96 return _reads_reg(inst, r, true, false); 97} 98 99static bool 100writes_sfu(uint64_t inst) 101{ 102 return (writes_reg(inst, QPU_W_SFU_RECIP) || 103 writes_reg(inst, QPU_W_SFU_RECIPSQRT) || 104 writes_reg(inst, QPU_W_SFU_EXP) || 105 writes_reg(inst, QPU_W_SFU_LOG)); 106} 107 108/** 109 * Checks for the instruction restrictions from page 37 ("Summary of 110 * Instruction Restrictions"). 111 */ 112void 113vc4_qpu_validate(uint64_t *insts, uint32_t num_inst) 114{ 115 bool scoreboard_locked = false; 116 bool threaded = false; 117 118 /* We don't want to do validation in release builds, but we want to 119 * keep compiling the validation code to make sure it doesn't get 120 * broken. 121 */ 122#ifndef DEBUG 123 return; 124#endif 125 126 for (int i = 0; i < num_inst; i++) { 127 uint64_t inst = insts[i]; 128 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 129 130 if (sig != QPU_SIG_PROG_END) { 131 if (qpu_inst_is_tlb(inst)) 132 scoreboard_locked = true; 133 134 if (sig == QPU_SIG_THREAD_SWITCH || 135 sig == QPU_SIG_LAST_THREAD_SWITCH) { 136 threaded = true; 137 } 138 139 continue; 140 } 141 142 /* "The Thread End instruction must not write to either physical 143 * regfile A or B." 144 */ 145 if (QPU_GET_FIELD(inst, QPU_WADDR_ADD) < 32 || 146 QPU_GET_FIELD(inst, QPU_WADDR_MUL) < 32) { 147 fail_instr(inst, "write to phys reg in thread end"); 148 } 149 150 /* Can't trigger an implicit wait on scoreboard in the program 151 * end instruction. 152 */ 153 if (qpu_inst_is_tlb(inst) && !scoreboard_locked) 154 fail_instr(inst, "implicit sb wait in program end"); 155 156 /* Two delay slots will be executed. */ 157 assert(i + 2 <= num_inst); 158 159 for (int j = i; j < i + 2; j++) { 160 /* "The last three instructions of any program 161 * (Thread End plus the following two delay-slot 162 * instructions) must not do varyings read, uniforms 163 * read or any kind of VPM, VDR, or VDW read or 164 * write." 165 */ 166 if (writes_reg(insts[j], QPU_W_VPM) || 167 reads_reg(insts[j], QPU_R_VARY) || 168 reads_reg(insts[j], QPU_R_UNIF) || 169 reads_reg(insts[j], QPU_R_VPM)) { 170 fail_instr(insts[j], "last 3 instructions " 171 "using fixed functions"); 172 } 173 174 /* "The Thread End instruction and the following two 175 * delay slot instructions must not write or read 176 * address 14 in either regfile A or B." 177 */ 178 if (writes_reg(insts[j], 14) || 179 reads_reg(insts[j], 14)) { 180 fail_instr(insts[j], "last 3 instructions " 181 "must not use r14"); 182 } 183 } 184 185 /* "The final program instruction (the second delay slot 186 * instruction) must not do a TLB Z write." 187 */ 188 if (writes_reg(insts[i + 2], QPU_W_TLB_Z)) { 189 fail_instr(insts[i + 2], "final instruction doing " 190 "Z write"); 191 } 192 } 193 194 /* "A scoreboard wait must not occur in the first two instructions of 195 * a fragment shader. This is either the explicit Wait for Scoreboard 196 * signal or an implicit wait with the first tile-buffer read or 197 * write instruction." 198 */ 199 for (int i = 0; i < 2; i++) { 200 uint64_t inst = insts[i]; 201 202 if (qpu_inst_is_tlb(inst)) 203 fail_instr(inst, "sb wait in first two insts"); 204 } 205 206 /* "If TMU_NOSWAP is written, the write must be three instructions 207 * before the first TMU write instruction. For example, if 208 * TMU_NOSWAP is written in the first shader instruction, the first 209 * TMU write cannot occur before the 4th shader instruction." 210 */ 211 int last_tmu_noswap = -10; 212 for (int i = 0; i < num_inst; i++) { 213 uint64_t inst = insts[i]; 214 215 if ((i - last_tmu_noswap) <= 3 && 216 (writes_reg(inst, QPU_W_TMU0_S) || 217 writes_reg(inst, QPU_W_TMU1_S))) { 218 fail_instr(inst, "TMU write too soon after TMU_NOSWAP"); 219 } 220 221 if (writes_reg(inst, QPU_W_TMU_NOSWAP)) 222 last_tmu_noswap = i; 223 } 224 225 /* "An instruction must not read from a location in physical regfile A 226 * or B that was written to by the previous instruction." 227 */ 228 for (int i = 0; i < num_inst - 1; i++) { 229 uint64_t inst = insts[i]; 230 uint32_t add_waddr = QPU_GET_FIELD(inst, QPU_WADDR_ADD); 231 uint32_t mul_waddr = QPU_GET_FIELD(inst, QPU_WADDR_MUL); 232 uint32_t waddr_a, waddr_b; 233 234 if (inst & QPU_WS) { 235 waddr_b = add_waddr; 236 waddr_a = mul_waddr; 237 } else { 238 waddr_a = add_waddr; 239 waddr_b = mul_waddr; 240 } 241 242 if ((waddr_a < 32 && reads_a_reg(insts[i + 1], waddr_a)) || 243 (waddr_b < 32 && reads_b_reg(insts[i + 1], waddr_b))) { 244 fail_instr(insts[i + 1], 245 "Reads physical reg too soon after write"); 246 } 247 } 248 249 /* "After an SFU lookup instruction, accumulator r4 must not be read 250 * in the following two instructions. Any other instruction that 251 * results in r4 being written (that is, TMU read, TLB read, SFU 252 * lookup) cannot occur in the two instructions following an SFU 253 * lookup." 254 */ 255 int last_sfu_inst = -10; 256 for (int i = 0; i < num_inst - 1; i++) { 257 uint64_t inst = insts[i]; 258 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 259 260 if (i - last_sfu_inst <= 2 && 261 (writes_sfu(inst) || 262 sig == QPU_SIG_LOAD_TMU0 || 263 sig == QPU_SIG_LOAD_TMU1 || 264 sig == QPU_SIG_COLOR_LOAD)) { 265 fail_instr(inst, "R4 write too soon after SFU write"); 266 } 267 268 if (writes_sfu(inst)) 269 last_sfu_inst = i; 270 } 271 272 for (int i = 0; i < num_inst - 1; i++) { 273 uint64_t inst = insts[i]; 274 275 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_SMALL_IMM && 276 QPU_GET_FIELD(inst, QPU_SMALL_IMM) >= 277 QPU_SMALL_IMM_MUL_ROT) { 278 uint32_t mux_a = QPU_GET_FIELD(inst, QPU_MUL_A); 279 uint32_t mux_b = QPU_GET_FIELD(inst, QPU_MUL_B); 280 281 /* "The full horizontal vector rotate is only 282 * available when both of the mul ALU input arguments 283 * are taken from accumulators r0-r3." 284 */ 285 if (mux_a > QPU_MUX_R3 || mux_b > QPU_MUX_R3) { 286 fail_instr(inst, 287 "MUL rotate using non-accumulator " 288 "input"); 289 } 290 291 if (QPU_GET_FIELD(inst, QPU_SMALL_IMM) == 292 QPU_SMALL_IMM_MUL_ROT) { 293 /* "An instruction that does a vector rotate 294 * by r5 must not immediately follow an 295 * instruction that writes to r5." 296 */ 297 if (writes_reg(insts[i - 1], QPU_W_ACC5)) { 298 fail_instr(inst, 299 "vector rotate by r5 " 300 "immediately after r5 write"); 301 } 302 } 303 304 /* "An instruction that does a vector rotate must not 305 * immediately follow an instruction that writes to the 306 * accumulator that is being rotated." 307 */ 308 if (writes_reg(insts[i - 1], QPU_W_ACC0 + mux_a) || 309 writes_reg(insts[i - 1], QPU_W_ACC0 + mux_b)) { 310 fail_instr(inst, 311 "vector rotate of value " 312 "written in previous instruction"); 313 } 314 } 315 } 316 317 /* "An instruction that does a vector rotate must not immediately 318 * follow an instruction that writes to the accumulator that is being 319 * rotated. 320 * 321 * XXX: TODO. 322 */ 323 324 /* "After an instruction that does a TLB Z write, the multisample mask 325 * must not be read as an instruction input argument in the following 326 * two instruction. The TLB Z write instruction can, however, be 327 * followed immediately by a TLB color write." 328 */ 329 for (int i = 0; i < num_inst - 1; i++) { 330 uint64_t inst = insts[i]; 331 if (writes_reg(inst, QPU_W_TLB_Z) && 332 (reads_a_reg(insts[i + 1], QPU_R_MS_REV_FLAGS) || 333 reads_a_reg(insts[i + 2], QPU_R_MS_REV_FLAGS))) { 334 fail_instr(inst, "TLB Z write followed by MS mask read"); 335 } 336 } 337 338 /* 339 * "A single instruction can only perform a maximum of one of the 340 * following closely coupled peripheral accesses in a single 341 * instruction: TMU write, TMU read, TLB write, TLB read, TLB 342 * combined color read and write, SFU write, Mutex read or Semaphore 343 * access." 344 */ 345 for (int i = 0; i < num_inst - 1; i++) { 346 uint64_t inst = insts[i]; 347 348 if (qpu_num_sf_accesses(inst) > 1) 349 fail_instr(inst, "Single instruction writes SFU twice"); 350 } 351 352 /* "The uniform base pointer can be written (from SIMD element 0) by 353 * the processor to reset the stream, there must be at least two 354 * nonuniform-accessing instructions following a pointer change 355 * before uniforms can be accessed once more." 356 */ 357 int last_unif_pointer_update = -3; 358 for (int i = 0; i < num_inst; i++) { 359 uint64_t inst = insts[i]; 360 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); 361 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); 362 363 if (reads_reg(inst, QPU_R_UNIF) && 364 i - last_unif_pointer_update <= 2) { 365 fail_instr(inst, 366 "uniform read too soon after pointer update"); 367 } 368 369 if (waddr_add == QPU_W_UNIFORMS_ADDRESS || 370 waddr_mul == QPU_W_UNIFORMS_ADDRESS) 371 last_unif_pointer_update = i; 372 } 373 374 if (threaded) { 375 bool last_thrsw_found = false; 376 bool scoreboard_locked = false; 377 int tex_samples_outstanding = 0; 378 int last_tex_samples_outstanding = 0; 379 int thrsw_ip = -1; 380 381 for (int i = 0; i < num_inst; i++) { 382 uint64_t inst = insts[i]; 383 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 384 385 if (i == thrsw_ip) { 386 /* In order to get texture results back in the 387 * correct order, before a new thrsw we have 388 * to read all the texture results from before 389 * the previous thrsw. 390 * 391 * FIXME: Is collecting the remaining results 392 * during the delay slots OK, or should we do 393 * this at THRSW signal time? 394 */ 395 if (last_tex_samples_outstanding != 0) { 396 fail_instr(inst, "THRSW with texture " 397 "results from the previous " 398 "THRSW still in the FIFO."); 399 } 400 401 last_tex_samples_outstanding = 402 tex_samples_outstanding; 403 tex_samples_outstanding = 0; 404 } 405 406 if (qpu_inst_is_tlb(inst)) 407 scoreboard_locked = true; 408 409 switch (sig) { 410 case QPU_SIG_THREAD_SWITCH: 411 case QPU_SIG_LAST_THREAD_SWITCH: 412 /* No thread switching with the scoreboard 413 * locked. Doing so means we may deadlock 414 * when the other thread tries to lock 415 * scoreboard. 416 */ 417 if (scoreboard_locked) { 418 fail_instr(inst, "THRSW with the " 419 "scoreboard locked."); 420 } 421 422 /* No thread switching after lthrsw, since 423 * lthrsw means that we get delayed until the 424 * other shader is ready for us to terminate. 425 */ 426 if (last_thrsw_found) { 427 fail_instr(inst, "THRSW after a " 428 "previous LTHRSW"); 429 } 430 431 if (sig == QPU_SIG_LAST_THREAD_SWITCH) 432 last_thrsw_found = true; 433 434 /* No THRSW while we already have a THRSW 435 * queued. 436 */ 437 if (i < thrsw_ip) { 438 fail_instr(inst, 439 "THRSW with a THRSW queued."); 440 } 441 442 thrsw_ip = i + 3; 443 break; 444 445 case QPU_SIG_LOAD_TMU0: 446 case QPU_SIG_LOAD_TMU1: 447 if (last_tex_samples_outstanding == 0) { 448 fail_instr(inst, "TMU load with nothing " 449 "in the results fifo from " 450 "the previous THRSW."); 451 } 452 453 last_tex_samples_outstanding--; 454 break; 455 } 456 457 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); 458 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); 459 if (waddr_add == QPU_W_TMU0_S || 460 waddr_add == QPU_W_TMU1_S || 461 waddr_mul == QPU_W_TMU0_S || 462 waddr_mul == QPU_W_TMU1_S) { 463 tex_samples_outstanding++; 464 } 465 } 466 } 467} 468