1.section #gk104_builtin_code
2// DIV U32
3//
4// UNR recurrence (q = a / b):
5// look for z such that 2^32 - b <= b * z < 2^32
6// then q - 1 <= (a * z) / 2^32 <= q
7//
8// INPUT:   $r0: dividend, $r1: divisor
9// OUTPUT:  $r0: result, $r1: modulus
10// CLOBBER: $r2 - $r3, $p0 - $p1
11// SIZE:    22 / 14 * 8 bytes
12//
13gk104_div_u32:
14   sched 0x28 0x4 0x28 0x4 0x28 0x28 0x28
15   bfind u32 $r2 $r1
16   long xor b32 $r2 $r2 0x1f
17   long mov b32 $r3 0x1
18   shl b32 $r2 $r3 clamp $r2
19   long cvt u32 $r1 neg u32 $r1
20   long mul $r3 u32 $r1 u32 $r2
21   add $r2 (mul high u32 $r2 u32 $r3) $r2
22   sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28
23   mul $r3 u32 $r1 u32 $r2
24   add $r2 (mul high u32 $r2 u32 $r3) $r2
25   mul $r3 u32 $r1 u32 $r2
26   add $r2 (mul high u32 $r2 u32 $r3) $r2
27   mul $r3 u32 $r1 u32 $r2
28   add $r2 (mul high u32 $r2 u32 $r3) $r2
29   mul $r3 u32 $r1 u32 $r2
30   sched 0x4 0x28 0x4 0x28 0x28 0x2c 0x4
31   add $r2 (mul high u32 $r2 u32 $r3) $r2
32   mov b32 $r3 $r0
33   mul high $r0 u32 $r0 u32 $r2
34   long cvt u32 $r2 neg u32 $r1
35   long add $r1 (mul u32 $r1 u32 $r0) $r3
36   set $p0 0x1 ge u32 $r1 $r2
37   $p0 sub b32 $r1 $r1 $r2
38   sched 0x28 0x2c 0x4 0x20 0x2e 0x28 0x20
39   $p0 add b32 $r0 $r0 0x1
40   $p0 set $p0 0x1 ge u32 $r1 $r2
41   $p0 sub b32 $r1 $r1 $r2
42   $p0 add b32 $r0 $r0 0x1
43   long ret
44
45// DIV S32, like DIV U32 after taking ABS(inputs)
46//
47// INPUT:   $r0: dividend, $r1: divisor
48// OUTPUT:  $r0: result, $r1: modulus
49// CLOBBER: $r2 - $r3, $p0 - $p3
50//
51gk104_div_s32:
52   set $p2 0x1 lt s32 $r0 0x0
53   set $p3 0x1 lt s32 $r1 0x0 xor $p2
54   sched 0x20 0x28 0x28 0x4 0x28 0x04 0x28
55   long cvt s32 $r0 abs s32 $r0
56   long cvt s32 $r1 abs s32 $r1
57   bfind u32 $r2 $r1
58   long xor b32 $r2 $r2 0x1f
59   long mov b32 $r3 0x1
60   shl b32 $r2 $r3 clamp $r2
61   cvt u32 $r1 neg u32 $r1
62   sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28
63   mul $r3 u32 $r1 u32 $r2
64   add $r2 (mul high u32 $r2 u32 $r3) $r2
65   mul $r3 u32 $r1 u32 $r2
66   add $r2 (mul high u32 $r2 u32 $r3) $r2
67   mul $r3 u32 $r1 u32 $r2
68   add $r2 (mul high u32 $r2 u32 $r3) $r2
69   mul $r3 u32 $r1 u32 $r2
70   sched 0x28 0x28 0x4 0x28 0x04 0x28 0x28
71   add $r2 (mul high u32 $r2 u32 $r3) $r2
72   mul $r3 u32 $r1 u32 $r2
73   add $r2 (mul high u32 $r2 u32 $r3) $r2
74   mov b32 $r3 $r0
75   mul high $r0 u32 $r0 u32 $r2
76   long cvt u32 $r2 neg u32 $r1
77   long add $r1 (mul u32 $r1 u32 $r0) $r3
78   sched 0x2c 0x04 0x28 0x2c 0x04 0x28 0x20
79   set $p0 0x1 ge u32 $r1 $r2
80   $p0 sub b32 $r1 $r1 $r2
81   $p0 add b32 $r0 $r0 0x1
82   $p0 set $p0 0x1 ge u32 $r1 $r2
83   $p0 sub b32 $r1 $r1 $r2
84   long $p0 add b32 $r0 $r0 0x1
85   long $p3 cvt s32 $r0 neg s32 $r0
86   sched 0x04 0x2e 0x04 0x28 0x04 0x20 0x2c
87   $p2 cvt s32 $r1 neg s32 $r1
88   long ret
89
90// SULDP [for each format]
91// $r4d: address
92// $r2: surface info (format)
93// $p0: access predicate
94// $p1, $p2: caching predicate (00: cv, 01: ca, 10: cg)
95//
96// RGBA32
97$p1 suldgb b128 $r0q ca zero u8 g[$r4d] $r2 $p0
98set $p1 0x1 $p1 xor not $p2
99$p2 suldgb b128 $r0q cg zero u8 g[$r4d] $r2 $p0
100$p1 suldgb b128 $r0q cv zero u8 g[$r4d] $r2 $p0
101long ret
102// RGBA16_UNORM
103sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
104$p1 suldgb b128 $r0q ca zero u8 g[$r4d] $r2 $p0
105set $p1 0x1 $p1 xor not $p2
106$p2 suldgb b128 $r0q cg zero u8 g[$r4d] $r2 $p0
107$p1 suldgb b128 $r0q cv zero u8 g[$r4d] $r2 $p0
108cvt rn f32 $r3 u16 1 $r1
109cvt rn f32 $r2 u16 0 $r1
110mul f32 $r3 $r3 0x37800074
111sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
112cvt rn f32 $r1 u16 1 $r0
113mul f32 $r2 $r2 0x37800074
114cvt rn f32 $r0 u16 0 $r0
115mul f32 $r1 $r1 0x37800074
116mul f32 $r0 $r0 0x37800074
117long ret
118// RGBA16_SNORM
119$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
120sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
121set $p1 0x1 $p1 xor not $p2
122$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
123$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
124cvt rn f32 $r3 s16 1 $r1
125cvt rn f32 $r2 s16 0 $r1
126mul f32 $r3 $r3 0x38000187
127cvt rn f32 $r1 s16 1 $r0
128sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
129mul f32 $r2 $r2 0x38000187
130cvt rn f32 $r0 s16 0 $r0
131mul f32 $r1 $r1 0x38000187
132mul f32 $r0 $r0 0x38000187
133long ret
134// RGBA16_SINT
135$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
136set $p1 0x1 $p1 xor not $p2
137sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
138$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
139$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
140cvt s32 $r3 s16 1 $r1
141cvt s32 $r2 s16 0 $r1
142cvt s32 $r1 s16 1 $r0
143cvt s32 $r0 s16 0 $r0
144long ret
145// RGBA16_UINT
146sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
147$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
148set $p1 0x1 $p1 xor not $p2
149$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
150$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
151cvt u32 $r3 u16 1 $r1
152cvt u32 $r2 u16 0 $r1
153cvt u32 $r1 u16 1 $r0
154sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
155cvt u32 $r0 u16 0 $r0
156long ret
157// RGBA16_FLOAT
158$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
159set $p1 0x1 $p1 xor not $p2
160$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
161$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
162cvt f32 $r3 f16 $r1 1
163sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
164cvt f32 $r2 f16 $r1 0
165cvt f32 $r1 f16 $r0 1
166cvt f32 $r0 f16 $r0 0
167long ret
168// RG32_FLOAT
169$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
170set $p1 0x1 $p1 xor not $p2
171$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
172sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
173$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
174long mov b32 $r2 0x00000000
175long mov b32 $r3 0x3f800000
176long ret
177// RG32_xINT
178$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
179set $p1 0x1 $p1 xor not $p2
180$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
181sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
182$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
183long mov b32 $r2 0x00000000
184long mov b32 $r3 0x00000001
185long ret
186// RGB10A2_UNORM
187$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
188set $p1 0x1 $p1 xor not $p2
189$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
190sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
191$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
192ext u32 $r1 $r0 0x0a0a
193long mov b32 $r3 0x3f800000
194ext u32 $r2 $r0 0x0a14
195long and b32 $r0 $r0 0x3ff
196cvt rn f32 $r2 u16 0 $r2
197cvt rn f32 $r1 u16 0 $r1
198sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
199mul f32 $r2 $r2 0x3a802007
200cvt rn f32 $r0 u16 0 $r0
201mul f32 $r1 $r1 0x3a802007
202mul f32 $r0 $r0 0x3a802007
203long ret
204// RGB10A2_UINT
205$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
206set $p1 0x1 $p1 xor not $p2
207sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
208$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
209$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
210ext u32 $r1 $r0 0x0a0a
211long mov b32 $r3 0x00000001
212ext u32 $r2 $r0 0x0a14
213long and b32 $r0 $r0 0x3ff
214long ret
215// RGBA8_UNORM
216sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
217$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
218set $p1 0x1 $p1 xor not $p2
219$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
220$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
221cvt rn f32 $r3 u8 3 $r0
222cvt rn f32 $r2 u8 2 $r0
223mul f32 $r3 $r3 0x3b808081
224sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
225cvt rn f32 $r1 u8 1 $r0
226mul f32 $r2 $r2 0x3b808081
227cvt rn f32 $r0 u8 0 $r0
228mul f32 $r1 $r1 0x3b808081
229mul f32 $r0 $r0 0x3b808081
230long ret
231// RGBA8_SNORM
232$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
233sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
234set $p1 0x1 $p1 xor not $p2
235$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
236$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
237cvt rn f32 $r3 s8 3 $r0
238cvt rn f32 $r2 s8 2 $r0
239mul f32 $r3 $r3 0x3c010204
240cvt rn f32 $r1 s8 1 $r0
241sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
242mul f32 $r2 $r2 0x3c010204
243cvt rn f32 $r0 s8 0 $r0
244mul f32 $r1 $r1 0x3c010204
245mul f32 $r0 $r0 0x3c010204
246long ret
247// RGBA8_SINT
248$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
249set $p1 0x1 $p1 xor not $p2
250sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
251$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
252$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
253cvt s32 $r3 s8 3 $r0
254cvt s32 $r2 s8 2 $r0
255cvt s32 $r1 s8 1 $r0
256cvt s32 $r0 s8 0 $r0
257long ret
258// RGBA8_UINT
259sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
260$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
261set $p1 0x1 $p1 xor not $p2
262$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
263$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
264cvt u32 $r3 u8 3 $r0
265cvt u32 $r2 u8 2 $r0
266cvt u32 $r1 u8 1 $r0
267sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
268cvt u32 $r0 u8 0 $r0
269long ret
270// R5G6B5_UNORM
271$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
272set $p1 0x1 $p1 xor not $p2
273$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
274$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
275ext u32 $r1 $r0 0x0605
276sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
277long mov b32 $r3 0x3f800000
278ext u32 $r2 $r0 0x050b
279long and b32 $r0 $r0 0x1f
280cvt rn f32 $r2 u8 0 $r2
281cvt rn f32 $r1 u8 0 $r1
282mul f32 $r2 $r2 0x3d042108
283cvt rn f32 $r0 u8 0 $r0
284sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
285mul f32 $r1 $r1 0x3c820821
286mul f32 $r0 $r0 0x3d042108
287long ret
288// R5G5B5X1_UNORM
289$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
290set $p1 0x1 $p1 xor not $p2
291$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
292$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
293sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
294ext u32 $r1 $r0 0x0505
295ext u32 $r2 $r0 0x050a
296long and b32 $r0 $r0 0x1f
297long mov b32 $r3 0x3f800000
298cvt rn f32 $r2 u8 0 $r2
299cvt rn f32 $r1 u8 0 $r1
300cvt rn f32 $r0 u8 0 $r0
301sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
302mul f32 $r2 $r2 0x3d042108
303mul f32 $r1 $r1 0x3d042108
304mul f32 $r0 $r0 0x3d042108
305long ret
306// RG16_UNORM
307$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
308set $p1 0x1 $p1 xor not $p2
309$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
310sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
311$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
312cvt rn f32 $r1 u16 1 $r0
313cvt rn f32 $r0 u16 0 $r0
314mul f32 $r1 $r1 0x37800074
315mul f32 $r0 $r0 0x37800074
316long mov b32 $r2 0x00000000
317long mov b32 $r3 0x3f800000
318sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
319long ret
320// RG16_SNORM
321$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
322set $p1 0x1 $p1 xor not $p2
323$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
324$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
325mov b32 $r3 0x3f800000
326cvt rn f32 $r1 s16 1 $r0
327sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
328mov b32 $r2 0x00000000
329cvt rn f32 $r0 s16 0 $r0
330mul f32 $r1 $r1 0x38000187
331mul f32 $r0 $r0 0x38000187
332long ret
333// RG16_SINT
334$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
335set $p1 0x1 $p1 xor not $p2
336sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
337$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
338$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
339mov b32 $r3 0x00000001
340cvt s32 $r1 s16 1 $r0
341mov b32 $r2 0x00000000
342cvt s32 $r0 s16 0 $r0
343long ret
344// RG16_UINT
345sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
346$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
347set $p1 0x1 $p1 xor not $p2
348$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
349$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
350mov b32 $r3 0x00000001
351cvt u32 $r1 u16 1 $r0
352mov b32 $r2 0x00000000
353sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
354cvt u32 $r0 u16 0 $r0
355long ret
356// RG16_FLOAT
357$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
358set $p1 0x1 $p1 xor not $p2
359$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
360$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
361mov b32 $r3 0x3f800000
362sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
363cvt f32 $r1 f16 $r0 1
364mov b32 $r2 0x00000000
365cvt f32 $r0 f16 $r0 0
366long ret
367// R32_FLOAT
368$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
369set $p1 0x1 $p1 xor not $p2
370$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
371sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
372$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
373long mov b32 $r3 0x3f800000
374long mov b32 $r2 0x00000000
375long mov b32 $r1 0x00000000
376long ret
377// R32_xINT
378$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
379set $p1 0x1 $p1 xor not $p2
380sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
381$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
382$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
383long mov b32 $r3 0x00000001
384long mov b32 $r2 0x00000000
385long mov b32 $r1 0x00000000
386long ret
387// RG8_UNORM
388$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
389sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
390set $p1 0x1 $p1 xor not $p2
391$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
392$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
393mov b32 $r3 0x3f800000
394cvt rn f32 $r1 u8 1 $r0
395mov b32 $r2 0x00000000
396cvt rn f32 $r0 u8 0 $r0
397sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
398mul f32 $r1 $r1 0x3b808081
399mul f32 $r0 $r0 0x3b808081
400long ret
401// RG8_SNORM
402$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
403set $p1 0x1 $p1 xor not $p2
404$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
405$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
406sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
407long mov b32 $r3 0x3f800000
408cvt rn f32 $r1 s8 1 $r0
409long mov b32 $r2 0x00000000
410cvt rn f32 $r0 s8 0 $r0
411mul f32 $r1 $r1 0x3c010204
412mul f32 $r0 $r0 0x3c010204
413long ret
414// RG8_UINT
415sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
416$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
417set $p1 0x1 $p1 xor not $p2
418$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
419$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
420long mov b32 $r3 0x00000001
421cvt u32 $r1 u8 1 $r0
422long mov b32 $r2 0x00000000
423sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
424cvt u32 $r0 u8 0 $r0
425long ret
426// RG8_SINT
427$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
428set $p1 0x1 $p1 xor not $p2
429$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
430$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
431long mov b32 $r3 0x00000001
432sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
433cvt s32 $r1 s8 1 $r0
434long mov b32 $r2 0x00000000
435cvt s32 $r0 s8 0 $r0
436long ret
437// R16_UNORM
438$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
439set $p1 0x1 $p1 xor not $p2
440$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
441sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
442$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
443long mov b32 $r3 0x3f800000
444cvt rn f32 $r0 u16 0 $r0
445long mov b32 $r2 0x00000000
446long mov b32 $r1 0x00000000
447mul f32 $r0 $r0 0x37800074
448long ret
449// R16_SNORM
450sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
451$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
452set $p1 0x1 $p1 xor not $p2
453$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
454$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
455mov b32 $r3 0x3f800000
456cvt rn f32 $r0 s16 0 $r0
457long mov b32 $r2 0x00000000
458sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
459long mov b32 $r1 0x00000000
460mul f32 $r0 $r0 0x38000187
461long ret
462// R16_SINT
463$p1 suldgb s16 $r0 ca zero u8 g[$r4d] $r2 $p0
464set $p1 0x1 $p1 xor not $p2
465$p2 suldgb s16 $r0 cg zero u8 g[$r4d] $r2 $p0
466$p1 suldgb s16 $r0 cv zero u8 g[$r4d] $r2 $p0
467sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
468long mov b32 $r3 0x00000001
469long mov b32 $r2 0x00000000
470long mov b32 $r1 0x00000000
471long ret
472// R16_UINT
473$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
474set $p1 0x1 $p1 xor not $p2
475$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
476sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
477$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
478long mov b32 $r3 0x00000001
479long mov b32 $r2 0x00000000
480long mov b32 $r1 0x00000000
481long ret
482// R16_FLOAT
483$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
484set $p1 0x1 $p1 xor not $p2
485sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
486$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
487$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
488long mov b32 $r3 0x3f800000
489long mov b32 $r2 0x00000000
490cvt f32 $r0 f16 $r0 0
491mov b32 $r1 0x00000000
492long ret
493// R8_UNORM
494sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
495$p1 suldgb u8 $r0 ca zero u8 g[$r4d] $r2 $p0
496set $p1 0x1 $p1 xor not $p2
497$p2 suldgb u8 $r0 cg zero u8 g[$r4d] $r2 $p0
498$p1 suldgb u8 $r0 cv zero u8 g[$r4d] $r2 $p0
499mov b32 $r3 0x3f800000
500cvt rn f32 $r0 u8 0 $r0
501mov b32 $r2 0x00000000
502sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
503mul f32 $r0 $r0 0x3b808081
504mov b32 $r1 0x00000000
505long ret
506// R8_SNORM
507$p1 suldgb u8 $r0 ca zero u8 g[$r4d] $r2 $p0
508set $p1 0x1 $p1 xor not $p2
509$p2 suldgb u8 $r0 cg zero u8 g[$r4d] $r2 $p0
510$p1 suldgb u8 $r0 cv zero u8 g[$r4d] $r2 $p0
511sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
512mov b32 $r3 0x3f800000
513cvt rn f32 $r0 s8 0 $r0
514mov b32 $r2 0x00000000
515mul f32 $r0 $r0 0x3c010204
516mov b32 $r1 0x00000000
517long ret
518// R8_SINT
519$p1 suldgb s8 $r0 ca zero u8 g[$r4d] $r2 $p0
520sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
521set $p1 0x1 $p1 xor not $p2
522$p2 suldgb s8 $r0 cg zero u8 g[$r4d] $r2 $p0
523$p1 suldgb s8 $r0 cv zero u8 g[$r4d] $r2 $p0
524long mov b32 $r3 0x00000001
525long mov b32 $r2 0x00000000
526long mov b32 $r1 0x00000000
527long ret
528// R8_UINT
529sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
530$p1 suldgb u8 $r0 ca zero u8 g[$r4d] $r2 $p0
531set $p1 0x1 $p1 xor not $p2
532$p2 suldgb u8 $r0 cg zero u8 g[$r4d] $r2 $p0
533$p1 suldgb u8 $r0 cv zero u8 g[$r4d] $r2 $p0
534long mov b32 $r3 0x00000001
535long mov b32 $r2 0x00000000
536long mov b32 $r1 0x00000000
537sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
538long ret
539// R11G11B10_FLOAT TODO
540$p1 suldgb b32 $r3 ca zero u8 g[$r4d] $r2 $p0
541set $p1 0x1 $p1 xor not $p2
542$p2 suldgb b32 $r3 cg zero u8 g[$r4d] $r2 $p0
543$p1 suldgb b32 $r3 cv zero u8 g[$r4d] $r2 $p0
544long mov b32 $r3 0x3f800000
545long nop
546long ret
547
548
549// RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i)
550//
551// INPUT:   $r0d (x)
552// OUTPUT:  $r0d (rcp(x))
553// CLOBBER: $r2 - $r7
554// SIZE:    9 * 8 bytes
555//
556gk104_rcp_f64:
557   long nop
558   long ret
559
560// RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i)
561//
562// INPUT:   $r0d (x)
563// OUTPUT:  $r0d (rsqrt(x))
564// CLOBBER: $r2 - $r7
565// SIZE:    14 * 8 bytes
566//
567gk104_rsq_f64:
568   long nop
569   long ret
570
571//
572// Trap handler.
573// Requires at least 4 GPRs and 32 bytes of l[] memory to temporarily save GPRs.
574// Low 32 bytes of l[] memory shouldn't be used if resumeability is required.
575//
576// Trap info:
577// 0x000: mutex
578// 0x004: PC
579// 0x008: trapstat
580// 0x00c: warperr
581// 0x010: tidx
582// 0x014: tidy
583// 0x018: tidz
584// 0x01c: ctaidx
585// 0x020: ctaidy
586// 0x024: ctaidz
587// 0x030: $r0q
588// 0x130: $flags
589// 0x140: s[]
590//
591st b128 wb l[0x00] $r0q
592// check state of the warp and continue if it didn't cause the trap
593long mov b32 $r1 $trapstat
594long mov b32 $r3 $warperr
595mov $r2 $flags mask 0xffff
596and b32 0 $c $r1 $r3
597e $c bra #end_cont
598// spill control flow stack to l[]
599long mov b32 $r3 16
600spill_cfstack:
601preret #end_exit
602sub b32 $r3 $c $r3 0x1
603lg $c bra #spill_cfstack
604// retrieve pointer to trap info
605mov b32 $r0 c0[0x1900]
606mov b32 $r1 c0[0x1904]
607// we only let a single faulting thread store its state
608mov b32 $r3 0x1
609exch b32 $r3 g[$r0d] $r3
610joinat #end_exit
611set $p0 0x1 eq u32 $r3 0x1
612join $p0 nop
613// store $c and $p registers
614st b32 wb g[$r0d+0x130] $r2
615// store $trapstat and $warperr
616long mov b32 $r2 $trapstat
617long mov b32 $r3 $warperr
618st b64 wb g[$r0d+0x8] $r2d
619// store registers
620st b128 wb g[$r0d+0x40] $r4q
621st b128 wb g[$r0d+0x50] $r8q
622st b128 wb g[$r0d+0x60] $r12q
623st b128 wb g[$r0d+0x70] $r16q
624st b128 wb g[$r0d+0x80] $r20q
625st b128 wb g[$r0d+0x90] $r24q
626st b128 wb g[$r0d+0xa0] $r28q
627st b128 wb g[$r0d+0xb0] $r32q
628st b128 wb g[$r0d+0xc0] $r36q
629st b128 wb g[$r0d+0xd0] $r40q
630st b128 wb g[$r0d+0xe0] $r44q
631st b128 wb g[$r0d+0xf0] $r48q
632st b128 wb g[$r0d+0x100] $r52q
633st b128 wb g[$r0d+0x110] $r56q
634st b128 wb g[$r0d+0x120] $r60q
635ld b64 $r2d cs l[0x0]
636st b64 wb g[$r0d+0x30] $r2d
637ld b64 $r2d cs l[0x8]
638st b64 wb g[$r0d+0x38] $r2d
639// store thread id
640long mov b32 $r2 $tidx
641long mov b32 $r3 $tidy
642st b64 wb g[$r0d+0x10] $r2d
643long mov b32 $r2 $tidz
644long mov b32 $r3 $ctaidx
645st b64 wb g[$r0d+0x18] $r2d
646long mov b32 $r2 $ctaidy
647long mov b32 $r3 $ctaidz
648st b64 wb g[$r0d+0x20] $r2d
649// store shared memory (in reverse order so $r0d is base again at the end)
650long mov b32 $r3 $smemsz
651sub b32 $r3 $c $r3 0x4
652s $c bra #shared_done
653add b32 $r0 $c $r0 $r3
654add b32 $r1 $r1 0x0 $c
655shared_loop:
656long ld b32 $r2 s[$r3]
657long st b32 wb g[$r0d+0x140] $r2
658sub b32 $r0 $c $r0 0x4
659sub b32 $r1 $r1 0x0 $c
660sub b32 $r3 $c $r3 0x4
661lg $c bra #shared_loop
662shared_done:
663// search the stack for trap entry to retrieve PC
664mov b32 $r0 c0[0x1908]
665mov b32 $r1 c0[0x190c]
666membar sys
667// invalidate caches so we can read stack entries via g[]
668cctl ivall 0 l[0]
669cctl ivall 0 g[$r0d]
670// get offsets
671mov b32 $r2 $physid
672ext u32 $r3 $r2 0x0814 // MP id
673ext u32 $r2 $r2 0x0608 // warp id
674mul $r2 u32 $r2 u32 c0[0x1914] // warp offset
675mul $r3 u32 $r3 u32 c0[0x1910] // MP offset
676add b32 $r2 $r2 $r3 // MP + warp offset
677add b32 $r0 $c $r0 $r2
678add b32 $r1 $r1 0x0 $c
679search_cstack:
680mov b32 $r3 c0[0x1918] // cstack size
681ld u8 $r2 cv g[$r0d+0x8]
682set $p0 0x1 eq u32 $r2 0xa
683$p0 bra #entry_found
684add b32 $r0 $c $r0 0x10
685add b32 $r1 $r1 0x0 $c
686sub b32 $r3 $c $r3 0x10
687lg $c bra #search_cstack
688bra #end_exit
689entry_found:
690// load PC (may be unaligned and spread out)
691ld b32 $r2 cv g[$r0d]
692mov b32 $r0 c0[0x1900]
693mov b32 $r1 c0[0x1904]
694st b32 wb g[$r0d+0x4] $r2
695join nop
696// invalidate caches and exit
697end_exit:
698cctl ivall 0 g[0]
699bpt pause 0x0
700rtt terminate
701end_cont:
702bpt pause 0x0
703mov $flags $r2 mask 0xffff
704ld b128 $r0q cs l[0x00]
705rtt
706
707.section #gk104_builtin_offsets
708.b64 #gk104_div_u32
709.b64 #gk104_div_s32
710.b64 #gk104_rcp_f64
711.b64 #gk104_rsq_f64
712