1/*
2 * Copyright © 2013-2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include "isl/isl.h"
25#include "brw_fs_surface_builder.h"
26#include "brw_fs.h"
27
28using namespace brw;
29
30namespace brw {
31   namespace surface_access {
32      namespace {
33         /**
34          * Generate a logical send opcode for a surface message and return
35          * the result.
36          */
37         fs_reg
38         emit_send(const fs_builder &bld, enum opcode opcode,
39                   const fs_reg &addr, const fs_reg &src, const fs_reg &surface,
40                   unsigned dims, unsigned arg, unsigned rsize,
41                   brw_predicate pred = BRW_PREDICATE_NONE)
42         {
43            /* Reduce the dynamically uniform surface index to a single
44             * scalar.
45             */
46            const fs_reg usurface = bld.emit_uniformize(surface);
47            const fs_reg srcs[] = {
48               addr, src, usurface, brw_imm_ud(dims), brw_imm_ud(arg)
49            };
50            const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, rsize);
51            fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
52
53            inst->size_written = rsize * dst.component_size(inst->exec_size);
54            inst->predicate = pred;
55            return dst;
56         }
57      }
58
59      /**
60       * Emit an untyped surface read opcode.  \p dims determines the number
61       * of components of the address and \p size the number of components of
62       * the returned value.
63       */
64      fs_reg
65      emit_untyped_read(const fs_builder &bld,
66                        const fs_reg &surface, const fs_reg &addr,
67                        unsigned dims, unsigned size,
68                        brw_predicate pred)
69      {
70         return emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
71                          addr, fs_reg(), surface, dims, size, size, pred);
72      }
73
74      /**
75       * Emit an untyped surface write opcode.  \p dims determines the number
76       * of components of the address and \p size the number of components of
77       * the argument.
78       */
79      void
80      emit_untyped_write(const fs_builder &bld, const fs_reg &surface,
81                         const fs_reg &addr, const fs_reg &src,
82                         unsigned dims, unsigned size,
83                         brw_predicate pred)
84      {
85         emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
86                   addr, src, surface, dims, size, 0, pred);
87      }
88
89      /**
90       * Emit an untyped surface atomic opcode.  \p dims determines the number
91       * of components of the address and \p rsize the number of components of
92       * the returned value (either zero or one).
93       */
94      fs_reg
95      emit_untyped_atomic(const fs_builder &bld,
96                          const fs_reg &surface, const fs_reg &addr,
97                          const fs_reg &src0, const fs_reg &src1,
98                          unsigned dims, unsigned rsize, unsigned op,
99                          brw_predicate pred)
100      {
101         /* FINISHME: Factor out this frequently recurring pattern into a
102          * helper function.
103          */
104         const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
105         const fs_reg srcs[] = { src0, src1 };
106         const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n);
107         bld.LOAD_PAYLOAD(tmp, srcs, n, 0);
108
109         return emit_send(bld, SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
110                          addr, tmp, surface, dims, op, rsize, pred);
111      }
112
113      /**
114       * Emit a typed surface read opcode.  \p dims determines the number of
115       * components of the address and \p size the number of components of the
116       * returned value.
117       */
118      fs_reg
119      emit_typed_read(const fs_builder &bld, const fs_reg &surface,
120                      const fs_reg &addr, unsigned dims, unsigned size)
121      {
122         return emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
123                          addr, fs_reg(), surface, dims, size, size);
124      }
125
126      /**
127       * Emit a typed surface write opcode.  \p dims determines the number of
128       * components of the address and \p size the number of components of the
129       * argument.
130       */
131      void
132      emit_typed_write(const fs_builder &bld, const fs_reg &surface,
133                       const fs_reg &addr, const fs_reg &src,
134                       unsigned dims, unsigned size)
135      {
136         emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
137                   addr, src, surface, dims, size, 0);
138      }
139
140      /**
141       * Emit a typed surface atomic opcode.  \p dims determines the number of
142       * components of the address and \p rsize the number of components of
143       * the returned value (either zero or one).
144       */
145      fs_reg
146      emit_typed_atomic(const fs_builder &bld, const fs_reg &surface,
147                        const fs_reg &addr,
148                        const fs_reg &src0, const fs_reg &src1,
149                        unsigned dims, unsigned rsize, unsigned op,
150                        brw_predicate pred)
151      {
152         /* FINISHME: Factor out this frequently recurring pattern into a
153          * helper function.
154          */
155         const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
156         const fs_reg srcs[] = { src0, src1 };
157         const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n);
158         bld.LOAD_PAYLOAD(tmp, srcs, n, 0);
159
160         return emit_send(bld, SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
161                          addr, tmp, surface, dims, op, rsize);
162      }
163   }
164}
165
166namespace {
167   namespace image_format_info {
168      /* The higher compiler layers use the GL enums for image formats even if
169       * they come in from SPIR-V or Vulkan.  We need to turn them into an ISL
170       * enum before we can use them.
171       */
172      enum isl_format
173      isl_format_for_gl_format(uint32_t gl_format)
174      {
175         switch (gl_format) {
176         case GL_R8:             return ISL_FORMAT_R8_UNORM;
177         case GL_R8_SNORM:       return ISL_FORMAT_R8_SNORM;
178         case GL_R8UI:           return ISL_FORMAT_R8_UINT;
179         case GL_R8I:            return ISL_FORMAT_R8_SINT;
180         case GL_RG8:            return ISL_FORMAT_R8G8_UNORM;
181         case GL_RG8_SNORM:      return ISL_FORMAT_R8G8_SNORM;
182         case GL_RG8UI:          return ISL_FORMAT_R8G8_UINT;
183         case GL_RG8I:           return ISL_FORMAT_R8G8_SINT;
184         case GL_RGBA8:          return ISL_FORMAT_R8G8B8A8_UNORM;
185         case GL_RGBA8_SNORM:    return ISL_FORMAT_R8G8B8A8_SNORM;
186         case GL_RGBA8UI:        return ISL_FORMAT_R8G8B8A8_UINT;
187         case GL_RGBA8I:         return ISL_FORMAT_R8G8B8A8_SINT;
188         case GL_R11F_G11F_B10F: return ISL_FORMAT_R11G11B10_FLOAT;
189         case GL_RGB10_A2:       return ISL_FORMAT_R10G10B10A2_UNORM;
190         case GL_RGB10_A2UI:     return ISL_FORMAT_R10G10B10A2_UINT;
191         case GL_R16:            return ISL_FORMAT_R16_UNORM;
192         case GL_R16_SNORM:      return ISL_FORMAT_R16_SNORM;
193         case GL_R16F:           return ISL_FORMAT_R16_FLOAT;
194         case GL_R16UI:          return ISL_FORMAT_R16_UINT;
195         case GL_R16I:           return ISL_FORMAT_R16_SINT;
196         case GL_RG16:           return ISL_FORMAT_R16G16_UNORM;
197         case GL_RG16_SNORM:     return ISL_FORMAT_R16G16_SNORM;
198         case GL_RG16F:          return ISL_FORMAT_R16G16_FLOAT;
199         case GL_RG16UI:         return ISL_FORMAT_R16G16_UINT;
200         case GL_RG16I:          return ISL_FORMAT_R16G16_SINT;
201         case GL_RGBA16:         return ISL_FORMAT_R16G16B16A16_UNORM;
202         case GL_RGBA16_SNORM:   return ISL_FORMAT_R16G16B16A16_SNORM;
203         case GL_RGBA16F:        return ISL_FORMAT_R16G16B16A16_FLOAT;
204         case GL_RGBA16UI:       return ISL_FORMAT_R16G16B16A16_UINT;
205         case GL_RGBA16I:        return ISL_FORMAT_R16G16B16A16_SINT;
206         case GL_R32F:           return ISL_FORMAT_R32_FLOAT;
207         case GL_R32UI:          return ISL_FORMAT_R32_UINT;
208         case GL_R32I:           return ISL_FORMAT_R32_SINT;
209         case GL_RG32F:          return ISL_FORMAT_R32G32_FLOAT;
210         case GL_RG32UI:         return ISL_FORMAT_R32G32_UINT;
211         case GL_RG32I:          return ISL_FORMAT_R32G32_SINT;
212         case GL_RGBA32F:        return ISL_FORMAT_R32G32B32A32_FLOAT;
213         case GL_RGBA32UI:       return ISL_FORMAT_R32G32B32A32_UINT;
214         case GL_RGBA32I:        return ISL_FORMAT_R32G32B32A32_SINT;
215         case GL_NONE:           return ISL_FORMAT_UNSUPPORTED;
216         default:
217            assert(!"Invalid image format");
218            return ISL_FORMAT_UNSUPPORTED;
219         }
220      }
221
222      /**
223       * Simple 4-tuple of scalars used to pass around per-color component
224       * values.
225       */
226      struct color_u {
227         color_u(unsigned x = 0) : r(x), g(x), b(x), a(x)
228         {
229         }
230
231         color_u(unsigned r, unsigned g, unsigned b, unsigned a) :
232            r(r), g(g), b(b), a(a)
233         {
234         }
235
236         unsigned
237         operator[](unsigned i) const
238         {
239            const unsigned xs[] = { r, g, b, a };
240            return xs[i];
241         }
242
243         unsigned r, g, b, a;
244      };
245
246      /**
247       * Return the per-channel bitfield widths for a given image format.
248       */
249      inline color_u
250      get_bit_widths(isl_format format)
251      {
252         const isl_format_layout *fmtl = isl_format_get_layout(format);
253
254         return color_u(fmtl->channels.r.bits,
255                        fmtl->channels.g.bits,
256                        fmtl->channels.b.bits,
257                        fmtl->channels.a.bits);
258      }
259
260      /**
261       * Return the per-channel bitfield shifts for a given image format.
262       */
263      inline color_u
264      get_bit_shifts(isl_format format)
265      {
266         const color_u widths = get_bit_widths(format);
267         return color_u(0, widths.r, widths.r + widths.g,
268                        widths.r + widths.g + widths.b);
269      }
270
271      /**
272       * Return true if all present components have the same bit width.
273       */
274      inline bool
275      is_homogeneous(isl_format format)
276      {
277         const color_u widths = get_bit_widths(format);
278         return ((widths.g == 0 || widths.g == widths.r) &&
279                 (widths.b == 0 || widths.b == widths.r) &&
280                 (widths.a == 0 || widths.a == widths.r));
281      }
282
283      /**
284       * Return true if the format conversion boils down to a trivial copy.
285       */
286      inline bool
287      is_conversion_trivial(const gen_device_info *devinfo, isl_format format)
288      {
289         return (get_bit_widths(format).r == 32 && is_homogeneous(format)) ||
290                 format == isl_lower_storage_image_format(devinfo, format);
291      }
292
293      /**
294       * Return true if the hardware natively supports some format with
295       * compatible bitfield layout, but possibly different data types.
296       */
297      inline bool
298      has_supported_bit_layout(const gen_device_info *devinfo,
299                               isl_format format)
300      {
301         const color_u widths = get_bit_widths(format);
302         const color_u lower_widths = get_bit_widths(
303            isl_lower_storage_image_format(devinfo, format));
304
305         return (widths.r == lower_widths.r &&
306                 widths.g == lower_widths.g &&
307                 widths.b == lower_widths.b &&
308                 widths.a == lower_widths.a);
309      }
310
311      /**
312       * Return true if we are required to spread individual components over
313       * several components of the format used by the hardware (RG32 and
314       * friends implemented as RGBA16UI).
315       */
316      inline bool
317      has_split_bit_layout(const gen_device_info *devinfo, isl_format format)
318      {
319         const isl_format lower_format =
320            isl_lower_storage_image_format(devinfo, format);
321
322         return (isl_format_get_num_channels(format) <
323                 isl_format_get_num_channels(lower_format));
324      }
325
326      /**
327       * Return true if the hardware returns garbage in the unused high bits
328       * of each component.  This may happen on IVB because we rely on the
329       * undocumented behavior that typed reads from surfaces of the
330       * unsupported R8 and R16 formats return useful data in their least
331       * significant bits.
332       */
333      inline bool
334      has_undefined_high_bits(const gen_device_info *devinfo,
335                              isl_format format)
336      {
337         const isl_format lower_format =
338            isl_lower_storage_image_format(devinfo, format);
339
340         return (devinfo->gen == 7 && !devinfo->is_haswell &&
341                 (lower_format == ISL_FORMAT_R16_UINT ||
342                  lower_format == ISL_FORMAT_R8_UINT));
343      }
344
345      /**
346       * Return true if the format represents values as signed integers
347       * requiring sign extension when unpacking.
348       */
349      inline bool
350      needs_sign_extension(isl_format format)
351      {
352         return isl_format_has_snorm_channel(format) ||
353                isl_format_has_sint_channel(format);
354      }
355   }
356
357   namespace image_validity {
358      /**
359       * Check whether the bound image is suitable for untyped access.
360       */
361      brw_predicate
362      emit_untyped_image_check(const fs_builder &bld, const fs_reg &image,
363                               brw_predicate pred)
364      {
365         const gen_device_info *devinfo = bld.shader->devinfo;
366         const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
367
368         if (devinfo->gen == 7 && !devinfo->is_haswell) {
369            /* Check whether the first stride component (i.e. the Bpp value)
370             * is greater than four, what on Gen7 indicates that a surface of
371             * type RAW has been bound for untyped access.  Reading or writing
372             * to a surface of type other than RAW using untyped surface
373             * messages causes a hang on IVB and VLV.
374             */
375            set_predicate(pred,
376                          bld.CMP(bld.null_reg_ud(), stride, brw_imm_d(4),
377                                  BRW_CONDITIONAL_G));
378
379            return BRW_PREDICATE_NORMAL;
380         } else {
381            /* More recent generations handle the format mismatch
382             * gracefully.
383             */
384            return pred;
385         }
386      }
387
388      /**
389       * Check whether there is an image bound at the given index and write
390       * the comparison result to f0.0.  Returns an appropriate predication
391       * mode to use on subsequent image operations.
392       */
393      brw_predicate
394      emit_typed_atomic_check(const fs_builder &bld, const fs_reg &image)
395      {
396         const gen_device_info *devinfo = bld.shader->devinfo;
397         const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
398
399         if (devinfo->gen == 7 && !devinfo->is_haswell) {
400            /* Check the first component of the size field to find out if the
401             * image is bound.  Necessary on IVB for typed atomics because
402             * they don't seem to respect null surfaces and will happily
403             * corrupt or read random memory when no image is bound.
404             */
405            bld.CMP(bld.null_reg_ud(),
406                    retype(size, BRW_REGISTER_TYPE_UD),
407                    brw_imm_d(0), BRW_CONDITIONAL_NZ);
408
409            return BRW_PREDICATE_NORMAL;
410         } else {
411            /* More recent platforms implement compliant behavior when a null
412             * surface is bound.
413             */
414            return BRW_PREDICATE_NONE;
415         }
416      }
417
418      /**
419       * Check whether the provided coordinates are within the image bounds
420       * and write the comparison result to f0.0.  Returns an appropriate
421       * predication mode to use on subsequent image operations.
422       */
423      brw_predicate
424      emit_bounds_check(const fs_builder &bld, const fs_reg &image,
425                        const fs_reg &addr, unsigned dims)
426      {
427         const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
428
429         for (unsigned c = 0; c < dims; ++c)
430            set_predicate(c == 0 ? BRW_PREDICATE_NONE : BRW_PREDICATE_NORMAL,
431                          bld.CMP(bld.null_reg_ud(),
432                                  offset(retype(addr, BRW_REGISTER_TYPE_UD), bld, c),
433                                  offset(size, bld, c),
434                                  BRW_CONDITIONAL_L));
435
436         return BRW_PREDICATE_NORMAL;
437      }
438   }
439
440   namespace image_coordinates {
441      /**
442       * Return the total number of coordinates needed to address a texel of
443       * the surface, which may be more than the sum of \p surf_dims and \p
444       * arr_dims if padding is required.
445       */
446      unsigned
447      num_image_coordinates(const fs_builder &bld,
448                            unsigned surf_dims, unsigned arr_dims,
449                            isl_format format)
450      {
451         /* HSW in vec4 mode and our software coordinate handling for untyped
452          * reads want the array index to be at the Z component.
453          */
454         const bool array_index_at_z =
455            format != ISL_FORMAT_UNSUPPORTED &&
456            !isl_has_matching_typed_storage_image_format(
457               bld.shader->devinfo, format);
458         const unsigned zero_dims =
459            ((surf_dims == 1 && arr_dims == 1 && array_index_at_z) ? 1 : 0);
460
461         return surf_dims + zero_dims + arr_dims;
462      }
463
464      /**
465       * Transform image coordinates into the form expected by the
466       * implementation.
467       */
468      fs_reg
469      emit_image_coordinates(const fs_builder &bld, const fs_reg &addr,
470                             unsigned surf_dims, unsigned arr_dims,
471                             isl_format format)
472      {
473         const unsigned dims =
474            num_image_coordinates(bld, surf_dims, arr_dims, format);
475
476         if (dims > surf_dims + arr_dims) {
477            assert(surf_dims == 1 && arr_dims == 1 && dims == 3);
478            /* The array index is required to be passed in as the Z component,
479             * insert a zero at the Y component to shift it to the right
480             * position.
481             *
482             * FINISHME: Factor out this frequently recurring pattern into a
483             * helper function.
484             */
485            const fs_reg srcs[] = { addr, brw_imm_d(0), offset(addr, bld, 1) };
486            const fs_reg dst = bld.vgrf(addr.type, dims);
487            bld.LOAD_PAYLOAD(dst, srcs, dims, 0);
488            return dst;
489         } else {
490            return addr;
491         }
492      }
493
494      /**
495       * Calculate the offset in memory of the texel given by \p coord.
496       *
497       * This is meant to be used with untyped surface messages to access a
498       * tiled surface, what involves taking into account the tiling and
499       * swizzling modes of the surface manually so it will hopefully not
500       * happen very often.
501       *
502       * The tiling algorithm implemented here matches either the X or Y
503       * tiling layouts supported by the hardware depending on the tiling
504       * coefficients passed to the program as uniforms.  See Volume 1 Part 2
505       * Section 4.5 "Address Tiling Function" of the IVB PRM for an in-depth
506       * explanation of the hardware tiling format.
507       */
508      fs_reg
509      emit_address_calculation(const fs_builder &bld, const fs_reg &image,
510                               const fs_reg &coord, unsigned dims)
511      {
512         const gen_device_info *devinfo = bld.shader->devinfo;
513         const fs_reg off = offset(image, bld, BRW_IMAGE_PARAM_OFFSET_OFFSET);
514         const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
515         const fs_reg tile = offset(image, bld, BRW_IMAGE_PARAM_TILING_OFFSET);
516         const fs_reg swz = offset(image, bld, BRW_IMAGE_PARAM_SWIZZLING_OFFSET);
517         const fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
518         const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
519         const fs_reg minor = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
520         const fs_reg major = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
521         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD);
522
523         /* Shift the coordinates by the fixed surface offset.  It may be
524          * non-zero if the image is a single slice of a higher-dimensional
525          * surface, or if a non-zero mipmap level of the surface is bound to
526          * the pipeline.  The offset needs to be applied here rather than at
527          * surface state set-up time because the desired slice-level may
528          * start mid-tile, so simply shifting the surface base address
529          * wouldn't give a well-formed tiled surface in the general case.
530          */
531         for (unsigned c = 0; c < 2; ++c)
532            bld.ADD(offset(addr, bld, c), offset(off, bld, c),
533                    (c < dims ?
534                     offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, c) :
535                     fs_reg(brw_imm_d(0))));
536
537         /* The layout of 3-D textures in memory is sort-of like a tiling
538          * format.  At each miplevel, the slices are arranged in rows of
539          * 2^level slices per row.  The slice row is stored in tmp.y and
540          * the slice within the row is stored in tmp.x.
541          *
542          * The layout of 2-D array textures and cubemaps is much simpler:
543          * Depending on whether the ARYSPC_LOD0 layout is in use it will be
544          * stored in memory as an array of slices, each one being a 2-D
545          * arrangement of miplevels, or as a 2D arrangement of miplevels,
546          * each one being an array of slices.  In either case the separation
547          * between slices of the same LOD is equal to the qpitch value
548          * provided as stride.w.
549          *
550          * This code can be made to handle either 2D arrays and 3D textures
551          * by passing in the miplevel as tile.z for 3-D textures and 0 in
552          * tile.z for 2-D array textures.
553          *
554          * See Volume 1 Part 1 of the Gen7 PRM, sections 6.18.4.7 "Surface
555          * Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion
556          * of the hardware 3D texture and 2D array layouts.
557          */
558         if (dims > 2) {
559            /* Decompose z into a major (tmp.y) and a minor (tmp.x)
560             * index.
561             */
562            bld.BFE(offset(tmp, bld, 0), offset(tile, bld, 2), brw_imm_d(0),
563                    offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2));
564            bld.SHR(offset(tmp, bld, 1),
565                    offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2),
566                    offset(tile, bld, 2));
567
568            /* Take into account the horizontal (tmp.x) and vertical (tmp.y)
569             * slice offset.
570             */
571            for (unsigned c = 0; c < 2; ++c) {
572               bld.MUL(offset(tmp, bld, c),
573                       offset(stride, bld, 2 + c), offset(tmp, bld, c));
574               bld.ADD(offset(addr, bld, c),
575                       offset(addr, bld, c), offset(tmp, bld, c));
576            }
577         }
578
579         if (dims > 1) {
580            /* Calculate the major/minor x and y indices.  In order to
581             * accommodate both X and Y tiling, the Y-major tiling format is
582             * treated as being a bunch of narrow X-tiles placed next to each
583             * other.  This means that the tile width for Y-tiling is actually
584             * the width of one sub-column of the Y-major tile where each 4K
585             * tile has 8 512B sub-columns.
586             *
587             * The major Y value is the row of tiles in which the pixel lives.
588             * The major X value is the tile sub-column in which the pixel
589             * lives; for X tiling, this is the same as the tile column, for Y
590             * tiling, each tile has 8 sub-columns.  The minor X and Y indices
591             * are the position within the sub-column.
592             */
593            for (unsigned c = 0; c < 2; ++c) {
594               /* Calculate the minor x and y indices. */
595               bld.BFE(offset(minor, bld, c), offset(tile, bld, c),
596                       brw_imm_d(0), offset(addr, bld, c));
597
598               /* Calculate the major x and y indices. */
599               bld.SHR(offset(major, bld, c),
600                       offset(addr, bld, c), offset(tile, bld, c));
601            }
602
603            /* Calculate the texel index from the start of the tile row and
604             * the vertical coordinate of the row.
605             * Equivalent to:
606             *   tmp.x = (major.x << tile.y << tile.x) +
607             *           (minor.y << tile.x) + minor.x
608             *   tmp.y = major.y << tile.y
609             */
610            bld.SHL(tmp, major, offset(tile, bld, 1));
611            bld.ADD(tmp, tmp, offset(minor, bld, 1));
612            bld.SHL(tmp, tmp, offset(tile, bld, 0));
613            bld.ADD(tmp, tmp, minor);
614            bld.SHL(offset(tmp, bld, 1),
615                    offset(major, bld, 1), offset(tile, bld, 1));
616
617            /* Add it to the start of the tile row. */
618            bld.MUL(offset(tmp, bld, 1),
619                    offset(tmp, bld, 1), offset(stride, bld, 1));
620            bld.ADD(tmp, tmp, offset(tmp, bld, 1));
621
622            /* Multiply by the Bpp value. */
623            bld.MUL(dst, tmp, stride);
624
625            if (devinfo->gen < 8 && !devinfo->is_baytrail) {
626               /* Take into account the two dynamically specified shifts.
627                * Both need are used to implement swizzling of X-tiled
628                * surfaces.  For Y-tiled surfaces only one bit needs to be
629                * XOR-ed with bit 6 of the memory address, so a swz value of
630                * 0xff (actually interpreted as 31 by the hardware) will be
631                * provided to cause the relevant bit of tmp.y to be zero and
632                * turn the first XOR into the identity.  For linear surfaces
633                * or platforms lacking address swizzling both shifts will be
634                * 0xff causing the relevant bits of both tmp.x and .y to be
635                * zero, what effectively disables swizzling.
636                */
637               for (unsigned c = 0; c < 2; ++c)
638                  bld.SHR(offset(tmp, bld, c), dst, offset(swz, bld, c));
639
640               /* XOR tmp.x and tmp.y with bit 6 of the memory address. */
641               bld.XOR(tmp, tmp, offset(tmp, bld, 1));
642               bld.AND(tmp, tmp, brw_imm_d(1 << 6));
643               bld.XOR(dst, dst, tmp);
644            }
645
646         } else {
647            /* Multiply by the Bpp/stride value.  Note that the addr.y may be
648             * non-zero even if the image is one-dimensional because a
649             * vertical offset may have been applied above to select a
650             * non-zero slice or level of a higher-dimensional texture.
651             */
652            bld.MUL(offset(addr, bld, 1),
653                    offset(addr, bld, 1), offset(stride, bld, 1));
654            bld.ADD(addr, addr, offset(addr, bld, 1));
655            bld.MUL(dst, addr, stride);
656         }
657
658         return dst;
659      }
660   }
661
662   namespace image_format_conversion {
663      using image_format_info::color_u;
664
665      namespace {
666         /**
667          * Maximum representable value in an unsigned integer with the given
668          * number of bits.
669          */
670         inline unsigned
671         scale(unsigned n)
672         {
673            return (1 << n) - 1;
674         }
675      }
676
677      /**
678       * Pack the vector \p src in a bitfield given the per-component bit
679       * shifts and widths.  Note that bitfield components are not allowed to
680       * cross 32-bit boundaries.
681       */
682      fs_reg
683      emit_pack(const fs_builder &bld, const fs_reg &src,
684                const color_u &shifts, const color_u &widths)
685      {
686         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
687         bool seen[4] = {};
688
689         for (unsigned c = 0; c < 4; ++c) {
690            if (widths[c]) {
691               const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
692
693               /* Shift each component left to the correct bitfield position. */
694               bld.SHL(tmp, offset(src, bld, c), brw_imm_ud(shifts[c] % 32));
695
696               /* Add everything up. */
697               if (seen[shifts[c] / 32]) {
698                  bld.OR(offset(dst, bld, shifts[c] / 32),
699                         offset(dst, bld, shifts[c] / 32), tmp);
700               } else {
701                  bld.MOV(offset(dst, bld, shifts[c] / 32), tmp);
702                  seen[shifts[c] / 32] = true;
703               }
704            }
705         }
706
707         return dst;
708      }
709
710      /**
711       * Unpack a vector from the bitfield \p src given the per-component bit
712       * shifts and widths.  Note that bitfield components are not allowed to
713       * cross 32-bit boundaries.
714       */
715      fs_reg
716      emit_unpack(const fs_builder &bld, const fs_reg &src,
717                  const color_u &shifts, const color_u &widths)
718      {
719         const fs_reg dst = bld.vgrf(src.type, 4);
720
721         for (unsigned c = 0; c < 4; ++c) {
722            if (widths[c]) {
723               /* Shift left to discard the most significant bits. */
724               bld.SHL(offset(dst, bld, c),
725                       offset(src, bld, shifts[c] / 32),
726                       brw_imm_ud(32 - shifts[c] % 32 - widths[c]));
727
728               /* Shift back to the least significant bits using an arithmetic
729                * shift to get sign extension on signed types.
730                */
731               bld.ASR(offset(dst, bld, c),
732                       offset(dst, bld, c), brw_imm_ud(32 - widths[c]));
733            }
734         }
735
736         return dst;
737      }
738
739      /**
740       * Convert an integer vector into another integer vector of the
741       * specified bit widths, properly handling overflow.
742       */
743      fs_reg
744      emit_convert_to_integer(const fs_builder &bld, const fs_reg &src,
745                              const color_u &widths, bool is_signed)
746      {
747         const unsigned s = (is_signed ? 1 : 0);
748         const fs_reg dst = bld.vgrf(
749            is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
750         assert(src.type == dst.type);
751
752         for (unsigned c = 0; c < 4; ++c) {
753            if (widths[c]) {
754               /* Clamp to the maximum value. */
755               bld.emit_minmax(offset(dst, bld, c), offset(src, bld, c),
756                               brw_imm_d((int)scale(widths[c] - s)),
757                               BRW_CONDITIONAL_L);
758
759               /* Clamp to the minimum value. */
760               if (is_signed)
761                  bld.emit_minmax(offset(dst, bld, c), offset(dst, bld, c),
762                                  brw_imm_d(-(int)scale(widths[c] - s) - 1),
763                                  BRW_CONDITIONAL_GE);
764
765               /* Mask off all but the bits we actually want.  Otherwise, if
766                * we pass a negative number into the hardware when it's
767                * expecting something like UINT8, it will happily clamp it to
768                * +255 for us.
769                */
770               if (is_signed && widths[c] < 32)
771                  bld.AND(offset(dst, bld, c), offset(dst, bld, c),
772                          brw_imm_d(scale(widths[c])));
773            }
774         }
775
776         return dst;
777      }
778
779      /**
780       * Convert a normalized fixed-point vector of the specified signedness
781       * and bit widths into a floating point vector.
782       */
783      fs_reg
784      emit_convert_from_scaled(const fs_builder &bld, const fs_reg &src,
785                               const color_u &widths, bool is_signed)
786      {
787         const unsigned s = (is_signed ? 1 : 0);
788         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
789
790         for (unsigned c = 0; c < 4; ++c) {
791            if (widths[c]) {
792               /* Convert to float. */
793               bld.MOV(offset(dst, bld, c), offset(src, bld, c));
794
795               /* Divide by the normalization constants. */
796               bld.MUL(offset(dst, bld, c), offset(dst, bld, c),
797                       brw_imm_f(1.0f / scale(widths[c] - s)));
798
799               /* Clamp to the minimum value. */
800               if (is_signed)
801                  bld.emit_minmax(offset(dst, bld, c),
802                                  offset(dst, bld, c), brw_imm_f(-1.0f),
803                                  BRW_CONDITIONAL_GE);
804            }
805         }
806         return dst;
807      }
808
809      /**
810       * Convert a floating-point vector into a normalized fixed-point vector
811       * of the specified signedness and bit widths.
812       */
813      fs_reg
814      emit_convert_to_scaled(const fs_builder &bld, const fs_reg &src,
815                             const color_u &widths, bool is_signed)
816      {
817         const unsigned s = (is_signed ? 1 : 0);
818         const fs_reg dst = bld.vgrf(
819            is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
820         const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
821
822         for (unsigned c = 0; c < 4; ++c) {
823            if (widths[c]) {
824               /* Clamp the normalized floating-point argument. */
825               if (is_signed) {
826                  bld.emit_minmax(offset(fdst, bld, c), offset(src, bld, c),
827                                  brw_imm_f(-1.0f), BRW_CONDITIONAL_GE);
828
829                  bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
830                                  brw_imm_f(1.0f), BRW_CONDITIONAL_L);
831               } else {
832                  set_saturate(true, bld.MOV(offset(fdst, bld, c),
833                                             offset(src, bld, c)));
834               }
835
836               /* Multiply by the normalization constants. */
837               bld.MUL(offset(fdst, bld, c), offset(fdst, bld, c),
838                       brw_imm_f((float)scale(widths[c] - s)));
839
840               /* Convert to integer. */
841               bld.RNDE(offset(fdst, bld, c), offset(fdst, bld, c));
842               bld.MOV(offset(dst, bld, c), offset(fdst, bld, c));
843
844               /* Mask off all but the bits we actually want.  Otherwise, if
845                * we pass a negative number into the hardware when it's
846                * expecting something like UINT8, it will happily clamp it to
847                * +255 for us.
848                */
849               if (is_signed && widths[c] < 32)
850                  bld.AND(offset(dst, bld, c), offset(dst, bld, c),
851                          brw_imm_d(scale(widths[c])));
852            }
853         }
854
855         return dst;
856      }
857
858      /**
859       * Convert a floating point vector of the specified bit widths into a
860       * 32-bit floating point vector.
861       */
862      fs_reg
863      emit_convert_from_float(const fs_builder &bld, const fs_reg &src,
864                              const color_u &widths)
865      {
866         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
867         const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
868
869         for (unsigned c = 0; c < 4; ++c) {
870            if (widths[c]) {
871               bld.MOV(offset(dst, bld, c), offset(src, bld, c));
872
873               /* Extend 10-bit and 11-bit floating point numbers to 15 bits.
874                * This works because they have a 5-bit exponent just like the
875                * 16-bit floating point format, and they have no sign bit.
876                */
877               if (widths[c] < 16)
878                  bld.SHL(offset(dst, bld, c),
879                          offset(dst, bld, c), brw_imm_ud(15 - widths[c]));
880
881               /* Convert to 32-bit floating point. */
882               bld.F16TO32(offset(fdst, bld, c), offset(dst, bld, c));
883            }
884         }
885
886         return fdst;
887      }
888
889      /**
890       * Convert a vector into a floating point vector of the specified bit
891       * widths.
892       */
893      fs_reg
894      emit_convert_to_float(const fs_builder &bld, const fs_reg &src,
895                            const color_u &widths)
896      {
897         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
898         const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
899
900         for (unsigned c = 0; c < 4; ++c) {
901            if (widths[c]) {
902               bld.MOV(offset(fdst, bld, c), offset(src, bld, c));
903
904               /* Clamp to the minimum value. */
905               if (widths[c] < 16)
906                  bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
907                                  brw_imm_f(0.0f), BRW_CONDITIONAL_GE);
908
909               /* Convert to 16-bit floating-point. */
910               bld.F32TO16(offset(dst, bld, c), offset(fdst, bld, c));
911
912               /* Discard the least significant bits to get floating point
913                * numbers of the requested width.  This works because the
914                * 10-bit and 11-bit floating point formats have a 5-bit
915                * exponent just like the 16-bit format, and they have no sign
916                * bit.
917                */
918               if (widths[c] < 16)
919                  bld.SHR(offset(dst, bld, c), offset(dst, bld, c),
920                          brw_imm_ud(15 - widths[c]));
921            }
922         }
923
924         return dst;
925      }
926
927      /**
928       * Fill missing components of a vector with 0, 0, 0, 1.
929       */
930      fs_reg
931      emit_pad(const fs_builder &bld, const fs_reg &src,
932               const color_u &widths)
933      {
934         const fs_reg dst = bld.vgrf(src.type, 4);
935         const unsigned pad[] = { 0, 0, 0, 1 };
936
937         for (unsigned c = 0; c < 4; ++c)
938            bld.MOV(offset(dst, bld, c),
939                    widths[c] ? offset(src, bld, c)
940                              : fs_reg(brw_imm_ud(pad[c])));
941
942         return dst;
943      }
944   }
945}
946
947namespace brw {
948   namespace image_access {
949      /**
950       * Load a vector from a surface of the given format and dimensionality
951       * at the given coordinates.  \p surf_dims and \p arr_dims give the
952       * number of non-array and array coordinates of the image respectively.
953       */
954      fs_reg
955      emit_image_load(const fs_builder &bld,
956                      const fs_reg &image, const fs_reg &addr,
957                      unsigned surf_dims, unsigned arr_dims,
958                      unsigned gl_format)
959      {
960         using namespace image_format_info;
961         using namespace image_format_conversion;
962         using namespace image_validity;
963         using namespace image_coordinates;
964         using namespace surface_access;
965         const gen_device_info *devinfo = bld.shader->devinfo;
966         const isl_format format = isl_format_for_gl_format(gl_format);
967         const isl_format lower_format =
968            isl_lower_storage_image_format(devinfo, format);
969         fs_reg tmp;
970
971         /* Transform the image coordinates into actual surface coordinates. */
972         const fs_reg saddr =
973            emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
974         const unsigned dims =
975            num_image_coordinates(bld, surf_dims, arr_dims, format);
976
977         if (isl_has_matching_typed_storage_image_format(devinfo, format)) {
978            /* Hopefully we get here most of the time... */
979            tmp = emit_typed_read(bld, image, saddr, dims,
980                                  isl_format_get_num_channels(lower_format));
981         } else {
982            /* Untyped surface reads return 32 bits of the surface per
983             * component, without any sort of unpacking or type conversion,
984             */
985            const unsigned size = isl_format_get_layout(format)->bpb / 32;
986            /* they don't properly handle out of bounds access, so we have to
987             * check manually if the coordinates are valid and predicate the
988             * surface read on the result,
989             */
990            const brw_predicate pred =
991               emit_untyped_image_check(bld, image,
992                                        emit_bounds_check(bld, image,
993                                                          saddr, dims));
994
995            /* and they don't know about surface coordinates, we need to
996             * convert them to a raw memory offset.
997             */
998            const fs_reg laddr = emit_address_calculation(bld, image, saddr, dims);
999
1000            tmp = emit_untyped_read(bld, image, laddr, 1, size, pred);
1001
1002            /* An out of bounds surface access should give zero as result. */
1003            for (unsigned c = 0; c < size; ++c)
1004               set_predicate(pred, bld.SEL(offset(tmp, bld, c),
1005                                           offset(tmp, bld, c), brw_imm_d(0)));
1006         }
1007
1008         /* Set the register type to D instead of UD if the data type is
1009          * represented as a signed integer in memory so that sign extension
1010          * is handled correctly by unpack.
1011          */
1012         if (needs_sign_extension(format))
1013            tmp = retype(tmp, BRW_REGISTER_TYPE_D);
1014
1015         if (!has_supported_bit_layout(devinfo, format)) {
1016            /* Unpack individual vector components from the bitfield if the
1017             * hardware is unable to do it for us.
1018             */
1019            if (has_split_bit_layout(devinfo, format))
1020               tmp = emit_pack(bld, tmp, get_bit_shifts(lower_format),
1021                               get_bit_widths(lower_format));
1022            else
1023               tmp = emit_unpack(bld, tmp, get_bit_shifts(format),
1024                                 get_bit_widths(format));
1025
1026         } else if ((needs_sign_extension(format) &&
1027                     !is_conversion_trivial(devinfo, format)) ||
1028                    has_undefined_high_bits(devinfo, format)) {
1029            /* Perform a trivial unpack even though the bit layout matches in
1030             * order to get the most significant bits of each component
1031             * initialized properly.
1032             */
1033            tmp = emit_unpack(bld, tmp, color_u(0, 32, 64, 96),
1034                              get_bit_widths(format));
1035         }
1036
1037         if (!isl_format_has_int_channel(format)) {
1038            if (is_conversion_trivial(devinfo, format)) {
1039               /* Just need to cast the vector to the target type. */
1040               tmp = retype(tmp, BRW_REGISTER_TYPE_F);
1041            } else {
1042               /* Do the right sort of type conversion to float. */
1043               if (isl_format_has_float_channel(format))
1044                  tmp = emit_convert_from_float(
1045                     bld, tmp, get_bit_widths(format));
1046               else
1047                  tmp = emit_convert_from_scaled(
1048                     bld, tmp, get_bit_widths(format),
1049                     isl_format_has_snorm_channel(format));
1050            }
1051         }
1052
1053         /* Initialize missing components of the result. */
1054         return emit_pad(bld, tmp, get_bit_widths(format));
1055      }
1056
1057      /**
1058       * Store a vector in a surface of the given format and dimensionality at
1059       * the given coordinates.  \p surf_dims and \p arr_dims give the number
1060       * of non-array and array coordinates of the image respectively.
1061       */
1062      void
1063      emit_image_store(const fs_builder &bld, const fs_reg &image,
1064                       const fs_reg &addr, const fs_reg &src,
1065                       unsigned surf_dims, unsigned arr_dims,
1066                       unsigned gl_format)
1067      {
1068         using namespace image_format_info;
1069         using namespace image_format_conversion;
1070         using namespace image_validity;
1071         using namespace image_coordinates;
1072         using namespace surface_access;
1073         const isl_format format = isl_format_for_gl_format(gl_format);
1074         const gen_device_info *devinfo = bld.shader->devinfo;
1075
1076         /* Transform the image coordinates into actual surface coordinates. */
1077         const fs_reg saddr =
1078            emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
1079         const unsigned dims =
1080            num_image_coordinates(bld, surf_dims, arr_dims, format);
1081
1082         if (gl_format == GL_NONE) {
1083            /* We don't know what the format is, but that's fine because it
1084             * implies write-only access, and typed surface writes are always
1085             * able to take care of type conversion and packing for us.
1086             */
1087            emit_typed_write(bld, image, saddr, src, dims, 4);
1088
1089         } else {
1090            const isl_format lower_format =
1091               isl_lower_storage_image_format(devinfo, format);
1092            fs_reg tmp = src;
1093
1094            if (!is_conversion_trivial(devinfo, format)) {
1095               /* Do the right sort of type conversion. */
1096               if (isl_format_has_float_channel(format))
1097                  tmp = emit_convert_to_float(bld, tmp, get_bit_widths(format));
1098
1099               else if (isl_format_has_int_channel(format))
1100                  tmp = emit_convert_to_integer(bld, tmp, get_bit_widths(format),
1101                                                isl_format_has_sint_channel(format));
1102
1103               else
1104                  tmp = emit_convert_to_scaled(bld, tmp, get_bit_widths(format),
1105                                               isl_format_has_snorm_channel(format));
1106            }
1107
1108            /* We're down to bit manipulation at this point. */
1109            tmp = retype(tmp, BRW_REGISTER_TYPE_UD);
1110
1111            if (!has_supported_bit_layout(devinfo, format)) {
1112               /* Pack the vector components into a bitfield if the hardware
1113                * is unable to do it for us.
1114                */
1115               if (has_split_bit_layout(devinfo, format))
1116                  tmp = emit_unpack(bld, tmp, get_bit_shifts(lower_format),
1117                                    get_bit_widths(lower_format));
1118
1119               else
1120                  tmp = emit_pack(bld, tmp, get_bit_shifts(format),
1121                                  get_bit_widths(format));
1122            }
1123
1124            if (isl_has_matching_typed_storage_image_format(devinfo, format)) {
1125               /* Hopefully we get here most of the time... */
1126               emit_typed_write(bld, image, saddr, tmp, dims,
1127                                isl_format_get_num_channels(lower_format));
1128
1129            } else {
1130               /* Untyped surface writes store 32 bits of the surface per
1131                * component, without any sort of packing or type conversion,
1132                */
1133               const unsigned size = isl_format_get_layout(format)->bpb / 32;
1134
1135               /* they don't properly handle out of bounds access, so we have
1136                * to check manually if the coordinates are valid and predicate
1137                * the surface write on the result,
1138                */
1139               const brw_predicate pred =
1140                  emit_untyped_image_check(bld, image,
1141                                           emit_bounds_check(bld, image,
1142                                                             saddr, dims));
1143
1144               /* and, phew, they don't know about surface coordinates, we
1145                * need to convert them to a raw memory offset.
1146                */
1147               const fs_reg laddr = emit_address_calculation(
1148                  bld, image, saddr, dims);
1149
1150               emit_untyped_write(bld, image, laddr, tmp, 1, size, pred);
1151            }
1152         }
1153      }
1154
1155      /**
1156       * Perform an atomic read-modify-write operation in a surface of the
1157       * given dimensionality at the given coordinates.  \p surf_dims and \p
1158       * arr_dims give the number of non-array and array coordinates of the
1159       * image respectively.  Main building block of the imageAtomic GLSL
1160       * built-ins.
1161       */
1162      fs_reg
1163      emit_image_atomic(const fs_builder &bld,
1164                        const fs_reg &image, const fs_reg &addr,
1165                        const fs_reg &src0, const fs_reg &src1,
1166                        unsigned surf_dims, unsigned arr_dims,
1167                        unsigned rsize, unsigned op)
1168      {
1169         using namespace image_validity;
1170         using namespace image_coordinates;
1171         using namespace surface_access;
1172         /* Avoid performing an atomic operation on an unbound surface. */
1173         const brw_predicate pred = emit_typed_atomic_check(bld, image);
1174
1175         /* Transform the image coordinates into actual surface coordinates. */
1176         const fs_reg saddr =
1177            emit_image_coordinates(bld, addr, surf_dims, arr_dims,
1178                                   ISL_FORMAT_R32_UINT);
1179         const unsigned dims =
1180            num_image_coordinates(bld, surf_dims, arr_dims,
1181                                  ISL_FORMAT_R32_UINT);
1182
1183         /* Thankfully we can do without untyped atomics here. */
1184         const fs_reg tmp = emit_typed_atomic(bld, image, saddr, src0, src1,
1185                                              dims, rsize, op, pred);
1186
1187         /* An unbound surface access should give zero as result. */
1188         if (rsize && pred)
1189            set_predicate(pred, bld.SEL(tmp, tmp, brw_imm_d(0)));
1190
1191         return retype(tmp, src0.type);
1192      }
1193   }
1194}
1195