translate_sse.c revision 8b7f760f835f870b8f6af6c4d6613d44440f1dc5
1069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project/* 2069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * Copyright 2003 Tungsten Graphics, inc. 3069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * All Rights Reserved. 4069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * 5069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * Permission is hereby granted, free of charge, to any person obtaining a 6069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * copy of this software and associated documentation files (the "Software"), 7069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * to deal in the Software without restriction, including without limitation 8069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * on the rights to use, copy, modify, merge, publish, distribute, sub 9069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * license, and/or sell copies of the Software, and to permit persons to whom 10069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * the Software is furnished to do so, subject to the following conditions: 11069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * 12069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * The above copyright notice and this permission notice (including the next 13069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * paragraph) shall be included in all copies or substantial portions of the 14069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * Software. 15069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * 16069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * USE OR OTHER DEALINGS IN THE SOFTWARE. 23069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * 24069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * Authors: 25069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * Keith Whitwell <keithw@tungstengraphics.com> 26069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project */ 27069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 28069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 29069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project#include "pipe/p_config.h" 30069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project#include "pipe/p_compiler.h" 31069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project#include "util/u_memory.h" 32069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project#include "util/u_math.h" 33069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project#include "util/u_format.h" 34069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 35069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project#include "translate.h" 36069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 37069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 38069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) 39069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 40069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project#include "rtasm/rtasm_cpu.h" 41069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project#include "rtasm/rtasm_x86sse.h" 42069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 43069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 44069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project#define X 0 45069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project#define Y 1 46069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project#define Z 2 47069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project#define W 3 48069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 49069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 50d42abb2fd917184764daf22f5f299e848b8701d7Narayan Kamathstruct translate_buffer { 51d42abb2fd917184764daf22f5f299e848b8701d7Narayan Kamath const void *base_ptr; 52d42abb2fd917184764daf22f5f299e848b8701d7Narayan Kamath uintptr_t stride; 53d42abb2fd917184764daf22f5f299e848b8701d7Narayan Kamath unsigned max_index; 54069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project}; 55d42abb2fd917184764daf22f5f299e848b8701d7Narayan Kamath 56069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Projectstruct translate_buffer_variant { 57069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project unsigned buffer_index; 58069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project unsigned instance_divisor; 59069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project void *ptr; /* updated either per vertex or per instance */ 60069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project}; 61069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 62069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 63069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project#define ELEMENT_BUFFER_INSTANCE_ID 1001 64069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 65069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project#define NUM_CONSTS 7 66069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 67069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Projectenum 68069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project{ 69069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project CONST_IDENTITY, 70069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project CONST_INV_127, 71069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project CONST_INV_255, 72069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project CONST_INV_32767, 73069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project CONST_INV_65535, 74069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project CONST_INV_2147483647, 75069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project CONST_255 76069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project}; 77069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 78069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project#define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)} 79069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Projectstatic float consts[NUM_CONSTS][4] = { 80069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project {0, 0, 0, 1}, 81069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project C(1.0 / 127.0), 82069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project C(1.0 / 255.0), 83069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project C(1.0 / 32767.0), 84069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project C(1.0 / 65535.0), 85069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project C(1.0 / 2147483647.0), 86069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project C(255.0) 87069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project}; 88069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project#undef C 89069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 90069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Projectstruct translate_sse { 91069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project struct translate translate; 92069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 93069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project struct x86_function linear_func; 94069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project struct x86_function elt_func; 95069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project struct x86_function elt16_func; 96069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project struct x86_function elt8_func; 97069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project struct x86_function *func; 98069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 99069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4]; 100069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project int8_t reg_to_const[16]; 101069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project int8_t const_to_reg[NUM_CONSTS]; 102069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 103069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project struct translate_buffer buffer[PIPE_MAX_ATTRIBS]; 104069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project unsigned nr_buffers; 105069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 106069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project /* Multiple buffer variants can map to a single buffer. */ 107069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project struct translate_buffer_variant buffer_variant[PIPE_MAX_ATTRIBS]; 108069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project unsigned nr_buffer_variants; 109069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 110069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project /* Multiple elements can map to a single buffer variant. */ 111069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project unsigned element_to_buffer_variant[PIPE_MAX_ATTRIBS]; 112069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 113069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project boolean use_instancing; 114069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project unsigned instance_id; 115069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 116069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project /* these are actually known values, but putting them in a struct 117069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * like this is helpful to keep them in sync across the file. 118069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project */ 119069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project struct x86_reg tmp_EAX; 120069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project struct x86_reg tmp2_EDX; 121069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project struct x86_reg src_ECX; 122069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project struct x86_reg idx_ESI; /* either start+i or &elt[i] */ 123069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project struct x86_reg machine_EDI; 124069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project struct x86_reg outbuf_EBX; 125069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project struct x86_reg count_EBP; /* decrements to zero */ 126069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project}; 127069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 128069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Projectstatic int get_offset( const void *a, const void *b ) 129069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project{ 130069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project return (const char *)b - (const char *)a; 131069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project} 132069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 133069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Projectstatic struct x86_reg get_const( struct translate_sse *p, unsigned id) 134069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project{ 135069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project struct x86_reg reg; 136069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project unsigned i; 137069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 138069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project if(p->const_to_reg[id] >= 0) 139069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project return x86_make_reg(file_XMM, p->const_to_reg[id]); 140069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 141069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project for(i = 2; i < 8; ++i) 142069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project { 143069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project if(p->reg_to_const[i] < 0) 144069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project break; 145069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project } 146069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 147069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project /* TODO: be smarter here */ 148069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project if(i == 8) 149069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project --i; 150069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 151069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project reg = x86_make_reg(file_XMM, i); 152069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 153069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project if(p->reg_to_const[i] >= 0) 154069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project p->const_to_reg[p->reg_to_const[i]] = -1; 155069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 156069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project p->reg_to_const[i] = id; 157069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project p->const_to_reg[id] = i; 158069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 159069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project /* TODO: this should happen outside the loop, if possible */ 160069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project sse_movaps(p->func, reg, 161069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project x86_make_disp(p->machine_EDI, 162069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project get_offset(p, &p->consts[id][0]))); 163069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 164069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project return reg; 165069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project} 166069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 167069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project/* load the data in a SSE2 register, padding with zeros */ 168069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Projectstatic boolean emit_load_sse2( struct translate_sse *p, 169069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project struct x86_reg data, 170069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project struct x86_reg src, 171069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project unsigned size) 172069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project{ 173069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); 174069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project struct x86_reg tmp = p->tmp_EAX; 175069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project switch(size) 176069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project { 177069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project case 1: 178069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project x86_movzx8(p->func, tmp, src); 179069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project sse2_movd(p->func, data, tmp); 180069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project break; 181069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project case 2: 182069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project x86_movzx16(p->func, tmp, src); 183069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project sse2_movd(p->func, data, tmp); 184069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project break; 185069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project case 3: 186069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project x86_movzx8(p->func, tmp, x86_make_disp(src, 2)); 187069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project x86_shl_imm(p->func, tmp, 16); 188069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project x86_mov16(p->func, tmp, src); 189069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project sse2_movd(p->func, data, tmp); 190069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project break; 191069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project case 4: 192069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project sse2_movd(p->func, data, src); 193069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project break; 194069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project case 6: 195069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project sse2_movd(p->func, data, src); 196069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project x86_movzx16(p->func, tmp, x86_make_disp(src, 4)); 197069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project sse2_movd(p->func, tmpXMM, tmp); 198069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project sse2_punpckldq(p->func, data, tmpXMM); 199069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project break; 200069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project case 8: 201069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project sse2_movq(p->func, data, src); 202069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project break; 203069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project case 12: 204069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project sse2_movq(p->func, data, src); 205069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8)); 206069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project sse2_punpcklqdq(p->func, data, tmpXMM); 207069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project break; 208069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project case 16: 209069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project sse2_movdqu(p->func, data, src); 210069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project break; 211069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project default: 212069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project return FALSE; 213069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project } 214069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project return TRUE; 215069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project} 216069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 217069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project/* this value can be passed for the out_chans argument */ 218069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project#define CHANNELS_0001 5 219069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 220069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project/* this function will load #chans float values, and will 221069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * pad the register with zeroes at least up to out_chans. 222069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * 223069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * If out_chans is set to CHANNELS_0001, then the fourth 224069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * value will be padded with 1. Only pass this value if 225069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * chans < 4 or results are undefined. 226069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project */ 227069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Projectstatic void emit_load_float32( struct translate_sse *p, 228069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project struct x86_reg data, 229069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project struct x86_reg arg0, 230069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project unsigned out_chans, 231069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project unsigned chans) 232069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project{ 233069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project switch(chans) 234069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project { 235069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project case 1: 236069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project /* a 0 0 0 237069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * a 0 0 1 238069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project */ 239069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project sse_movss(p->func, data, arg0); 240069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project if(out_chans == CHANNELS_0001) 241069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project sse_orps(p->func, data, get_const(p, CONST_IDENTITY) ); 242069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project break; 243069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project case 2: 244069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project /* 0 0 0 1 245069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * a b 0 1 246069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project */ 247069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project if(out_chans == CHANNELS_0001) 248069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) ); 249069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project else if(out_chans > 2) 250069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) ); 251069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project sse_movlps(p->func, data, arg0); 252069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project break; 253069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project case 3: 254069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project /* Have to jump through some hoops: 255069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * 256069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * c 0 0 0 257069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * c 0 0 1 if out_chans == CHANNELS_0001 258069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * 0 0 c 0/1 259069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * a b c 0/1 260069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project */ 261069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project sse_movss(p->func, data, x86_make_disp(arg0, 8)); 262069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project if(out_chans == CHANNELS_0001) 263069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X,Y,Z,W) ); 264069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) ); 265069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project sse_movlps(p->func, data, arg0); 266069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project break; 267069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project case 4: 268069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project sse_movups(p->func, data, arg0); 269069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project break; 270069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project } 271069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project} 272069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 273069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project/* this function behaves like emit_load_float32, but loads 274069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project 64-bit floating point numbers, converting them to 32-bit 275069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project ones */ 276069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Projectstatic void emit_load_float64to32( struct translate_sse *p, 277069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project struct x86_reg data, 278069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project struct x86_reg arg0, 279069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project unsigned out_chans, 280069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project unsigned chans) 281069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project{ 282069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); 283069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project switch(chans) 284069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project { 285069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project case 1: 286069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project sse2_movsd(p->func, data, arg0); 287069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project if(out_chans > 1) 288069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project sse2_cvtpd2ps(p->func, data, data); 289069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project else 290069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project sse2_cvtsd2ss(p->func, data, data); 291069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project if(out_chans == CHANNELS_0001) 292069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) ); 293069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project break; 294069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project case 2: 295069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project sse2_movupd(p->func, data, arg0); 296069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project sse2_cvtpd2ps(p->func, data, data); 297069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project if(out_chans == CHANNELS_0001) 298069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) ); 299069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project else if(out_chans > 2) 300069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) ); 301 break; 302 case 3: 303 sse2_movupd(p->func, data, arg0); 304 sse2_cvtpd2ps(p->func, data, data); 305 sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16)); 306 if(out_chans > 3) 307 sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM); 308 else 309 sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM); 310 sse_movlhps(p->func, data, tmpXMM); 311 if(out_chans == CHANNELS_0001) 312 sse_orps(p->func, data, get_const(p, CONST_IDENTITY) ); 313 break; 314 case 4: 315 sse2_movupd(p->func, data, arg0); 316 sse2_cvtpd2ps(p->func, data, data); 317 sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16)); 318 sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM); 319 sse_movlhps(p->func, data, tmpXMM); 320 break; 321 } 322} 323 324static void emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src_gpr, struct x86_reg src_xmm) 325{ 326 if(x86_target(p->func) != X86_32) 327 x64_mov64(p->func, dst_gpr, src_gpr); 328 else 329 { 330 /* TODO: when/on which CPUs is SSE2 actually better than SSE? */ 331 if(x86_target_caps(p->func) & X86_SSE2) 332 sse2_movq(p->func, dst_xmm, src_xmm); 333 else 334 sse_movlps(p->func, dst_xmm, src_xmm); 335 } 336} 337 338static void emit_load64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src) 339{ 340 emit_mov64(p, dst_gpr, dst_xmm, src, src); 341} 342 343static void emit_store64(struct translate_sse *p, struct x86_reg dst, struct x86_reg src_gpr, struct x86_reg src_xmm) 344{ 345 emit_mov64(p, dst, dst, src_gpr, src_xmm); 346} 347 348static void emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src) 349{ 350 if(x86_target_caps(p->func) & X86_SSE2) 351 sse2_movdqu(p->func, dst, src); 352 else 353 sse_movups(p->func, dst, src); 354} 355 356/* TODO: this uses unaligned accesses liberally, which is great on Nehalem, 357 * but may or may not be good on older processors 358 * TODO: may perhaps want to use non-temporal stores here if possible 359 */ 360static void emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src, unsigned size) 361{ 362 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); 363 struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1); 364 struct x86_reg dataGPR = p->tmp_EAX; 365 struct x86_reg dataGPR2 = p->tmp2_EDX; 366 367 if(size < 8) 368 { 369 switch (size) 370 { 371 case 1: 372 x86_mov8(p->func, dataGPR, src); 373 x86_mov8(p->func, dst, dataGPR); 374 break; 375 case 2: 376 x86_mov16(p->func, dataGPR, src); 377 x86_mov16(p->func, dst, dataGPR); 378 break; 379 case 3: 380 x86_mov16(p->func, dataGPR, src); 381 x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2)); 382 x86_mov16(p->func, dst, dataGPR); 383 x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2); 384 break; 385 case 4: 386 x86_mov(p->func, dataGPR, src); 387 x86_mov(p->func, dst, dataGPR); 388 break; 389 case 6: 390 x86_mov(p->func, dataGPR, src); 391 x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4)); 392 x86_mov(p->func, dst, dataGPR); 393 x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2); 394 break; 395 } 396 } 397 else if(!(x86_target_caps(p->func) & X86_SSE)) 398 { 399 unsigned i = 0; 400 assert((size & 3) == 0); 401 for(i = 0; i < size; i += 4) 402 { 403 x86_mov(p->func, dataGPR, x86_make_disp(src, i)); 404 x86_mov(p->func, x86_make_disp(dst, i), dataGPR); 405 } 406 } 407 else 408 { 409 switch(size) 410 { 411 case 8: 412 emit_load64(p, dataGPR, dataXMM, src); 413 emit_store64(p, dst, dataGPR, dataXMM); 414 break; 415 case 12: 416 emit_load64(p, dataGPR2, dataXMM, src); 417 x86_mov(p->func, dataGPR, x86_make_disp(src, 8)); 418 emit_store64(p, dst, dataGPR2, dataXMM); 419 x86_mov(p->func, x86_make_disp(dst, 8), dataGPR); 420 break; 421 case 16: 422 emit_mov128(p, dataXMM, src); 423 emit_mov128(p, dst, dataXMM); 424 break; 425 case 24: 426 emit_mov128(p, dataXMM, src); 427 emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16)); 428 emit_mov128(p, dst, dataXMM); 429 emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2); 430 break; 431 case 32: 432 emit_mov128(p, dataXMM, src); 433 emit_mov128(p, dataXMM2, x86_make_disp(src, 16)); 434 emit_mov128(p, dst, dataXMM); 435 emit_mov128(p, x86_make_disp(dst, 16), dataXMM2); 436 break; 437 default: 438 assert(0); 439 } 440 } 441} 442 443static boolean translate_attr_convert( struct translate_sse *p, 444 const struct translate_element *a, 445 struct x86_reg src, 446 struct x86_reg dst) 447 448{ 449 const struct util_format_description* input_desc = util_format_description(a->input_format); 450 const struct util_format_description* output_desc = util_format_description(a->output_format); 451 unsigned i; 452 boolean id_swizzle = TRUE; 453 unsigned swizzle[4] = {UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE}; 454 unsigned needed_chans = 0; 455 unsigned imms[2] = {0, 0x3f800000}; 456 457 if(a->output_format == PIPE_FORMAT_NONE || a->input_format == PIPE_FORMAT_NONE) 458 return FALSE; 459 460 if(input_desc->channel[0].size & 7) 461 return FALSE; 462 463 if(input_desc->colorspace != output_desc->colorspace) 464 return FALSE; 465 466 for(i = 1; i < input_desc->nr_channels; ++i) 467 { 468 if(memcmp(&input_desc->channel[i], &input_desc->channel[0], sizeof(input_desc->channel[0]))) 469 return FALSE; 470 } 471 472 for(i = 1; i < output_desc->nr_channels; ++i) 473 { 474 if(memcmp(&output_desc->channel[i], &output_desc->channel[0], sizeof(output_desc->channel[0]))) 475 return FALSE; 476 } 477 478 for(i = 0; i < output_desc->nr_channels; ++i) 479 { 480 if(output_desc->swizzle[i] < 4) 481 swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i]; 482 } 483 484 if((x86_target_caps(p->func) & X86_SSE) && (0 485 || a->output_format == PIPE_FORMAT_R32_FLOAT 486 || a->output_format == PIPE_FORMAT_R32G32_FLOAT 487 || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT 488 || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT)) 489 { 490 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); 491 492 for(i = 0; i < output_desc->nr_channels; ++i) 493 { 494 if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels) 495 swizzle[i] = i; 496 } 497 498 for(i = 0; i < output_desc->nr_channels; ++i) 499 { 500 if(swizzle[i] < 4) 501 needed_chans = MAX2(needed_chans, swizzle[i] + 1); 502 if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i) 503 id_swizzle = FALSE; 504 } 505 506 if(needed_chans > 0) 507 { 508 switch(input_desc->channel[0].type) 509 { 510 case UTIL_FORMAT_TYPE_UNSIGNED: 511 if(!(x86_target_caps(p->func) & X86_SSE2)) 512 return FALSE; 513 emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3); 514 515 /* TODO: add support for SSE4.1 pmovzx */ 516 switch(input_desc->channel[0].size) 517 { 518 case 8: 519 /* TODO: this may be inefficient due to get_identity() being used both as a float and integer register */ 520 sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); 521 sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); 522 break; 523 case 16: 524 sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY)); 525 break; 526 case 32: /* we lose precision here */ 527 sse2_psrld_imm(p->func, dataXMM, 1); 528 break; 529 default: 530 return FALSE; 531 } 532 sse2_cvtdq2ps(p->func, dataXMM, dataXMM); 533 if(input_desc->channel[0].normalized) 534 { 535 struct x86_reg factor; 536 switch(input_desc->channel[0].size) 537 { 538 case 8: 539 factor = get_const(p, CONST_INV_255); 540 break; 541 case 16: 542 factor = get_const(p, CONST_INV_65535); 543 break; 544 case 32: 545 factor = get_const(p, CONST_INV_2147483647); 546 break; 547 default: 548 assert(0); 549 factor.disp = 0; 550 factor.file = 0; 551 factor.idx = 0; 552 factor.mod = 0; 553 break; 554 } 555 sse_mulps(p->func, dataXMM, factor); 556 } 557 else if(input_desc->channel[0].size == 32) 558 sse_addps(p->func, dataXMM, dataXMM); /* compensate for the bit we threw away to fit u32 into s32 */ 559 break; 560 case UTIL_FORMAT_TYPE_SIGNED: 561 if(!(x86_target_caps(p->func) & X86_SSE2)) 562 return FALSE; 563 emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3); 564 565 /* TODO: add support for SSE4.1 pmovsx */ 566 switch(input_desc->channel[0].size) 567 { 568 case 8: 569 sse2_punpcklbw(p->func, dataXMM, dataXMM); 570 sse2_punpcklbw(p->func, dataXMM, dataXMM); 571 sse2_psrad_imm(p->func, dataXMM, 24); 572 break; 573 case 16: 574 sse2_punpcklwd(p->func, dataXMM, dataXMM); 575 sse2_psrad_imm(p->func, dataXMM, 16); 576 break; 577 case 32: /* we lose precision here */ 578 break; 579 default: 580 return FALSE; 581 } 582 sse2_cvtdq2ps(p->func, dataXMM, dataXMM); 583 if(input_desc->channel[0].normalized) 584 { 585 struct x86_reg factor; 586 switch(input_desc->channel[0].size) 587 { 588 case 8: 589 factor = get_const(p, CONST_INV_127); 590 break; 591 case 16: 592 factor = get_const(p, CONST_INV_32767); 593 break; 594 case 32: 595 factor = get_const(p, CONST_INV_2147483647); 596 break; 597 default: 598 assert(0); 599 factor.disp = 0; 600 factor.file = 0; 601 factor.idx = 0; 602 factor.mod = 0; 603 break; 604 } 605 sse_mulps(p->func, dataXMM, factor); 606 } 607 break; 608 609 break; 610 case UTIL_FORMAT_TYPE_FLOAT: 611 if(input_desc->channel[0].size != 32 && input_desc->channel[0].size != 64) 612 return FALSE; 613 if(swizzle[3] == UTIL_FORMAT_SWIZZLE_1 && input_desc->nr_channels <= 3) 614 { 615 swizzle[3] = UTIL_FORMAT_SWIZZLE_W; 616 needed_chans = CHANNELS_0001; 617 } 618 switch(input_desc->channel[0].size) 619 { 620 case 32: 621 emit_load_float32(p, dataXMM, src, needed_chans, input_desc->nr_channels); 622 break; 623 case 64: /* we lose precision here */ 624 if(!(x86_target_caps(p->func) & X86_SSE2)) 625 return FALSE; 626 emit_load_float64to32(p, dataXMM, src, needed_chans, input_desc->nr_channels); 627 break; 628 default: 629 return FALSE; 630 } 631 break; 632 default: 633 return FALSE; 634 } 635 636 if(!id_swizzle) 637 sse_shufps(p->func, dataXMM, dataXMM, SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]) ); 638 } 639 640 if(output_desc->nr_channels >= 4 641 && swizzle[0] < UTIL_FORMAT_SWIZZLE_0 642 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0 643 && swizzle[2] < UTIL_FORMAT_SWIZZLE_0 644 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0 645 ) 646 sse_movups(p->func, dst, dataXMM); 647 else 648 { 649 if(output_desc->nr_channels >= 2 650 && swizzle[0] < UTIL_FORMAT_SWIZZLE_0 651 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0) 652 sse_movlps(p->func, dst, dataXMM); 653 else 654 { 655 if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0) 656 sse_movss(p->func, dst, dataXMM); 657 else 658 x86_mov_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]); 659 660 if(output_desc->nr_channels >= 2) 661 { 662 if(swizzle[1] < UTIL_FORMAT_SWIZZLE_0) 663 { 664 sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3)); 665 sse_movss(p->func, x86_make_disp(dst, 4), dataXMM); 666 } 667 else 668 x86_mov_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]); 669 } 670 } 671 672 if(output_desc->nr_channels >= 3) 673 { 674 if(output_desc->nr_channels >= 4 675 && swizzle[2] < UTIL_FORMAT_SWIZZLE_0 676 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0) 677 sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM); 678 else 679 { 680 if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0) 681 { 682 sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3)); 683 sse_movss(p->func, x86_make_disp(dst, 8), dataXMM); 684 } 685 else 686 x86_mov_imm(p->func, x86_make_disp(dst, 8), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]); 687 688 if(output_desc->nr_channels >= 4) 689 { 690 if(swizzle[3] < UTIL_FORMAT_SWIZZLE_0) 691 { 692 sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3)); 693 sse_movss(p->func, x86_make_disp(dst, 12), dataXMM); 694 } 695 else 696 x86_mov_imm(p->func, x86_make_disp(dst, 12), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]); 697 } 698 } 699 } 700 } 701 return TRUE; 702 } 703 else if((x86_target_caps(p->func) & X86_SSE2) && input_desc->channel[0].size == 8 && output_desc->channel[0].size == 16 704 && output_desc->channel[0].normalized == input_desc->channel[0].normalized 705 && (0 706 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) 707 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) 708 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) 709 )) 710 { 711 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); 712 struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); 713 struct x86_reg tmp = p->tmp_EAX; 714 unsigned imms[2] = {0, 1}; 715 716 for(i = 0; i < output_desc->nr_channels; ++i) 717 { 718 if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels) 719 swizzle[i] = i; 720 } 721 722 for(i = 0; i < output_desc->nr_channels; ++i) 723 { 724 if(swizzle[i] < 4) 725 needed_chans = MAX2(needed_chans, swizzle[i] + 1); 726 if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i) 727 id_swizzle = FALSE; 728 } 729 730 if(needed_chans > 0) 731 { 732 emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3); 733 734 switch(input_desc->channel[0].type) 735 { 736 case UTIL_FORMAT_TYPE_UNSIGNED: 737 if(input_desc->channel[0].normalized) 738 { 739 sse2_punpcklbw(p->func, dataXMM, dataXMM); 740 if(output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) 741 sse2_psrlw_imm(p->func, dataXMM, 1); 742 } 743 else 744 sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); 745 break; 746 case UTIL_FORMAT_TYPE_SIGNED: 747 if(input_desc->channel[0].normalized) 748 { 749 sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY)); 750 sse2_punpcklbw(p->func, tmpXMM, dataXMM); 751 sse2_psllw_imm(p->func, dataXMM, 9); 752 sse2_psrlw_imm(p->func, dataXMM, 8); 753 sse2_por(p->func, tmpXMM, dataXMM); 754 sse2_psrlw_imm(p->func, dataXMM, 7); 755 sse2_por(p->func, tmpXMM, dataXMM); 756 { 757 struct x86_reg t = dataXMM; 758 dataXMM = tmpXMM; 759 tmpXMM = t; 760 } 761 } 762 else 763 { 764 sse2_punpcklbw(p->func, dataXMM, dataXMM); 765 sse2_psraw_imm(p->func, dataXMM, 8); 766 } 767 break; 768 default: 769 assert(0); 770 } 771 772 if(output_desc->channel[0].normalized) 773 imms[1] = (output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff; 774 775 if(!id_swizzle) 776 sse2_pshuflw(p->func, dataXMM, dataXMM, (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) | ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6)); 777 } 778 779 if(output_desc->nr_channels >= 4 780 && swizzle[0] < UTIL_FORMAT_SWIZZLE_0 781 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0 782 && swizzle[2] < UTIL_FORMAT_SWIZZLE_0 783 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0 784 ) 785 sse2_movq(p->func, dst, dataXMM); 786 else 787 { 788 if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0) 789 { 790 if(output_desc->nr_channels >= 2 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0) 791 sse2_movd(p->func, dst, dataXMM); 792 else 793 { 794 sse2_movd(p->func, tmp, dataXMM); 795 x86_mov16(p->func, dst, tmp); 796 if(output_desc->nr_channels >= 2) 797 x86_mov16_imm(p->func, x86_make_disp(dst, 2), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]); 798 } 799 } 800 else 801 { 802 if(output_desc->nr_channels >= 2 && swizzle[1] >= UTIL_FORMAT_SWIZZLE_0) 803 x86_mov_imm(p->func, dst, (imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]); 804 else 805 { 806 x86_mov16_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]); 807 if(output_desc->nr_channels >= 2) 808 { 809 sse2_movd(p->func, tmp, dataXMM); 810 x86_shr_imm(p->func, tmp, 16); 811 x86_mov16(p->func, x86_make_disp(dst, 2), tmp); 812 } 813 } 814 } 815 816 if(output_desc->nr_channels >= 3) 817 { 818 if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0) 819 { 820 if(output_desc->nr_channels >= 4 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0) 821 { 822 sse2_psrlq_imm(p->func, dataXMM, 32); 823 sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM); 824 } 825 else 826 { 827 sse2_psrlq_imm(p->func, dataXMM, 32); 828 sse2_movd(p->func, tmp, dataXMM); 829 x86_mov16(p->func, x86_make_disp(dst, 4), tmp); 830 if(output_desc->nr_channels >= 4) 831 { 832 x86_mov16_imm(p->func, x86_make_disp(dst, 6), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]); 833 } 834 } 835 } 836 else 837 { 838 if(output_desc->nr_channels >= 4 && swizzle[3] >= UTIL_FORMAT_SWIZZLE_0) 839 x86_mov_imm(p->func, x86_make_disp(dst, 4), (imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]); 840 else 841 { 842 x86_mov16_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]); 843 844 if(output_desc->nr_channels >= 4) 845 { 846 sse2_psrlq_imm(p->func, dataXMM, 48); 847 sse2_movd(p->func, tmp, dataXMM); 848 x86_mov16(p->func, x86_make_disp(dst, 6), tmp); 849 } 850 } 851 } 852 } 853 } 854 return TRUE; 855 } 856 else if(!memcmp(&output_desc->channel[0], &input_desc->channel[0], sizeof(output_desc->channel[0]))) 857 { 858 struct x86_reg tmp = p->tmp_EAX; 859 unsigned i; 860 if(input_desc->channel[0].size == 8 && input_desc->nr_channels == 4 && output_desc->nr_channels == 4 861 && swizzle[0] == UTIL_FORMAT_SWIZZLE_W 862 && swizzle[1] == UTIL_FORMAT_SWIZZLE_Z 863 && swizzle[2] == UTIL_FORMAT_SWIZZLE_Y 864 && swizzle[3] == UTIL_FORMAT_SWIZZLE_X) 865 { 866 /* TODO: support movbe */ 867 x86_mov(p->func, tmp, src); 868 x86_bswap(p->func, tmp); 869 x86_mov(p->func, dst, tmp); 870 return TRUE; 871 } 872 873 for(i = 0; i < output_desc->nr_channels; ++i) 874 { 875 switch(output_desc->channel[0].size) 876 { 877 case 8: 878 if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) 879 { 880 unsigned v = 0; 881 if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1) 882 { 883 switch(output_desc->channel[0].type) 884 { 885 case UTIL_FORMAT_TYPE_UNSIGNED: 886 v = output_desc->channel[0].normalized ? 0xff : 1; 887 break; 888 case UTIL_FORMAT_TYPE_SIGNED: 889 v = output_desc->channel[0].normalized ? 0x7f : 1; 890 break; 891 default: 892 return FALSE; 893 } 894 } 895 x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v); 896 } 897 else 898 { 899 x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1)); 900 x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp); 901 } 902 break; 903 case 16: 904 if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) 905 { 906 unsigned v = 0; 907 if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1) 908 { 909 switch(output_desc->channel[1].type) 910 { 911 case UTIL_FORMAT_TYPE_UNSIGNED: 912 v = output_desc->channel[1].normalized ? 0xffff : 1; 913 break; 914 case UTIL_FORMAT_TYPE_SIGNED: 915 v = output_desc->channel[1].normalized ? 0x7fff : 1; 916 break; 917 case UTIL_FORMAT_TYPE_FLOAT: 918 v = 0x3c00; 919 break; 920 default: 921 return FALSE; 922 } 923 } 924 x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v); 925 } 926 else if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0) 927 x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0); 928 else 929 { 930 x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2)); 931 x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp); 932 } 933 break; 934 case 32: 935 if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) 936 { 937 unsigned v = 0; 938 if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1) 939 { 940 switch(output_desc->channel[1].type) 941 { 942 case UTIL_FORMAT_TYPE_UNSIGNED: 943 v = output_desc->channel[1].normalized ? 0xffffffff : 1; 944 break; 945 case UTIL_FORMAT_TYPE_SIGNED: 946 v = output_desc->channel[1].normalized ? 0x7fffffff : 1; 947 break; 948 case UTIL_FORMAT_TYPE_FLOAT: 949 v = 0x3f800000; 950 break; 951 default: 952 return FALSE; 953 } 954 } 955 x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v); 956 } 957 else 958 { 959 x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4)); 960 x86_mov(p->func, x86_make_disp(dst, i * 4), tmp); 961 } 962 break; 963 case 64: 964 if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) 965 { 966 unsigned l = 0; 967 unsigned h = 0; 968 if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1) 969 { 970 switch(output_desc->channel[1].type) 971 { 972 case UTIL_FORMAT_TYPE_UNSIGNED: 973 h = output_desc->channel[1].normalized ? 0xffffffff : 0; 974 l = output_desc->channel[1].normalized ? 0xffffffff : 1; 975 break; 976 case UTIL_FORMAT_TYPE_SIGNED: 977 h = output_desc->channel[1].normalized ? 0x7fffffff : 0; 978 l = output_desc->channel[1].normalized ? 0xffffffff : 1; 979 break; 980 case UTIL_FORMAT_TYPE_FLOAT: 981 h = 0x3ff00000; 982 l = 0; 983 break; 984 default: 985 return FALSE; 986 } 987 } 988 x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l); 989 x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h); 990 } 991 else 992 { 993 if(x86_target_caps(p->func) & X86_SSE) 994 { 995 struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0); 996 emit_load64(p, tmp, tmpXMM, x86_make_disp(src, swizzle[i] * 8)); 997 emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM); 998 } 999 else 1000 { 1001 x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8)); 1002 x86_mov(p->func, x86_make_disp(dst, i * 8), tmp); 1003 x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8 + 4)); 1004 x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp); 1005 } 1006 } 1007 break; 1008 default: 1009 return FALSE; 1010 } 1011 } 1012 return TRUE; 1013 } 1014 /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */ 1015 else if((x86_target_caps(p->func) & X86_SSE2) && 1016 a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT && (0 1017 || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM 1018 || a->output_format == PIPE_FORMAT_R8G8B8A8_UNORM 1019 )) 1020 { 1021 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); 1022 1023 /* load */ 1024 sse_movups(p->func, dataXMM, src); 1025 1026 if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM) 1027 sse_shufps(p->func, dataXMM, dataXMM, SHUF(2,1,0,3)); 1028 1029 /* scale by 255.0 */ 1030 sse_mulps(p->func, dataXMM, get_const(p, CONST_255)); 1031 1032 /* pack and emit */ 1033 sse2_cvtps2dq(p->func, dataXMM, dataXMM); 1034 sse2_packssdw(p->func, dataXMM, dataXMM); 1035 sse2_packuswb(p->func, dataXMM, dataXMM); 1036 sse2_movd(p->func, dst, dataXMM); 1037 1038 return TRUE; 1039 } 1040 1041 return FALSE; 1042} 1043 1044static boolean translate_attr( struct translate_sse *p, 1045 const struct translate_element *a, 1046 struct x86_reg src, 1047 struct x86_reg dst) 1048{ 1049 if(a->input_format == a->output_format) 1050 { 1051 emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1)); 1052 return TRUE; 1053 } 1054 1055 return translate_attr_convert(p, a, src, dst); 1056} 1057 1058static boolean init_inputs( struct translate_sse *p, 1059 unsigned index_size ) 1060{ 1061 unsigned i; 1062 struct x86_reg instance_id = x86_make_disp(p->machine_EDI, 1063 get_offset(p, &p->instance_id)); 1064 1065 for (i = 0; i < p->nr_buffer_variants; i++) { 1066 struct translate_buffer_variant *variant = &p->buffer_variant[i]; 1067 struct translate_buffer *buffer = &p->buffer[variant->buffer_index]; 1068 1069 if (!index_size || variant->instance_divisor) { 1070 struct x86_reg buf_max_index = x86_make_disp(p->machine_EDI, 1071 get_offset(p, &buffer->max_index)); 1072 struct x86_reg buf_stride = x86_make_disp(p->machine_EDI, 1073 get_offset(p, &buffer->stride)); 1074 struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI, 1075 get_offset(p, &variant->ptr)); 1076 struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDI, 1077 get_offset(p, &buffer->base_ptr)); 1078 struct x86_reg elt = p->idx_ESI; 1079 struct x86_reg tmp_EAX = p->tmp_EAX; 1080 1081 /* Calculate pointer to first attrib: 1082 * base_ptr + stride * index, where index depends on instance divisor 1083 */ 1084 if (variant->instance_divisor) { 1085 /* Our index is instance ID divided by instance divisor. 1086 */ 1087 x86_mov(p->func, tmp_EAX, instance_id); 1088 1089 if (variant->instance_divisor != 1) { 1090 struct x86_reg tmp_EDX = p->tmp2_EDX; 1091 struct x86_reg tmp_ECX = p->src_ECX; 1092 1093 /* TODO: Add x86_shr() to rtasm and use it whenever 1094 * instance divisor is power of two. 1095 */ 1096 1097 x86_xor(p->func, tmp_EDX, tmp_EDX); 1098 x86_mov_reg_imm(p->func, tmp_ECX, variant->instance_divisor); 1099 x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */ 1100 } 1101 1102 /* XXX we need to clamp the index here too, but to a 1103 * per-array max value, not the draw->pt.max_index value 1104 * that's being given to us via translate->set_buffer(). 1105 */ 1106 } else { 1107 x86_mov(p->func, tmp_EAX, elt); 1108 1109 /* Clamp to max_index 1110 */ 1111 x86_cmp(p->func, tmp_EAX, buf_max_index); 1112 x86_cmovcc(p->func, tmp_EAX, buf_max_index, cc_AE); 1113 } 1114 1115 x86_imul(p->func, tmp_EAX, buf_stride); 1116 x64_rexw(p->func); 1117 x86_add(p->func, tmp_EAX, buf_base_ptr); 1118 1119 x86_cmp(p->func, p->count_EBP, p->tmp_EAX); 1120 1121 /* In the linear case, keep the buffer pointer instead of the 1122 * index number. 1123 */ 1124 if (!index_size && p->nr_buffer_variants == 1) 1125 { 1126 x64_rexw(p->func); 1127 x86_mov(p->func, elt, tmp_EAX); 1128 } 1129 else 1130 { 1131 x64_rexw(p->func); 1132 x86_mov(p->func, buf_ptr, tmp_EAX); 1133 } 1134 } 1135 } 1136 1137 return TRUE; 1138} 1139 1140 1141static struct x86_reg get_buffer_ptr( struct translate_sse *p, 1142 unsigned index_size, 1143 unsigned var_idx, 1144 struct x86_reg elt ) 1145{ 1146 if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) { 1147 return x86_make_disp(p->machine_EDI, 1148 get_offset(p, &p->instance_id)); 1149 } 1150 if (!index_size && p->nr_buffer_variants == 1) { 1151 return p->idx_ESI; 1152 } 1153 else if (!index_size || p->buffer_variant[var_idx].instance_divisor) { 1154 struct x86_reg ptr = p->src_ECX; 1155 struct x86_reg buf_ptr = 1156 x86_make_disp(p->machine_EDI, 1157 get_offset(p, &p->buffer_variant[var_idx].ptr)); 1158 1159 x64_rexw(p->func); 1160 x86_mov(p->func, ptr, buf_ptr); 1161 return ptr; 1162 } 1163 else { 1164 struct x86_reg ptr = p->src_ECX; 1165 const struct translate_buffer_variant *variant = &p->buffer_variant[var_idx]; 1166 1167 struct x86_reg buf_stride = 1168 x86_make_disp(p->machine_EDI, 1169 get_offset(p, &p->buffer[variant->buffer_index].stride)); 1170 1171 struct x86_reg buf_base_ptr = 1172 x86_make_disp(p->machine_EDI, 1173 get_offset(p, &p->buffer[variant->buffer_index].base_ptr)); 1174 1175 struct x86_reg buf_max_index = 1176 x86_make_disp(p->machine_EDI, 1177 get_offset(p, &p->buffer[variant->buffer_index].max_index)); 1178 1179 1180 1181 /* Calculate pointer to current attrib: 1182 */ 1183 switch(index_size) 1184 { 1185 case 1: 1186 x86_movzx8(p->func, ptr, elt); 1187 break; 1188 case 2: 1189 x86_movzx16(p->func, ptr, elt); 1190 break; 1191 case 4: 1192 x86_mov(p->func, ptr, elt); 1193 break; 1194 } 1195 1196 /* Clamp to max_index 1197 */ 1198 x86_cmp(p->func, ptr, buf_max_index); 1199 x86_cmovcc(p->func, ptr, buf_max_index, cc_AE); 1200 1201 x86_imul(p->func, ptr, buf_stride); 1202 x64_rexw(p->func); 1203 x86_add(p->func, ptr, buf_base_ptr); 1204 return ptr; 1205 } 1206} 1207 1208 1209 1210static boolean incr_inputs( struct translate_sse *p, 1211 unsigned index_size ) 1212{ 1213 if (!index_size && p->nr_buffer_variants == 1) { 1214 struct x86_reg stride = x86_make_disp(p->machine_EDI, 1215 get_offset(p, &p->buffer[0].stride)); 1216 1217 if (p->buffer_variant[0].instance_divisor == 0) { 1218 x64_rexw(p->func); 1219 x86_add(p->func, p->idx_ESI, stride); 1220 sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192)); 1221 } 1222 } 1223 else if (!index_size) { 1224 unsigned i; 1225 1226 /* Is this worthwhile?? 1227 */ 1228 for (i = 0; i < p->nr_buffer_variants; i++) { 1229 struct translate_buffer_variant *variant = &p->buffer_variant[i]; 1230 struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI, 1231 get_offset(p, &variant->ptr)); 1232 struct x86_reg buf_stride = x86_make_disp(p->machine_EDI, 1233 get_offset(p, &p->buffer[variant->buffer_index].stride)); 1234 1235 if (variant->instance_divisor == 0) { 1236 x86_mov(p->func, p->tmp_EAX, buf_stride); 1237 x64_rexw(p->func); 1238 x86_add(p->func, p->tmp_EAX, buf_ptr); 1239 if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192)); 1240 x64_rexw(p->func); 1241 x86_mov(p->func, buf_ptr, p->tmp_EAX); 1242 } 1243 } 1244 } 1245 else { 1246 x64_rexw(p->func); 1247 x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size)); 1248 } 1249 1250 return TRUE; 1251} 1252 1253 1254/* Build run( struct translate *machine, 1255 * unsigned start, 1256 * unsigned count, 1257 * void *output_buffer ) 1258 * or 1259 * run_elts( struct translate *machine, 1260 * unsigned *elts, 1261 * unsigned count, 1262 * void *output_buffer ) 1263 * 1264 * Lots of hardcoding 1265 * 1266 * EAX -- pointer to current output vertex 1267 * ECX -- pointer to current attribute 1268 * 1269 */ 1270static boolean build_vertex_emit( struct translate_sse *p, 1271 struct x86_function *func, 1272 unsigned index_size ) 1273{ 1274 int fixup, label; 1275 unsigned j; 1276 1277 memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const)); 1278 memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg)); 1279 1280 p->tmp_EAX = x86_make_reg(file_REG32, reg_AX); 1281 p->idx_ESI = x86_make_reg(file_REG32, reg_SI); 1282 p->outbuf_EBX = x86_make_reg(file_REG32, reg_BX); 1283 p->machine_EDI = x86_make_reg(file_REG32, reg_DI); 1284 p->count_EBP = x86_make_reg(file_REG32, reg_BP); 1285 p->tmp2_EDX = x86_make_reg(file_REG32, reg_DX); 1286 p->src_ECX = x86_make_reg(file_REG32, reg_CX); 1287 1288 p->func = func; 1289 1290 x86_init_func(p->func); 1291 1292 if(x86_target(p->func) == X86_64_WIN64_ABI) 1293 { 1294 /* the ABI guarantees a 16-byte aligned 32-byte "shadow space" above the return address */ 1295 sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8), x86_make_reg(file_XMM, 6)); 1296 sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24), x86_make_reg(file_XMM, 7)); 1297 } 1298 1299 x86_push(p->func, p->outbuf_EBX); 1300 x86_push(p->func, p->count_EBP); 1301 1302/* on non-Win64 x86-64, these are already in the right registers */ 1303 if(x86_target(p->func) != X86_64_STD_ABI) 1304 { 1305 x86_push(p->func, p->machine_EDI); 1306 x86_push(p->func, p->idx_ESI); 1307 1308 x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1)); 1309 x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2)); 1310 } 1311 1312 x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3)); 1313 1314 if(x86_target(p->func) != X86_32) 1315 x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5)); 1316 else 1317 x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5)); 1318 1319 /* Load instance ID. 1320 */ 1321 if (p->use_instancing) { 1322 x86_mov(p->func, 1323 p->tmp_EAX, 1324 x86_fn_arg(p->func, 4)); 1325 x86_mov(p->func, 1326 x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)), 1327 p->tmp_EAX); 1328 } 1329 1330 /* Get vertex count, compare to zero 1331 */ 1332 x86_xor(p->func, p->tmp_EAX, p->tmp_EAX); 1333 x86_cmp(p->func, p->count_EBP, p->tmp_EAX); 1334 fixup = x86_jcc_forward(p->func, cc_E); 1335 1336 /* always load, needed or not: 1337 */ 1338 init_inputs(p, index_size); 1339 1340 /* Note address for loop jump 1341 */ 1342 label = x86_get_label(p->func); 1343 { 1344 struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI); 1345 int last_variant = -1; 1346 struct x86_reg vb; 1347 1348 for (j = 0; j < p->translate.key.nr_elements; j++) { 1349 const struct translate_element *a = &p->translate.key.element[j]; 1350 unsigned variant = p->element_to_buffer_variant[j]; 1351 1352 /* Figure out source pointer address: 1353 */ 1354 if (variant != last_variant) { 1355 last_variant = variant; 1356 vb = get_buffer_ptr(p, index_size, variant, elt); 1357 } 1358 1359 if (!translate_attr( p, a, 1360 x86_make_disp(vb, a->input_offset), 1361 x86_make_disp(p->outbuf_EBX, a->output_offset))) 1362 return FALSE; 1363 } 1364 1365 /* Next output vertex: 1366 */ 1367 x64_rexw(p->func); 1368 x86_lea(p->func, 1369 p->outbuf_EBX, 1370 x86_make_disp(p->outbuf_EBX, 1371 p->translate.key.output_stride)); 1372 1373 /* Incr index 1374 */ 1375 incr_inputs( p, index_size ); 1376 } 1377 1378 /* decr count, loop if not zero 1379 */ 1380 x86_dec(p->func, p->count_EBP); 1381 x86_jcc(p->func, cc_NZ, label); 1382 1383 /* Exit mmx state? 1384 */ 1385 if (p->func->need_emms) 1386 mmx_emms(p->func); 1387 1388 /* Land forward jump here: 1389 */ 1390 x86_fixup_fwd_jump(p->func, fixup); 1391 1392 /* Pop regs and return 1393 */ 1394 1395 if(x86_target(p->func) != X86_64_STD_ABI) 1396 { 1397 x86_pop(p->func, p->idx_ESI); 1398 x86_pop(p->func, p->machine_EDI); 1399 } 1400 1401 x86_pop(p->func, p->count_EBP); 1402 x86_pop(p->func, p->outbuf_EBX); 1403 1404 if(x86_target(p->func) == X86_64_WIN64_ABI) 1405 { 1406 sse2_movdqa(p->func, x86_make_reg(file_XMM, 6), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8)); 1407 sse2_movdqa(p->func, x86_make_reg(file_XMM, 7), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24)); 1408 } 1409 x86_ret(p->func); 1410 1411 return TRUE; 1412} 1413 1414 1415 1416 1417 1418 1419 1420static void translate_sse_set_buffer( struct translate *translate, 1421 unsigned buf, 1422 const void *ptr, 1423 unsigned stride, 1424 unsigned max_index ) 1425{ 1426 struct translate_sse *p = (struct translate_sse *)translate; 1427 1428 if (buf < p->nr_buffers) { 1429 p->buffer[buf].base_ptr = (char *)ptr; 1430 p->buffer[buf].stride = stride; 1431 p->buffer[buf].max_index = max_index; 1432 } 1433 1434 if (0) debug_printf("%s %d/%d: %p %d\n", 1435 __FUNCTION__, buf, 1436 p->nr_buffers, 1437 ptr, stride); 1438} 1439 1440 1441static void translate_sse_release( struct translate *translate ) 1442{ 1443 struct translate_sse *p = (struct translate_sse *)translate; 1444 1445 x86_release_func( &p->linear_func ); 1446 x86_release_func( &p->elt_func ); 1447 1448 os_free_aligned(p); 1449} 1450 1451 1452struct translate *translate_sse2_create( const struct translate_key *key ) 1453{ 1454 struct translate_sse *p = NULL; 1455 unsigned i; 1456 1457 /* this is misnamed, it actually refers to whether rtasm is enabled or not */ 1458 if (!rtasm_cpu_has_sse()) 1459 goto fail; 1460 1461 p = os_malloc_aligned(sizeof(struct translate_sse), 16); 1462 if (p == NULL) 1463 goto fail; 1464 memset(p, 0, sizeof(*p)); 1465 memcpy(p->consts, consts, sizeof(consts)); 1466 1467 p->translate.key = *key; 1468 p->translate.release = translate_sse_release; 1469 p->translate.set_buffer = translate_sse_set_buffer; 1470 1471 for (i = 0; i < key->nr_elements; i++) { 1472 if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) { 1473 unsigned j; 1474 1475 p->nr_buffers = MAX2(p->nr_buffers, key->element[i].input_buffer + 1); 1476 1477 if (key->element[i].instance_divisor) { 1478 p->use_instancing = TRUE; 1479 } 1480 1481 /* 1482 * Map vertex element to vertex buffer variant. 1483 */ 1484 for (j = 0; j < p->nr_buffer_variants; j++) { 1485 if (p->buffer_variant[j].buffer_index == key->element[i].input_buffer && 1486 p->buffer_variant[j].instance_divisor == key->element[i].instance_divisor) { 1487 break; 1488 } 1489 } 1490 if (j == p->nr_buffer_variants) { 1491 p->buffer_variant[j].buffer_index = key->element[i].input_buffer; 1492 p->buffer_variant[j].instance_divisor = key->element[i].instance_divisor; 1493 p->nr_buffer_variants++; 1494 } 1495 p->element_to_buffer_variant[i] = j; 1496 } else { 1497 assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID); 1498 1499 p->element_to_buffer_variant[i] = ELEMENT_BUFFER_INSTANCE_ID; 1500 } 1501 } 1502 1503 if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers); 1504 1505 if (!build_vertex_emit(p, &p->linear_func, 0)) 1506 goto fail; 1507 1508 if (!build_vertex_emit(p, &p->elt_func, 4)) 1509 goto fail; 1510 1511 if (!build_vertex_emit(p, &p->elt16_func, 2)) 1512 goto fail; 1513 1514 if (!build_vertex_emit(p, &p->elt8_func, 1)) 1515 goto fail; 1516 1517 p->translate.run = (run_func) x86_get_func(&p->linear_func); 1518 if (p->translate.run == NULL) 1519 goto fail; 1520 1521 p->translate.run_elts = (run_elts_func) x86_get_func(&p->elt_func); 1522 if (p->translate.run_elts == NULL) 1523 goto fail; 1524 1525 p->translate.run_elts16 = (run_elts16_func) x86_get_func(&p->elt16_func); 1526 if (p->translate.run_elts16 == NULL) 1527 goto fail; 1528 1529 p->translate.run_elts8 = (run_elts8_func) x86_get_func(&p->elt8_func); 1530 if (p->translate.run_elts8 == NULL) 1531 goto fail; 1532 1533 return &p->translate; 1534 1535 fail: 1536 if (p) 1537 translate_sse_release( &p->translate ); 1538 1539 return NULL; 1540} 1541 1542 1543 1544#else 1545 1546struct translate *translate_sse2_create( const struct translate_key *key ) 1547{ 1548 return NULL; 1549} 1550 1551#endif 1552