17b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang"""Mul primitive used by the GEMM function. 27b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 37b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao WangThe Mul primitive takes 1-3 zipped rows and 1-3 zipped columns and performs 47b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangmatrix multiplication on those resulting in a small 1x1 to 3x3 block of results. 57b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang""" 67b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 77b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangimport neon_emitter 87b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 97b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 107b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangclass Error(Exception): 117b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang """Module level error.""" 127b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 137b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 147b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangclass ConfigurationError(Error): 157b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang """Unsupported configuration.""" 167b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 177b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 187b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangclass MulLanes(object): 197b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 207b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang def __init__(self, input_address): 217b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang self.input_address = input_address 227b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang self.lanes = [] 237b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 247b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang def AddLane(self, lane): 257b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang self.lanes.append(lane) 267b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 277b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang def FreeRegisters(self, registers): 287b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang for i in range(0, len(self.lanes)): 297b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang registers.FreeRegister(self.lanes[i]) 307b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang self.lanes[i] = None 317b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 327b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 337b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangdef GenerateMulLanes(registers, lane_count, address): 347b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang lanes = MulLanes(address) 357b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang for unused_i in range(0, lane_count): 367b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang lanes.AddLane(registers.DoubleRegister()) 377b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang return lanes 387b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 397b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 407b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangdef Generate3MulLanes(quad_register, registers, address): 417b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang lanes = MulLanes(address) 427b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang lanes.AddLane(registers.Low(quad_register)) 437b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang lanes.AddLane(registers.High(quad_register)) 447b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang lanes.AddLane(registers.DoubleRegister()) 457b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang return lanes 467b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 477b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 487b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangdef GenerateAndClearAggregators(emitter, registers, aggregator_count): 497b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang """Prepare aggregators and emit aggregator clear code.""" 507b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitComment('Clear aggregators.') 517b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang aggregators = [] 527b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang for i in range(0, aggregator_count): 537b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang aggregator = registers.QuadRegister() 547b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang aggregators.append(aggregator) 557b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang if i < 3: 567b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVMov('i32', aggregator, emitter.ImmediateConstant(0)) 577b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang else: 587b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVMov('i32', aggregator, aggregators[i - 3]) 597b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitNewline() 607b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang return aggregators 617b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 627b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 637b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangdef GenerateNxMLoadMultiplyAggregate(emitter, registers, left_lanes, 647b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang right_lanes, aggregators, count): 657b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang """Emit inner loop for N rows x M cols multiplication.""" 667b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitComment('General NxM lanes loop.') 677b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitNumericalLabel(1) 687b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitNewline() 697b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitComment('Subtract counter.') 707b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitSubs(count, count, emitter.ImmediateConstant(8)) 717b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitNewline() 727b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 737b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVLoadA('1.8', left_lanes.lanes, 747b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.DereferenceIncrement(left_lanes.input_address, 64)) 757b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVLoadA( 767b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang '1.8', right_lanes.lanes, 777b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.DereferenceIncrement(right_lanes.input_address, 64)) 787b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 797b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitPldOffset(left_lanes.input_address, emitter.ImmediateConstant(64)) 807b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitPldOffset(right_lanes.input_address, 817b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.ImmediateConstant(64)) 827b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 837b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang rows = len(left_lanes.lanes) 847b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang cols = len(right_lanes.lanes) 857b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 867b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang multiply_results = [] 877b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang for i in range(0, rows * cols): 887b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang multiply_results.append(registers.QuadRegister()) 897b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 907b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang for row in range(0, rows): 917b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang for col in range(0, cols): 927b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang index = row * cols + col 937b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVMull('u8', multiply_results[index], right_lanes.lanes[col], 947b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang left_lanes.lanes[row]) 957b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 967b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang for i in range(0, rows * cols): 977b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVPadal('u16', aggregators[i], multiply_results[i]) 987b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 997b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitNewline() 1007b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitComment('Loop break.') 1017b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitBneBack(1) 1027b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitNewline() 1037b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1047b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang for register in multiply_results: 1057b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang registers.FreeRegister(register) 1067b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1077b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1087b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangdef Generate3x3LoadMultiplyAggregate(emitter, registers, left_lanes, 1097b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang right_lanes, aggregators, count, 1107b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang backup_register): 1117b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang """Emit inner loop for 3 rows x 3 cols multiplication (register trick).""" 1127b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitComment('3x3 lanes loop.') 1137b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitNumericalLabel(1) 1147b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitNewline() 1157b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitComment('Subtract counter.') 1167b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitSubs(count, count, emitter.ImmediateConstant(8)) 1177b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitNewline() 1187b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1197b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVLoadA('1.8', left_lanes.lanes, 1207b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.DereferenceIncrement(left_lanes.input_address, 64)) 1217b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVLoadA( 1227b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang '1.8', right_lanes.lanes, 1237b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.DereferenceIncrement(right_lanes.input_address, 64)) 1247b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1257b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitPldOffset(left_lanes.input_address, emitter.ImmediateConstant(64)) 1267b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitPldOffset(right_lanes.input_address, 1277b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.ImmediateConstant(64)) 1287b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1297b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang temp = [] 1307b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang for unused_i in range(0, 4): 1317b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang temp.append(registers.QuadRegister()) 1327b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1337b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVMull('u8', temp[0], left_lanes.lanes[0], right_lanes.lanes[0]) 1347b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVMull('u8', temp[1], left_lanes.lanes[0], right_lanes.lanes[1]) 1357b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVMull('u8', temp[2], left_lanes.lanes[0], right_lanes.lanes[2]) 1367b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVMull('u8', temp[3], left_lanes.lanes[1], right_lanes.lanes[0]) 1377b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1387b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVPadal('u16', aggregators[0], temp[0]) 1397b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVPadal('u16', aggregators[1], temp[1]) 1407b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVPadal('u16', aggregators[2], temp[2]) 1417b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVPadal('u16', aggregators[3], temp[3]) 1427b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1437b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVMull('u8', temp[0], left_lanes.lanes[1], right_lanes.lanes[1]) 1447b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVMull('u8', temp[1], left_lanes.lanes[1], right_lanes.lanes[2]) 1457b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVMull('u8', temp[2], left_lanes.lanes[2], right_lanes.lanes[0]) 1467b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVMull('u8', temp[3], left_lanes.lanes[2], right_lanes.lanes[1]) 1477b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVMull('u8', backup_register, left_lanes.lanes[2], 1487b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang right_lanes.lanes[2]) 1497b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1507b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVPadal('u16', aggregators[4], temp[0]) 1517b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVPadal('u16', aggregators[5], temp[1]) 1527b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVPadal('u16', aggregators[6], temp[2]) 1537b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVPadal('u16', aggregators[7], temp[3]) 1547b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVPadal('u16', aggregators[8], backup_register) 1557b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1567b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitNewline() 1577b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitComment('Loop break.') 1587b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitBneBack(1) 1597b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitNewline() 1607b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1617b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang for register in temp: 1627b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang registers.FreeRegister(register) 1637b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1647b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1657b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangdef ReadParams(emitter, registers, input_address, elements, min_reg): 1667b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang if elements == 1 or elements == 2: 1677b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang register = registers.DoubleRegister(min_reg * 2) 1687b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVLoad('1.32', register, emitter.Dereference(input_address, 64)) 1697b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang return register 170a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang elif elements == 3 or elements == 4: 1717b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang register = registers.QuadRegister(min_reg) 1727b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVLoad('1.32', register, emitter.Dereference(input_address, 64)) 1737b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang return register 1747b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang else: 1757b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang raise ConfigurationError('Unsupported elements no: %d' % elements) 1767b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1777b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1787b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangdef Duplicate(emitter, registers, rows, cols, min_register, values): 1797b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang """Populate a grid of registers duplicating provided values.""" 1807b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang duplicated = [] 1817b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang if cols == 1 or cols == 2: 1827b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang for unused_i in range(0, rows): 1837b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang duplicated.append(registers.DoubleRegister(min_register)) 184a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang elif cols == 3 or cols == 4: 1857b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang for unused_i in range(0, rows): 1867b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang duplicated.append(registers.QuadRegister(min_register)) 1877b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang else: 1887b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang raise ConfigurationError('Unsupported duplicate amount: %d' % cols) 1897b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1907b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang if rows == 1: 1917b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVDup('32', duplicated[0], emitter.Lane(values, 0)) 1927b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang elif rows == 2: 1937b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVDup('32', duplicated[0], emitter.Lane(values, 0)) 1947b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVDup('32', duplicated[1], emitter.Lane(values, 1)) 1957b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang elif rows == 3: 1967b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVDup('32', duplicated[0], emitter.Lane( 1977b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang registers.Low(values), 0)) 1987b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVDup('32', duplicated[1], emitter.Lane( 1997b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang registers.Low(values), 1)) 2007b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVDup('32', duplicated[2], emitter.Lane( 2017b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang registers.High(values), 0)) 202a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang elif rows == 4: 203a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang emitter.EmitVDup('32', duplicated[0], emitter.Lane( 204a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang registers.Low(values), 0)) 205a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang emitter.EmitVDup('32', duplicated[1], emitter.Lane( 206a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang registers.Low(values), 1)) 207a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang emitter.EmitVDup('32', duplicated[2], emitter.Lane( 208a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang registers.High(values), 0)) 209a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang emitter.EmitVDup('32', duplicated[3], emitter.Lane( 210a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang registers.High(values), 1)) 2117b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 2127b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang return duplicated 2137b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 2147b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 2157b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangdef DuplicateGeneralRegister(emitter, registers, cols, general_register, 2167b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang min_register): 2177b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang if cols == 1 or cols == 2: 2187b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang duplicated = registers.DoubleRegister(min_register) 219a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang elif cols == 3 or cols == 4: 2207b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang duplicated = registers.QuadRegister(min_register) 2217b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang else: 2227b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang raise ConfigurationError('Unsupported duplicate amount: %d' % cols) 2237b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 2247b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVDup('32', duplicated, general_register) 2257b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang return duplicated 2267b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 2277b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 2287b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangdef ReduceAggregator(emitter, registers, aggregators, row, cols): 2297b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang if cols == 1: 2307b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang register = registers.Low(aggregators[row]) 2317b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVPadd('u32', register, register, register) 2327b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang return register 2337b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang elif cols == 2: 2347b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang register = registers.Low(aggregators[row * 2]) 2357b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVPadd('u32', register, register, 2367b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang registers.Low(aggregators[row * 2 + 1])) 2377b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang return register 2387b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang elif cols == 3: 2397b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang register = aggregators[row * 3] 2407b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVPadd('u32', registers.Low(register), registers.Low(register), 2417b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang registers.Low(aggregators[row * 3 + 1])) 2427b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVPadd('u32', registers.High(register), 2437b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang registers.Low(aggregators[row * 3 + 2]), 2447b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang registers.Low(aggregators[row * 3 + 2])) 2457b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang return register 246a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang elif cols == 4: 247a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang register = aggregators[row * 3] 248a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang emitter.EmitVPadd('u32', registers.Low(register), registers.Low(register), 249a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang registers.Low(aggregators[row * 3 + 1])) 250a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang emitter.EmitVPadd('u32', registers.High(register), 251a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang registers.Low(aggregators[row * 3 + 2]), 252a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang registers.Low(aggregators[row * 3 + 3])) 253a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang return register 2547b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang else: 2557b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang raise ConfigurationError('Unsupported columns no: %d' % cols) 2567b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 2577b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 2587b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangdef StoreAggregator(emitter, registers, aggregator, cols, result_address, 2597b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang result_stride): 2607b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang if cols == 1: 2617b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVStoreOffset('1.32', emitter.Lane(aggregator, 0), 2627b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.Dereference(result_address, None), 2637b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang result_stride) 2647b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang elif cols == 2: 2657b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVStoreOffset('1.32', aggregator, 2667b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.Dereference(result_address, None), 2677b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang result_stride) 2687b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang elif cols == 3: 2697b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVStore('1.32', registers.Low(aggregator), 2707b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.DereferenceIncrement(result_address, None)) 2717b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVStoreOffset('1.32', emitter.Lane( 2727b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang registers.High(aggregator), 2737b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 0), emitter.Dereference(result_address, None), result_stride) 2747b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitNewline() 275a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang elif cols == 4: 276a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang emitter.EmitVStoreOffsetA( 277a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang '1.32', [registers.Low(aggregator), registers.High(aggregator)], 278a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang emitter.Dereference(result_address, None), result_stride) 2797b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang else: 2807b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang raise ConfigurationError('Unsupported columns no: %d' % cols) 2817b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 2827b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 2837b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangdef GenerateAggregatorReduceStore(emitter, registers, aggregators, result_type, 2847b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang lhs_add, rhs_add, left_lanes, right_lanes, 2857b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang results, results_stride): 2867b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang """Emit code that reduces 4 lane aggregators to 1 value, and stores them.""" 2877b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang rows = len(left_lanes.lanes) 2887b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang cols = len(right_lanes.lanes) 2897b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 2907b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang if lhs_add: 2917b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang left_offset = ReadParams(emitter, registers, left_lanes.input_address, rows, 2927b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 4) 2937b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang left_offsets = Duplicate(emitter, registers, rows, cols, 4, left_offset) 2947b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang else: 2957b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang left_offsets = None 2967b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 2977b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang if rhs_add: 2987b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang right_offset = ReadParams(emitter, registers, right_lanes.input_address, 2997b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang cols, 4) 3007b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang else: 3017b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang right_offset = None 3027b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3037b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang if result_type is 'float': 3047b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang result_scale = DuplicateGeneralRegister( 3057b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter, registers, cols, registers.MapParameter('result_scale'), 4) 3067b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang else: 3077b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang result_scale = None 3087b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3097b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang if cols == 3: 3107b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitNewline() 3117b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitComment('Change stride because storing in two ops.') 3127b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitSub(results_stride, results_stride, 3137b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.ImmediateConstant(8)) 3147b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3157b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitNewline() 3167b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitComment('Horizontal reduce aggregators.') 3177b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang for aggregator in aggregators: 3187b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVPadd('u32', registers.Low(aggregator), 3197b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang registers.Low(aggregator), registers.High(aggregator)) 3207b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3217b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitNewline() 3227b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitComment('Reduce rows.') 3237b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang row_temps = [] 3247b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang for i in range(0, rows): 3257b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang row_temps.append(ReduceAggregator(emitter, registers, aggregators, i, cols)) 3267b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3277b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang if lhs_add: 3287b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitNewline() 3297b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitComment('Add lhs offsets to aggregated rows.') 3307b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang for (row_temp, left_offset) in zip(row_temps, left_offsets): 3317b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVAdd('s32', row_temp, row_temp, left_offset) 3327b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3337b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang if rhs_add: 3347b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitNewline() 3357b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitComment('Add rhs offset to aggregated rows.') 3367b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang for row_temp in row_temps: 3377b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVAdd('s32', row_temp, row_temp, right_offset) 3387b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3397b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang if result_type is 'float': 3407b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitNewline() 3417b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitComment('Convert to float. Multiply by result scale.') 3427b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang for row_temp in row_temps: 3437b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVCvt('f32', 's32', row_temp, row_temp) 3447b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang for row_temp in row_temps: 3457b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitVMul('f32', row_temp, row_temp, result_scale) 3467b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3477b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitNewline() 3487b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitComment('Store reduced rows.') 3497b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang for row_temp in row_temps: 3507b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang StoreAggregator(emitter, registers, row_temp, cols, results, results_stride) 3517b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3527b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3537b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangdef BuildName(result_type, lhs_add, rhs_add, left, right): 3547b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang name = 'mul_%dx8_%dx8_%s' % (left, right, result_type) 3557b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang if lhs_add: 3567b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang name += '_lhsadd' 3577b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang if rhs_add: 3587b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang name += '_rhsadd' 3597b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang return name 3607b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3617b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3627b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangdef CppResultType(result_type): 3637b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang if result_type is 'int32': 3647b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang return 'std::int32_t*' 3657b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang elif result_type is 'float': 3667b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang return 'float*' 3677b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang else: 3687b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang raise ConfigurationError('Unsupported result type: %s' % result_type) 3697b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3707b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3717b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangdef GetParameters(result_type): 3727b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang params = [['const std::uint8_t*', 'lhs'], ['const std::uint8_t*', 'rhs'], 3737b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang ['std::int32_t', 'count'], [CppResultType(result_type), 'result'], 3747b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang ['std::int32_t', 'result_stride']] 3757b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang if result_type is 'float': 3767b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang params.append(['float', 'result_scale']) 3777b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang return params 3787b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3797b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3807b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangdef GenerateMulNx8Mx8(emitter, result_type, lhs_add, rhs_add, left_lanes_count, 3817b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang right_lanes_count): 3827b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang """Emit the multiply code for given rows and cols counts.""" 383a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang if left_lanes_count < 1 or left_lanes_count > 4: 384a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang raise ConfigurationError('Left_lanes should be: 1, 2, 3 or 4.') 385a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang if right_lanes_count < 1 or right_lanes_count > 4: 386a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang raise ConfigurationError('Right_lanes should be: 1, 2, 3 or 4.') 3877b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3887b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitFunctionBeginA( 3897b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang BuildName(result_type, lhs_add, rhs_add, left_lanes_count, 3907b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang right_lanes_count), GetParameters(result_type), 'inline void') 3917b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3927b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitAssert('count % 8 == 0') 3937b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitAssert('count >= 8') 3947b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitAsmBegin() 3957b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3967b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang registers = neon_emitter.NeonRegisters() 3977b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3987b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang count = registers.MapParameter('count') 3997b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 4007b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang size = left_lanes_count * right_lanes_count 4017b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 402a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang lhs = registers.MapParameter('lhs') 403a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang rhs = registers.MapParameter('rhs') 4047b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 405a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang emitter.EmitPld(lhs) 406a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang emitter.EmitPld(rhs) 4077b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 408a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang aggregators = GenerateAndClearAggregators(emitter, registers, size) 409a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 410a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang if size < 9: 411a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang left_lanes = GenerateMulLanes(registers, left_lanes_count, lhs) 412a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang right_lanes = GenerateMulLanes(registers, right_lanes_count, rhs) 4137b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 4147b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang GenerateNxMLoadMultiplyAggregate(emitter, registers, left_lanes, 4157b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang right_lanes, aggregators, count) 4167b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 4177b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang else: # left == 3 and right == 3 4187b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang backup_register = registers.QuadRegister() 419a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang left_lanes = Generate3MulLanes(backup_register, registers, lhs) 420a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang right_lanes = GenerateMulLanes(registers, right_lanes_count, rhs) 4217b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 4227b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang Generate3x3LoadMultiplyAggregate(emitter, registers, left_lanes, 4237b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang right_lanes, aggregators, count, 4247b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang backup_register) 4257b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang left_lanes.FreeRegisters(registers) 4267b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang right_lanes.FreeRegisters(registers) 4277b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 4287b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang GenerateAggregatorReduceStore(emitter, registers, aggregators, result_type, 4297b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang lhs_add, rhs_add, left_lanes, right_lanes, 4307b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang registers.MapParameter('result'), 4317b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang registers.MapParameter('result_stride')) 4327b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 4337b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitAsmEnd(registers.MappedParameters(), [], 4347b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang registers.Clobbers() + ['cc', 'memory']) 4357b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitFunctionEnd() 4367b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 4377b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 4387b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangdef GenerateFunctions(emitter, result_type, lhs_add, rhs_add): 4397b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang for left_lanes in range(1, 4): 4407b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang for right_lanes in range(1, 4): 4417b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang GenerateMulNx8Mx8(emitter, result_type, lhs_add, rhs_add, left_lanes, 4427b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang right_lanes) 4437b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang emitter.EmitNewline() 444a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 445a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang GenerateMulNx8Mx8(emitter, result_type, lhs_add, rhs_add, 1, 4) 446a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang emitter.EmitNewline() 447a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 448a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 449a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangif __name__ == '__main__': 450a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang GenerateFunctions(neon_emitter.NeonEmitter(), 'int32', True, True) 451