17b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang"""Mul primitive used by the GEMM function.
27b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
37b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao WangThe Mul primitive takes 1-3 zipped rows and 1-3 zipped columns and performs
47b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangmatrix multiplication on those resulting in a small 1x1 to 3x3 block of results.
57b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang"""
67b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
77b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangimport neon_emitter
87b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
97b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
107b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangclass Error(Exception):
117b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  """Module level error."""
127b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
137b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
147b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangclass ConfigurationError(Error):
157b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  """Unsupported configuration."""
167b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
177b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
187b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangclass MulLanes(object):
197b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
207b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  def __init__(self, input_address):
217b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    self.input_address = input_address
227b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    self.lanes = []
237b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
247b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  def AddLane(self, lane):
257b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    self.lanes.append(lane)
267b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
277b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  def FreeRegisters(self, registers):
287b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    for i in range(0, len(self.lanes)):
297b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang      registers.FreeRegister(self.lanes[i])
307b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang      self.lanes[i] = None
317b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
327b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
337b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangdef GenerateMulLanes(registers, lane_count, address):
347b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  lanes = MulLanes(address)
357b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  for unused_i in range(0, lane_count):
367b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    lanes.AddLane(registers.DoubleRegister())
377b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  return lanes
387b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
397b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
407b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangdef Generate3MulLanes(quad_register, registers, address):
417b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  lanes = MulLanes(address)
427b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  lanes.AddLane(registers.Low(quad_register))
437b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  lanes.AddLane(registers.High(quad_register))
447b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  lanes.AddLane(registers.DoubleRegister())
457b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  return lanes
467b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
477b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
487b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangdef GenerateAndClearAggregators(emitter, registers, aggregator_count):
497b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  """Prepare aggregators and emit aggregator clear code."""
507b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitComment('Clear aggregators.')
517b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  aggregators = []
527b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  for i in range(0, aggregator_count):
537b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    aggregator = registers.QuadRegister()
547b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    aggregators.append(aggregator)
557b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    if i < 3:
567b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang      emitter.EmitVMov('i32', aggregator, emitter.ImmediateConstant(0))
577b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    else:
587b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang      emitter.EmitVMov('i32', aggregator, aggregators[i - 3])
597b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitNewline()
607b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  return aggregators
617b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
627b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
637b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangdef GenerateNxMLoadMultiplyAggregate(emitter, registers, left_lanes,
647b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                                     right_lanes, aggregators, count):
657b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  """Emit inner loop for N rows x M cols multiplication."""
667b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitComment('General NxM lanes loop.')
677b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitNumericalLabel(1)
687b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitNewline()
697b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitComment('Subtract counter.')
707b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitSubs(count, count, emitter.ImmediateConstant(8))
717b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitNewline()
727b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
737b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitVLoadA('1.8', left_lanes.lanes,
747b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                     emitter.DereferenceIncrement(left_lanes.input_address, 64))
757b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitVLoadA(
767b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang      '1.8', right_lanes.lanes,
777b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang      emitter.DereferenceIncrement(right_lanes.input_address, 64))
787b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
797b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitPldOffset(left_lanes.input_address, emitter.ImmediateConstant(64))
807b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitPldOffset(right_lanes.input_address,
817b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                        emitter.ImmediateConstant(64))
827b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
837b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  rows = len(left_lanes.lanes)
847b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  cols = len(right_lanes.lanes)
857b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
867b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  multiply_results = []
877b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  for i in range(0, rows * cols):
887b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    multiply_results.append(registers.QuadRegister())
897b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
907b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  for row in range(0, rows):
917b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    for col in range(0, cols):
927b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang      index = row * cols + col
937b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang      emitter.EmitVMull('u8', multiply_results[index], right_lanes.lanes[col],
947b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                        left_lanes.lanes[row])
957b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
967b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  for i in range(0, rows * cols):
977b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    emitter.EmitVPadal('u16', aggregators[i], multiply_results[i])
987b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
997b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitNewline()
1007b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitComment('Loop break.')
1017b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitBneBack(1)
1027b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitNewline()
1037b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1047b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  for register in multiply_results:
1057b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    registers.FreeRegister(register)
1067b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1077b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1087b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangdef Generate3x3LoadMultiplyAggregate(emitter, registers, left_lanes,
1097b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                                     right_lanes, aggregators, count,
1107b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                                     backup_register):
1117b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  """Emit inner loop for 3 rows x 3 cols multiplication (register trick)."""
1127b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitComment('3x3 lanes loop.')
1137b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitNumericalLabel(1)
1147b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitNewline()
1157b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitComment('Subtract counter.')
1167b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitSubs(count, count, emitter.ImmediateConstant(8))
1177b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitNewline()
1187b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1197b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitVLoadA('1.8', left_lanes.lanes,
1207b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                     emitter.DereferenceIncrement(left_lanes.input_address, 64))
1217b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitVLoadA(
1227b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang      '1.8', right_lanes.lanes,
1237b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang      emitter.DereferenceIncrement(right_lanes.input_address, 64))
1247b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1257b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitPldOffset(left_lanes.input_address, emitter.ImmediateConstant(64))
1267b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitPldOffset(right_lanes.input_address,
1277b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                        emitter.ImmediateConstant(64))
1287b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1297b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  temp = []
1307b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  for unused_i in range(0, 4):
1317b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    temp.append(registers.QuadRegister())
1327b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1337b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitVMull('u8', temp[0], left_lanes.lanes[0], right_lanes.lanes[0])
1347b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitVMull('u8', temp[1], left_lanes.lanes[0], right_lanes.lanes[1])
1357b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitVMull('u8', temp[2], left_lanes.lanes[0], right_lanes.lanes[2])
1367b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitVMull('u8', temp[3], left_lanes.lanes[1], right_lanes.lanes[0])
1377b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1387b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitVPadal('u16', aggregators[0], temp[0])
1397b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitVPadal('u16', aggregators[1], temp[1])
1407b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitVPadal('u16', aggregators[2], temp[2])
1417b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitVPadal('u16', aggregators[3], temp[3])
1427b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1437b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitVMull('u8', temp[0], left_lanes.lanes[1], right_lanes.lanes[1])
1447b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitVMull('u8', temp[1], left_lanes.lanes[1], right_lanes.lanes[2])
1457b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitVMull('u8', temp[2], left_lanes.lanes[2], right_lanes.lanes[0])
1467b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitVMull('u8', temp[3], left_lanes.lanes[2], right_lanes.lanes[1])
1477b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitVMull('u8', backup_register, left_lanes.lanes[2],
1487b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                    right_lanes.lanes[2])
1497b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1507b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitVPadal('u16', aggregators[4], temp[0])
1517b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitVPadal('u16', aggregators[5], temp[1])
1527b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitVPadal('u16', aggregators[6], temp[2])
1537b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitVPadal('u16', aggregators[7], temp[3])
1547b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitVPadal('u16', aggregators[8], backup_register)
1557b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1567b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitNewline()
1577b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitComment('Loop break.')
1587b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitBneBack(1)
1597b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitNewline()
1607b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1617b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  for register in temp:
1627b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    registers.FreeRegister(register)
1637b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1647b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1657b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangdef ReadParams(emitter, registers, input_address, elements, min_reg):
1667b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  if elements == 1 or elements == 2:
1677b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    register = registers.DoubleRegister(min_reg * 2)
1687b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    emitter.EmitVLoad('1.32', register, emitter.Dereference(input_address, 64))
1697b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    return register
170a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  elif elements == 3 or elements == 4:
1717b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    register = registers.QuadRegister(min_reg)
1727b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    emitter.EmitVLoad('1.32', register, emitter.Dereference(input_address, 64))
1737b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    return register
1747b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  else:
1757b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    raise ConfigurationError('Unsupported elements no: %d' % elements)
1767b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1777b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1787b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangdef Duplicate(emitter, registers, rows, cols, min_register, values):
1797b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  """Populate a grid of registers duplicating provided values."""
1807b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  duplicated = []
1817b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  if cols == 1 or cols == 2:
1827b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    for unused_i in range(0, rows):
1837b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang      duplicated.append(registers.DoubleRegister(min_register))
184a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  elif cols == 3 or cols == 4:
1857b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    for unused_i in range(0, rows):
1867b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang      duplicated.append(registers.QuadRegister(min_register))
1877b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  else:
1887b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    raise ConfigurationError('Unsupported duplicate amount: %d' % cols)
1897b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1907b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  if rows == 1:
1917b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    emitter.EmitVDup('32', duplicated[0], emitter.Lane(values, 0))
1927b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  elif rows == 2:
1937b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    emitter.EmitVDup('32', duplicated[0], emitter.Lane(values, 0))
1947b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    emitter.EmitVDup('32', duplicated[1], emitter.Lane(values, 1))
1957b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  elif rows == 3:
1967b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    emitter.EmitVDup('32', duplicated[0], emitter.Lane(
1977b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        registers.Low(values), 0))
1987b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    emitter.EmitVDup('32', duplicated[1], emitter.Lane(
1997b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        registers.Low(values), 1))
2007b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    emitter.EmitVDup('32', duplicated[2], emitter.Lane(
2017b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        registers.High(values), 0))
202a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  elif rows == 4:
203a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    emitter.EmitVDup('32', duplicated[0], emitter.Lane(
204a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        registers.Low(values), 0))
205a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    emitter.EmitVDup('32', duplicated[1], emitter.Lane(
206a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        registers.Low(values), 1))
207a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    emitter.EmitVDup('32', duplicated[2], emitter.Lane(
208a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        registers.High(values), 0))
209a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    emitter.EmitVDup('32', duplicated[3], emitter.Lane(
210a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        registers.High(values), 1))
2117b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
2127b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  return duplicated
2137b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
2147b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
2157b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangdef DuplicateGeneralRegister(emitter, registers, cols, general_register,
2167b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                             min_register):
2177b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  if cols == 1 or cols == 2:
2187b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    duplicated = registers.DoubleRegister(min_register)
219a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  elif cols == 3 or cols == 4:
2207b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    duplicated = registers.QuadRegister(min_register)
2217b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  else:
2227b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    raise ConfigurationError('Unsupported duplicate amount: %d' % cols)
2237b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
2247b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitVDup('32', duplicated, general_register)
2257b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  return duplicated
2267b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
2277b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
2287b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangdef ReduceAggregator(emitter, registers, aggregators, row, cols):
2297b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  if cols == 1:
2307b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    register = registers.Low(aggregators[row])
2317b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    emitter.EmitVPadd('u32', register, register, register)
2327b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    return register
2337b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  elif cols == 2:
2347b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    register = registers.Low(aggregators[row * 2])
2357b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    emitter.EmitVPadd('u32', register, register,
2367b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                      registers.Low(aggregators[row * 2 + 1]))
2377b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    return register
2387b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  elif cols == 3:
2397b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    register = aggregators[row * 3]
2407b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    emitter.EmitVPadd('u32', registers.Low(register), registers.Low(register),
2417b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                      registers.Low(aggregators[row * 3 + 1]))
2427b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    emitter.EmitVPadd('u32', registers.High(register),
2437b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                      registers.Low(aggregators[row * 3 + 2]),
2447b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                      registers.Low(aggregators[row * 3 + 2]))
2457b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    return register
246a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  elif cols == 4:
247a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    register = aggregators[row * 3]
248a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    emitter.EmitVPadd('u32', registers.Low(register), registers.Low(register),
249a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                      registers.Low(aggregators[row * 3 + 1]))
250a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    emitter.EmitVPadd('u32', registers.High(register),
251a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                      registers.Low(aggregators[row * 3 + 2]),
252a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                      registers.Low(aggregators[row * 3 + 3]))
253a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    return register
2547b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  else:
2557b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    raise ConfigurationError('Unsupported columns no: %d' % cols)
2567b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
2577b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
2587b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangdef StoreAggregator(emitter, registers, aggregator, cols, result_address,
2597b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                    result_stride):
2607b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  if cols == 1:
2617b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    emitter.EmitVStoreOffset('1.32', emitter.Lane(aggregator, 0),
2627b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                             emitter.Dereference(result_address, None),
2637b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                             result_stride)
2647b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  elif cols == 2:
2657b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    emitter.EmitVStoreOffset('1.32', aggregator,
2667b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                             emitter.Dereference(result_address, None),
2677b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                             result_stride)
2687b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  elif cols == 3:
2697b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    emitter.EmitVStore('1.32', registers.Low(aggregator),
2707b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                       emitter.DereferenceIncrement(result_address, None))
2717b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    emitter.EmitVStoreOffset('1.32', emitter.Lane(
2727b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        registers.High(aggregator),
2737b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        0), emitter.Dereference(result_address, None), result_stride)
2747b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    emitter.EmitNewline()
275a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  elif cols == 4:
276a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    emitter.EmitVStoreOffsetA(
277a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        '1.32', [registers.Low(aggregator), registers.High(aggregator)],
278a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        emitter.Dereference(result_address, None), result_stride)
2797b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  else:
2807b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    raise ConfigurationError('Unsupported columns no: %d' % cols)
2817b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
2827b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
2837b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangdef GenerateAggregatorReduceStore(emitter, registers, aggregators, result_type,
2847b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                                  lhs_add, rhs_add, left_lanes, right_lanes,
2857b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                                  results, results_stride):
2867b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  """Emit code that reduces 4 lane aggregators to 1 value, and stores them."""
2877b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  rows = len(left_lanes.lanes)
2887b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  cols = len(right_lanes.lanes)
2897b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
2907b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  if lhs_add:
2917b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    left_offset = ReadParams(emitter, registers, left_lanes.input_address, rows,
2927b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                             4)
2937b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    left_offsets = Duplicate(emitter, registers, rows, cols, 4, left_offset)
2947b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  else:
2957b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    left_offsets = None
2967b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
2977b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  if rhs_add:
2987b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    right_offset = ReadParams(emitter, registers, right_lanes.input_address,
2997b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                              cols, 4)
3007b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  else:
3017b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    right_offset = None
3027b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3037b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  if result_type is 'float':
3047b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    result_scale = DuplicateGeneralRegister(
3057b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        emitter, registers, cols, registers.MapParameter('result_scale'), 4)
3067b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  else:
3077b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    result_scale = None
3087b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3097b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  if cols == 3:
3107b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    emitter.EmitNewline()
3117b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    emitter.EmitComment('Change stride because storing in two ops.')
3127b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    emitter.EmitSub(results_stride, results_stride,
3137b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                    emitter.ImmediateConstant(8))
3147b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3157b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitNewline()
3167b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitComment('Horizontal reduce aggregators.')
3177b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  for aggregator in aggregators:
3187b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    emitter.EmitVPadd('u32', registers.Low(aggregator),
3197b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                      registers.Low(aggregator), registers.High(aggregator))
3207b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3217b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitNewline()
3227b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitComment('Reduce rows.')
3237b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  row_temps = []
3247b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  for i in range(0, rows):
3257b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    row_temps.append(ReduceAggregator(emitter, registers, aggregators, i, cols))
3267b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3277b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  if lhs_add:
3287b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    emitter.EmitNewline()
3297b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    emitter.EmitComment('Add lhs offsets to aggregated rows.')
3307b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    for (row_temp, left_offset) in zip(row_temps, left_offsets):
3317b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang      emitter.EmitVAdd('s32', row_temp, row_temp, left_offset)
3327b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3337b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  if rhs_add:
3347b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    emitter.EmitNewline()
3357b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    emitter.EmitComment('Add rhs offset to aggregated rows.')
3367b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    for row_temp in row_temps:
3377b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang      emitter.EmitVAdd('s32', row_temp, row_temp, right_offset)
3387b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3397b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  if result_type is 'float':
3407b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    emitter.EmitNewline()
3417b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    emitter.EmitComment('Convert to float. Multiply by result scale.')
3427b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    for row_temp in row_temps:
3437b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang      emitter.EmitVCvt('f32', 's32', row_temp, row_temp)
3447b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    for row_temp in row_temps:
3457b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang      emitter.EmitVMul('f32', row_temp, row_temp, result_scale)
3467b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3477b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitNewline()
3487b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitComment('Store reduced rows.')
3497b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  for row_temp in row_temps:
3507b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    StoreAggregator(emitter, registers, row_temp, cols, results, results_stride)
3517b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3527b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3537b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangdef BuildName(result_type, lhs_add, rhs_add, left, right):
3547b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  name = 'mul_%dx8_%dx8_%s' % (left, right, result_type)
3557b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  if lhs_add:
3567b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    name += '_lhsadd'
3577b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  if rhs_add:
3587b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    name += '_rhsadd'
3597b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  return name
3607b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3617b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3627b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangdef CppResultType(result_type):
3637b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  if result_type is 'int32':
3647b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    return 'std::int32_t*'
3657b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  elif result_type is 'float':
3667b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    return 'float*'
3677b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  else:
3687b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    raise ConfigurationError('Unsupported result type: %s' % result_type)
3697b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3707b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3717b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangdef GetParameters(result_type):
3727b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  params = [['const std::uint8_t*', 'lhs'], ['const std::uint8_t*', 'rhs'],
3737b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang            ['std::int32_t', 'count'], [CppResultType(result_type), 'result'],
3747b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang            ['std::int32_t', 'result_stride']]
3757b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  if result_type is 'float':
3767b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    params.append(['float', 'result_scale'])
3777b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  return params
3787b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3797b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3807b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangdef GenerateMulNx8Mx8(emitter, result_type, lhs_add, rhs_add, left_lanes_count,
3817b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                      right_lanes_count):
3827b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  """Emit the multiply code for given rows and cols counts."""
383a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  if left_lanes_count < 1 or left_lanes_count > 4:
384a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    raise ConfigurationError('Left_lanes should be: 1, 2, 3 or 4.')
385a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  if right_lanes_count < 1 or right_lanes_count > 4:
386a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    raise ConfigurationError('Right_lanes should be: 1, 2, 3 or 4.')
3877b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3887b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitFunctionBeginA(
3897b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang      BuildName(result_type, lhs_add, rhs_add, left_lanes_count,
3907b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                right_lanes_count), GetParameters(result_type), 'inline void')
3917b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3927b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitAssert('count % 8 == 0')
3937b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitAssert('count >= 8')
3947b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitAsmBegin()
3957b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3967b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  registers = neon_emitter.NeonRegisters()
3977b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3987b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  count = registers.MapParameter('count')
3997b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
4007b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  size = left_lanes_count * right_lanes_count
4017b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
402a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  lhs = registers.MapParameter('lhs')
403a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  rhs = registers.MapParameter('rhs')
4047b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
405a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  emitter.EmitPld(lhs)
406a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  emitter.EmitPld(rhs)
4077b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
408a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  aggregators = GenerateAndClearAggregators(emitter, registers, size)
409a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
410a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  if size < 9:
411a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    left_lanes = GenerateMulLanes(registers, left_lanes_count, lhs)
412a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    right_lanes = GenerateMulLanes(registers, right_lanes_count, rhs)
4137b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
4147b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    GenerateNxMLoadMultiplyAggregate(emitter, registers, left_lanes,
4157b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                                     right_lanes, aggregators, count)
4167b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
4177b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  else:  # left == 3 and right == 3
4187b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    backup_register = registers.QuadRegister()
419a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    left_lanes = Generate3MulLanes(backup_register, registers, lhs)
420a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    right_lanes = GenerateMulLanes(registers, right_lanes_count, rhs)
4217b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
4227b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    Generate3x3LoadMultiplyAggregate(emitter, registers, left_lanes,
4237b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                                     right_lanes, aggregators, count,
4247b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                                     backup_register)
4257b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  left_lanes.FreeRegisters(registers)
4267b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  right_lanes.FreeRegisters(registers)
4277b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
4287b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  GenerateAggregatorReduceStore(emitter, registers, aggregators, result_type,
4297b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                                lhs_add, rhs_add, left_lanes, right_lanes,
4307b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                                registers.MapParameter('result'),
4317b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                                registers.MapParameter('result_stride'))
4327b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
4337b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitAsmEnd(registers.MappedParameters(), [],
4347b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                     registers.Clobbers() + ['cc', 'memory'])
4357b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  emitter.EmitFunctionEnd()
4367b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
4377b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
4387b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangdef GenerateFunctions(emitter, result_type, lhs_add, rhs_add):
4397b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  for left_lanes in range(1, 4):
4407b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    for right_lanes in range(1, 4):
4417b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang      GenerateMulNx8Mx8(emitter, result_type, lhs_add, rhs_add, left_lanes,
4427b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                        right_lanes)
4437b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang      emitter.EmitNewline()
444a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
445a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  GenerateMulNx8Mx8(emitter, result_type, lhs_add, rhs_add, 1, 4)
446a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  emitter.EmitNewline()
447a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
448a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
449a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangif __name__ == '__main__':
450a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  GenerateFunctions(neon_emitter.NeonEmitter(), 'int32', True, True)
451