1307d839f4084d7ae5c56dff5487956ad0cad2bcbAndrew Hsieh#include <arm_neon.h>
2307d839f4084d7ae5c56dff5487956ad0cad2bcbAndrew Hsieh
3307d839f4084d7ae5c56dff5487956ad0cad2bcbAndrew Hsiehstruct Matrix43 {
4307d839f4084d7ae5c56dff5487956ad0cad2bcbAndrew Hsieh    float32x4_t row0;
5307d839f4084d7ae5c56dff5487956ad0cad2bcbAndrew Hsieh    float32x4_t row1;
6307d839f4084d7ae5c56dff5487956ad0cad2bcbAndrew Hsieh    float32x4_t row2;
7307d839f4084d7ae5c56dff5487956ad0cad2bcbAndrew Hsieh    float32x4_t row3;
8307d839f4084d7ae5c56dff5487956ad0cad2bcbAndrew Hsieh};
9307d839f4084d7ae5c56dff5487956ad0cad2bcbAndrew Hsieh
10307d839f4084d7ae5c56dff5487956ad0cad2bcbAndrew Hsieh__attribute__((always_inline)) inline Matrix43 operator*(const Matrix43& m1, const Matrix43& m2) {
11307d839f4084d7ae5c56dff5487956ad0cad2bcbAndrew Hsieh    Matrix43 rr;
12307d839f4084d7ae5c56dff5487956ad0cad2bcbAndrew Hsieh    rr.row0 = vmulq_n_f32(         m2.row0, vgetq_lane_f32(m1.row0, 0));
13307d839f4084d7ae5c56dff5487956ad0cad2bcbAndrew Hsieh    rr.row0 = vmlaq_n_f32(rr.row0, m2.row1, vgetq_lane_f32(m1.row0, 1));
14307d839f4084d7ae5c56dff5487956ad0cad2bcbAndrew Hsieh    rr.row0 = vmlaq_n_f32(rr.row0, m2.row2, vgetq_lane_f32(m1.row0, 2));
15307d839f4084d7ae5c56dff5487956ad0cad2bcbAndrew Hsieh
16307d839f4084d7ae5c56dff5487956ad0cad2bcbAndrew Hsieh    rr.row1 = vmulq_n_f32(         m2.row0, vgetq_lane_f32(m1.row1, 0));
17307d839f4084d7ae5c56dff5487956ad0cad2bcbAndrew Hsieh    rr.row1 = vmlaq_n_f32(rr.row1, m2.row1, vgetq_lane_f32(m1.row1, 1));
18307d839f4084d7ae5c56dff5487956ad0cad2bcbAndrew Hsieh    rr.row1 = vmlaq_n_f32(rr.row1, m2.row2, vgetq_lane_f32(m1.row1, 2));
19307d839f4084d7ae5c56dff5487956ad0cad2bcbAndrew Hsieh
20307d839f4084d7ae5c56dff5487956ad0cad2bcbAndrew Hsieh    rr.row2 = vmulq_n_f32(         m2.row0, vgetq_lane_f32(m1.row2, 0));
21307d839f4084d7ae5c56dff5487956ad0cad2bcbAndrew Hsieh    rr.row2 = vmlaq_n_f32(rr.row2, m2.row1, vgetq_lane_f32(m1.row2, 1));
22307d839f4084d7ae5c56dff5487956ad0cad2bcbAndrew Hsieh    rr.row2 = vmlaq_n_f32(rr.row2, m2.row2, vgetq_lane_f32(m1.row2, 2));
23307d839f4084d7ae5c56dff5487956ad0cad2bcbAndrew Hsieh
24307d839f4084d7ae5c56dff5487956ad0cad2bcbAndrew Hsieh    rr.row3 = vmlaq_n_f32(m2.row3, m2.row0, vgetq_lane_f32(m1.row3, 0));
25307d839f4084d7ae5c56dff5487956ad0cad2bcbAndrew Hsieh    rr.row3 = vmlaq_n_f32(rr.row3, m2.row1, vgetq_lane_f32(m1.row3, 1));
26307d839f4084d7ae5c56dff5487956ad0cad2bcbAndrew Hsieh    rr.row3 = vmlaq_n_f32(rr.row3, m2.row2, vgetq_lane_f32(m1.row3, 2));
27307d839f4084d7ae5c56dff5487956ad0cad2bcbAndrew Hsieh    return rr;
28307d839f4084d7ae5c56dff5487956ad0cad2bcbAndrew Hsieh}
29307d839f4084d7ae5c56dff5487956ad0cad2bcbAndrew Hsieh
30307d839f4084d7ae5c56dff5487956ad0cad2bcbAndrew Hsiehvoid _f_with_internal_compiler_error(const Matrix43& m, const void* a1, const void* a2) {
31307d839f4084d7ae5c56dff5487956ad0cad2bcbAndrew Hsieh    m * m * m;
32307d839f4084d7ae5c56dff5487956ad0cad2bcbAndrew Hsieh}
33