Lines Matching refs:q0

68                                  uint8x16_t* const q0, uint8x16_t* const q1) {
69 // row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7]
70 // row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15]
75 *q0 = vcombine_u8(row0.val[2], row8.val[2]);
88 uint8x16_t* const q0, uint8x16_t* const q1) {
121 *q0 = vreinterpretq_u8_u16(row02.val[1]);
132 uint8x16_t* const q0, uint8x16_t* const q1,
135 Load4x16(src + 2, stride, q0, q1, q2, q3);
140 uint8x16_t* const q0, uint8x16_t* const q1) {
143 *q0 = vld1q_u8(src + 0 * stride);
150 uint8x16_t* const q0, uint8x16_t* const q1,
153 Load16x4(src + 2 * stride, stride, q0, q1, q2, q3);
161 uint8x16_t* const q0, uint8x16_t* const q1,
169 *q0 = vcombine_u8(vld1_u8(u + 0 * stride), vld1_u8(v + 0 * stride));
185 uint8x16_t* const q0, uint8x16_t* const q1,
232 *q0 = vreinterpretq_u8_u32(row04.val[1]);
253 static WEBP_INLINE void Store2x16(const uint8x16_t p0, const uint8x16_t q0,
257 lo.val[1] = vget_low_u8(q0);
259 hi.val[1] = vget_high_u8(q0);
278 const uint8x16_t q0, const uint8x16_t q1,
283 vget_low_u8(q0), vget_low_u8(q1));
286 vget_high_u8(q0), vget_high_u8(q1));
292 static WEBP_INLINE void Store16x2(const uint8x16_t p0, const uint8x16_t q0,
295 vst1q_u8(dst, q0);
299 const uint8x16_t q0, const uint8x16_t q1,
302 Store16x2(q0, q1, dst + stride, stride);
305 static WEBP_INLINE void Store8x2x2(const uint8x16_t p0, const uint8x16_t q0,
308 // p0 and q0 contain the u+v samples packed in low/high halves.
310 vst1_u8(u, vget_low_u8(q0));
312 vst1_u8(v, vget_high_u8(q0));
316 const uint8x16_t q0, const uint8x16_t q1,
321 Store8x2x2(q0, q1, u + stride, v + stride, stride);
333 const uint8x16_t p0, const uint8x16_t q0,
339 INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));
341 INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2));
362 const uint8x16_t q0, const uint8x16_t q1,
368 vget_low_u8(q0), vget_low_u8(q1));
371 vget_high_u8(q0), vget_high_u8(q1));
441 const uint8x16_t q0, const uint8x16_t q1,
444 const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0); // abs(p0-q0)
446 const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0); // 2 * abs(p0-q0)
464 const int8x16_t q0, const int8x16_t q1) {
465 const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)
467 const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0); // (p1-q1) + 1 * (q0 - p0)
468 const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // (p1-q1) + 2 * (q0 - p0)
469 const int8x16_t s3 = vqaddq_s8(q0_p0, s2); // (p1-q1) + 3 * (q0 - p0)
473 static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) {
474 const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)
475 const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0); // 2 * (q0 - p0)
476 const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // 3 * (q0 - p0)
500 const uint8x16_t q0, const uint8x16_t q1,
505 const int8x16_t q0s = FlipSign(q0);
513 uint8x16_t p1, p0, q0, q1, op0, oq0;
514 Load16x4(p, stride, &p1, &p0, &q0, &q1);
516 const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
517 DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);
523 uint8x16_t p1, p0, q0, q1, oq0, op0;
524 Load4x16(p, stride, &p1, &p0, &q0, &q1);
526 const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
527 DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);
534 #define QRegs "q0", "q1", "q2", "q3", \
545 #define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask) \
546 "vabd.u8 q15," #p0 "," #q0 " \n" /* abs(p0 - q0) */ \
548 "vqadd.u8 q15, q15, q15 \n" /* abs(p0 - q0) * 2 */ \
550 "vqadd.u8 q15, q15, q14 \n" /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
554 #define GET_BASE_DELTA(p1, p0, q0, q1, o) \
555 "vqsub.s8 q15," #q0 "," #p0 " \n" /* (q0 - p0) */ \
557 "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 1 * (p0 - q0) */ \
558 "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 2 * (p0 - q0) */ \
559 "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 3 * (p0 - q0) */
561 #define DO_SIMPLE_FILTER(p0, q0, fl) \
570 "vqsub.s8 " #q0 "," #q0 ", q15 \n" /* q0 -= filter2 */
572 // Applies filter on 2 pixels (p0 and q0)
573 #define DO_FILTER2(p1, p0, q0, q1, thresh) \
574 NEEDS_FILTER(p1, p0, q0, q1, thresh, q9) /* filter mask in q9 */ \
576 FLIP_SIGN_BIT4(p1, p0, q0, q1, q10) /* convert to signed value */ \
577 GET_BASE_DELTA(p1, p0, q0, q1, q11) /* get filter level */ \
579 DO_SIMPLE_FILTER(p0, q0, q9) /* apply filter */ \
580 FLIP_SIGN_BIT2(p0, q0, q10)
588 "vld1.u8 {q3}, [%[p]], %[stride] \n" // q0
612 "vswp d5, d26 \n" // q0:q2 q1:q4
613 "vswp q2, q12 \n" // p1:q1 p0:q2 q0:q3 q1:q4
651 const uint8x16_t q0, const uint8x16_t q1,
655 const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)
664 const uint8x16_t q0, const uint8x16_t q1,
673 const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)
680 const uint8x16_t mask1 = NeedsFilter(p1, p0, q0, q1, thresh);
689 const int8x16_t q0, const int8x16_t q1,
701 *oq0 = FlipSignBack(vqsubq_s8(q0, a1)); // clip(q0 - a1)
708 const uint8x16_t q0, const uint8x16_t q1,
715 int8x16_t q0s = FlipSign(q0);
746 const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,
773 *oq0 = FlipSignBack(vqsubq_s8(q0, a1)); // clip(q0 - q1)
782 const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
790 int8x16_t q0s = FlipSign(q0);
822 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
823 Load16x8(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
825 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
827 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
829 DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
839 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
840 Load8x16(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
842 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
844 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
846 DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
861 uint8x16_t q0, q1, q2, q3;
863 Load16x4(p + 2 * stride, stride, &q0, &q1, &q2, &q3);
866 NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
867 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
870 DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
885 uint8x16_t q0, q1, q2, q3;
887 Load4x16(p + 2, stride, &q0, &q1, &q2, &q3);
890 NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
891 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
892 DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
904 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
905 Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
907 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
909 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
911 DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
920 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
923 Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
925 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
927 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
929 DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
937 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
938 Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
940 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
942 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
944 DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
952 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
955 Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
957 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
959 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
961 DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
1162 : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" /* clobbered */