Lines Matching refs:p0

67                                  uint8x16_t* const p1, uint8x16_t* const p0,
69 // row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7]
70 // row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15]
74 *p0 = vcombine_u8(row0.val[1], row8.val[1]);
87 uint8x16_t* const p1, uint8x16_t* const p0,
120 *p0 = vreinterpretq_u8_u16(row13.val[0]);
131 uint8x16_t* const p1, uint8x16_t* const p0,
134 Load4x16(src - 2, stride, p3, p2, p1, p0);
139 uint8x16_t* const p1, uint8x16_t* const p0,
142 *p0 = vld1q_u8(src - 1 * stride);
149 uint8x16_t* const p1, uint8x16_t* const p0,
152 Load16x4(src - 2 * stride, stride, p3, p2, p1, p0);
160 uint8x16_t* const p1, uint8x16_t* const p0,
168 *p0 = vcombine_u8(vld1_u8(u - 1 * stride), vld1_u8(v - 1 * stride));
184 uint8x16_t* const p1, uint8x16_t* const p0,
231 *p0 = vreinterpretq_u8_u32(row37.val[0]);
253 static WEBP_INLINE void Store2x16(const uint8x16_t p0, const uint8x16_t q0,
256 lo.val[0] = vget_low_u8(p0);
258 hi.val[0] = vget_high_u8(p0);
277 static WEBP_INLINE void Store4x16(const uint8x16_t p1, const uint8x16_t p0,
282 vget_low_u8(p1), vget_low_u8(p0),
285 vget_high_u8(p1), vget_high_u8(p0),
292 static WEBP_INLINE void Store16x2(const uint8x16_t p0, const uint8x16_t q0,
294 vst1q_u8(dst - stride, p0);
298 static WEBP_INLINE void Store16x4(const uint8x16_t p1, const uint8x16_t p0,
301 Store16x2(p1, p0, dst - stride, stride);
305 static WEBP_INLINE void Store8x2x2(const uint8x16_t p0, const uint8x16_t q0,
308 // p0 and q0 contain the u+v samples packed in low/high halves.
309 vst1_u8(u - stride, vget_low_u8(p0));
311 vst1_u8(v - stride, vget_high_u8(p0));
315 static WEBP_INLINE void Store8x4x2(const uint8x16_t p1, const uint8x16_t p0,
320 Store8x2x2(p1, p0, u - stride, v - stride, stride);
333 const uint8x16_t p0, const uint8x16_t q0,
338 INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));
340 INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0));
361 static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0,
367 vget_low_u8(p1), vget_low_u8(p0),
370 vget_high_u8(p1), vget_high_u8(p0),
440 static uint8x16_t NeedsFilter(const uint8x16_t p1, const uint8x16_t p0,
444 const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0); // abs(p0-q0)
446 const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0); // 2 * abs(p0-q0)
463 static int8x16_t GetBaseDelta(const int8x16_t p1, const int8x16_t p0,
465 const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)
467 const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0); // (p1-q1) + 1 * (q0 - p0)
468 const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // (p1-q1) + 2 * (q0 - p0)
469 const int8x16_t s3 = vqaddq_s8(q0_p0, s2); // (p1-q1) + 3 * (q0 - p0)
473 static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) {
474 const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)
475 const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0); // 2 * (q0 - p0)
476 const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // 3 * (q0 - p0)
499 static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0,
504 const int8x16_t p0s = FlipSign(p0);
513 uint8x16_t p1, p0, q0, q1, op0, oq0;
514 Load16x4(p, stride, &p1, &p0, &q0, &q1);
516 const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
517 DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);
523 uint8x16_t p1, p0, q0, q1, oq0, op0;
524 Load4x16(p, stride, &p1, &p0, &q0, &q1);
526 const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
527 DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);
545 #define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask) \
546 "vabd.u8 q15," #p0 "," #q0 " \n" /* abs(p0 - q0) */ \
548 "vqadd.u8 q15, q15, q15 \n" /* abs(p0 - q0) * 2 */ \
550 "vqadd.u8 q15, q15, q14 \n" /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
554 #define GET_BASE_DELTA(p1, p0, q0, q1, o) \
555 "vqsub.s8 q15," #q0 "," #p0 " \n" /* (q0 - p0) */ \
557 "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 1 * (p0 - q0) */ \
558 "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 2 * (p0 - q0) */ \
559 "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 3 * (p0 - q0) */
561 #define DO_SIMPLE_FILTER(p0, q0, fl) \
565 "vqadd.s8 " #p0 "," #p0 ", q15 \n" /* p0 += filter1 */ \
572 // Applies filter on 2 pixels (p0 and q0)
573 #define DO_FILTER2(p1, p0, q0, q1, thresh) \
574 NEEDS_FILTER(p1, p0, q0, q1, thresh, q9) /* filter mask in q9 */ \
576 FLIP_SIGN_BIT4(p1, p0, q0, q1, q10) /* convert to signed value */ \
577 GET_BASE_DELTA(p1, p0, q0, q1, q11) /* get filter level */ \
579 DO_SIMPLE_FILTER(p0, q0, q9) /* apply filter */ \
580 FLIP_SIGN_BIT2(p0, q0, q10)
587 "vld1.u8 {q2}, [%[p]], %[stride] \n" // p0
611 "vswp d3, d24 \n" // p1:q1 p0:q3
613 "vswp q2, q12 \n" // p1:q1 p0:q2 q0:q3 q1:q4
650 static uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0,
654 const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)
663 const uint8x16_t p1, const uint8x16_t p0,
670 const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)
680 const uint8x16_t mask1 = NeedsFilter(p1, p0, q0, q1, thresh);
688 const int8x16_t p1, const int8x16_t p0,
700 *op0 = FlipSignBack(vqaddq_s8(p0, a2)); // clip(p0 + a2)
707 const uint8x16_t p1, const uint8x16_t p0,
714 int8x16_t p0s = FlipSign(p0);
745 const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,
772 *op0 = FlipSignBack(vqaddq_s8(p0, a1)); // clip(p0 + a1)
781 const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
789 int8x16_t p0s = FlipSign(p0);
822 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
823 Load16x8(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
825 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
827 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
829 DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
839 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
840 Load8x16(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
842 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
844 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
846 DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
858 uint8x16_t p3, p2, p1, p0;
859 Load16x4(p + 2 * stride, stride, &p3, &p2, &p1, &p0);
866 NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
867 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
869 // re-used for next span. And q2/q3 will become p1/p0 accordingly.
870 DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
871 Store16x4(p1, p0, p3, p2, p, stride);
873 p0 = q3;
882 uint8x16_t p3, p2, p1, p0;
883 Load4x16(p + 2, stride, &p3, &p2, &p1, &p0);
890 NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
891 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
892 DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
893 Store4x16(p1, p0, p3, p2, p, stride);
895 p0 = q3;
904 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
905 Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
907 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
909 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
911 DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
920 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
923 Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
925 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
927 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
929 DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
937 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
938 Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
940 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
942 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
944 DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
952 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
955 Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
957 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
959 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
961 DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);