blob: d25c3d7f7e7efe9e20f7d64cbdb5be34bdcc95c9 [file] [log] [blame]
Jean-Luc Brouillet7140c082021-03-02 10:28:22 -08001/*
2 * Copyright (C) 2011 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <stdint.h>
18#include <x86intrin.h>
19
Jean-Luc Brouillet54c25312021-03-16 21:37:28 -070020namespace android {
21namespace renderscript {
22
Jean-Luc Brouillet7140c082021-03-02 10:28:22 -080023/* Unsigned extend packed 8-bit integer (in LBS) into packed 32-bit integer */
24static inline __m128i cvtepu8_epi32(__m128i x) {
25#if defined(__SSE4_1__)
26 return _mm_cvtepu8_epi32(x);
27#elif defined(__SSSE3__)
28 const __m128i M8to32 = _mm_set_epi32(0xffffff03, 0xffffff02, 0xffffff01, 0xffffff00);
29 x = _mm_shuffle_epi8(x, M8to32);
30 return x;
31#else
32# error "Require at least SSSE3"
33#endif
34}
35
36static inline __m128i packus_epi32(__m128i lo, __m128i hi) {
37#if defined(__SSE4_1__)
38 return _mm_packus_epi32(lo, hi);
39#elif defined(__SSSE3__)
40 const __m128i C0 = _mm_set_epi32(0x0000, 0x0000, 0x0000, 0x0000);
41 const __m128i C1 = _mm_set_epi32(0xffff, 0xffff, 0xffff, 0xffff);
42 const __m128i M32to16L = _mm_set_epi32(0xffffffff, 0xffffffff, 0x0d0c0908, 0x05040100);
43 const __m128i M32to16H = _mm_set_epi32(0x0d0c0908, 0x05040100, 0xffffffff, 0xffffffff);
44 lo = _mm_and_si128(lo, _mm_cmpgt_epi32(lo, C0));
45 lo = _mm_or_si128(lo, _mm_cmpgt_epi32(lo, C1));
46 hi = _mm_and_si128(hi, _mm_cmpgt_epi32(hi, C0));
47 hi = _mm_or_si128(hi, _mm_cmpgt_epi32(hi, C1));
48 return _mm_or_si128(_mm_shuffle_epi8(lo, M32to16L),
49 _mm_shuffle_epi8(hi, M32to16H));
50#else
51# error "Require at least SSSE3"
52#endif
53}
54
55static inline __m128i mullo_epi32(__m128i x, __m128i y) {
56#if defined(__SSE4_1__)
57 return _mm_mullo_epi32(x, y);
58#elif defined(__SSSE3__)
59 const __m128i Meven = _mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0xffffffff);
60 __m128i even = _mm_mul_epu32(x, y);
61 __m128i odd = _mm_mul_epu32(_mm_srli_si128(x, 4),
62 _mm_srli_si128(y, 4));
63 even = _mm_and_si128(even, Meven);
64 odd = _mm_and_si128(odd, Meven);
65 return _mm_or_si128(even, _mm_slli_si128(odd, 4));
66#else
67# error "Require at least SSSE3"
68#endif
69}
70
71/* 'mask' must packed 8-bit of 0x00 or 0xff */
72static inline __m128i blendv_epi8(__m128i x, __m128i y, __m128i mask) {
73#if defined(__SSE4_1__)
74 return _mm_blendv_epi8(x, y, mask);
75#elif defined(__SSSE3__)
76 return _mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(y, mask));
77#else
78# error "Require at least SSSE3"
79#endif
80}
81
82extern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0,
83 const void *y1, const void *y2,
84 const short *coef, uint32_t count) {
85 __m128i x;
86 __m128i c0, c2, c4, c6, c8;
87 __m128i r0, r1, r2;
88 __m128i p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11;
89 __m128i o0, o1;
90 uint32_t i;
91
92 x = _mm_loadl_epi64((const __m128i *)(coef+0));
93 c0 = _mm_shuffle_epi32(x, 0x00);
94 c2 = _mm_shuffle_epi32(x, 0x55);
95 x = _mm_loadl_epi64((const __m128i *)(coef+4));
96 c4 = _mm_shuffle_epi32(x, 0x00);
97 c6 = _mm_shuffle_epi32(x, 0x55);
98 x = _mm_loadl_epi64((const __m128i *)(coef+8));
99 c8 = _mm_shuffle_epi32(x, 0x00);
100
101 for (i = 0; i < count; ++i) {
102
103 p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0)), _mm_setzero_si128());
104 p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
105 p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
106 p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
107 p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
108 p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
109 p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
110 p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
111 p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
112 p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
113 p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
114 p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
115
116 o0 = _mm_madd_epi16(_mm_unpacklo_epi16(p0, p1), c0);
117 o1 = _mm_madd_epi16(_mm_unpacklo_epi16(p1, p2), c0);
118
119 o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p2, p4), c2));
120 o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p3, p5), c2));
121
122 o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p5, p6), c4));
123 o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p6, p7), c4));
124
125 o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p8, p9), c6));
126 o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p9, p10), c6));
127
128 o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p10, _mm_setzero_si128()), c8));
129 o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p11, _mm_setzero_si128()), c8));
130
131 o0 = _mm_srai_epi32(o0, 8);
132 o1 = _mm_srai_epi32(o1, 8);
133
134 o0 = packus_epi32(o0, o1);
135 o0 = _mm_packus_epi16(o0, o0);
136 _mm_storel_epi64((__m128i *)dst, o0);
137
138 y0 = (const char *)y0 + 8;
139 y1 = (const char *)y1 + 8;
140 y2 = (const char *)y2 + 8;
141 dst = (char *)dst + 8;
142 }
143}
144
145void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
146 const short *coef, uint32_t count) {
147 const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
148 14, 10, 6, 2,
149 13, 9, 5, 1,
150 12, 8, 4, 0);
151
152 const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
153 const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
154 __m128i c0, c1, c2, c3;
155 __m128i i4, o4;
156 __m128i xy, zw;
157 __m128i x2, y2, z2, w2;
158 uint32_t i;
159
160 c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
161 c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
162 c0 = _mm_unpacklo_epi16(c0, c1);
163
164 c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
165 c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
166 c2 = _mm_unpacklo_epi16(c2, c3);
167
168 for (i = 0; i < count; ++i) {
169 i4 = _mm_load_si128((const __m128i *)src);
170 xy = _mm_shuffle_epi8(i4, Mxy);
171 zw = _mm_shuffle_epi8(i4, Mzw);
172
173 x2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
174 y2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
175 z2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
176 w2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xff));
177
178 x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
179 y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
180 z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
181 w2 = _mm_add_epi32(w2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xff)));
182
183 x2 = _mm_srai_epi32(x2, 8);
184 y2 = _mm_srai_epi32(y2, 8);
185 z2 = _mm_srai_epi32(z2, 8);
186 w2 = _mm_srai_epi32(w2, 8);
187
188 x2 = packus_epi32(x2, y2);
189 z2 = packus_epi32(z2, w2);
190 o4 = _mm_packus_epi16(x2, z2);
191
192 o4 = _mm_shuffle_epi8(o4, T4x4);
193 _mm_storeu_si128((__m128i *)dst, o4);
194
195 src = (const char *)src + 16;
196 dst = (char *)dst + 16;
197 }
198}
199
200void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
201 const short *coef, uint32_t count) {
202 const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
203 14, 10, 6, 2,
204 13, 9, 5, 1,
205 12, 8, 4, 0);
206
207 const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
208 const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
209
210 __m128i c0, c1, c2, c3;
211 __m128i i4, o4;
212 __m128i xy, zw;
213 __m128i x2, y2, z2, w2;
214 uint32_t i;
215
216 c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
217 c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
218 c0 = _mm_unpacklo_epi16(c0, c1);
219
220 c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
221 c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
222 c2 = _mm_unpacklo_epi16(c2, c3);
223
224 for (i = 0; i < count; ++i) {
225 i4 = _mm_loadu_si128((const __m128i *)src);
226 xy = _mm_shuffle_epi8(i4, Mxy);
227 zw = _mm_shuffle_epi8(i4, Mzw);
228
229 x2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
230 y2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
231 z2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
232
233 x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
234 y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
235 z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
236
237 x2 = _mm_srai_epi32(x2, 8);
238 y2 = _mm_srai_epi32(y2, 8);
239 z2 = _mm_srai_epi32(z2, 8);
240 w2 = _mm_srli_epi32(zw, 16);
241
242 x2 = packus_epi32(x2, y2);
243 z2 = packus_epi32(z2, w2);
244 o4 = _mm_packus_epi16(x2, z2);
245
246 o4 = _mm_shuffle_epi8(o4, T4x4);
247 _mm_storeu_si128((__m128i *)dst, o4);
248
249 src = (const char *)src + 16;
250 dst = (char *)dst + 16;
251 }
252}
253
254void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
255 const short *coef, uint32_t count) {
256 const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
257 14, 10, 6, 2,
258 13, 9, 5, 1,
259 12, 8, 4, 0);
260 const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
261 const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
262 __m128i c0, c1, c2, c3;
263 __m128i i4, o4;
264 __m128i xy, zw;
265 __m128i x2, y2, z2, w2;
266 uint32_t i;
267
268 c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
269 c0 = _mm_shufflelo_epi16(c0, 0);
270 c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
271 c1 = _mm_shufflelo_epi16(c1, 0);
272 c0 = _mm_unpacklo_epi16(c0, c1);
273
274 c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
275 c2 = _mm_shufflelo_epi16(c2, 0);
276 c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
277 c3 = _mm_shufflelo_epi16(c3, 0);
278 c2 = _mm_unpacklo_epi16(c2, c3);
279
280 for (i = 0; i < count; ++i) {
281 i4 = _mm_loadu_si128((const __m128i *)src);
282
283 xy = _mm_shuffle_epi8(i4, Mxy);
284 zw = _mm_shuffle_epi8(i4, Mzw);
285
286 x2 = _mm_madd_epi16(xy, c0);
287 x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, c2));
288
289 x2 = _mm_srai_epi32(x2, 8);
290 y2 = x2;
291 z2 = x2;
292 w2 = _mm_srli_epi32(zw, 16);
293
294 x2 = packus_epi32(x2, y2);
295 z2 = packus_epi32(z2, w2);
296 o4 = _mm_packus_epi16(x2, z2);
297
298 o4 = _mm_shuffle_epi8(o4, T4x4);
299 _mm_storeu_si128((__m128i *)dst, o4);
300
301 src = (const char *)src + 16;
302 dst = (char *)dst + 16;
303 }
304}
305
306void rsdIntrinsicBlurVFU4_K(void *dst,
307 const void *pin, int stride, const void *gptr,
308 int rct, int x1, int x2) {
309 const char *pi;
310 __m128i pi0, pi1;
311 __m128 pf0, pf1;
312 __m128 bp0, bp1;
313 __m128 x;
314 int r;
315
316 for (; x1 < x2; x1 += 2) {
317 pi = (const char *)pin + (x1 << 2);
318 bp0 = _mm_setzero_ps();
319 bp1 = _mm_setzero_ps();
320
321 for (r = 0; r < rct; ++r) {
322 x = _mm_load_ss((const float *)gptr + r);
323 x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
324
325 pi0 = _mm_cvtsi32_si128(*(const int *)pi);
326 pi1 = _mm_cvtsi32_si128(*((const int *)pi + 1));
327
328 pf0 = _mm_cvtepi32_ps(cvtepu8_epi32(pi0));
329 pf1 = _mm_cvtepi32_ps(cvtepu8_epi32(pi1));
330
331 bp0 = _mm_add_ps(bp0, _mm_mul_ps(pf0, x));
332 bp1 = _mm_add_ps(bp1, _mm_mul_ps(pf1, x));
333
334 pi += stride;
335 }
336
337 _mm_storeu_ps((float *)dst, bp0);
338 _mm_storeu_ps((float *)dst + 4, bp1);
339 dst = (char *)dst + 32;
340 }
341}
342
343void rsdIntrinsicBlurHFU4_K(void *dst,
344 const void *pin, const void *gptr,
345 int rct, int x1, int x2) {
346 const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
347 const float *pi;
348 __m128 pf, x, y;
349 __m128i o;
350 int r;
351
352 for (; x1 < x2; ++x1) {
353 /* rct is define as 2*r+1 by the caller */
354 x = _mm_load_ss((const float *)gptr);
355 x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
356
357 pi = (const float *)pin + (x1 << 2);
358 pf = _mm_mul_ps(x, _mm_load_ps(pi));
359
360 for (r = 1; r < rct; r += 2) {
361 x = _mm_load_ss((const float *)gptr + r);
362 y = _mm_load_ss((const float *)gptr + r + 1);
363 x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
364 y = _mm_shuffle_ps(y, y, _MM_SHUFFLE(0, 0, 0, 0));
365
366 pf = _mm_add_ps(pf, _mm_mul_ps(x, _mm_load_ps(pi + (r << 2))));
367 pf = _mm_add_ps(pf, _mm_mul_ps(y, _mm_load_ps(pi + (r << 2) + 4)));
368 }
369
370 o = _mm_cvtps_epi32(pf);
371 *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
372 dst = (char *)dst + 4;
373 }
374}
375
376void rsdIntrinsicBlurHFU1_K(void *dst,
377 const void *pin, const void *gptr,
378 int rct, int x1, int x2) {
379 const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
380 const float *pi;
381 __m128 pf, g0, g1, g2, g3, gx, p0, p1;
382 __m128i o;
383 int r;
384
385 for (; x1 < x2; x1+=4) {
386 g0 = _mm_load_ss((const float *)gptr);
387 g0 = _mm_shuffle_ps(g0, g0, _MM_SHUFFLE(0, 0, 0, 0));
388
389 pi = (const float *)pin + x1;
390 pf = _mm_mul_ps(g0, _mm_loadu_ps(pi));
391
392 for (r = 1; r < rct; r += 4) {
393 gx = _mm_loadu_ps((const float *)gptr + r);
394 p0 = _mm_loadu_ps(pi + r);
395 p1 = _mm_loadu_ps(pi + r + 4);
396
397 g0 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(0, 0, 0, 0));
398 pf = _mm_add_ps(pf, _mm_mul_ps(g0, p0));
399 g1 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(1, 1, 1, 1));
400 pf = _mm_add_ps(pf, _mm_mul_ps(g1, _mm_alignr_epi8(p1, p0, 4)));
401 g2 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(2, 2, 2, 2));
402 pf = _mm_add_ps(pf, _mm_mul_ps(g2, _mm_alignr_epi8(p1, p0, 8)));
403 g3 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(3, 3, 3, 3));
404 pf = _mm_add_ps(pf, _mm_mul_ps(g3, _mm_alignr_epi8(p1, p0, 12)));
405 }
406
407 o = _mm_cvtps_epi32(pf);
408 *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
409 dst = (char *)dst + 4;
410 }
411}
412
413void rsdIntrinsicYuv_K(void *dst,
414 const unsigned char *pY, const unsigned char *pUV,
415 uint32_t count, const short *param) {
416 __m128i biasY, biasUV;
417 __m128i c0, c1, c2, c3, c4;
418
419 biasY = _mm_set1_epi32(param[8]); /* 16 */
420 biasUV = _mm_set1_epi32(param[16]); /* 128 */
421
422 c0 = _mm_set1_epi32(param[0]); /* 298 */
423 c1 = _mm_set1_epi32(param[1]); /* 409 */
424 c2 = _mm_set1_epi32(param[2]); /* -100 */
425 c3 = _mm_set1_epi32(param[3]); /* 516 */
426 c4 = _mm_set1_epi32(param[4]); /* -208 */
427
428 __m128i Y, UV, U, V, R, G, B, A;
429
430 A = _mm_set1_epi32(255);
431 uint32_t i;
432
433 for (i = 0; i < (count << 1); ++i) {
434 Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
435 UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
436
437 Y = _mm_sub_epi32(Y, biasY);
438 UV = _mm_sub_epi32(UV, biasUV);
439
440 U = _mm_shuffle_epi32(UV, 0xf5);
441 V = _mm_shuffle_epi32(UV, 0xa0);
442
443 Y = mullo_epi32(Y, c0);
444
445 R = _mm_add_epi32(Y, mullo_epi32(V, c1));
446 R = _mm_add_epi32(R, biasUV);
447 R = _mm_srai_epi32(R, 8);
448
449 G = _mm_add_epi32(Y, mullo_epi32(U, c2));
450 G = _mm_add_epi32(G, mullo_epi32(V, c4));
451 G = _mm_add_epi32(G, biasUV);
452 G = _mm_srai_epi32(G, 8);
453
454 B = _mm_add_epi32(Y, mullo_epi32(U, c3));
455 B = _mm_add_epi32(B, biasUV);
456 B = _mm_srai_epi32(B, 8);
457
458 __m128i y1, y2, y3, y4;
459
460 y1 = packus_epi32(R, G);
461 y2 = packus_epi32(B, A);
462 y3 = _mm_packus_epi16(y1, y2);
463 const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
464 14, 10, 6, 2,
465 13, 9, 5, 1,
466 12, 8, 4, 0);
467 y4 = _mm_shuffle_epi8(y3, T4x4);
468 _mm_storeu_si128((__m128i *)dst, y4);
469 pY += 4;
470 pUV += 4;
471 dst = (__m128i *)dst + 1;
472 }
473}
474
475void rsdIntrinsicYuvR_K(void *dst,
476 const unsigned char *pY, const unsigned char *pUV,
477 uint32_t count, const short *param) {
478 __m128i biasY, biasUV;
479 __m128i c0, c1, c2, c3, c4;
480
481 biasY = _mm_set1_epi32(param[8]); /* 16 */
482 biasUV = _mm_set1_epi32(param[16]); /* 128 */
483
484 c0 = _mm_set1_epi32(param[0]); /* 298 */
485 c1 = _mm_set1_epi32(param[1]); /* 409 */
486 c2 = _mm_set1_epi32(param[2]); /* -100 */
487 c3 = _mm_set1_epi32(param[3]); /* 516 */
488 c4 = _mm_set1_epi32(param[4]); /* -208 */
489
490 __m128i Y, UV, U, V, R, G, B, A;
491
492 A = _mm_set1_epi32(255);
493 uint32_t i;
494
495 for (i = 0; i < (count << 1); ++i) {
496 Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
497 UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
498
499 Y = _mm_sub_epi32(Y, biasY);
500 UV = _mm_sub_epi32(UV, biasUV);
501
502 V = _mm_shuffle_epi32(UV, 0xf5);
503 U = _mm_shuffle_epi32(UV, 0xa0);
504
505 Y = mullo_epi32(Y, c0);
506
507 R = _mm_add_epi32(Y, mullo_epi32(V, c1));
508 R = _mm_add_epi32(R, biasUV);
509 R = _mm_srai_epi32(R, 8);
510
511 G = _mm_add_epi32(Y, mullo_epi32(U, c2));
512 G = _mm_add_epi32(G, mullo_epi32(V, c4));
513 G = _mm_add_epi32(G, biasUV);
514 G = _mm_srai_epi32(G, 8);
515
516 B = _mm_add_epi32(Y, mullo_epi32(U, c3));
517 B = _mm_add_epi32(B, biasUV);
518 B = _mm_srai_epi32(B, 8);
519
520 __m128i y1, y2, y3, y4;
521
522 y1 = packus_epi32(R, G);
523 y2 = packus_epi32(B, A);
524 y3 = _mm_packus_epi16(y1, y2);
525 const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
526 14, 10, 6, 2,
527 13, 9, 5, 1,
528 12, 8, 4, 0);
529 y4 = _mm_shuffle_epi8(y3, T4x4);
530 _mm_storeu_si128((__m128i *)dst, y4);
531 pY += 4;
532 pUV += 4;
533 dst = (__m128i *)dst + 1;
534 }
535}
536
537void rsdIntrinsicYuv2_K(void *dst,
538 const unsigned char *pY, const unsigned char *pU,
539 const unsigned char *pV, uint32_t count, const short *param) {
540 __m128i biasY, biasUV;
541 __m128i c0, c1, c2, c3, c4;
542
543 biasY = _mm_set1_epi32(param[8]); /* 16 */
544 biasUV = _mm_set1_epi32(param[16]); /* 128 */
545
546 c0 = _mm_set1_epi32(param[0]); /* 298 */
547 c1 = _mm_set1_epi32(param[1]); /* 409 */
548 c2 = _mm_set1_epi32(param[2]); /* -100 */
549 c3 = _mm_set1_epi32(param[3]); /* 516 */
550 c4 = _mm_set1_epi32(param[4]); /* -208 */
551
552 __m128i Y, U, V, R, G, B, A;
553
554 A = _mm_set1_epi32(255);
555 uint32_t i;
556
557 for (i = 0; i < (count << 1); ++i) {
558 Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
559 U = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pU));
560 V = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pV));
561
562 Y = _mm_sub_epi32(Y, biasY);
563 U = _mm_sub_epi32(U, biasUV);
564 V = _mm_sub_epi32(V, biasUV);
565
566 Y = mullo_epi32(Y, c0);
567
568 R = _mm_add_epi32(Y, mullo_epi32(V, c1));
569 R = _mm_add_epi32(R, biasUV);
570 R = _mm_srai_epi32(R, 8);
571
572 G = _mm_add_epi32(Y, mullo_epi32(U, c2));
573 G = _mm_add_epi32(G, mullo_epi32(V, c4));
574 G = _mm_add_epi32(G, biasUV);
575 G = _mm_srai_epi32(G, 8);
576
577 B = _mm_add_epi32(Y, mullo_epi32(U, c3));
578 B = _mm_add_epi32(B, biasUV);
579 B = _mm_srai_epi32(B, 8);
580
581 __m128i y1, y2, y3, y4;
582
583 y1 = packus_epi32(R, G);
584 y2 = packus_epi32(B, A);
585 y3 = _mm_packus_epi16(y1, y2);
586 const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
587 14, 10, 6, 2,
588 13, 9, 5, 1,
589 12, 8, 4, 0);
590 y4 = _mm_shuffle_epi8(y3, T4x4);
591 _mm_storeu_si128((__m128i *)dst, y4);
592 pY += 4;
593 pU += 4;
594 pV += 4;
595 dst = (__m128i *)dst + 1;
596 }
597}
598
599extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0,
600 const void *y1, const void *y2,
601 const void *y3, const void *y4,
602 const short *coef, uint32_t count) {
603 __m128i x;
604 __m128i c0, c2, c4, c6, c8, c10, c12;
605 __m128i c14, c16, c18, c20, c22, c24;
606 __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9;
607 __m128i p0, p1, p2, p3, p4, p5, p6, p7;
608 __m128i p8, p9, p10, p11, p12, p13, p14, p15;
609 __m128i p16, p17, p18, p19, p20, p21, p22, p23;
610 __m128i p24, p25, p26, p27, p28, p29, p30, p31;
611 __m128i p32, p33, p34, p35, p36, p37, p38, p39;
612 __m128i o0, o1, o2, o3;
613 uint32_t i;
614
615 x = _mm_loadl_epi64((const __m128i *)(coef+0));
616 c0 = _mm_shuffle_epi32(x, 0x00);
617 c2 = _mm_shuffle_epi32(x, 0x55);
618
619 x = _mm_loadl_epi64((const __m128i *)(coef+4));
620 c4 = _mm_shuffle_epi32(x, 0x00);
621 c6 = _mm_shuffle_epi32(x, 0x55);
622
623 x = _mm_loadl_epi64((const __m128i *)(coef+8));
624 c8 = _mm_shuffle_epi32(x, 0x00);
625 c10 = _mm_shuffle_epi32(x, 0x55);
626
627 x = _mm_loadl_epi64((const __m128i *)(coef+12));
628 c12 = _mm_shuffle_epi32(x, 0x00);
629 c14 = _mm_shuffle_epi32(x, 0x55);
630
631 x = _mm_loadl_epi64((const __m128i *)(coef+16));
632 c16 = _mm_shuffle_epi32(x, 0x00);
633 c18 = _mm_shuffle_epi32(x, 0x55);
634
635 x = _mm_loadl_epi64((const __m128i *)(coef+20));
636 c20 = _mm_shuffle_epi32(x, 0x00);
637 c22 = _mm_shuffle_epi32(x, 0x55);
638
639 x = _mm_loadl_epi64((const __m128i *)(coef+24));
640 c24 = _mm_shuffle_epi32(x, 0x00);
641
642 for (i = 0; i < count; ++i) {
643
644 p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int32_t *)y0), _mm_setzero_si128());
645 p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
646 p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
647 p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
648 p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+4)), _mm_setzero_si128());
649 p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+5)), _mm_setzero_si128());
650 p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+6)), _mm_setzero_si128());
651 p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+7)), _mm_setzero_si128());
652
653 p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
654 p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
655 p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
656 p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
657 p12 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+4)), _mm_setzero_si128());
658 p13 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+5)), _mm_setzero_si128());
659 p14 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+6)), _mm_setzero_si128());
660 p15 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+7)), _mm_setzero_si128());
661
662 p16 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
663 p17 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
664 p18 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
665 p19 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
666 p20 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+4)), _mm_setzero_si128());
667 p21 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+5)), _mm_setzero_si128());
668 p22 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+6)), _mm_setzero_si128());
669 p23 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+7)), _mm_setzero_si128());
670
671 p24 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3)), _mm_setzero_si128());
672 p25 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+1)), _mm_setzero_si128());
673 p26 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+2)), _mm_setzero_si128());
674 p27 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+3)), _mm_setzero_si128());
675 p28 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+4)), _mm_setzero_si128());
676 p29 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+5)), _mm_setzero_si128());
677 p30 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+6)), _mm_setzero_si128());
678 p31 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+7)), _mm_setzero_si128());
679
680 p32 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4)), _mm_setzero_si128());
681 p33 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+1)), _mm_setzero_si128());
682 p34 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+2)), _mm_setzero_si128());
683 p35 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+3)), _mm_setzero_si128());
684 p36 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+4)), _mm_setzero_si128());
685 p37 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+5)), _mm_setzero_si128());
686 p38 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+6)), _mm_setzero_si128());
687 p39 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+7)), _mm_setzero_si128());
688
689 o0 = _mm_madd_epi16( _mm_unpacklo_epi16(p0, p1), c0);
690 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p2, p3), c2));
691 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p8), c4));
692 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p9,p10), c6));
693 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12), c8));
694 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p16, p17), c10));
695 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c12));
696 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p24), c14));
697 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p25,p26), c16));
698 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c18));
699 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p32, p33), c20));
700 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c22));
701 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p36, _mm_setzero_si128()), c24));
702 o0 = _mm_srai_epi32(o0, 8);
703
704 o1 = _mm_madd_epi16( _mm_unpacklo_epi16(p1, p2), c0);
705 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4), c2));
706 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p9), c4));
707 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p10,p11), c6));
708 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p12,p13), c8));
709 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p17,p18), c10));
710 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p19,p20), c12));
711 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p21,p25), c14));
712 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p26, p27), c16));
713 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c18));
714 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p33, p34), c20));
715 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c22));
716 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p37, _mm_setzero_si128()), c24));
717 o1 = _mm_srai_epi32(o1, 8);
718
719 o2 = _mm_madd_epi16( _mm_unpacklo_epi16(p2,p3), c0);
720 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p5), c2));
721 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p6, p10), c4));
722 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12), c6));
723 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p13, p14), c8));
724 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c10));
725 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p21), c12));
726 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p22, p26), c14));
727 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c16));
728 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p29, p30), c18));
729 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c20));
730 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p36, p37), c22));
731 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p38, _mm_setzero_si128()), c24));
732 o2 = _mm_srai_epi32(o2, 8);
733
734 o3 = _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4), c0);
735 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p6), c2));
736 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p7, p11), c4));
737 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p12, p13), c6));
738 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p14, p15), c8));
739 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p19, p20), c10));
740 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p21, p22), c12));
741 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p23, p27), c14));
742 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c16));
743 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p30, p31), c18));
744 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c20));
745 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p37,p38), c22));
746 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p39, _mm_setzero_si128()), c24));
747 o3 = _mm_srai_epi32(o3, 8);
748
749 o0 = packus_epi32(o0, o1);
750 o2 = packus_epi32(o2, o3);
751 o0 = _mm_packus_epi16(o0, o2);
752 _mm_storeu_si128((__m128i *)dst, o0);
753
754 y0 = (const char *)y0 + 16;
755 y1 = (const char *)y1 + 16;
756 y2 = (const char *)y2 + 16;
757 y3 = (const char *)y3 + 16;
758 y4 = (const char *)y4 + 16;
759 dst = (char *)dst + 16;
760 }
761}
762
763void rsdIntrinsicBlendSrcOver_K(void *dst, const void *src, uint32_t count8) {
764 __m128i all1s, ina, ins;
765 __m128i in0, in1, out0, out1;
766 __m128i t0, t1, t2, t3;
767 uint32_t i;
768
769 all1s = _mm_set1_epi16(255);
770
771 for (i = 0; i < count8; ++i) {
772 in0 = _mm_loadu_si128((const __m128i *)src);
773 in1 = _mm_loadu_si128((const __m128i *)src + 1);
774 out0 = _mm_loadu_si128((const __m128i *)dst);
775 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
776
777 ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
778 ina = _mm_shufflelo_epi16(ins, 0xFF);
779 ina = _mm_shufflehi_epi16(ina, 0xFF);
780 t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
781 t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
782 t0 = _mm_srli_epi16(t0, 8);
783 t0 = _mm_add_epi16(t0, ins);
784
785 ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
786 ina = _mm_shufflelo_epi16(ins, 0xFF);
787 ina = _mm_shufflehi_epi16(ina, 0xFF);
788 t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
789 t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
790 t1 = _mm_srli_epi16(t1, 8);
791 t1 = _mm_add_epi16(t1, ins);
792
793 ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
794 ina = _mm_shufflelo_epi16(ins, 0xFF);
795 ina = _mm_shufflehi_epi16(ina, 0xFF);
796 t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
797 t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
798 t2 = _mm_srli_epi16(t2, 8);
799 t2 = _mm_add_epi16(t2, ins);
800
801 ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
802 ina = _mm_shufflelo_epi16(ins, 0xFF);
803 ina = _mm_shufflehi_epi16(ina, 0xFF);
804 t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
805 t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
806 t3 = _mm_srli_epi16(t3, 8);
807 t3 = _mm_add_epi16(t3, ins);
808
809 t0 = _mm_packus_epi16(t0, t1);
810 t2 = _mm_packus_epi16(t2, t3);
811 _mm_storeu_si128((__m128i *)dst, t0);
812 _mm_storeu_si128((__m128i *)dst + 1, t2);
813
814 src = (const __m128i *)src + 2;
815 dst = (__m128i *)dst + 2;
816 }
817}
818
819void rsdIntrinsicBlendDstOver_K(void *dst, const void *src, uint32_t count8) {
820 __m128i all1s, outa, outs;
821 __m128i in0, in1, out0, out1;
822 __m128i t0, t1, t2, t3;
823 uint32_t i;
824
825 all1s = _mm_set1_epi16(255);
826
827 for (i = 0; i < count8; ++i) {
828 in0 = _mm_loadu_si128((const __m128i *)src);
829 in1 = _mm_loadu_si128((const __m128i *)src + 1);
830 out0 = _mm_loadu_si128((const __m128i *)dst);
831 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
832
833
834 outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
835 outa = _mm_shufflelo_epi16(outs, 0xFF);
836 outa = _mm_shufflehi_epi16(outa, 0xFF);
837 t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
838 t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
839 t0 = _mm_srli_epi16(t0, 8);
840 t0 = _mm_add_epi16(t0, outs);
841
842 outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
843 outa = _mm_shufflelo_epi16(outs, 0xFF);
844 outa = _mm_shufflehi_epi16(outa, 0xFF);
845 t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
846 t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
847 t1 = _mm_srli_epi16(t1, 8);
848 t1 = _mm_add_epi16(t1, outs);
849
850 outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
851 outa = _mm_shufflelo_epi16(outs, 0xFF);
852 outa = _mm_shufflehi_epi16(outa, 0xFF);
853 t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
854 t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
855 t2 = _mm_srli_epi16(t2, 8);
856 t2 = _mm_add_epi16(t2, outs);
857
858 outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
859 outa = _mm_shufflelo_epi16(outs, 0xFF);
860 outa = _mm_shufflehi_epi16(outa, 0xFF);
861 t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
862 t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
863 t3 = _mm_srli_epi16(t3, 8);
864 t3 = _mm_add_epi16(t3, outs);
865
866 t0 = _mm_packus_epi16(t0, t1);
867 t2 = _mm_packus_epi16(t2, t3);
868 _mm_storeu_si128((__m128i *)dst, t0);
869 _mm_storeu_si128((__m128i *)dst + 1, t2);
870
871 src = (const __m128i *)src + 2;
872 dst = (__m128i *)dst + 2;
873 }
874}
875
876void rsdIntrinsicBlendSrcIn_K(void *dst, const void *src, uint32_t count8) {
877 __m128i outa;
878 __m128i in0, in1, out0, out1;
879 __m128i t0, t1, t2, t3;
880 uint32_t i;
881
882 for (i = 0; i < count8; ++i) {
883 in0 = _mm_loadu_si128((const __m128i *)src);
884 in1 = _mm_loadu_si128((const __m128i *)src + 1);
885 out0 = _mm_loadu_si128((const __m128i *)dst);
886 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
887
888 outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
889 outa = _mm_shufflelo_epi16(outa, 0xFF);
890 outa = _mm_shufflehi_epi16(outa, 0xFF);
891 t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
892 t0 = _mm_mullo_epi16(t0, outa);
893 t0 = _mm_srli_epi16(t0, 8);
894
895 outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
896 outa = _mm_shufflelo_epi16(outa, 0xFF);
897 outa = _mm_shufflehi_epi16(outa, 0xFF);
898 t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
899 t1 = _mm_mullo_epi16(t1, outa);
900 t1 = _mm_srli_epi16(t1, 8);
901
902 outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
903 outa = _mm_shufflelo_epi16(outa, 0xFF);
904 outa = _mm_shufflehi_epi16(outa, 0xFF);
905 t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
906 t2 = _mm_mullo_epi16(t2, outa);
907 t2 = _mm_srli_epi16(t2, 8);
908
909 outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
910 outa = _mm_shufflelo_epi16(outa, 0xFF);
911 outa = _mm_shufflehi_epi16(outa, 0xFF);
912 t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
913 t3 = _mm_mullo_epi16(t3, outa);
914 t3 = _mm_srli_epi16(t3, 8);
915
916 t0 = _mm_packus_epi16(t0, t1);
917 t2 = _mm_packus_epi16(t2, t3);
918 _mm_storeu_si128((__m128i *)dst, t0);
919 _mm_storeu_si128((__m128i *)dst + 1, t2);
920
921 src = (const __m128i *)src + 2;
922 dst = (__m128i *)dst + 2;
923 }
924}
925
926void rsdIntrinsicBlendDstIn_K(void *dst, const void *src, uint32_t count8) {
927 __m128i ina;
928 __m128i in0, in1, out0, out1;
929 __m128i t0, t1, t2, t3;
930 uint32_t i;
931
932 for (i = 0; i < count8; ++i) {
933 in0 = _mm_loadu_si128((const __m128i *)src);
934 in1 = _mm_loadu_si128((const __m128i *)src + 1);
935 out0 = _mm_loadu_si128((const __m128i *)dst);
936 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
937
938 ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
939 ina = _mm_shufflelo_epi16(ina, 0xFF);
940 ina = _mm_shufflehi_epi16(ina, 0xFF);
941 t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
942 t0 = _mm_mullo_epi16(t0, ina);
943 t0 = _mm_srli_epi16(t0, 8);
944
945 ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
946 ina = _mm_shufflelo_epi16(ina, 0xFF);
947 ina = _mm_shufflehi_epi16(ina, 0xFF);
948 t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
949 t1 = _mm_mullo_epi16(t1, ina);
950 t1 = _mm_srli_epi16(t1, 8);
951
952 ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
953 ina = _mm_shufflelo_epi16(ina, 0xFF);
954 ina = _mm_shufflehi_epi16(ina, 0xFF);
955 t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
956 t2 = _mm_mullo_epi16(t2, ina);
957 t2 = _mm_srli_epi16(t2, 8);
958
959 ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
960 ina = _mm_shufflelo_epi16(ina, 0xFF);
961 ina = _mm_shufflehi_epi16(ina, 0xFF);
962 t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
963 t3 = _mm_mullo_epi16(t3, ina);
964 t3 = _mm_srli_epi16(t3, 8);
965
966 t0 = _mm_packus_epi16(t0, t1);
967 t2 = _mm_packus_epi16(t2, t3);
968 _mm_storeu_si128((__m128i *)dst, t0);
969 _mm_storeu_si128((__m128i *)dst + 1, t2);
970
971 src = (const __m128i *)src + 2;
972 dst = (__m128i *)dst + 2;
973 }
974}
975
976void rsdIntrinsicBlendSrcOut_K(void *dst, const void *src, uint32_t count8) {
977 __m128i all1s, outa;
978 __m128i in0, in1, out0, out1;
979 __m128i t0, t1, t2, t3;
980 uint32_t i;
981
982 all1s = _mm_set1_epi16(255);
983
984 for (i = 0; i < count8; ++i) {
985 in0 = _mm_loadu_si128((const __m128i *)src);
986 in1 = _mm_loadu_si128((const __m128i *)src + 1);
987 out0 = _mm_loadu_si128((const __m128i *)dst);
988 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
989
990 outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
991 outa = _mm_shufflelo_epi16(outa, 0xFF);
992 outa = _mm_shufflehi_epi16(outa, 0xFF);
993 t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
994 t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
995 t0 = _mm_srli_epi16(t0, 8);
996
997 outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
998 outa = _mm_shufflelo_epi16(outa, 0xFF);
999 outa = _mm_shufflehi_epi16(outa, 0xFF);
1000 t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1001 t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
1002 t1 = _mm_srli_epi16(t1, 8);
1003
1004 outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1005 outa = _mm_shufflelo_epi16(outa, 0xFF);
1006 outa = _mm_shufflehi_epi16(outa, 0xFF);
1007 t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1008 t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
1009 t2 = _mm_srli_epi16(t2, 8);
1010
1011 outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1012 outa = _mm_shufflelo_epi16(outa, 0xFF);
1013 outa = _mm_shufflehi_epi16(outa, 0xFF);
1014 t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1015 t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
1016 t3 = _mm_srli_epi16(t3, 8);
1017
1018 t0 = _mm_packus_epi16(t0, t1);
1019 t2 = _mm_packus_epi16(t2, t3);
1020 _mm_storeu_si128((__m128i *)dst, t0);
1021 _mm_storeu_si128((__m128i *)dst + 1, t2);
1022
1023 src = (const __m128i *)src + 2;
1024 dst = (__m128i *)dst + 2;
1025 }
1026}
1027
1028void rsdIntrinsicBlendDstOut_K(void *dst, const void *src, uint32_t count8) {
1029 __m128i all1s, ina;
1030 __m128i in0, in1, out0, out1;
1031 __m128i t0, t1, t2, t3;
1032 uint32_t i;
1033
1034 all1s = _mm_set1_epi16(255);
1035
1036 for (i = 0; i < count8; ++i) {
1037 in0 = _mm_loadu_si128((const __m128i *)src);
1038 in1 = _mm_loadu_si128((const __m128i *)src + 1);
1039 out0 = _mm_loadu_si128((const __m128i *)dst);
1040 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1041
1042 ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1043 ina = _mm_shufflelo_epi16(ina, 0xFF);
1044 ina = _mm_shufflehi_epi16(ina, 0xFF);
1045 t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1046 t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
1047 t0 = _mm_srli_epi16(t0, 8);
1048
1049 ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1050 ina = _mm_shufflelo_epi16(ina, 0xFF);
1051 ina = _mm_shufflehi_epi16(ina, 0xFF);
1052 t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1053 t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
1054 t1 = _mm_srli_epi16(t1, 8);
1055
1056 ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1057 ina = _mm_shufflelo_epi16(ina, 0xFF);
1058 ina = _mm_shufflehi_epi16(ina, 0xFF);
1059 t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1060 t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
1061 t2 = _mm_srli_epi16(t2, 8);
1062
1063 ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1064 ina = _mm_shufflelo_epi16(ina, 0xFF);
1065 ina = _mm_shufflehi_epi16(ina, 0xFF);
1066 t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1067 t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
1068 t3 = _mm_srli_epi16(t3, 8);
1069
1070 t0 = _mm_packus_epi16(t0, t1);
1071 t2 = _mm_packus_epi16(t2, t3);
1072 _mm_storeu_si128((__m128i *)dst, t0);
1073 _mm_storeu_si128((__m128i *)dst + 1, t2);
1074
1075 src = (const __m128i *)src + 2;
1076 dst = (__m128i *)dst + 2;
1077 }
1078}
1079
1080void rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8) {
1081 const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
1082 __m128i all1s, ina, outa, ins, outs;
1083 __m128i in0, in1, out0, out1;
1084 __m128i t0, t1, t2, t3;
1085 uint32_t i;
1086
1087 all1s = _mm_set1_epi16(255);
1088
1089 for (i = 0; i < count8; ++i) {
1090 in0 = _mm_loadu_si128((const __m128i *)src);
1091 in1 = _mm_loadu_si128((const __m128i *)src + 1);
1092 out0 = _mm_loadu_si128((const __m128i *)dst);
1093 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1094
1095 ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1096 ina = _mm_shufflelo_epi16(ins, 0xFF);
1097 ina = _mm_shufflehi_epi16(ina, 0xFF);
1098 outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1099 outa = _mm_shufflelo_epi16(outs, 0xFF);
1100 outa = _mm_shufflehi_epi16(outa, 0xFF);
1101 t0 = _mm_sub_epi16(all1s, ina);
1102 t0 = _mm_mullo_epi16(t0, outs);
1103 t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(outa, ins));
1104 t0 = _mm_srli_epi16(t0, 8);
1105
1106 ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1107 ina = _mm_shufflelo_epi16(ins, 0xFF);
1108 ina = _mm_shufflehi_epi16(ina, 0xFF);
1109 outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1110 outa = _mm_shufflelo_epi16(outs, 0xFF);
1111 outa = _mm_shufflehi_epi16(outa, 0xFF);
1112 t1 = _mm_sub_epi16(all1s, ina);
1113 t1 = _mm_mullo_epi16(t1, outs);
1114 t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(outa, ins));
1115 t1 = _mm_srli_epi16(t1, 8);
1116
1117 ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1118 ina = _mm_shufflelo_epi16(ins, 0xFF);
1119 ina = _mm_shufflehi_epi16(ina, 0xFF);
1120 outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1121 outa = _mm_shufflelo_epi16(outs, 0xFF);
1122 outa = _mm_shufflehi_epi16(outa, 0xFF);
1123 t2 = _mm_sub_epi16(all1s, ina);
1124 t2 = _mm_mullo_epi16(t2, outs);
1125 t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(outa, ins));
1126 t2 = _mm_srli_epi16(t2, 8);
1127
1128 ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1129 ina = _mm_shufflelo_epi16(ins, 0xFF);
1130 ina = _mm_shufflehi_epi16(ina, 0xFF);
1131 outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1132 outa = _mm_shufflelo_epi16(outs, 0xFF);
1133 outa = _mm_shufflehi_epi16(outa, 0xFF);
1134 t3 = _mm_sub_epi16(all1s, ina);
1135 t3 = _mm_mullo_epi16(t3, outs);
1136 t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(outa, ins));
1137 t3 = _mm_srli_epi16(t3, 8);
1138
1139 t0 = _mm_packus_epi16(t0, t1);
1140 t0 = blendv_epi8(t0, out0, M0001);
1141 t2 = _mm_packus_epi16(t2, t3);
1142 t2 = blendv_epi8(t2, out1, M0001);
1143 _mm_storeu_si128((__m128i *)dst, t0);
1144 _mm_storeu_si128((__m128i *)dst + 1, t2);
1145
1146 src = (const __m128i *)src + 2;
1147 dst = (__m128i *)dst + 2;
1148 }
1149}
1150
1151void rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8) {
1152 const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
1153 __m128i all1s, ina, ins, outa, outs;
1154 __m128i in0, in1, out0, out1;
1155 __m128i t0, t1, t2, t3;
1156 uint32_t i;
1157
1158 all1s = _mm_set1_epi16(255);
1159
1160 for (i = 0; i < count8; ++i) {
1161 in0 = _mm_loadu_si128((const __m128i *)src);
1162 in1 = _mm_loadu_si128((const __m128i *)src + 1);
1163 out0 = _mm_loadu_si128((const __m128i *)dst);
1164 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1165
1166 ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1167 ina = _mm_shufflelo_epi16(ins, 0xFF);
1168 ina = _mm_shufflehi_epi16(ina, 0xFF);
1169 outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1170 outa = _mm_shufflelo_epi16(outs, 0xFF);
1171 outa = _mm_shufflehi_epi16(outa, 0xFF);
1172 t0 = _mm_sub_epi16(all1s, outa);
1173 t0 = _mm_mullo_epi16(t0, ins);
1174 t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(ina, outs));
1175 t0 = _mm_srli_epi16(t0, 8);
1176
1177 ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1178 ina = _mm_shufflelo_epi16(ins, 0xFF);
1179 ina = _mm_shufflehi_epi16(ina, 0xFF);
1180 outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1181 outa = _mm_shufflelo_epi16(outs, 0xFF);
1182 outa = _mm_shufflehi_epi16(outa, 0xFF);
1183 t1 = _mm_sub_epi16(all1s, outa);
1184 t1 = _mm_mullo_epi16(t1, ins);
1185 t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(ina, outs));
1186 t1 = _mm_srli_epi16(t1, 8);
1187
1188 ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1189 ina = _mm_shufflelo_epi16(ins, 0xFF);
1190 ina = _mm_shufflehi_epi16(ina, 0xFF);
1191 outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1192 outa = _mm_shufflelo_epi16(outs, 0xFF);
1193 outa = _mm_shufflehi_epi16(outa, 0xFF);
1194 t2 = _mm_sub_epi16(all1s, outa);
1195 t2 = _mm_mullo_epi16(t2, ins);
1196 t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(ina, outs));
1197 t2 = _mm_srli_epi16(t2, 8);
1198
1199 ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1200 ina = _mm_shufflelo_epi16(ins, 0xFF);
1201 ina = _mm_shufflehi_epi16(ina, 0xFF);
1202 outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1203 outa = _mm_shufflelo_epi16(outs, 0xFF);
1204 outa = _mm_shufflehi_epi16(outa, 0xFF);
1205 t3 = _mm_sub_epi16(all1s, outa);
1206 t3 = _mm_mullo_epi16(t3, ins);
1207 t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(ina, outs));
1208 t3 = _mm_srli_epi16(t3, 8);
1209
1210 t0 = _mm_packus_epi16(t0, t1);
1211 t0 = blendv_epi8(t0, in0, M0001);
1212 t2 = _mm_packus_epi16(t2, t3);
1213 t2 = blendv_epi8(t2, in1, M0001);
1214 _mm_storeu_si128((__m128i *)dst, t0);
1215 _mm_storeu_si128((__m128i *)dst + 1, t2);
1216
1217 src = (const __m128i *)src + 2;
1218 dst = (__m128i *)dst + 2;
1219 }
1220}
1221
1222void rsdIntrinsicBlendXor_K(void *dst, const void *src, uint32_t count8) {
1223 __m128i in0, in1, out0, out1;
1224 uint32_t i;
1225
1226 for (i = 0; i < count8; ++i) {
1227 in0 = _mm_loadu_si128((const __m128i *)src);
1228 in1 = _mm_loadu_si128((const __m128i *)src + 1);
1229 out0 = _mm_loadu_si128((const __m128i *)dst);
1230 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1231
1232 out0 = _mm_xor_si128(out0, in0);
1233 out1 = _mm_xor_si128(out1, in1);
1234
1235 _mm_storeu_si128((__m128i *)dst, out0);
1236 _mm_storeu_si128((__m128i *)dst + 1, out1);
1237
1238 src = (const __m128i *)src + 2;
1239 dst = (__m128i *)dst + 2;
1240 }
1241}
1242
1243void rsdIntrinsicBlendMultiply_K(void *dst, const void *src, uint32_t count8) {
1244 __m128i in0, in1, out0, out1;
1245 __m128i t0, t1, t2, t3;
1246 uint32_t i;
1247
1248 for (i = 0; i < count8; ++i) {
1249 in0 = _mm_loadu_si128((const __m128i *)src);
1250 in1 = _mm_loadu_si128((const __m128i *)src + 1);
1251 out0 = _mm_loadu_si128((const __m128i *)dst);
1252 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1253
1254 t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1255 t0 = _mm_mullo_epi16(t0, _mm_unpacklo_epi8(out0, _mm_setzero_si128()));
1256 t0 = _mm_srli_epi16(t0, 8);
1257
1258 t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1259 t1 = _mm_mullo_epi16(t1, _mm_unpackhi_epi8(out0, _mm_setzero_si128()));
1260 t1 = _mm_srli_epi16(t1, 8);
1261
1262 t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1263 t2 = _mm_mullo_epi16(t2, _mm_unpacklo_epi8(out1, _mm_setzero_si128()));
1264 t2 = _mm_srli_epi16(t2, 8);
1265
1266 t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1267 t3 = _mm_mullo_epi16(t3, _mm_unpackhi_epi8(out1, _mm_setzero_si128()));
1268 t3 = _mm_srli_epi16(t3, 8);
1269
1270 t0 = _mm_packus_epi16(t0, t1);
1271 t2 = _mm_packus_epi16(t2, t3);
1272 _mm_storeu_si128((__m128i *)dst, t0);
1273 _mm_storeu_si128((__m128i *)dst + 1, t2);
1274
1275 src = (const __m128i *)src + 2;
1276 dst = (__m128i *)dst + 2;
1277 }
1278}
1279
1280void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8) {
1281 __m128i in0, in1, out0, out1;
1282 uint32_t i;
1283
1284 for (i = 0; i < count8; ++i) {
1285 in0 = _mm_loadu_si128((const __m128i *)src);
1286 in1 = _mm_loadu_si128((const __m128i *)src + 1);
1287 out0 = _mm_loadu_si128((const __m128i *)dst);
1288 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1289
1290 out0 = _mm_adds_epu8(out0, in0);
1291 out1 = _mm_adds_epu8(out1, in1);
1292
1293 _mm_storeu_si128((__m128i *)dst, out0);
1294 _mm_storeu_si128((__m128i *)dst + 1, out1);
1295
1296 src = (const __m128i *)src + 2;
1297 dst = (__m128i *)dst + 2;
1298 }
1299}
1300
1301void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8) {
1302 __m128i in0, in1, out0, out1;
1303 uint32_t i;
1304
1305 for (i = 0; i < count8; ++i) {
1306 in0 = _mm_loadu_si128((const __m128i *)src);
1307 in1 = _mm_loadu_si128((const __m128i *)src + 1);
1308 out0 = _mm_loadu_si128((const __m128i *)dst);
1309 out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1310
1311 out0 = _mm_subs_epu8(out0, in0);
1312 out1 = _mm_subs_epu8(out1, in1);
1313
1314 _mm_storeu_si128((__m128i *)dst, out0);
1315 _mm_storeu_si128((__m128i *)dst + 1, out1);
1316
1317 src = (const __m128i *)src + 2;
1318 dst = (__m128i *)dst + 2;
1319 }
1320}
Jean-Luc Brouillet54c25312021-03-16 21:37:28 -07001321
1322} // namespace android
1323} // namespace renderscript