| /* |
| * Copyright (C) 2015 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: |
| #define END(f) .size f, .-f; |
| |
| /* Fixed-point precision after vertical pass -- 16 bit data minus 1 sign and 1 |
| * integer (bicubic has a little overshoot). It would also be possible to add |
| * a temporary DC bias to eliminate the sign bit for more precision, but that's |
| * extra arithmetic. |
| */ |
| .set VERTBITS, 14 |
| |
| /* The size of the scratch buffer in which we store our vertically convolved |
| * intermediates. |
| */ |
| .set CHUNKSHIFT, 7 /* 5 tests better for uchar4, but 7 is necessary for ridiculous (10:1) scale factors */ |
| .set CHUNKSIZE, (1 << CHUNKSHIFT) |
| |
| /* The number of components processed in a single iteration of the innermost |
| * loop. |
| */ |
| .set VECSHIFT, 3 |
| .set VECSIZE, (1<<VECSHIFT) |
| |
| /* Read four different lines (except at edges where addresses may be clamped, |
| * which is why we don't simply take base and stride registers), and multiply |
| * and accumulate them by the coefficients in v3[0..3], leaving the results in |
| * v12. This gives eight 16-bit results representing a horizontal line of 2-8 |
| * input pixels (depending on number of components per pixel) to be fed into |
| * the horizontal scaling pass. |
| * |
| * Input coefficients are 16-bit unsigned fixed-point (although [0] and [3] are |
| * known to represent negative values and VMLS is used to implement this). |
| * Output is VERTBITS signed fixed-point, which must leave room for a little |
| * v12. This gives eight 16-bit results. |
| */ |
| .macro vert8, dstlo=v12.4h, dsthi=v12.8h |
| ld1 {v8.8b}, [x4], #8 |
| ld1 {v9.8b}, [x5], #8 |
| ld1 {v10.8b}, [x6], #8 |
| ld1 {v11.8b}, [x7], #8 |
| uxtl v8.8h, v8.8b |
| uxtl v9.8h, v9.8b |
| uxtl v10.8h, v10.8b |
| uxtl v11.8h, v11.8b |
| umull v12.4s, v9.4h, v3.h[1] |
| umull2 v13.4s, v9.8h, v3.h[1] |
| umlsl v12.4s, v8.4h, v3.h[0] |
| umlsl2 v13.4s, v8.8h, v3.h[0] |
| umlal v12.4s, v10.4h, v3.h[2] |
| umlal2 v13.4s, v10.8h, v3.h[2] |
| umlsl v12.4s, v11.4h, v3.h[3] |
| umlsl2 v13.4s, v11.8h, v3.h[3] |
| |
| /* Shift by 8 (bits per pixel), plus 16 (the fixed-point multiplies), |
| * minus VERTBITS (the number of fraction bits we want to keep from |
| * here on). |
| */ |
| sqshrn \dstlo, v12.4s, #8 + (16 - VERTBITS) |
| sqshrn2 \dsthi, v13.4s, #8 + (16 - VERTBITS) |
| .endm |
| |
| /* As above, but only four 16-bit results into v12hi. |
| */ |
| .macro vert4, dst=v12.8h |
| ld1 {v8.s}[0], [x4], #4 |
| ld1 {v9.s}[0], [x5], #4 |
| ld1 {v10.s}[0], [x6], #4 |
| ld1 {v11.s}[0], [x7], #4 |
| uxtl v8.8h, v8.8b |
| uxtl v9.8h, v9.8b |
| uxtl v10.8h, v10.8b |
| uxtl v11.8h, v11.8b |
| umull v12.4s, v9.4h, v3.h[1] |
| umlsl v12.4s, v8.4h, v3.h[0] |
| umlal v12.4s, v10.4h, v3.h[2] |
| umlsl v12.4s, v11.4h, v3.h[3] |
| .ifc \dst,v12.8h |
| sqshrn2 \dst, v12.4s, #8 + (16 - VERTBITS) |
| .else |
| sqshrn \dst, v12.4s, #8 + (16 - VERTBITS) |
| .endif |
| .endm |
| |
| |
| /* During horizontal resize having CHUNKSIZE input available means being able |
| * to produce a varying amount of output, depending on the phase of the data. |
| * This function calculates the minimum number of VECSIZE chunks extracted from |
| * a CHUNKSIZE window (x1), and the threshold value for when the count will be |
| * one higher than that (x0). |
| * These work out, conveniently, to be the quotient and remainder from: |
| * (CHUNKSIZE + xinc * VECSIZE - 1) / (xinc * VECSIZE) |
| * |
| * The two values are packed together in a uint64_t for convenience; and |
| * they are, in fact, used this way as an arithmetic short-cut later on. |
| */ |
| /* uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc) */ |
| ENTRY(rsdIntrinsicResize_oscctl_K) |
| lsl x2, x0, #VECSHIFT |
| mov x0, #(CHUNKSIZE << 16) - 1 |
| add x0, x0, x2 |
| udiv x1, x0, x2 |
| msub x0, x1, x2, x0 |
| add x0, x0, x1, LSL #32 |
| ret |
| END(rsdIntrinsicResize_oscctl_K) |
| |
| /* Iterate to generate the uchar1, uchar2, and uchar4 versions of the code. |
| * For the most part the vertical pass (the outer loop) is the same for all |
| * versions. Exceptions are handled in-line with conditional assembly. |
| */ |
| .irp comp, 1, 2, 4 |
| .if \comp == 1 |
| .set COMPONENT_SHIFT, 0 |
| .elseif \comp == 2 |
| .set COMPONENT_SHIFT, 1 |
| .elseif \comp == 4 |
| .set COMPONENT_SHIFT, 2 |
| .else |
| .error "Unknown component count" |
| .endif |
| .set COMPONENT_COUNT, (1 << COMPONENT_SHIFT) |
| .set LOOP_OUTPUT_SIZE, (VECSIZE * COMPONENT_COUNT) |
| |
| .set BUFFER_SIZE, (CHUNKSIZE * 2 + 4) * COMPONENT_COUNT * 2 |
| |
| /* void rsdIntrinsicResizeB1_K( |
| * uint8_t * restrict dst, // x0 |
| * size_t count, // x1 |
| * uint32_t xf, // x2 |
| * uint32_t xinc, // x3 |
| * uint8_t const * restrict srcn, // x4 |
| * uint8_t const * restrict src0, // x5 |
| * uint8_t const * restrict src1, // x6 |
| * uint8_t const * restrict src2, // x7 |
| * size_t xclip, // [sp,#0] -> [sp,#80] -> x12 |
| * size_t avail, // [sp,#8] -> [sp,#88] -> x11 |
| * uint64_t osc_ctl, // [sp,#16] -> [sp,#96] -> x10 |
| * int32 const *yr, // [sp,#24] -> [sp,#104] -> v4 (copied to v3 for scalar access) |
| */ |
| ENTRY(rsdIntrinsicResizeB\comp\()_K) |
| sub x8, sp, #48 |
| sub sp, sp, #80 |
| st1 {v8.1d - v11.1d}, [sp] |
| st1 {v12.1d - v15.1d}, [x8] |
| str x19, [x8, #32] |
| |
| /* align the working buffer on the stack to make it easy to use bit |
| * twiddling for address calculations. |
| */ |
| sub x12, sp, #BUFFER_SIZE |
| bic x12, x12, #(1 << (CHUNKSHIFT + 1 + COMPONENT_SHIFT + 1)) - 1 |
| |
| ldr x8, [sp,#104] // yr |
| adrp x9, intrinsic_resize_consts |
| add x9, x9, :lo12:intrinsic_resize_consts |
| ld1 {v4.4s}, [x8] |
| ld1 {v5.8h}, [x9] |
| sqxtun v4.4h, v4.4s // yr |
| dup v6.8h, w2 |
| dup v7.8h, w3 |
| mla v6.8h, v5.8h, v7.8h // vxf |
| shl v7.8h, v7.8h, #VECSHIFT // vxinc |
| |
| /* Compute starting condition for oscillator used to compute ahead |
| * of time how many iterations are possible before needing to |
| * refill the working buffer. This is based on the fixed-point |
| * index of the last element in the vector of pixels processed in |
| * each iteration, counting up until it would overflow. |
| */ |
| sub x8, x2, x3 |
| lsl x9, x3, #VECSHIFT |
| add x8, x8, x9 |
| |
| ldr x10, [sp,#96] // osc_ctl |
| ldp x13,x11, [sp,#80] // xclip, avail |
| |
| mov x19, sp |
| mov sp, x12 |
| |
| /* x4-x7 contain pointers to the four lines of input to be |
| * convolved. These pointers have been clamped vertically and |
| * horizontally (which is why it's not a simple row/stride pair), |
| * and the xclip argument (now in x13) indicates how many pixels |
| * from true the x position of the pointer is. This value should |
| * be 0, 1, or 2 only. |
| * |
| * Start by placing four pixels worth of input at the far end of |
| * the buffer. As many as two of these may be clipped, so four |
| * pixels are fetched, and then the first pixel is duplicated and |
| * the data shifted according to xclip. The source pointers are |
| * then also adjusted according to xclip so that subsequent fetches |
| * match. |
| */ |
| mov v3.8b, v4.8b /* make y coeffs available for vert4 and vert8 macros */ |
| sub x14, x12, x13, LSL #(COMPONENT_SHIFT + 1) |
| add x15, x12, #(2 * CHUNKSIZE - 4) * COMPONENT_COUNT * 2 |
| add x14, x14, #4 * COMPONENT_COUNT * 2 |
| .if \comp == 1 |
| vert4 v12.4h |
| dup v11.4h, v12.h[0] |
| st1 {v11.4h,v12.4h}, [x12] |
| ld1 {v12.4h}, [x14] |
| st1 {v12.4h}, [x15] |
| .elseif \comp == 2 |
| vert8 |
| dup v11.4s, v12.s[0] |
| st1 {v11.8h,v12.8h}, [x12] |
| ld1 {v12.8h}, [x14] |
| st1 {v12.8h}, [x15] |
| .elseif \comp == 4 |
| vert8 v14.4h, v14.8h |
| vert8 v15.4h, v15.8h |
| dup v12.2d, v14.d[0] |
| dup v13.2d, v14.d[0] |
| st1 {v12.8h,v13.8h}, [x12], #32 |
| st1 {v14.8h,v15.8h}, [x12] |
| sub x12, x12, #32 |
| ld1 {v11.8h,v12.8h}, [x14] |
| st1 {v11.8h,v12.8h}, [x15] |
| .endif |
| /* Count off four pixels into the working buffer. |
| */ |
| sub x11, x11, #4 |
| /* Incoming pointers were to the first _legal_ pixel. Four pixels |
| * were read unconditionally, but some may have been discarded by |
| * xclip, so we rewind the pointers to compensate. |
| */ |
| sub x4, x4, x13, LSL #(COMPONENT_SHIFT) |
| sub x5, x5, x13, LSL #(COMPONENT_SHIFT) |
| sub x6, x6, x13, LSL #(COMPONENT_SHIFT) |
| sub x7, x7, x13, LSL #(COMPONENT_SHIFT) |
| |
| /* First tap starts where we just pre-filled, at the end of the |
| * buffer. |
| */ |
| add x2, x2, #(CHUNKSIZE * 2 - 4) << 16 |
| |
| /* Use overflowing arithmetic to implement wraparound array |
| * indexing. |
| */ |
| lsl x2, x2, #(47 - CHUNKSHIFT) |
| lsl x3, x3, #(47 - CHUNKSHIFT) |
| |
| |
| /* Start of outermost loop. |
| * Fetch CHUNKSIZE pixels into scratch buffer, then calculate the |
| * number of iterations of the inner loop that can be performed and |
| * get into that. |
| * |
| * The fill is complicated by the possibility of running out of |
| * input before the scratch buffer is filled. If this isn't a risk |
| * then it's handled by the simple loop at 2:, otherwise the |
| * horrible loop at 3:. |
| */ |
| 1: mov v3.8b, v4.8b /* put y scaling coefficients somewhere handy */ |
| subs x11, x11, #CHUNKSIZE |
| bge 2f /* if at least CHUNKSIZE are available... */ |
| add x11, x11, #CHUNKSIZE /* if they're not... */ |
| b 4f |
| /* basic fill loop, processing 8 bytes at a time until there are |
| * fewer than eight bytes available. |
| */ |
| 3: vert8 |
| sub x11, x11, #8 / COMPONENT_COUNT |
| st1 {v12.8h}, [x12], #16 |
| 4: cmp x11, #8 / COMPONENT_COUNT - 1 |
| bgt 3b |
| .if \comp == 4 |
| blt 3f |
| /* The last pixel (four bytes) if necessary */ |
| vert4 |
| .else |
| cmp x11, #1 |
| blt 3f |
| /* The last pixels if necessary */ |
| sub x4, x4, #8 |
| sub x5, x5, #8 |
| sub x6, x6, #8 |
| sub x7, x7, #8 |
| add x4, x4, x11, LSL #(COMPONENT_SHIFT) |
| add x5, x5, x11, LSL #(COMPONENT_SHIFT) |
| add x6, x6, x11, LSL #(COMPONENT_SHIFT) |
| add x7, x7, x11, LSL #(COMPONENT_SHIFT) |
| vert8 |
| sub x11, sp, x11, LSL #(COMPONENT_SHIFT + 1) |
| sub sp, sp, #32 |
| sub x11, x11, #16 |
| .if \comp == 1 |
| dup v13.8h, v12.h[7] |
| .elseif \comp == 2 |
| dup v13.4s, v12.s[3] |
| .endif |
| st1 {v12.8h,v13.8h}, [sp] |
| ld1 {v12.8h}, [x11] |
| add sp, sp, #32 |
| b 4f |
| .endif |
| /* Keep filling until we get to the end of this chunk of the buffer */ |
| 3: |
| .if \comp == 1 |
| dup v12.8h, v12.h[7] |
| .elseif \comp == 2 |
| dup v12.4s, v12.s[3] |
| .elseif \comp == 4 |
| dup v12.2d, v12.d[1] |
| .endif |
| 4: st1 {v12.8h}, [x12], #16 |
| tst x12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2 |
| bne 3b |
| b 4f |
| |
| .align 4 |
| 2: /* Quickly pull a chunk of data into the working buffer. |
| */ |
| vert8 |
| st1 {v12.8h}, [x12], #16 |
| vert8 |
| st1 {v12.8h}, [x12], #16 |
| tst x12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2 |
| bne 2b |
| cmp x11, #0 |
| bne 3f |
| 4: /* if we end with 0 pixels left we'll have nothing handy to spread |
| * across to the right, so we rewind a bit. |
| */ |
| mov x11, #1 |
| sub x4, x4, #COMPONENT_COUNT |
| sub x5, x5, #COMPONENT_COUNT |
| sub x6, x6, #COMPONENT_COUNT |
| sub x7, x7, #COMPONENT_COUNT |
| 3: /* copy four taps (width of cubic window) to far end for overflow |
| * address handling |
| */ |
| sub x13, x12, #CHUNKSIZE * COMPONENT_COUNT * 2 |
| eor x12, x13, #CHUNKSIZE * COMPONENT_COUNT * 2 |
| .if \comp == 1 |
| ld1 {v14.4h}, [x13] |
| .elseif \comp == 2 |
| ld1 {v14.8h}, [x13] |
| .elseif \comp == 4 |
| ld1 {v14.8h,v15.8h}, [x13] |
| .endif |
| add x13, x12, #CHUNKSIZE * COMPONENT_COUNT * 2 |
| .if \comp == 1 |
| st1 {v14.4h}, [x13] |
| .elseif \comp == 2 |
| st1 {v14.8h}, [x13] |
| .elseif \comp == 4 |
| st1 {v14.8h,v15.8h}, [x13] |
| .endif |
| /* The high 32-bits of x10 contains the maximum possible iteration |
| * count, but if x8 is greater than the low 32-bits of x10 then |
| * this indicates that the count must be reduced by one for this |
| * iteration to avoid reading past the end of the available data. |
| */ |
| sub x13, x10, x8 |
| lsr x13, x13, #32 |
| |
| madd x8, x13, x9, x8 |
| sub x8, x8, #(CHUNKSIZE << 16) |
| |
| /* prefer to count pixels, rather than vectors, to clarify the tail |
| * store case on exit. |
| */ |
| lsl x13, x13, #VECSHIFT |
| cmp x13, x1 |
| csel x13, x1, x13, gt |
| |
| sub x1, x1, x13 |
| |
| lsl x13, x13, #COMPONENT_SHIFT |
| |
| mov w14, #0x8000 |
| movi v30.8h, #3 |
| dup v31.8h, w14 |
| |
| cmp x13, #0 |
| bgt 3f |
| cmp x1, #0 |
| bgt 1b /* an extreme case where we shouldn't use code in this structure */ |
| b 9f |
| |
| .align 4 |
| 2: /* Inner loop continues here, but starts at 3:, see end of loop |
| * below for explanation. */ |
| .if LOOP_OUTPUT_SIZE == 4 |
| st1 {v8.s}[0], [x0], #4 |
| .elseif LOOP_OUTPUT_SIZE == 8 |
| st1 {v8.8b}, [x0], #8 |
| .elseif LOOP_OUTPUT_SIZE == 16 |
| st1 {v8.16b}, [x0], #16 |
| .elseif LOOP_OUTPUT_SIZE == 32 |
| st1 {v8.16b,v9.16b}, [x0], #32 |
| .endif |
| /* Inner loop: here the four x coefficients for each tap are |
| * calculated in vector code, and the addresses are calculated in |
| * scalar code, and these calculations are interleaved. |
| */ |
| 3: ushr v8.8h, v6.8h, #1 // sxf |
| lsr x14, x2, #(63 - CHUNKSHIFT) |
| sqrdmulh v9.8h, v8.8h, v8.8h // sxf**2 |
| add x2, x2, x3 |
| sqrdmulh v10.8h, v9.8h, v8.8h // sxf**3 |
| lsr x15, x2, #(63 - CHUNKSHIFT) |
| sshll v11.4s, v9.4h, #2 |
| sshll2 v12.4s, v9.8h, #2 |
| add x2, x2, x3 |
| smlsl v11.4s, v10.4h, v30.4h |
| smlsl2 v12.4s, v10.8h, v30.8h |
| lsr x16, x2, #(63 - CHUNKSHIFT) |
| |
| shadd v0.8h, v10.8h, v8.8h |
| add x2, x2, x3 |
| sub v0.8h, v9.8h, v0.8h |
| lsr x17, x2, #(63 - CHUNKSHIFT) |
| |
| saddw v1.4s, v11.4s, v9.4h |
| saddw2 v13.4s, v12.4s, v9.8h |
| add x2, x2, x3 |
| shrn v1.4h, v1.4s, #1 |
| shrn2 v1.8h, v13.4s, #1 |
| add x14, sp, x14, LSL #(COMPONENT_SHIFT + 1) |
| sub v1.8h, v1.8h, v31.8h |
| add x15, sp, x15, LSL #(COMPONENT_SHIFT + 1) |
| |
| saddw v2.4s, v11.4s, v8.4h |
| saddw2 v13.4s, v12.4s, v8.8h |
| add x16, sp, x16, LSL #(COMPONENT_SHIFT + 1) |
| shrn v2.4h, v2.4s, #1 |
| shrn2 v2.8h, v13.4s, #1 |
| add x17, sp, x17, LSL #(COMPONENT_SHIFT + 1) |
| neg v2.8h, v2.8h |
| |
| shsub v3.8h, v10.8h, v9.8h |
| |
| /* increment the x fractional parts (oveflow is ignored, as the |
| * scalar arithmetic shadows this addition with full precision). |
| */ |
| add v6.8h, v6.8h, v7.8h |
| |
| /* At this point we have four pointers in x8-x11, pointing to the |
| * four taps in the scratch buffer that must be convolved together |
| * to produce an output pixel (one output pixel per pointer). |
| * These pointers usually overlap, but their spacing is irregular |
| * so resolving the redundancy through L1 is a pragmatic solution. |
| * |
| * The scratch buffer is made of signed 16-bit data, holding over |
| * some extra precision, and overshoot, from the vertical pass. |
| * |
| * We also have the 16-bit unsigned fixed-point weights for each |
| * of the four taps in v0 - v3. That's eight pixels worth of |
| * coefficients when we have only four pointers, so calculations |
| * for four more pixels are interleaved with the fetch and permute |
| * code for each variant in the following code. |
| * |
| * The data arrangement is less than ideal for any pixel format, |
| * but permuting loads help to mitigate most of the problems. |
| * |
| * Note also that the two outside taps of a bicubic are negative, |
| * but these coefficients are unsigned. The sign is hard-coded by |
| * use of multiply-and-subtract operations. |
| */ |
| .if \comp == 1 |
| /* The uchar 1 case. |
| * Issue one lanewise ld4.h to load four consecutive pixels from |
| * one pointer (one pixel) into four different registers; then load |
| * four consecutive s16 values from the next pointer (pixel) into |
| * the next lane of those four registers, etc., so that we finish |
| * with v12 - v15 representing the four taps, and each lane |
| * representing a separate pixel. |
| * |
| * The first ld4 uses a splat to avoid any false dependency on |
| * the previous state of the register. |
| */ |
| ld4r {v12.8h,v13.8h,v14.8h,v15.8h}, [x14] |
| lsr x14, x2, #(63 - CHUNKSHIFT) |
| add x2, x2, x3 |
| ld4 {v12.h,v13.h,v14.h,v15.h}[1], [x15] |
| add x14, sp, x14, LSL #(COMPONENT_SHIFT + 1) |
| lsr x15, x2, #(63 - CHUNKSHIFT) |
| add x2, x2, x3 |
| ld4 {v12.h,v13.h,v14.h,v15.h}[2], [x16] |
| add x15, sp, x15, LSL #(COMPONENT_SHIFT + 1) |
| lsr x16, x2, #(63 - CHUNKSHIFT) |
| add x2, x2, x3 |
| ld4 {v12.h,v13.h,v14.h,v15.h}[3], [x17] |
| add x16, sp, x16, LSL #(COMPONENT_SHIFT + 1) |
| lsr x17, x2, #(63 - CHUNKSHIFT) |
| add x2, x2, x3 |
| ld4 {v12.h,v13.h,v14.h,v15.h}[4], [x14] |
| add x17, sp, x17, LSL #(COMPONENT_SHIFT + 1) |
| ld4 {v12.h,v13.h,v14.h,v15.h}[5], [x15] |
| ld4 {v12.h,v13.h,v14.h,v15.h}[6], [x16] |
| ld4 {v12.h,v13.h,v14.h,v15.h}[7], [x17] |
| |
| smull v8.4s, v12.4h, v0.4h |
| smull2 v9.4s, v12.8h, v0.8h |
| smlsl v8.4s, v13.4h, v1.4h |
| smlsl2 v9.4s, v13.8h, v1.8h |
| smlsl v8.4s, v14.4h, v2.4h |
| smlsl2 v9.4s, v14.8h, v2.8h |
| smlal v8.4s, v15.4h, v3.4h |
| smlal2 v9.4s, v15.8h, v3.8h |
| |
| subs x13, x13, #LOOP_OUTPUT_SIZE |
| |
| sqrshrn v8.4h, v8.4s, #15 |
| sqrshrn2 v8.8h, v9.4s, #15 |
| |
| sqrshrun v8.8b, v8.8h, #VERTBITS - 8 |
| .elseif \comp == 2 |
| /* The uchar2 case: |
| * This time load pairs of values into adjacent lanes in v12 - v15 |
| * by aliasing them as u32 data; leaving room for only four pixels, |
| * so the process has to be done twice. This also means that the |
| * coefficient registers fail to align with the coefficient data |
| * (eight separate pixels), so that has to be doubled-up to match. |
| */ |
| ld4r {v12.4s,v13.4s,v14.4s,v15.4s}, [x14] |
| lsr x14, x2, #(63 - CHUNKSHIFT) |
| add x2, x2, x3 |
| ld4 {v12.s,v13.s,v14.s,v15.s}[1], [x15] |
| add x14, sp, x14, LSL #(COMPONENT_SHIFT + 1) |
| lsr x15, x2, #(63 - CHUNKSHIFT) |
| add x2, x2, x3 |
| ld4 {v12.s,v13.s,v14.s,v15.s}[2], [x16] |
| add x15, sp, x15, LSL #(COMPONENT_SHIFT + 1) |
| lsr x16, x2, #(63 - CHUNKSHIFT) |
| add x2, x2, x3 |
| ld4 {v12.s,v13.s,v14.s,v15.s}[3], [x17] |
| add x16, sp, x16, LSL #(COMPONENT_SHIFT + 1) |
| lsr x17, x2, #(63 - CHUNKSHIFT) |
| add x2, x2, x3 |
| |
| /* double-up coefficients to align with component pairs */ |
| zip1 v16.8h, v0.8h, v0.8h |
| add x17, sp, x17, LSL #(COMPONENT_SHIFT + 1) |
| zip1 v17.8h, v1.8h, v1.8h |
| zip1 v18.8h, v2.8h, v2.8h |
| zip1 v19.8h, v3.8h, v3.8h |
| |
| smull v8.4s, v12.4h, v16.4h |
| smull2 v9.4s, v12.8h, v16.8h |
| smlsl v8.4s, v13.4h, v17.4h |
| smlsl2 v9.4s, v13.8h, v17.8h |
| smlsl v8.4s, v14.4h, v18.4h |
| smlsl2 v9.4s, v14.8h, v18.8h |
| smlal v8.4s, v15.4h, v19.4h |
| smlal2 v9.4s, v15.8h, v19.8h |
| |
| sqrshrn v8.4h, v8.4s, #15 |
| sqrshrn2 v8.8h, v9.4s, #15 |
| |
| ld4r {v12.4s,v13.4s,v14.4s,v15.4s}, [x14] |
| ld4 {v12.s,v13.s,v14.s,v15.s}[1], [x15] |
| ld4 {v12.s,v13.s,v14.s,v15.s}[2], [x16] |
| ld4 {v12.s,v13.s,v14.s,v15.s}[3], [x17] |
| |
| /* double-up coefficients to align with component pairs */ |
| zip2 v16.8h, v0.8h, v0.8h |
| zip2 v17.8h, v1.8h, v1.8h |
| zip2 v18.8h, v2.8h, v2.8h |
| zip2 v19.8h, v3.8h, v3.8h |
| |
| smull v10.4s, v12.4h, v16.4h |
| smull2 v11.4s, v12.8h, v16.8h |
| smlsl v10.4s, v13.4h, v17.4h |
| smlsl2 v11.4s, v13.8h, v17.8h |
| smlsl v10.4s, v14.4h, v18.4h |
| smlsl2 v11.4s, v14.8h, v18.8h |
| smlal v10.4s, v15.4h, v19.4h |
| smlal2 v11.4s, v15.8h, v19.8h |
| |
| subs x13, x13, #LOOP_OUTPUT_SIZE |
| |
| sqrshrn v9.4h, v10.4s, #15 |
| sqrshrn2 v9.8h, v11.4s, #15 |
| |
| sqrshrun v8.8b, v8.8h, #VERTBITS - 8 |
| sqrshrun2 v8.16b, v9.8h, #VERTBITS - 8 |
| .elseif \comp == 4 |
| /* The uchar4 case. |
| * This case is comparatively painless because four s16s are the |
| * smallest addressable unit for a vmul-by-scalar. Rather than |
| * permute the data, simply arrange the multiplies to suit the way |
| * the data comes in. That's a lot of data, though, so things |
| * progress in pairs of pixels at a time. |
| */ |
| ld1 {v12.8h,v13.8h}, [x14] |
| lsr x14, x2, #(63 - CHUNKSHIFT) |
| add x2, x2, x3 |
| ld1 {v14.8h,v15.8h}, [x15] |
| add x14, sp, x14, LSL #(COMPONENT_SHIFT + 1) |
| lsr x15, x2, #(63 - CHUNKSHIFT) |
| add x2, x2, x3 |
| |
| smull v8.4s, v12.4h, v0.h[0] |
| smull v9.4s, v14.4h, v0.h[1] |
| smlsl2 v8.4s, v12.8h, v1.h[0] |
| smlsl2 v9.4s, v14.8h, v1.h[1] |
| smlsl v8.4s, v13.4h, v2.h[0] |
| smlsl v9.4s, v15.4h, v2.h[1] |
| smlal2 v8.4s, v13.8h, v3.h[0] |
| smlal2 v9.4s, v15.8h, v3.h[1] |
| |
| /* And two more... */ |
| ld1 {v12.8h,v13.8h}, [x16] |
| add x15, sp, x15, LSL #(COMPONENT_SHIFT + 1) |
| lsr x16, x2, #(63 - CHUNKSHIFT) |
| add x2, x2, x3 |
| ld1 {v14.8h,v15.8h}, [x17] |
| add x16, sp, x16, LSL #(COMPONENT_SHIFT + 1) |
| lsr x17, x2, #(63 - CHUNKSHIFT) |
| add x2, x2, x3 |
| |
| sqrshrn v8.4h, v8.4s, #15 |
| add x17, sp, x17, LSL #(COMPONENT_SHIFT + 1) |
| sqrshrn2 v8.8h, v9.4s, #15 |
| |
| smull v10.4s, v12.4h, v0.h[2] |
| smull v11.4s, v14.4h, v0.h[3] |
| smlsl2 v10.4s, v12.8h, v1.h[2] |
| smlsl2 v11.4s, v14.8h, v1.h[3] |
| smlsl v10.4s, v13.4h, v2.h[2] |
| smlsl v11.4s, v15.4h, v2.h[3] |
| smlal2 v10.4s, v13.8h, v3.h[2] |
| smlal2 v11.4s, v15.8h, v3.h[3] |
| |
| sqrshrn v9.4h, v10.4s, #15 |
| sqrshrn2 v9.8h, v11.4s, #15 |
| |
| sqrshrun v8.8b, v8.8h, #VERTBITS - 8 |
| sqrshrun2 v8.16b, v9.8h, #VERTBITS - 8 |
| |
| /* And two more... */ |
| ld1 {v12.8h,v13.8h}, [x14] |
| ld1 {v14.8h,v15.8h}, [x15] |
| |
| smull v10.4s, v12.4h, v0.h[4] |
| smull v11.4s, v14.4h, v0.h[5] |
| smlsl2 v10.4s, v12.8h, v1.h[4] |
| smlsl2 v11.4s, v14.8h, v1.h[5] |
| smlsl v10.4s, v13.4h, v2.h[4] |
| smlsl v11.4s, v15.4h, v2.h[5] |
| smlal2 v10.4s, v13.8h, v3.h[4] |
| smlal2 v11.4s, v15.8h, v3.h[5] |
| |
| /* And two more... */ |
| ld1 {v12.8h,v13.8h}, [x16] |
| ld1 {v14.8h,v15.8h}, [x17] |
| |
| subs x13, x13, #LOOP_OUTPUT_SIZE |
| |
| sqrshrn v9.4h, v10.4s, #15 |
| sqrshrn2 v9.8h, v11.4s, #15 |
| |
| smull v10.4s, v12.4h, v0.h[6] |
| smull v11.4s, v14.4h, v0.h[7] |
| smlsl2 v10.4s, v12.8h, v1.h[6] |
| smlsl2 v11.4s, v14.8h, v1.h[7] |
| smlsl v10.4s, v13.4h, v2.h[6] |
| smlsl v11.4s, v15.4h, v2.h[7] |
| smlal2 v10.4s, v13.8h, v3.h[6] |
| smlal2 v11.4s, v15.8h, v3.h[7] |
| |
| sqrshrn v10.4h, v10.4s, #15 |
| sqrshrn2 v10.8h, v11.4s, #15 |
| |
| sqrshrun v9.8b, v9.8h, #VERTBITS - 8 |
| sqrshrun2 v9.16b, v10.8h, #VERTBITS - 8 |
| .endif |
| bgt 2b /* continue inner loop */ |
| /* The inner loop has already been limited to ensure that none of |
| * the earlier iterations could overfill the output, so the store |
| * appears within the loop but after the conditional branch (at the |
| * top). At the end, provided it won't overfill, perform the final |
| * store here. If it would, then break out to the tricky tail case |
| * instead. |
| */ |
| blt 1f |
| /* Store the amount of data appropriate to the configuration of the |
| * instance being assembled. |
| */ |
| .if LOOP_OUTPUT_SIZE == 4 |
| st1 {v8.s}[0], [x0], #4 |
| .elseif LOOP_OUTPUT_SIZE == 8 |
| st1 {v8.8b}, [x0], #8 |
| .elseif LOOP_OUTPUT_SIZE == 16 |
| st1 {v8.16b}, [x0], #16 |
| .elseif LOOP_OUTPUT_SIZE == 32 |
| st1 {v8.16b,v9.16b}, [x0], #32 |
| .endif |
| b 1b /* resume outer loop */ |
| /* Partial tail store case: |
| * Different versions of the code need different subsets of the |
| * following partial stores. Here the number of components and the |
| * size of the chunk of data produced by each inner loop iteration |
| * is tested to figure out whether or not each phrase is relevant. |
| */ |
| .if 16 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 16 |
| 1: tst x13, #16 |
| beq 1f |
| st1 {v8.16b}, [x0], #16 |
| mov v8.16b, v9.16b |
| .endif |
| .if 8 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 8 |
| 1: tst x13, #8 |
| beq 1f |
| st1 {v8.8b}, [x0], #8 |
| ext v8.16b, v8.16b, v8.16b, #8 |
| .endif |
| .if 4 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 4 |
| 1: tst x13, #4 |
| beq 1f |
| st1 {v8.s}[0], [x0], #4 |
| ext v8.8b, v8.8b, v8.8b, #4 |
| .endif |
| .if 2 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 2 |
| 1: tst x13, #2 |
| beq 1f |
| st1 {v8.h}[0], [x0], #2 |
| ext v8.8b, v8.8b, v8.8b, #2 |
| .endif |
| .if 1 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 1 |
| 1: tst x13, #1 |
| beq 1f |
| st1 {v8.b}[0], [x0], #1 |
| .endif |
| 1: |
| 9: mov sp, x19 |
| ld1 {v8.1d - v11.1d}, [sp], #32 |
| ld1 {v12.1d - v15.1d}, [sp], #32 |
| ldr x19, [sp], #16 |
| ret |
| END(rsdIntrinsicResizeB\comp\()_K) |
| .endr |
| |
| .rodata |
| intrinsic_resize_consts: .hword 0, 1, 2, 3, 4, 5, 6, 7 |