| /* |
| * Copyright (C) 2014 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart |
| #define END(f) .fnend; .size f, .-f; |
| |
| .eabi_attribute 25,1 @Tag_ABI_align8_preserved |
| .arm |
| |
| .macro lanepair dst, src, xr0, xr1, yr0, yr1, zr0, zr1 |
| |
| vmov r6, r7, \src |
| |
| add r6, r6, r3 |
| add r7, r7, r3 |
| |
| vld1.u8 d16, [r6], r4 |
| vld1.u8 d17, [r7], r4 |
| |
| vld1.u8 d18, [r6], r5 |
| vld1.u8 d19, [r7], r5 |
| |
| vdup.u8 d6, \yr0 |
| vdup.u8 d7, \yr1 |
| /* Y interpolate, front, lanes 0 and 1 -> q12 and q13 */ |
| vshll.u8 q12, d16, #8 |
| vshll.u8 q13, d17, #8 |
| vmlsl.u8 q12, d16, d6 |
| vmlsl.u8 q13, d17, d7 |
| vmlal.u8 q12, d18, d6 |
| vmlal.u8 q13, d19, d7 |
| |
| vld1.u8 d18, [r6] |
| vld1.u8 d19, [r7] |
| |
| sub r6, r6, r4 |
| sub r7, r7, r4 |
| |
| vld1.u8 d16, [r6] |
| vld1.u8 d17, [r7] |
| |
| /* Y interpolate, rear, lanes 0 and 1 -> q14 and q15 */ |
| vshll.u8 q14, d16, #8 |
| vshll.u8 q15, d17, #8 |
| vmlsl.u8 q14, d16, d6 |
| vmlsl.u8 q15, d17, d7 |
| vmlal.u8 q14, d18, d6 |
| vmlal.u8 q15, d19, d7 |
| |
| /* Z interpolate, lane 0 q12/q14 -> q10 */ |
| vshll.u16 q8, d24, #8 |
| vshll.u16 q9, d25, #8 |
| vmlsl.u16 q8, d24, \zr0 |
| vmlsl.u16 q9, d25, \zr0 |
| vmlal.u16 q8, d28, \zr0 |
| vmlal.u16 q9, d29, \zr0 |
| vrshrn.u32 d20, q8, #8 |
| vrshrn.u32 d21, q9, #8 |
| |
| /* Z interpolate, lane 1 q13/q15 -> q11 */ |
| vshll.u16 q8, d26, #8 |
| vshll.u16 q9, d27, #8 |
| vmlsl.u16 q8, d26, \zr1 |
| vmlsl.u16 q9, d27, \zr1 |
| vmlal.u16 q8, d30, \zr1 |
| vmlal.u16 q9, d31, \zr1 |
| vrshrn.u32 d22, q8, #8 |
| vrshrn.u32 d23, q9, #8 |
| |
| /* X interpolate, lanes 0 and 1 q10,q11 -> q14 */ |
| vshll.u16 q8, d20, #8 |
| vshll.u16 q9, d22, #8 |
| vmlsl.u16 q8, d20, \xr0 |
| vmlsl.u16 q9, d22, \xr1 |
| vmlal.u16 q8, d21, \xr0 |
| vmlal.u16 q9, d23, \xr1 |
| vshrn.u32 d28, q8, #8 |
| vshrn.u32 d29, q9, #8 |
| |
| /* pack lanes 0-1 -> d12 */ |
| vqrshrn.u16 \dst, q14, #8 |
| .endm |
| |
| /* void rsdIntrinsic3DLUT_K( |
| * void *dst, // r0 |
| * void const *in, // r1 |
| * size_t count, // r2 |
| * void const *lut, // r3 |
| * int32_t pitchy, // [sp] |
| * int32_t pitchz, // [sp+#4] |
| * int dimx, // [sp+#8] |
| * int dimy, // [sp+#12] |
| * int dimz); // [sp+#16] |
| */ |
| ENTRY(rsdIntrinsic3DLUT_K) |
| push {r4,r5,r6,r7} |
| ldr r4, [sp, #16] |
| ldr r5, [sp, #20] |
| ldr r6, [sp, #24] |
| ldr r7, [sp, #28] |
| ldr r12, [sp, #32] |
| vpush {d8-d15} |
| |
| vmov.u8 d8, #1 |
| vmov.u16 d8[0], r6 |
| vmov.u16 d8[1], r7 |
| vmov.u16 d8[2], r12 |
| vmov d9, r4, r5 |
| |
| subs r2, #8 |
| bge 2f |
| cmp r2, #-8 |
| ble 9f |
| b 4f |
| |
| .align 6 |
| 1: vst4.u8 {d12,d13,d14,d15}, [r0]! |
| /* r0 = dst |
| * r1 = src |
| * r2 = count |
| * r3 = lut |
| * r4 = pitchy |
| * r5 = pitchz |
| * r6 = offset0 |
| * r7 = offset1 |
| */ |
| 2: vld4.u8 {d0,d2,d4,d6}, [r1]! |
| 3: vmov d10, d6 |
| /* q0,q1,q2,q5 source data |
| * q4 dimensions and pitches |
| * q3, scratch register for scalar access |
| */ |
| vmov q3, q4 |
| vmovl.u8 q0, d0 |
| vmovl.u8 q1, d2 |
| vmovl.u8 q2, d4 |
| vmul.u16 q0, q0, d6[0] |
| vmul.u16 q1, q1, d6[1] |
| vmul.u16 q2, q2, d6[2] |
| |
| /* vrsra.u16 below would be more accurate, but this can result in a dim.0 case |
| * where we try to read from the limit of the array and the limit +1 to |
| * interpolate, even though the fractional component is zero. Strictly this is |
| * correct, except for the llegal access problem. |
| */ |
| vsra.u16 q0, q0, #8 |
| vsra.u16 q1, q1, #8 |
| vsra.u16 q2, q2, #8 |
| |
| vshr.u16 q12, q0, #8 |
| vshr.u16 q13, q1, #8 |
| vshr.u16 q14, q2, #8 |
| |
| vbic.u16 q0, #0xff00 |
| vmovn.u16 d2, q1 |
| vbic.u16 q2, #0xff00 |
| |
| /* q0,d2,q2 fractional offset |
| * q12,q13,q14 integer offset |
| */ |
| |
| vshll.u16 q6, d24, #2 |
| vshll.u16 q7, d25, #2 |
| vmovl.u16 q8, d26 |
| vmovl.u16 q9, d27 |
| vmovl.u16 q10, d28 |
| vmovl.u16 q11, d29 |
| vmla.s32 q6, q8, d9[0] |
| vmla.s32 q7, q9, d9[0] |
| vmla.s32 q6, q10, d9[1] |
| vmla.s32 q7, q11, d9[1] |
| |
| /* q6,q7 list of table offsets */ |
| |
| /* lanes 0 and 1 */ |
| lanepair dst=d12, src=d12, xr0=d0[0], xr1=d0[1], yr0=d2[0], yr1=d2[1], zr0=d4[0], zr1=d4[1] |
| |
| /* lanes 2 and 3 */ |
| lanepair dst=d13, src=d13, xr0=d0[2], xr1=d0[3], yr0=d2[2], yr1=d2[3], zr0=d4[2], zr1=d4[3] |
| |
| /* lanes 4 and 5 */ |
| lanepair dst=d14, src=d14, xr0=d1[0], xr1=d1[1], yr0=d2[4], yr1=d2[5], zr0=d5[0], zr1=d5[1] |
| |
| /* lanes 6 and 7 */ |
| lanepair dst=d15, src=d15, xr0=d1[2], xr1=d1[3], yr0=d2[6], yr1=d2[7], zr0=d5[2], zr1=d5[3] |
| |
| vuzp.u8 d12, d13 |
| vuzp.u8 d14, d15 |
| vuzp.u8 d12, d14 |
| vuzp.u8 d13, d15 |
| |
| subs r2, r2, #8 |
| vmov.u8 d15, d10 |
| |
| bge 1b |
| |
| cmp r2, #-8 |
| blt 1f |
| |
| vst4.u8 {d12,d13,d14,d15}, [r0]! |
| |
| beq 9f |
| |
| /* fill the vector with a safe value */ |
| 4: vld1.u32 {d0[]}, [r1] |
| vmov d2, d0 |
| vmov d4, d0 |
| vmov d6, d0 |
| tst r2, #4 |
| beq 2f |
| vld1.u32 {d0}, [r1]! |
| vld1.u32 {d2}, [r1]! |
| 2: tst r2, #2 |
| beq 2f |
| vld1.u32 {d4}, [r1]! |
| 2: tst r2, #1 |
| beq 2f |
| vld1.u32 {d6[0]}, [r1]! |
| 2: vuzp.8 d0, d2 |
| vuzp.8 d4, d6 |
| vuzp.8 d0, d4 |
| vuzp.8 d2, d6 |
| b 3b |
| |
| 1: vzip.8 d12, d14 |
| vzip.8 d13, d15 |
| vzip.8 d12, d13 |
| vzip.8 d14, d15 |
| tst r2, #4 |
| beq 2f |
| vst1.u32 {d12,d13}, [r0]! |
| 2: tst r2, #2 |
| beq 2f |
| vst1.u32 {d14}, [r0]! |
| 2: tst r2, #1 |
| beq 9f |
| vst1.u32 {d15[0]}, [r0]! |
| |
| 9: mov r0, #0 |
| vpop {d8-d15} |
| pop {r4,r5,r6,r7} |
| bx lr |
| END(rsdIntrinsic3DLUT_K) |