| /* |
| * Copyright (C) 2014 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: |
| #define END(f) .size f, .-f; |
| |
| |
| .macro lanepair dst, src0, src1, xr0, xr1, yr0, yr1, zr0, zr1 |
| |
| smov x6, \src0 |
| smov x7, \src1 |
| |
| add x6, x6, x3 |
| add x7, x7, x3 |
| |
| ld1 {v16.2s}, [x6], x4 |
| ld1 {v17.2s}, [x7], x4 |
| |
| ld1 {v18.2s}, [x6], x5 |
| ld1 {v19.2s}, [x7], x5 |
| |
| dup v8.8b, \yr0 |
| dup v9.8b, \yr1 |
| /* Y interpolate, front, lanes 0 and 1 -> v12 and v13 */ |
| zip1 v12.16b, v5.16b, v16.16b |
| zip1 v13.16b, v5.16b, v17.16b |
| umlsl v12.8h, v16.8b, v8.8b |
| umlsl v13.8h, v17.8b, v9.8b |
| umlal v12.8h, v18.8b, v8.8b |
| umlal v13.8h, v19.8b, v9.8b |
| |
| ld1 {v18.2s}, [x6] |
| ld1 {v19.2s}, [x7] |
| |
| sub x6, x6, x4 |
| sub x7, x7, x4 |
| |
| ld1 {v16.2s}, [x6] |
| ld1 {v17.2s}, [x7] |
| |
| /* Y interpolate, rear, lanes 0 and 1 -> v14 and v15 */ |
| zip1 v14.16b, v5.16b, v16.16b |
| zip1 v15.16b, v5.16b, v17.16b |
| umlsl v14.8h, v16.8b, v8.8b |
| umlsl v15.8h, v17.8b, v9.8b |
| umlal v14.8h, v18.8b, v8.8b |
| umlal v15.8h, v19.8b, v9.8b |
| |
| /* Z interpolate, lane 0 v12/v14 -> v10 */ |
| ushll v8.4s, v12.4h, #8 |
| ushll2 v9.4s, v12.8h, #8 |
| umlsl v8.4s, v12.4h, \zr0 |
| umlsl2 v9.4s, v12.8h, \zr0 |
| umlal v8.4s, v14.4h, \zr0 |
| umlal2 v9.4s, v14.8h, \zr0 |
| rshrn v10.4h, v8.4s, #8 |
| rshrn2 v10.8h, v9.4s, #8 |
| |
| /* Z interpolate, lane 1 v13/v15 -> v11 */ |
| ushll v8.4s, v13.4h, #8 |
| ushll2 v9.4s, v13.8h, #8 |
| umlsl v8.4s, v13.4h, \zr1 |
| umlsl2 v9.4s, v13.8h, \zr1 |
| umlal v8.4s, v15.4h, \zr1 |
| umlal2 v9.4s, v15.8h, \zr1 |
| rshrn v11.4h, v8.4s, #8 |
| rshrn2 v11.8h, v9.4s, #8 |
| |
| /* X interpolate, lanes 0 and 1 v10,v11 -> v14 */ |
| ushll v8.4s, v10.4h, #8 |
| ushll v9.4s, v11.4h, #8 |
| umlsl v8.4s, v10.4h, \xr0 |
| umlsl v9.4s, v11.4h, \xr1 |
| umlal2 v8.4s, v10.8h, \xr0 |
| umlal2 v9.4s, v11.8h, \xr1 |
| shrn v14.4h, v8.4s, #8 |
| shrn2 v14.8h, v9.4s, #8 |
| |
| /* pack lanes 0-1 -> v6 */ |
| .ifc \dst, v20.16b |
| uqrshrn2 \dst, v14.8h, #8 |
| .else ; .ifc \dst, v21.16b |
| uqrshrn2 \dst, v14.8h, #8 |
| .else |
| uqrshrn \dst, v14.8h, #8 |
| .endif ; .endif |
| .endm |
| |
| /* void rsdIntrinsic3DLUT_K( |
| * void *dst, // x0 |
| * void const *in, // x1 |
| * size_t count, // x2 |
| * void const *lut, // x3 |
| * int32_t pitchy, // w4 |
| * int32_t pitchz, // w5 |
| * int dimx, // w6 |
| * int dimy, // w7 |
| * int dimz); // [sp] |
| */ |
| ENTRY(rsdIntrinsic3DLUT_K) |
| ldr w8, [sp] |
| stp d8, d9, [sp, #-64]! |
| stp d10, d11, [sp, #16] |
| stp d12, d13, [sp, #32] |
| stp d14, d15, [sp, #48] |
| movi v4.8b, #1 |
| ins v4.h[0], w6 |
| ins v4.h[1], w7 |
| ins v4.h[2], w8 |
| ins v4.s[2], w4 |
| ins v4.s[3], w5 |
| movi v5.16b, #0 |
| |
| subs x2, x2, #8 |
| bge 2f |
| cmn x2, #8 // same as cmp x2, #-8 |
| ble 9f |
| b 4f |
| |
| .align 6 |
| 1: st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [x0], #32 |
| /* x0 = dst |
| * x1 = src |
| * x2 = count |
| * x3 = lut |
| * x4 = pitchy |
| * x5 = pitchz |
| * x6 = offset0 |
| * x7 = offset1 |
| */ |
| 2: ld4 {v0.8b-v3.8b}, [x1], #32 |
| /* v0,v1,v2,v3 source data |
| * v4 dimensions and pitches |
| */ |
| 3: uxtl v0.8h, v0.8b |
| uxtl v1.8h, v1.8b |
| uxtl v2.8h, v2.8b |
| mul v0.8h, v0.8h, v4.h[0] |
| mul v1.8h, v1.8h, v4.h[1] |
| mul v2.8h, v2.8h, v4.h[2] |
| |
| /* ursra below would be more accurate, but this can result in a dim.0 case |
| * where we try to read from the limit of the array and the limit +1 to |
| * interpolate, even though the fractional component is zero. Strictly this is |
| * correct, except for the llegal access problem. |
| */ |
| usra v0.8h, v0.8h, #8 |
| usra v1.8h, v1.8h, #8 |
| usra v2.8h, v2.8h, #8 |
| |
| ushr v12.8h, v0.8h, #8 |
| ushr v13.8h, v1.8h, #8 |
| ushr v14.8h, v2.8h, #8 |
| bic v0.8h, #0xff, LSL #8 |
| xtn v1.8b, v1.8h |
| bic v2.8h, #0xff, LSL #8 |
| |
| /* v0.8h,v1.8b,v2.hb fractional offset |
| * v12.8h,v13.8h,v14.8h integer offset |
| */ |
| |
| ushll v6.4s, v12.4h, #2 |
| ushll2 v7.4s, v12.8h, #2 |
| uxtl v8.4s, v13.4h |
| uxtl2 v9.4s, v13.8h |
| uxtl v10.4s, v14.4h |
| uxtl2 v11.4s, v14.8h |
| mla v6.4s, v8.4s, v4.s[2] |
| mla v7.4s, v9.4s, v4.s[2] |
| mla v6.4s, v10.4s, v4.s[3] |
| mla v7.4s, v11.4s, v4.s[3] |
| |
| /* v6,v7 list of table offsets */ |
| |
| /* lanes 0 and 1 */ |
| lanepair dst=v20.8b, src0=v6.s[0], src1=v6.s[1], xr0=v0.h[0], xr1=v0.h[1], yr0=v1.b[0], yr1=v1.b[1], zr0=v2.h[0], zr1=v2.h[1] |
| |
| /* lanes 2 and 3 */ |
| lanepair dst=v20.16b, src0=v6.s[2], src1=v6.s[3], xr0=v0.h[2], xr1=v0.h[3], yr0=v1.b[2], yr1=v1.b[3], zr0=v2.h[2], zr1=v2.h[3] |
| |
| /* lanes 4 and 5 */ |
| lanepair dst=v21.8b, src0=v7.s[0], src1=v7.s[1], xr0=v0.h[4], xr1=v0.h[5], yr0=v1.b[4], yr1=v1.b[5], zr0=v2.h[4], zr1=v2.h[5] |
| |
| /* lanes 6 and 7 */ |
| lanepair dst=v21.16b, src0=v7.s[2], src1=v7.s[3], xr0=v0.h[6], xr1=v0.h[7], yr0=v1.b[6], yr1=v1.b[7], zr0=v2.h[6], zr1=v2.h[7] |
| |
| uzp1 v6.16b, v20.16b, v21.16b |
| uzp2 v7.16b, v20.16b, v21.16b |
| uzp1 v20.16b, v6.16b, v7.16b |
| uzp2 v22.16b, v6.16b, v7.16b |
| mov v21.d[0], v20.d[1] |
| |
| subs x2, x2, #8 |
| mov v23.8b, v3.8b |
| |
| bge 1b |
| |
| cmn x2, #8 // same as cmp x2, #-8 |
| blt 1f |
| |
| st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [x0], #32 |
| beq 9f |
| |
| /* fill the vector with a safe value */ |
| 4: ld4r {v0.8b-v3.8b}, [x1] |
| tbz x2, #2, 2f |
| ld4 {v0.b-v3.b}[0], [x1], #4 |
| ld4 {v0.b-v3.b}[1], [x1], #4 |
| ld4 {v0.b-v3.b}[2], [x1], #4 |
| ld4 {v0.b-v3.b}[3], [x1], #4 |
| 2: tbz x2, #1, 2f |
| ld4 {v0.b-v3.b}[4], [x1], #4 |
| ld4 {v0.b-v3.b}[5], [x1], #4 |
| 2: tbz x2, #0, 2f |
| ld4 {v0.b-v3.b}[6], [x1], #4 |
| 2: b 3b |
| |
| 1: tst x2, #4 |
| beq 2f |
| st4 {v20.b-v23.b}[0], [x0], #4 |
| st4 {v20.b-v23.b}[1], [x0], #4 |
| st4 {v20.b-v23.b}[2], [x0], #4 |
| st4 {v20.b-v23.b}[3], [x0], #4 |
| 2: tst x2, #2 |
| beq 2f |
| st4 {v20.b-v23.b}[4], [x0], #4 |
| st4 {v20.b-v23.b}[5], [x0], #4 |
| 2: tst x2, #1 |
| beq 9f |
| st4 {v20.b-v23.b}[6], [x0], #4 |
| |
| 9: ldp d14, d15, [sp, #48] |
| ldp d12, d13, [sp, #32] |
| ldp d10, d11, [sp, #16] |
| ldp d8, d9, [sp], #64 |
| ret |
| END(rsdIntrinsic3DLUT_K) |