Revive forEachDstAtop implementations for NEON/ASIMD

ScriptIntrinsicBlend's forEachDstAtop had broken NEON and ASIMD
implementations and were bypassed.

This CL fixed these two and turned them back on in
RsdCpuScriptIntrinsicBlend::kernel().

Bug: b/22047392
Test: CTS and RsTest on Angler
Change-Id: Ifa41d74222606f1c04b7b4e3fe1b43eb932ead89
diff --git a/cpu_ref/rsCpuIntrinsicBlend.cpp b/cpu_ref/rsCpuIntrinsicBlend.cpp
index 1507d45..ce30092 100644
--- a/cpu_ref/rsCpuIntrinsicBlend.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlend.cpp
@@ -119,9 +119,7 @@
     uint32_t x2 = xend;
 
 #if defined(ARCH_ARM_USE_INTRINSICS)
-    // Bug: 22047392 - Skip optimized version for BLEND_DST_ATOP until this
-    // been fixed.
-    if (gArchUseSIMD && info->slot != BLEND_DST_ATOP) {
+    if (gArchUseSIMD) {
         if (rsdIntrinsicBlend_K(out, in, info->slot, x1, x2) >= 0)
             return;
     }
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_Blend.S b/cpu_ref/rsCpuIntrinsics_advsimd_Blend.S
index 90dbd73..1c721b6 100644
--- a/cpu_ref/rsCpuIntrinsics_advsimd_Blend.S
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_Blend.S
@@ -350,7 +350,7 @@
         uqrshrn     v2.8b,  v2.8h,  #8
         uqrshrn2    v2.16b, v14.8h, #8
 
-        mvn         v3.16b, v3.16b
+        mov         v3.16b, v11.16b
 .endm
 
 #define params_MULTIPLY zipped=0
diff --git a/cpu_ref/rsCpuIntrinsics_neon_Blend.S b/cpu_ref/rsCpuIntrinsics_neon_Blend.S
index f0145e5..a1fa1b5 100644
--- a/cpu_ref/rsCpuIntrinsics_neon_Blend.S
+++ b/cpu_ref/rsCpuIntrinsics_neon_Blend.S
@@ -353,7 +353,7 @@
         vqrshrn.u16 d4, q2,  #8
         vqrshrn.u16 d5, q14, #8
 
-        vmvn        q3, q3
+        vmov        q3, q11
 .endm
 
 #define params_MULTIPLY zipped=0