| /* |
| * Copyright (C) 2008 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /* ---- includes ----------------------------------------------------------- */ |
| |
| #include "b_BasicEm/Basic.h" /* to disable some warnings in VC++ */ |
| |
| #if ( defined( WIN64 ) || defined( HW_SSE2 ) ) |
| |
| #include "emmintrin.h" |
| |
| /* disable warning "local variable 'x' used without having been initialized" */ |
| #pragma warning( disable : 4700 ) |
| |
| |
| /** Using half register (64-bit) in SSE2 to calculate dot product. |
| * This is a SSE2 reimplementation of bbs_dotProduct_intelMMX16 in Math.c. |
| * Dependencies: input vectors need to be 16-bit aligned |
| * Return Value: int32 containing resultL of dot product |
| */ |
| int32 bbs_dotProduct_64SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA ) |
| { |
| __m128i m_XMM0, m_XMM1, m_XMM2, m_XMM3, m_XMM4, m_XMM5, m_XMM6, m_XMM7, m_XMM8; |
| int16* vec1L = ( int16* )vec1A; |
| int16* vec2L = ( int16* )vec2A; |
| |
| int32 resultL = 0; |
| uint32 alignOffSetL = 0; |
| |
| /* initialize registers to 0 */ |
| m_XMM4 = _mm_xor_si128( m_XMM4, m_XMM4 ); |
| m_XMM6 = _mm_xor_si128( m_XMM6, m_XMM6 ); |
| m_XMM7 = _mm_xor_si128( m_XMM7, m_XMM7 ); |
| |
| alignOffSetL = sizeA % 16; |
| sizeA >>= 4; |
| |
| if( sizeA ) |
| { |
| while( sizeA > 0 ) |
| { |
| m_XMM0 = _mm_loadl_epi64( (__m128i *)&0[vec1L] ); |
| m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM4 ); |
| |
| m_XMM1 = _mm_loadl_epi64( (__m128i *)&0[vec2L] ); |
| m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM6 ); |
| |
| m_XMM2 = _mm_loadl_epi64( (__m128i *)&4[vec1L] ); |
| |
| m_XMM0 = _mm_madd_epi16( m_XMM0, m_XMM1 ); |
| |
| m_XMM3 = _mm_loadl_epi64( (__m128i *)&4[vec2L] ); |
| m_XMM4 = _mm_loadl_epi64( (__m128i *)&8[vec1L] ); |
| |
| m_XMM2 = _mm_madd_epi16( m_XMM2, m_XMM3 ); |
| |
| m_XMM5 = _mm_loadl_epi64( (__m128i *)&8[vec2L] ); |
| |
| m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM0 ); |
| |
| m_XMM6 = _mm_loadl_epi64( (__m128i *)&12[vec1L] ); |
| |
| m_XMM4 = _mm_madd_epi16( m_XMM4, m_XMM5 ); |
| |
| m_XMM8 = _mm_loadl_epi64( (__m128i *)&12[vec2L] ); |
| m_XMM6 = _mm_madd_epi16( m_XMM6, m_XMM8 ); |
| |
| m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM2 ); |
| |
| vec1L += 16; |
| vec2L += 16; |
| sizeA--; |
| } |
| |
| /* sum up accumulators */ |
| m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM4 ); |
| |
| m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM6 ); |
| |
| m_XMM0 = _mm_loadl_epi64( (__m128i *)&m_XMM7 ); |
| |
| m_XMM0 = _mm_srli_epi64( m_XMM0, 32 ); |
| |
| m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM0 ); |
| |
| resultL = _mm_cvtsi128_si32( m_XMM7 ); |
| } |
| |
| /* switch statements produces faster code than loop */ |
| switch( alignOffSetL ) |
| { |
| case 15: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 14: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 13: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 12: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 11: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 10: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 9: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 8: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 7: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 6: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 5: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 4: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 3: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 2: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 1: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| } |
| |
| return resultL; |
| } |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| /** Using full register (128-bit) in SSE2 to calculate dot Product. |
| * Dependencies: 16-bit aligned |
| * Return Value: int32 containing dot Product |
| */ |
| int32 bbs_dotProduct_128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA ) |
| { |
| __m128i m_XMM0, m_XMM2, m_XMM3, m_XMM5, m_XMM6; |
| int16* vec1L = ( int16* )vec1A; |
| int16* vec2L = ( int16* )vec2A; |
| |
| int32 resultL = 0; |
| uint32 alignOffSetL = 0; |
| |
| m_XMM5 = _mm_xor_si128( m_XMM5, m_XMM5 ); |
| m_XMM6 = _mm_xor_si128( m_XMM6, m_XMM6 ); |
| |
| alignOffSetL = sizeA % 16; |
| sizeA >>= 4; |
| |
| if( sizeA ) |
| { |
| while( sizeA > 0 ) |
| { |
| m_XMM0 = _mm_load_si128( (__m128i *)&0[vec1L] ); |
| m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 ); |
| |
| m_XMM2 = _mm_load_si128( (__m128i *)&0[vec2L] ); |
| |
| m_XMM6 = _mm_load_si128( (__m128i *)&8[vec1L] ); |
| |
| m_XMM0 = _mm_madd_epi16( m_XMM0, m_XMM2 ); |
| |
| m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM0 ); |
| |
| m_XMM3 = _mm_load_si128( (__m128i *)&8[vec2L] ); |
| |
| m_XMM6 = _mm_madd_epi16( m_XMM6, m_XMM3 ); |
| |
| vec1L += 16; |
| vec2L += 16; |
| sizeA--; |
| } |
| |
| /* sum up accumulators */ |
| m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 ); |
| |
| m_XMM0 = _mm_load_si128( (__m128i *)&m_XMM5 ); |
| |
| resultL = _mm_cvtsi128_si32( m_XMM0 ); /* 1st 32bits */ |
| |
| m_XMM0 = _mm_srli_si128( m_XMM0, 4 ); |
| |
| resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 2nd 32bits */ |
| |
| m_XMM0 = _mm_srli_si128( m_XMM0, 4 ); |
| |
| resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 3rd 32bits */ |
| |
| m_XMM0 = _mm_srli_si128( m_XMM0, 4 ); |
| |
| resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 4th 32bits */ |
| } |
| |
| switch( alignOffSetL ) |
| { |
| case 15: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 14: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 13: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 12: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 11: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 10: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 9: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 8: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 7: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 6: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 5: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 4: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 3: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 2: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 1: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| } |
| |
| return resultL; |
| } |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| |
| /** Using full register (128-bit) in SSE2 to calculate dot product (non aligned version). |
| * Dependencies: memory does not need to be 16-bit aligned |
| * Return Value: int32 containing dot product |
| */ |
| int32 bbs_dotProduct_u128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA ) |
| { |
| __m128i m_XMM0, m_XMM2, m_XMM3, m_XMM5, m_XMM6; |
| int16* vec1L = ( int16* )vec1A; |
| int16* vec2L = ( int16* )vec2A; |
| int32 resultL = 0; |
| uint32 alignOffSetL = 0; |
| |
| /* initialize registers to 0 */ |
| m_XMM5 = _mm_xor_si128( m_XMM5, m_XMM5 ); |
| m_XMM6 = _mm_xor_si128( m_XMM6, m_XMM6 ); |
| |
| |
| alignOffSetL = sizeA % 16; |
| sizeA >>= 4; |
| |
| if( sizeA ) |
| { |
| while( sizeA > 0 ) |
| { |
| m_XMM0 = _mm_loadu_si128( (__m128i *)&0[vec1L] ); |
| m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 ); |
| |
| m_XMM2 = _mm_loadu_si128( (__m128i *)&0[vec2L] ); |
| |
| m_XMM6 = _mm_loadu_si128( (__m128i *)&8[vec1L] ); |
| |
| m_XMM0 = _mm_madd_epi16( m_XMM0, m_XMM2 ); |
| |
| m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM0 ); |
| |
| m_XMM3 = _mm_loadu_si128( (__m128i *)&8[vec2L] ); |
| |
| m_XMM6 = _mm_madd_epi16( m_XMM6, m_XMM3 ); |
| |
| vec1L += 16; |
| vec2L += 16; |
| sizeA--; |
| } |
| |
| /* sum up accumulators */ |
| m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 ); |
| |
| m_XMM0 = _mm_loadu_si128( (__m128i *)&m_XMM5 ); |
| |
| resultL = _mm_cvtsi128_si32( m_XMM0 ); /* 1st 32bits */ |
| |
| m_XMM0 = _mm_srli_si128( m_XMM0, 4 ); |
| |
| resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 2nd 32bits */ |
| |
| m_XMM0 = _mm_srli_si128( m_XMM0, 4 ); |
| |
| resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 3rd 32bits */ |
| |
| m_XMM0 = _mm_srli_si128( m_XMM0, 4 ); |
| |
| resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 4th 32bits */ |
| } |
| |
| |
| switch( alignOffSetL ) |
| { |
| case 15: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 14: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 13: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 12: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 11: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 10: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 9: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 8: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 7: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 6: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 5: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 4: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 3: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 2: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| case 1: |
| resultL += ( int32 )*vec1L++ * *vec2L++; |
| } |
| |
| return resultL; |
| } |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| #endif /* HW_SSE2 */ |