Embedded/common/src/b_BasicEm/MathSSE2.c - platform/external/neven - Git at Google

 /*
  * Copyright (C) 2008 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 /* ---- includes ----------------------------------------------------------- */

 #include "b_BasicEm/Basic.h" /* to disable some warnings in VC++ */

 #if ( defined( WIN64 ) || defined( HW_SSE2 ) )

 #include "emmintrin.h"

 /* disable warning "local variable 'x' used without having been initialized" */
 #pragma warning( disable : 4700 )


 /** Using half register (64-bit) in SSE2 to calculate dot product.
  *  This is a SSE2 reimplementation of bbs_dotProduct_intelMMX16 in Math.c.
  *  Dependencies: input vectors need to be 16-bit aligned
  *  Return Value: int32 containing resultL of dot product
  */
 int32 bbs_dotProduct_64SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA )
 {
 	__m128i m_XMM0, m_XMM1, m_XMM2, m_XMM3, m_XMM4, m_XMM5, m_XMM6, m_XMM7, m_XMM8;
 	int16* vec1L = ( int16* )vec1A;
 	int16* vec2L = ( int16* )vec2A;

 	int32 resultL = 0;
 	uint32 alignOffSetL = 0;

 	/* initialize registers to 0 */
 	m_XMM4 = _mm_xor_si128( m_XMM4, m_XMM4 );
 	m_XMM6 = _mm_xor_si128( m_XMM6, m_XMM6 );
 	m_XMM7 = _mm_xor_si128( m_XMM7, m_XMM7 );

 	alignOffSetL = sizeA % 16;
 	sizeA >>= 4;

 	if( sizeA )
 	{
 		while( sizeA > 0 )
 		{
 			m_XMM0 = _mm_loadl_epi64( (__m128i *)&0[vec1L] );
 			m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM4 );

 			m_XMM1 = _mm_loadl_epi64( (__m128i *)&0[vec2L] );
 			m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM6 );

 			m_XMM2 = _mm_loadl_epi64( (__m128i *)&4[vec1L] );

 			m_XMM0 = _mm_madd_epi16( m_XMM0, m_XMM1 );

 			m_XMM3 = _mm_loadl_epi64( (__m128i *)&4[vec2L] );
 			m_XMM4 = _mm_loadl_epi64( (__m128i *)&8[vec1L] );

 			m_XMM2 = _mm_madd_epi16( m_XMM2, m_XMM3 );

 			m_XMM5 = _mm_loadl_epi64( (__m128i *)&8[vec2L] );

 			m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM0 );

 			m_XMM6 = _mm_loadl_epi64( (__m128i *)&12[vec1L] );

 			m_XMM4 = _mm_madd_epi16( m_XMM4, m_XMM5 );

 			m_XMM8 = _mm_loadl_epi64( (__m128i *)&12[vec2L] );
 			m_XMM6 = _mm_madd_epi16( m_XMM6, m_XMM8 );

 			m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM2 );

 			vec1L += 16;
 			vec2L += 16;
 			sizeA--;
 		}

 		/* sum up accumulators */
 		m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM4 );

 		m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM6 );

 		m_XMM0 = _mm_loadl_epi64( (__m128i *)&m_XMM7 );

 		m_XMM0 = _mm_srli_epi64( m_XMM0, 32 );

 		m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM0 );

 		resultL = _mm_cvtsi128_si32( m_XMM7 );
 	}

 	/* switch statements produces faster code than loop */
 	switch( alignOffSetL )
 	{
 		case 15:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 14:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 13:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 12:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 11:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 10:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 9:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 8:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 7:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 6:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 5:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 4:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 3:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 2:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 1:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 	}

 	return resultL;
 }

 /* ------------------------------------------------------------------------- */

 /** Using full register (128-bit) in SSE2 to calculate dot Product.
  *  Dependencies: 16-bit aligned
  *  Return Value: int32 containing dot Product
  */
 int32 bbs_dotProduct_128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA )
 {
 	__m128i m_XMM0, m_XMM2, m_XMM3, m_XMM5, m_XMM6;
 	int16* vec1L = ( int16* )vec1A;
 	int16* vec2L = ( int16* )vec2A;

 	int32 resultL = 0;
 	uint32 alignOffSetL = 0;

 	m_XMM5 = _mm_xor_si128( m_XMM5, m_XMM5 );
 	m_XMM6 = _mm_xor_si128( m_XMM6, m_XMM6 );

 	alignOffSetL = sizeA % 16;
 	sizeA >>= 4;

 	if( sizeA )
 	{
 		while( sizeA > 0 )
 		{
 			m_XMM0 = _mm_load_si128( (__m128i *)&0[vec1L] );
 			m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 );

 			m_XMM2 = _mm_load_si128( (__m128i *)&0[vec2L] );

 			m_XMM6 = _mm_load_si128( (__m128i *)&8[vec1L] );

 			m_XMM0 = _mm_madd_epi16( m_XMM0, m_XMM2 );

 			m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM0 );

 			m_XMM3 = _mm_load_si128( (__m128i *)&8[vec2L] );

 			m_XMM6 = _mm_madd_epi16( m_XMM6, m_XMM3 );

 			vec1L += 16;
 			vec2L += 16;
 			sizeA--;
 		}

 		/* sum up accumulators */
 		m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 );

 		m_XMM0 = _mm_load_si128( (__m128i *)&m_XMM5 );

 		resultL = _mm_cvtsi128_si32( m_XMM0 );	/* 1st 32bits */

 		m_XMM0 = _mm_srli_si128( m_XMM0, 4 );

 		resultL += _mm_cvtsi128_si32( m_XMM0 );	/* 2nd 32bits */

 		m_XMM0 = _mm_srli_si128( m_XMM0, 4 );

 		resultL += _mm_cvtsi128_si32( m_XMM0 );	/* 3rd 32bits */

 		m_XMM0 = _mm_srli_si128( m_XMM0, 4 );

 		resultL += _mm_cvtsi128_si32( m_XMM0 );	/* 4th 32bits */
 	}

 	switch( alignOffSetL )
 	{
 		case 15:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 14:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 13:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 12:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 11:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 10:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 9:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 8:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 7:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 6:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 5:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 4:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 3:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 2:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 1:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 	}

 	return resultL;
 }

 /* ------------------------------------------------------------------------- */


 /** Using full register (128-bit) in SSE2 to calculate dot product (non aligned version).
  *  Dependencies: memory does not need to be 16-bit aligned
  *  Return Value: int32 containing dot product
  */
 int32 bbs_dotProduct_u128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA )
 {
 	__m128i m_XMM0, m_XMM2, m_XMM3, m_XMM5, m_XMM6;
 	int16* vec1L = ( int16* )vec1A;
 	int16* vec2L = ( int16* )vec2A;
 	int32 resultL = 0;
 	uint32 alignOffSetL = 0;

 	/* initialize registers to 0 */
 	m_XMM5 = _mm_xor_si128( m_XMM5, m_XMM5 );
 	m_XMM6 = _mm_xor_si128( m_XMM6, m_XMM6 );


 	alignOffSetL = sizeA % 16;
 	sizeA >>= 4;

 	if( sizeA )
 	{
 		while( sizeA > 0 )
 		{
 			m_XMM0 = _mm_loadu_si128( (__m128i *)&0[vec1L] );
 			m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 );

 			m_XMM2 = _mm_loadu_si128( (__m128i *)&0[vec2L] );

 			m_XMM6 = _mm_loadu_si128( (__m128i *)&8[vec1L] );

 			m_XMM0 = _mm_madd_epi16( m_XMM0, m_XMM2 );

 			m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM0 );

 			m_XMM3 = _mm_loadu_si128( (__m128i *)&8[vec2L] );

 			m_XMM6 = _mm_madd_epi16( m_XMM6, m_XMM3 );

 			vec1L += 16;
 			vec2L += 16;
 			sizeA--;
 		}

 		/* sum up accumulators */
 		m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 );

 		m_XMM0 = _mm_loadu_si128( (__m128i *)&m_XMM5 );

 		resultL = _mm_cvtsi128_si32( m_XMM0 );	/* 1st 32bits */

 		m_XMM0 = _mm_srli_si128( m_XMM0, 4 );

 		resultL += _mm_cvtsi128_si32( m_XMM0 );	/* 2nd 32bits */

 		m_XMM0 = _mm_srli_si128( m_XMM0, 4 );

 		resultL += _mm_cvtsi128_si32( m_XMM0 );	/* 3rd 32bits */

 		m_XMM0 = _mm_srli_si128( m_XMM0, 4 );

 		resultL += _mm_cvtsi128_si32( m_XMM0 );	/* 4th 32bits */
 	}


 	switch( alignOffSetL )
 	{
 		case 15:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 14:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 13:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 12:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 11:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 10:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 9:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 8:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 7:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 6:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 5:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 4:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 3:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 2:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 		case 1:
 			resultL += ( int32 )*vec1L++ * *vec2L++;
 	}

 	return resultL;
 }

 /* ------------------------------------------------------------------------- */

 #endif /* HW_SSE2 */
	/*
	* Copyright (C) 2008 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/* ---- includes ----------------------------------------------------------- */

	#include "b_BasicEm/Basic.h" /* to disable some warnings in VC++ */

	#if ( defined( WIN64 ) \|\| defined( HW_SSE2 ) )

	#include "emmintrin.h"

	/* disable warning "local variable 'x' used without having been initialized" */
	#pragma warning( disable : 4700 )


	/** Using half register (64-bit) in SSE2 to calculate dot product.
	* This is a SSE2 reimplementation of bbs_dotProduct_intelMMX16 in Math.c.
	* Dependencies: input vectors need to be 16-bit aligned
	* Return Value: int32 containing resultL of dot product
	*/
	int32 bbs_dotProduct_64SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA )
	{
	__m128i m_XMM0, m_XMM1, m_XMM2, m_XMM3, m_XMM4, m_XMM5, m_XMM6, m_XMM7, m_XMM8;
	int16* vec1L = ( int16* )vec1A;
	int16* vec2L = ( int16* )vec2A;

	int32 resultL = 0;
	uint32 alignOffSetL = 0;

	/* initialize registers to 0 */
	m_XMM4 = _mm_xor_si128( m_XMM4, m_XMM4 );
	m_XMM6 = _mm_xor_si128( m_XMM6, m_XMM6 );
	m_XMM7 = _mm_xor_si128( m_XMM7, m_XMM7 );

	alignOffSetL = sizeA % 16;
	sizeA >>= 4;

	if( sizeA )
	{
	while( sizeA > 0 )
	{
	m_XMM0 = _mm_loadl_epi64( (__m128i *)&0[vec1L] );
	m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM4 );

	m_XMM1 = _mm_loadl_epi64( (__m128i *)&0[vec2L] );
	m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM6 );

	m_XMM2 = _mm_loadl_epi64( (__m128i *)&4[vec1L] );

	m_XMM0 = _mm_madd_epi16( m_XMM0, m_XMM1 );

	m_XMM3 = _mm_loadl_epi64( (__m128i *)&4[vec2L] );
	m_XMM4 = _mm_loadl_epi64( (__m128i *)&8[vec1L] );

	m_XMM2 = _mm_madd_epi16( m_XMM2, m_XMM3 );

	m_XMM5 = _mm_loadl_epi64( (__m128i *)&8[vec2L] );

	m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM0 );

	m_XMM6 = _mm_loadl_epi64( (__m128i *)&12[vec1L] );

	m_XMM4 = _mm_madd_epi16( m_XMM4, m_XMM5 );

	m_XMM8 = _mm_loadl_epi64( (__m128i *)&12[vec2L] );
	m_XMM6 = _mm_madd_epi16( m_XMM6, m_XMM8 );

	m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM2 );

	vec1L += 16;
	vec2L += 16;
	sizeA--;
	}

	/* sum up accumulators */
	m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM4 );

	m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM6 );

	m_XMM0 = _mm_loadl_epi64( (__m128i *)&m_XMM7 );

	m_XMM0 = _mm_srli_epi64( m_XMM0, 32 );

	m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM0 );

	resultL = _mm_cvtsi128_si32( m_XMM7 );
	}

	/* switch statements produces faster code than loop */
	switch( alignOffSetL )
	{
	case 15:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 14:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 13:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 12:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 11:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 10:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 9:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 8:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 7:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 6:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 5:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 4:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 3:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 2:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 1:
	resultL += ( int32 )vec1L++ *vec2L++;
	}

	return resultL;
	}

	/* ------------------------------------------------------------------------- */

	/** Using full register (128-bit) in SSE2 to calculate dot Product.
	* Dependencies: 16-bit aligned
	* Return Value: int32 containing dot Product
	*/
	int32 bbs_dotProduct_128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA )
	{
	__m128i m_XMM0, m_XMM2, m_XMM3, m_XMM5, m_XMM6;
	int16* vec1L = ( int16* )vec1A;
	int16* vec2L = ( int16* )vec2A;

	int32 resultL = 0;
	uint32 alignOffSetL = 0;

	m_XMM5 = _mm_xor_si128( m_XMM5, m_XMM5 );
	m_XMM6 = _mm_xor_si128( m_XMM6, m_XMM6 );

	alignOffSetL = sizeA % 16;
	sizeA >>= 4;

	if( sizeA )
	{
	while( sizeA > 0 )
	{
	m_XMM0 = _mm_load_si128( (__m128i *)&0[vec1L] );
	m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 );

	m_XMM2 = _mm_load_si128( (__m128i *)&0[vec2L] );

	m_XMM6 = _mm_load_si128( (__m128i *)&8[vec1L] );

	m_XMM0 = _mm_madd_epi16( m_XMM0, m_XMM2 );

	m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM0 );

	m_XMM3 = _mm_load_si128( (__m128i *)&8[vec2L] );

	m_XMM6 = _mm_madd_epi16( m_XMM6, m_XMM3 );

	vec1L += 16;
	vec2L += 16;
	sizeA--;
	}

	/* sum up accumulators */
	m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 );

	m_XMM0 = _mm_load_si128( (__m128i *)&m_XMM5 );

	resultL = _mm_cvtsi128_si32( m_XMM0 ); /* 1st 32bits */

	m_XMM0 = _mm_srli_si128( m_XMM0, 4 );

	resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 2nd 32bits */

	m_XMM0 = _mm_srli_si128( m_XMM0, 4 );

	resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 3rd 32bits */

	m_XMM0 = _mm_srli_si128( m_XMM0, 4 );

	resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 4th 32bits */
	}

	switch( alignOffSetL )
	{
	case 15:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 14:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 13:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 12:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 11:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 10:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 9:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 8:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 7:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 6:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 5:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 4:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 3:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 2:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 1:
	resultL += ( int32 )vec1L++ *vec2L++;
	}

	return resultL;
	}

	/* ------------------------------------------------------------------------- */


	/** Using full register (128-bit) in SSE2 to calculate dot product (non aligned version).
	* Dependencies: memory does not need to be 16-bit aligned
	* Return Value: int32 containing dot product
	*/
	int32 bbs_dotProduct_u128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA )
	{
	__m128i m_XMM0, m_XMM2, m_XMM3, m_XMM5, m_XMM6;
	int16* vec1L = ( int16* )vec1A;
	int16* vec2L = ( int16* )vec2A;
	int32 resultL = 0;
	uint32 alignOffSetL = 0;

	/* initialize registers to 0 */
	m_XMM5 = _mm_xor_si128( m_XMM5, m_XMM5 );
	m_XMM6 = _mm_xor_si128( m_XMM6, m_XMM6 );


	alignOffSetL = sizeA % 16;
	sizeA >>= 4;

	if( sizeA )
	{
	while( sizeA > 0 )
	{
	m_XMM0 = _mm_loadu_si128( (__m128i *)&0[vec1L] );
	m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 );

	m_XMM2 = _mm_loadu_si128( (__m128i *)&0[vec2L] );

	m_XMM6 = _mm_loadu_si128( (__m128i *)&8[vec1L] );

	m_XMM0 = _mm_madd_epi16( m_XMM0, m_XMM2 );

	m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM0 );

	m_XMM3 = _mm_loadu_si128( (__m128i *)&8[vec2L] );

	m_XMM6 = _mm_madd_epi16( m_XMM6, m_XMM3 );

	vec1L += 16;
	vec2L += 16;
	sizeA--;
	}

	/* sum up accumulators */
	m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 );

	m_XMM0 = _mm_loadu_si128( (__m128i *)&m_XMM5 );

	resultL = _mm_cvtsi128_si32( m_XMM0 ); /* 1st 32bits */

	m_XMM0 = _mm_srli_si128( m_XMM0, 4 );

	resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 2nd 32bits */

	m_XMM0 = _mm_srli_si128( m_XMM0, 4 );

	resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 3rd 32bits */

	m_XMM0 = _mm_srli_si128( m_XMM0, 4 );

	resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 4th 32bits */
	}


	switch( alignOffSetL )
	{
	case 15:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 14:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 13:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 12:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 11:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 10:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 9:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 8:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 7:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 6:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 5:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 4:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 3:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 2:
	resultL += ( int32 )vec1L++ *vec2L++;
	case 1:
	resultL += ( int32 )vec1L++ *vec2L++;
	}

	return resultL;
	}

	/* ------------------------------------------------------------------------- */

	#endif /* HW_SSE2 */