| # MMX assist routines for sumsq |
| # Copyright 2001 Phil Karn, KA9Q |
| # May be used under the terms of the GNU Public License (GPL) |
| |
| .text |
| |
| # Evaluate sum of squares of signed 16-bit input samples |
| # long long sumsq_mmx_assist(signed short *in,int cnt); |
| .global sumsq_mmx_assist |
| .type sumsq_mmx_assist,@function |
| .align 16 |
| sumsq_mmx_assist: |
| pushl %ebp |
| movl %esp,%ebp |
| pushl %esi |
| pushl %ecx |
| pushl %ebx |
| |
| movl 8(%ebp),%esi |
| movl 12(%ebp),%ecx |
| xor %eax,%eax |
| xor %edx,%edx |
| |
| # Since 4 * 32767**2 < 2**32, we can accumulate two at a time |
| 1: subl $8,%ecx |
| jl 2f |
| movq (%esi),%mm0 # S0 S1 S2 S3 |
| pmaddwd %mm0,%mm0 # (S0^2+S1^2) (S2^2+S3^2) |
| movq 8(%esi),%mm6 # S4 S5 S6 S7 |
| pmaddwd %mm6,%mm6 # (S4^2+S5^2) (S6^2+S7^2) |
| paddd %mm6,%mm0 # (S0^2+S1^2+S4^2+S5^2)(S2^2+S3^2+S6^2+S7^2) |
| movd %mm0,%ebx |
| addl %ebx,%eax |
| adcl $0,%edx |
| psrlq $32,%mm0 |
| movd %mm0,%ebx |
| addl %ebx,%eax |
| adcl $0,%edx |
| addl $16,%esi |
| jmp 1b |
| |
| 2: emms |
| popl %ebx |
| popl %ecx |
| popl %esi |
| popl %ebp |
| ret |
| |
| # Evaluate sum of squares of signed 16-bit input samples |
| # long sumsq_wd_mmx_assist(signed short *in,int cnt); |
| # Quick version, only safe for small numbers of small input values... |
| .global sumsq_wd_mmx_assist |
| .type sumsq_wd_mmx_assist,@function |
| .align 16 |
| sumsq_wd_mmx_assist: |
| pushl %ebp |
| movl %esp,%ebp |
| pushl %esi |
| |
| movl 8(%ebp),%esi |
| movl 12(%ebp),%ecx |
| pxor %mm2,%mm2 # zero sum |
| |
| 1: subl $8,%ecx |
| jl 2f |
| movq (%esi),%mm0 # S0 S1 S2 S3 |
| pmaddwd %mm0,%mm0 # (S0*S0+S1*S1) (S2*S2+S3*S3) |
| movq 8(%esi),%mm1 |
| pmaddwd %mm1,%mm1 |
| paddd %mm1,%mm2 |
| paddd %mm0,%mm2 # accumulate |
| |
| addl $16,%esi |
| jmp 1b |
| |
| 2: movd %mm2,%eax # even sum |
| psrlq $32,%mm2 |
| movd %mm2,%edx # odd sum |
| addl %edx,%eax |
| emms |
| popl %esi |
| popl %ebp |
| ret |