| /* |
| * Copyright (c) 2013 ARM Ltd |
| * All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * 3. The name of the company may not be used to endorse or promote |
| * products derived from this software without specific prior written |
| * permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED |
| * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF |
| * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. |
| * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED |
| * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
| * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
| * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| /* This memcpy routine is optimised for Cortex-M3/M4 cores with/without |
| unaligned access. |
| |
| If compiled with GCC, this file should be enclosed within following |
| pre-processing check: |
| if defined (__ARM_ARCH_7M__) || defined (__ARM_ARCH_7EM__) |
| |
| Prototype: void *memcpy (void *dst, const void *src, size_t count); |
| |
| The job will be done in 5 steps. |
| Step 1: Align src/dest pointers, copy mis-aligned if fail to align both |
| Step 2: Repeatedly copy big block size of __OPT_BIG_BLOCK_SIZE |
| Step 3: Repeatedly copy big block size of __OPT_MID_BLOCK_SIZE |
| Step 4: Copy word by word |
| Step 5: Copy byte-to-byte |
| |
| Tunable options: |
| __OPT_BIG_BLOCK_SIZE: Size of big block in words. Default to 64. |
| __OPT_MID_BLOCK_SIZE: Size of big block in words. Default to 16. |
| */ |
| #ifndef __OPT_BIG_BLOCK_SIZE |
| #define __OPT_BIG_BLOCK_SIZE (4 * 16) |
| #endif |
| |
| #ifndef __OPT_MID_BLOCK_SIZE |
| #define __OPT_MID_BLOCK_SIZE (4 * 4) |
| #endif |
| |
| #if __OPT_BIG_BLOCK_SIZE == 16 |
| #define BEGIN_UNROLL_BIG_BLOCK \ |
| .irp offset, 0,4,8,12 |
| #elif __OPT_BIG_BLOCK_SIZE == 32 |
| #define BEGIN_UNROLL_BIG_BLOCK \ |
| .irp offset, 0,4,8,12,16,20,24,28 |
| #elif __OPT_BIG_BLOCK_SIZE == 64 |
| #define BEGIN_UNROLL_BIG_BLOCK \ |
| .irp offset, 0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60 |
| #else |
| #error "Illegal __OPT_BIG_BLOCK_SIZE" |
| #endif |
| |
| #if __OPT_MID_BLOCK_SIZE == 8 |
| #define BEGIN_UNROLL_MID_BLOCK \ |
| .irp offset, 0,4 |
| #elif __OPT_MID_BLOCK_SIZE == 16 |
| #define BEGIN_UNROLL_MID_BLOCK \ |
| .irp offset, 0,4,8,12 |
| #else |
| #error "Illegal __OPT_MID_BLOCK_SIZE" |
| #endif |
| |
| #define END_UNROLL .endr |
| |
| .syntax unified |
| .text |
| .align 2 |
| .global memcpy |
| .thumb |
| .thumb_func |
| .type memcpy, %function |
| memcpy: |
| @ r0: dst |
| @ r1: src |
| @ r2: len |
| #ifdef __ARM_FEATURE_UNALIGNED |
| /* In case of UNALIGNED access supported, ip is not used in |
| function body. */ |
| mov ip, r0 |
| #else |
| push {r0} |
| #endif |
| orr r3, r1, r0 |
| ands r3, r3, #3 |
| bne .Lmisaligned_copy |
| |
| .Lbig_block: |
| subs r2, __OPT_BIG_BLOCK_SIZE |
| blo .Lmid_block |
| |
| /* Kernel loop for big block copy */ |
| .align 2 |
| .Lbig_block_loop: |
| BEGIN_UNROLL_BIG_BLOCK |
| #ifdef __ARM_ARCH_7EM__ |
| ldr r3, [r1], #4 |
| str r3, [r0], #4 |
| END_UNROLL |
| #else /* __ARM_ARCH_7M__ */ |
| ldr r3, [r1, \offset] |
| str r3, [r0, \offset] |
| END_UNROLL |
| adds r0, __OPT_BIG_BLOCK_SIZE |
| adds r1, __OPT_BIG_BLOCK_SIZE |
| #endif |
| subs r2, __OPT_BIG_BLOCK_SIZE |
| bhs .Lbig_block_loop |
| |
| .Lmid_block: |
| adds r2, __OPT_BIG_BLOCK_SIZE - __OPT_MID_BLOCK_SIZE |
| blo .Lcopy_word_by_word |
| |
| /* Kernel loop for mid-block copy */ |
| .align 2 |
| .Lmid_block_loop: |
| BEGIN_UNROLL_MID_BLOCK |
| #ifdef __ARM_ARCH_7EM__ |
| ldr r3, [r1], #4 |
| str r3, [r0], #4 |
| END_UNROLL |
| #else /* __ARM_ARCH_7M__ */ |
| ldr r3, [r1, \offset] |
| str r3, [r0, \offset] |
| END_UNROLL |
| adds r0, __OPT_MID_BLOCK_SIZE |
| adds r1, __OPT_MID_BLOCK_SIZE |
| #endif |
| subs r2, __OPT_MID_BLOCK_SIZE |
| bhs .Lmid_block_loop |
| |
| .Lcopy_word_by_word: |
| adds r2, __OPT_MID_BLOCK_SIZE - 4 |
| blo .Lcopy_less_than_4 |
| |
| /* Kernel loop for small block copy */ |
| .align 2 |
| .Lcopy_word_by_word_loop: |
| ldr r3, [r1], #4 |
| str r3, [r0], #4 |
| subs r2, #4 |
| bhs .Lcopy_word_by_word_loop |
| |
| .Lcopy_less_than_4: |
| adds r2, #4 |
| beq .Ldone |
| |
| lsls r2, r2, #31 |
| itt ne |
| ldrbne r3, [r1], #1 |
| strbne r3, [r0], #1 |
| |
| bcc .Ldone |
| #ifdef __ARM_FEATURE_UNALIGNED |
| ldrh r3, [r1] |
| strh r3, [r0] |
| #else |
| ldrb r3, [r1] |
| strb r3, [r0] |
| ldrb r3, [r1, #1] |
| strb r3, [r0, #1] |
| #endif /* __ARM_FEATURE_UNALIGNED */ |
| |
| .Ldone: |
| #ifdef __ARM_FEATURE_UNALIGNED |
| mov r0, ip |
| #else |
| pop {r0} |
| #endif |
| bx lr |
| |
| .align 2 |
| .Lmisaligned_copy: |
| #ifdef __ARM_FEATURE_UNALIGNED |
| /* Define label DST_ALIGNED to BIG_BLOCK. It will go to aligned copy |
| once destination is adjusted to aligned. */ |
| #define Ldst_aligned Lbig_block |
| |
| /* Copy word by word using LDR when alignment can be done in hardware, |
| i.e., SCTLR.A is set, supporting unaligned access in LDR and STR. */ |
| |
| cmp r2, #8 |
| blo .Lbyte_copy |
| |
| /* if src is aligned, just go to the big block loop. */ |
| lsls r3, r1, #30 |
| beq .Ldst_aligned |
| #else |
| /* if len < 12, misalignment adjustment has more overhead than |
| just byte-to-byte copy. Also, len must >=8 to guarantee code |
| afterward work correctly. */ |
| cmp r2, #12 |
| blo .Lbyte_copy |
| #endif /* __ARM_FEATURE_UNALIGNED */ |
| |
| /* Align dst only, not trying to align src. That is the because |
| handling of aligned src and misaligned dst need more overhead than |
| otherwise. By doing this the worst case is when initial src is aligned, |
| additional up to 4 byte additional copy will executed, which is |
| acceptable. */ |
| |
| ands r3, r0, #3 |
| beq .Ldst_aligned |
| |
| rsb r3, #4 |
| subs r2, r3 |
| |
| lsls r3, r3, #31 |
| itt ne |
| ldrbne r3, [r1], #1 |
| strbne r3, [r0], #1 |
| |
| bcc .Ldst_aligned |
| |
| #ifdef __ARM_FEATURE_UNALIGNED |
| ldrh r3, [r1], #2 |
| strh r3, [r0], #2 |
| b .Ldst_aligned |
| #else |
| ldrb r3, [r1], #1 |
| strb r3, [r0], #1 |
| ldrb r3, [r1], #1 |
| strb r3, [r0], #1 |
| /* Now that dst is aligned */ |
| .Ldst_aligned: |
| /* if r1 is aligned now, it means r0/r1 has the same misalignment, |
| and they are both aligned now. Go aligned copy. */ |
| ands r3, r1, #3 |
| beq .Lbig_block |
| |
| /* dst is aligned, but src isn't. Misaligned copy. */ |
| |
| push {r4, r5} |
| subs r2, #4 |
| |
| /* Backward r1 by misaligned bytes, to make r1 aligned. |
| Since we need to restore r1 to unaligned address after the loop, |
| we need keep the offset bytes to ip and sub it from r1 afterward. */ |
| subs r1, r3 |
| rsb ip, r3, #4 |
| |
| /* Pre-load on word */ |
| ldr r4, [r1], #4 |
| |
| cmp r3, #2 |
| beq .Lmisaligned_copy_2_2 |
| cmp r3, #3 |
| beq .Lmisaligned_copy_3_1 |
| |
| .macro mis_src_copy shift |
| 1: |
| lsrs r4, r4, \shift |
| ldr r3, [r1], #4 |
| lsls r5, r3, 32-\shift |
| orr r4, r4, r5 |
| str r4, [r0], #4 |
| mov r4, r3 |
| subs r2, #4 |
| bhs 1b |
| .endm |
| |
| .Lmisaligned_copy_1_3: |
| mis_src_copy shift=8 |
| b .Lsrc_misaligned_tail |
| |
| .Lmisaligned_copy_3_1: |
| mis_src_copy shift=24 |
| b .Lsrc_misaligned_tail |
| |
| .Lmisaligned_copy_2_2: |
| /* For 2_2 misalignment, ldr is still faster than 2 x ldrh. */ |
| mis_src_copy shift=16 |
| |
| .Lsrc_misaligned_tail: |
| adds r2, #4 |
| subs r1, ip |
| pop {r4, r5} |
| |
| #endif /* __ARM_FEATURE_UNALIGNED */ |
| |
| .Lbyte_copy: |
| subs r2, #4 |
| blo .Lcopy_less_than_4 |
| |
| .Lbyte_copy_loop: |
| subs r2, #1 |
| ldrb r3, [r1], #1 |
| strb r3, [r0], #1 |
| bhs .Lbyte_copy_loop |
| |
| ldrb r3, [r1] |
| strb r3, [r0] |
| ldrb r3, [r1, #1] |
| strb r3, [r0, #1] |
| ldrb r3, [r1, #2] |
| strb r3, [r0, #2] |
| |
| #ifdef __ARM_FEATURE_UNALIGNED |
| mov r0, ip |
| #else |
| pop {r0} |
| #endif |
| bx lr |
| |
| .size memcpy, .-memcpy |