[PATCH] [Trusty/Utopic] UBUNTU: SAUCE: (no-up) arm64: optimized copy_to_user and copy_from_user assembly code
Brad Figg
brad.figg at canonical.com
Tue Aug 19 23:22:56 UTC 2014
Buglink? SRU justification? etc.
On 08/19/2014 04:11 PM, dann frazier wrote:
> From: Feng Kan <fkan at apm.com>
>
> Using the glibc cortex string work work authored by Linaro as base to
> create new copy to/from user kernel routine.
>
> Iperf performance increase:
> -l (size) 1 core result
> Optimized 64B 44-51Mb/s
> 1500B 4.9Gb/s
> 30000B 16.2Gb/s
> Original 64B 34-50.7Mb/s
> 1500B 4.7Gb/s
> 30000B 14.5Gb/s
>
> Signed-off-by: Feng Kan <fkan at apm.com>
> (v2 submittal)
> Reference: http://www.spinics.net/lists/arm-kernel/msg353650.html
> Signed-off-by: dann frazier <dann.frazier at canonical.com>
> ---
> arch/arm64/lib/copy_from_user.S | 36 +-----
> arch/arm64/lib/copy_template.S | 278 ++++++++++++++++++++++++++++++++++++++++
> arch/arm64/lib/copy_to_user.S | 31 +----
> 3 files changed, 284 insertions(+), 61 deletions(-)
> create mode 100644 arch/arm64/lib/copy_template.S
>
> diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
> index 5e27add..c4c5187 100644
> --- a/arch/arm64/lib/copy_from_user.S
> +++ b/arch/arm64/lib/copy_from_user.S
> @@ -15,7 +15,6 @@
> */
>
> #include <linux/linkage.h>
> -#include <asm/assembler.h>
>
> /*
> * Copy from user space to a kernel buffer (alignment handled by the hardware)
> @@ -28,39 +27,10 @@
> * x0 - bytes not copied
> */
> ENTRY(__copy_from_user)
> - add x4, x1, x2 // upper user buffer boundary
> - subs x2, x2, #8
> - b.mi 2f
> -1:
> -USER(9f, ldr x3, [x1], #8 )
> - subs x2, x2, #8
> - str x3, [x0], #8
> - b.pl 1b
> -2: adds x2, x2, #4
> - b.mi 3f
> -USER(9f, ldr w3, [x1], #4 )
> - sub x2, x2, #4
> - str w3, [x0], #4
> -3: adds x2, x2, #2
> - b.mi 4f
> -USER(9f, ldrh w3, [x1], #2 )
> - sub x2, x2, #2
> - strh w3, [x0], #2
> -4: adds x2, x2, #1
> - b.mi 5f
> -USER(9f, ldrb w3, [x1] )
> - strb w3, [x0]
> -5: mov x0, #0
> - ret
> +#include "copy_template.S"
> ENDPROC(__copy_from_user)
>
> .section .fixup,"ax"
> - .align 2
> -9: sub x2, x4, x1
> - mov x3, x2
> -10: strb wzr, [x0], #1 // zero remaining buffer space
> - subs x3, x3, #1
> - b.ne 10b
> - mov x0, x2 // bytes not copied
> - ret
> + .align 2
> + copy_abort_table
> .previous
> diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
> new file mode 100644
> index 0000000..f2c7003
> --- /dev/null
> +++ b/arch/arm64/lib/copy_template.S
> @@ -0,0 +1,278 @@
> +/*
> + * Copyright (c) 2013, Applied Micro Circuits Corporation
> + * Copyright (c) 2012-2013, Linaro Limited
> + *
> + * Author: Feng Kan <fkan at apm.com>
> + * Author: Philipp Tomsich <philipp.tomsich at theobroma-systems.com>
> + *
> + * The code is adopted from the memcpy routine by Linaro Limited.
> + *
> + * This file is free software: you may copy, redistribute and/or modify it
> + * under the terms of the GNU General Public License as published by the
> + * Free Software Foundation, either version 2 of the License, or (at your
> + * option) any later version.
> + *
> + * This file is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program. If not, see <http://www.gnu.org/licenses/>.
> + *
> + * This file incorporates work covered by the following copyright and
> + * permission notice:
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions are met:
> + * 1 Redistributions of source code must retain the above copyright
> + * notice, this list of conditions and the following disclaimer.
> + * 2 Redistributions in binary form must reproduce the above copyright
> + * notice, this list of conditions and the following disclaimer in the
> + * documentation and/or other materials provided with the distribution.
> + * 3 Neither the name of the Linaro nor the
> + * names of its contributors may be used to endorse or promote products
> + * derived from this software without specific prior written permission.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +#include <asm/assembler.h>
> +
> +dstin .req x0
> +src .req x1
> +count .req x2
> +tmp1 .req x3
> +tmp1w .req w3
> +tmp2 .req x4
> +tmp2w .req w4
> +tmp3 .req x5
> +tmp3w .req w5
> +dst .req x6
> +
> +A_l .req x7
> +A_h .req x8
> +B_l .req x9
> +B_h .req x10
> +C_l .req x11
> +C_h .req x12
> +D_l .req x13
> +D_h .req x14
> +
> + mov dst, dstin
> + cmp count, #64
> + b.ge .Lcpy_not_short
> + cmp count, #15
> + b.le .Ltail15tiny
> +
> + /*
> + * Deal with small copies quickly by dropping straight into the
> + * exit block.
> + */
> +.Ltail63:
> + /*
> + * Copy up to 48 bytes of data. At this point we only need the
> + * bottom 6 bits of count to be accurate.
> + */
> + ands tmp1, count, #0x30
> + b.eq .Ltail15
> + add dst, dst, tmp1
> + add src, src, tmp1
> + cmp tmp1w, #0x20
> + b.eq 1f
> + b.lt 2f
> + USER(8f, ldp A_l, A_h, [src, #-48])
> + USER(8f, stp A_l, A_h, [dst, #-48])
> +1:
> + USER(8f, ldp A_l, A_h, [src, #-32])
> + USER(8f, stp A_l, A_h, [dst, #-32])
> +2:
> + USER(8f, ldp A_l, A_h, [src, #-16])
> + USER(8f, stp A_l, A_h, [dst, #-16])
> +
> +.Ltail15:
> + ands count, count, #15
> + beq 1f
> + add src, src, count
> + USER(9f, ldp A_l, A_h, [src, #-16])
> + add dst, dst, count
> + USER(9f, stp A_l, A_h, [dst, #-16])
> +1:
> + b .Lsuccess
> +
> +.Ltail15tiny:
> + /*
> + * Copy up to 15 bytes of data. Does not assume additional data
> + * being copied.
> + */
> + tbz count, #3, 1f
> + USER(10f, ldr tmp1, [src], #8)
> + USER(10f, str tmp1, [dst], #8)
> +1:
> + tbz count, #2, 1f
> + USER(10f, ldr tmp1w, [src], #4)
> + USER(10f, str tmp1w, [dst], #4)
> +1:
> + tbz count, #1, 1f
> + USER(10f, ldrh tmp1w, [src], #2)
> + USER(10f, strh tmp1w, [dst], #2)
> +1:
> + tbz count, #0, 1f
> + USER(10f, ldrb tmp1w, [src])
> + USER(10f, strb tmp1w, [dst])
> +1:
> + b .Lsuccess
> +
> +.Lcpy_not_short:
> + /*
> + * We don't much care about the alignment of DST, but we want SRC
> + * to be 128-bit (16 byte) aligned so that we don't cross cache line
> + * boundaries on both loads and stores.
> + */
> + neg tmp2, src
> + ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
> + b.eq 2f
> + sub count, count, tmp2
> + /*
> + * Copy more data than needed; it's faster than jumping
> + * around copying sub-Quadword quantities. We know that
> + * it can't overrun.
> + */
> + USER(11f, ldp A_l, A_h, [src])
> + add src, src, tmp2
> + USER(11f, stp A_l, A_h, [dst])
> + add dst, dst, tmp2
> + /* There may be less than 63 bytes to go now. */
> + cmp count, #63
> + b.le .Ltail63
> +2:
> + subs count, count, #128
> + b.ge .Lcpy_body_large
> + /*
> + * Less than 128 bytes to copy, so handle 64 here and then jump
> + * to the tail.
> + */
> + USER(12f, ldp A_l, A_h, [src])
> + USER(12f, ldp B_l, B_h, [src, #16])
> + USER(12f, ldp C_l, C_h, [src, #32])
> + USER(12f, ldp D_l, D_h, [src, #48])
> + USER(12f, stp A_l, A_h, [dst])
> + USER(12f, stp B_l, B_h, [dst, #16])
> + USER(12f, stp C_l, C_h, [dst, #32])
> + USER(12f, stp D_l, D_h, [dst, #48])
> + tst count, #0x3f
> + add src, src, #64
> + add dst, dst, #64
> + b.ne .Ltail63
> + b .Lsuccess
> +
> + /*
> + * Critical loop. Start at a new cache line boundary. Assuming
> + * 64 bytes per line this ensures the entire loop is in one line.
> + */
> + .p2align 6
> +.Lcpy_body_large:
> + /* There are at least 128 bytes to copy. */
> + USER(12f, ldp A_l, A_h, [src, #0])
> + sub dst, dst, #16 /* Pre-bias. */
> + USER(13f, ldp B_l, B_h, [src, #16])
> + USER(13f, ldp C_l, C_h, [src, #32])
> + USER(13f, ldp D_l, D_h, [src, #48]!) /* src += 64 - Pre-bias. */
> +1:
> + USER(13f, stp A_l, A_h, [dst, #16])
> + USER(13f, ldp A_l, A_h, [src, #16])
> + USER(13f, stp B_l, B_h, [dst, #32])
> + USER(13f, ldp B_l, B_h, [src, #32])
> + USER(13f, stp C_l, C_h, [dst, #48])
> + USER(13f, ldp C_l, C_h, [src, #48])
> + USER(13f, stp D_l, D_h, [dst, #64]!)
> + USER(13f, ldp D_l, D_h, [src, #64]!)
> + subs count, count, #64
> + b.ge 1b
> + USER(14f, stp A_l, A_h, [dst, #16])
> + USER(14f, stp B_l, B_h, [dst, #32])
> + USER(14f, stp C_l, C_h, [dst, #48])
> + USER(14f, stp D_l, D_h, [dst, #64])
> + add src, src, #16
> + add dst, dst, #64 + 16
> + tst count, #0x3f
> + b.ne .Ltail63
> +.Lsuccess:
> + /* Nothing left to copy */
> + mov x0, #0
> + ret
> +
> + .macro copy_abort_table
> +8:
> + /*
> + * Count bytes remain
> + * dst points to (dst + tmp1)
> + */
> + mov x0, count
> + sub dst, dst, tmp1
> + b .Lfinalize
> +9:
> + /*
> + * 16 bytes remain
> + * dst is accurate
> + */
> + mov x0, #16
> + b .Lfinalize
> +10:
> + /*
> + * count is accurate
> + * dst is accurate
> + */
> + mov x0, count
> + b .Lfinalize
> +11:
> + /*
> + *(count + tmp2) bytes remain
> + * dst points to the start of the remaining bytes
> + */
> + add x0, count, tmp2
> + b .Lfinalize
> +12:
> + /*
> + * (count + 128) bytes remain
> + * dst is accurate
> + */
> + add x0, count, #128
> + b .Lfinalize
> +13:
> + /*
> + * (count + 128) bytes remain
> + * dst is pre-biased to (dst + 16)
> + */
> + add x0, count, #128
> + sub dst, dst, #16
> + b .Lfinalize
> +14:
> + /*
> + * count is accurate
> + * dst is pre-biased to (dst + 16)
> + */
> + mov x0, count
> + sub dst, dst, #16
> + /* fall-through */
> +.Lfinalize:
> + /*
> + * Zeroize remaining destination-buffer
> + */
> + mov count, x0
> +20:
> + /* Zero remaining buffer space */
> + strb wzr, [dst], #1
> + subs count, count, #1
> + b.ne 20b
> + ret
> + .endm
> diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
> index a0aeeb9..08787b0 100644
> --- a/arch/arm64/lib/copy_to_user.S
> +++ b/arch/arm64/lib/copy_to_user.S
> @@ -15,7 +15,6 @@
> */
>
> #include <linux/linkage.h>
> -#include <asm/assembler.h>
>
> /*
> * Copy to user space from a kernel buffer (alignment handled by the hardware)
> @@ -28,34 +27,10 @@
> * x0 - bytes not copied
> */
> ENTRY(__copy_to_user)
> - add x4, x0, x2 // upper user buffer boundary
> - subs x2, x2, #8
> - b.mi 2f
> -1:
> - ldr x3, [x1], #8
> - subs x2, x2, #8
> -USER(9f, str x3, [x0], #8 )
> - b.pl 1b
> -2: adds x2, x2, #4
> - b.mi 3f
> - ldr w3, [x1], #4
> - sub x2, x2, #4
> -USER(9f, str w3, [x0], #4 )
> -3: adds x2, x2, #2
> - b.mi 4f
> - ldrh w3, [x1], #2
> - sub x2, x2, #2
> -USER(9f, strh w3, [x0], #2 )
> -4: adds x2, x2, #1
> - b.mi 5f
> - ldrb w3, [x1]
> -USER(9f, strb w3, [x0] )
> -5: mov x0, #0
> - ret
> +#include "copy_template.S"
> ENDPROC(__copy_to_user)
>
> .section .fixup,"ax"
> - .align 2
> -9: sub x0, x4, x0 // bytes not copied
> - ret
> + .align 2
> + copy_abort_table
> .previous
>
--
Brad Figg brad.figg at canonical.com http://www.canonical.com
More information about the kernel-team
mailing list