summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMichael Olbrich <m.olbrich@pengutronix.de>2011-09-15 16:53:32 +0200
committerMichael Olbrich <m.olbrich@pengutronix.de>2011-11-22 15:32:48 +0100
commit4bf80f0dc1e792fc8eb471e7b6a0ec35ab30aa83 (patch)
tree13e05d65259792a514f3eba0b4d8aa042c83886d
parent41b99d2dc61897bb0f32d055b20ed75949240aef (diff)
downloadOSELAS.Toolchain-4bf80f0dc1e792fc8eb471e7b6a0ec35ab30aa83.tar.gz
OSELAS.Toolchain-4bf80f0dc1e792fc8eb471e7b6a0ec35ab30aa83.tar.xz
glibc: add patches for linaro NEON string function
Signed-off-by: Michael Olbrich <m.olbrich@pengutronix.de>
-rw-r--r--patches/glibc-2.13/0024-optimized-string-functions-for-NEON-from-Linaro.patch1295
-rw-r--r--patches/glibc-2.13/0025-add-libc_hidden_builtin_def-for-all-cortex-functions.patch79
-rw-r--r--patches/glibc-2.13/series2
3 files changed, 1376 insertions, 0 deletions
diff --git a/patches/glibc-2.13/0024-optimized-string-functions-for-NEON-from-Linaro.patch b/patches/glibc-2.13/0024-optimized-string-functions-for-NEON-from-Linaro.patch
new file mode 100644
index 0000000..3e56ec8
--- /dev/null
+++ b/patches/glibc-2.13/0024-optimized-string-functions-for-NEON-from-Linaro.patch
@@ -0,0 +1,1295 @@
+From: Michael Olbrich <m.olbrich@pengutronix.de>
+Date: Thu, 15 Sep 2011 16:50:56 +0200
+Subject: [PATCH] optimized string functions for NEON from Linaro
+
+Signed-off-by: Michael Olbrich <m.olbrich@pengutronix.de>
+---
+ .../sysdeps/arm/eabi/arm/cortex-a8/memchr.S | 150 +++++++
+ .../sysdeps/arm/eabi/arm/cortex-a8/memcpy.S | 152 +++++++
+ .../sysdeps/arm/eabi/arm/cortex-a8/memset.S | 118 +++++
+ .../sysdeps/arm/eabi/arm/cortex-a8/strchr.S | 76 ++++
+ .../sysdeps/arm/eabi/arm/cortex-a8/strcmp.c | 449 ++++++++++++++++++++
+ .../sysdeps/arm/eabi/arm/cortex-a8/strcpy.c | 172 ++++++++
+ .../sysdeps/arm/eabi/arm/cortex-a8/strlen.S | 111 +++++
+ 7 files changed, 1228 insertions(+), 0 deletions(-)
+ create mode 100644 cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/memchr.S
+ create mode 100644 cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/memcpy.S
+ create mode 100644 cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/memset.S
+ create mode 100644 cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/strchr.S
+ create mode 100644 cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/strcmp.c
+ create mode 100644 cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/strcpy.c
+ create mode 100644 cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/strlen.S
+
+diff --git a/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/memchr.S b/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/memchr.S
+new file mode 100644
+index 0000000..8f5aaa9
+--- /dev/null
++++ b/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/memchr.S
+@@ -0,0 +1,150 @@
++/* Copyright (c) 2010-2011, Linaro Limited
++ All rights reserved.
++
++ Redistribution and use in source and binary forms, with or without
++ modification, are permitted provided that the following conditions
++ are met:
++
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++
++ * Neither the name of Linaro Limited nor the names of its
++ contributors may be used to endorse or promote products derived
++ from this software without specific prior written permission.
++
++ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
++ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
++ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
++ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
++ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
++ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
++ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++ Written by Dave Gilbert <david.gilbert@linaro.org>
++
++ This memchr routine is optimised on a Cortex-A9 and should work on
++ all ARMv7 processors. It has a fast past for short sizes, and has
++ an optimised path for large data sets; the worst case is finding the
++ match early in a large data set. */
++
++@ 2011-02-07 david.gilbert@linaro.org
++@ Extracted from local git a5b438d861
++@ 2011-07-14 david.gilbert@linaro.org
++@ Import endianness fix from local git ea786f1b
++
++ .syntax unified
++ .arch armv7-a
++
++@ this lets us check a flag in a 00/ff byte easily in either endianness
++#ifdef __ARMEB__
++#define CHARTSTMASK(c) 1<<(31-(c*8))
++#else
++#define CHARTSTMASK(c) 1<<(c*8)
++#endif
++ .text
++ .thumb
++
++@ ---------------------------------------------------------------------------
++ .thumb_func
++ .align 2
++ .p2align 4,,15
++ .global memchr
++ .type memchr,%function
++memchr:
++ @ r0 = start of memory to scan
++ @ r1 = character to look for
++ @ r2 = length
++ @ returns r0 = pointer to character or NULL if not found
++ and r1,r1,#0xff @ Don't think we can trust the caller to actually pass a char
++
++ cmp r2,#16 @ If it's short don't bother with anything clever
++ blt 20f
++
++ tst r0, #7 @ If it's already aligned skip the next bit
++ beq 10f
++
++ @ Work up to an aligned point
++5:
++ ldrb r3, [r0],#1
++ subs r2, r2, #1
++ cmp r3, r1
++ beq 50f @ If it matches exit found
++ tst r0, #7
++ cbz r2, 40f @ If we run off the end, exit not found
++ bne 5b @ If not aligned yet then do next byte
++
++10:
++ @ At this point, we are aligned, we know we have at least 8 bytes to work with
++ push {r4,r5,r6,r7}
++ orr r1, r1, r1, lsl #8 @ expand the match word across to all bytes
++ orr r1, r1, r1, lsl #16
++ bic r4, r2, #7 @ Number of double words to work with
++ mvns r7, #0 @ all F's
++ movs r3, #0
++
++15:
++ ldmia r0!,{r5,r6}
++ subs r4, r4, #8
++ eor r5,r5, r1 @ Get it so that r5,r6 have 00's where the bytes match the target
++ eor r6,r6, r1
++ uadd8 r5, r5, r7 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0
++ sel r5, r3, r7 @ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
++ uadd8 r6, r6, r7 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0
++ sel r6, r5, r7 @ chained....bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
++ cbnz r6, 60f
++ bne 15b @ (Flags from the subs above) If not run out of bytes then go around again
++
++ pop {r4,r5,r6,r7}
++ and r1,r1,#0xff @ Get r1 back to a single character from the expansion above
++ and r2,r2,#7 @ Leave the count remaining as the number after the double words have been done
++
++20:
++ cbz r2, 40f @ 0 length or hit the end already then not found
++
++21: @ Post aligned section, or just a short call
++ ldrb r3,[r0],#1
++ subs r2,r2,#1
++ eor r3,r3,r1 @ r3 = 0 if match - doesn't break flags from sub
++ cbz r3, 50f
++ bne 21b @ on r2 flags
++
++40:
++ movs r0,#0 @ not found
++ bx lr
++
++50:
++ subs r0,r0,#1 @ found
++ bx lr
++
++60: @ We're here because the fast path found a hit - now we have to track down exactly which word it was
++ @ r0 points to the start of the double word after the one that was tested
++ @ r5 has the 00/ff pattern for the first word, r6 has the chained value
++ cmp r5, #0
++ itte eq
++ moveq r5, r6 @ the end is in the 2nd word
++ subeq r0,r0,#3 @ Points to 2nd byte of 2nd word
++ subne r0,r0,#7 @ or 2nd byte of 1st word
++
++ @ r0 currently points to the 3rd byte of the word containing the hit
++ tst r5, # CHARTSTMASK(0) @ 1st character
++ bne 61f
++ adds r0,r0,#1
++ tst r5, # CHARTSTMASK(1) @ 2nd character
++ ittt eq
++ addeq r0,r0,#1
++ tsteq r5, # (3<<15) @ 2nd & 3rd character
++ @ If not the 3rd must be the last one
++ addeq r0,r0,#1
++
++61:
++ pop {r4,r5,r6,r7}
++ subs r0,r0,#1
++ bx lr
+diff --git a/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/memcpy.S b/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/memcpy.S
+new file mode 100644
+index 0000000..3be24ca
+--- /dev/null
++++ b/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/memcpy.S
+@@ -0,0 +1,152 @@
++/* Copyright (c) 2010-2011, Linaro Limited
++ All rights reserved.
++
++ Redistribution and use in source and binary forms, with or without
++ modification, are permitted provided that the following conditions
++ are met:
++
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++
++ * Neither the name of Linaro Limited nor the names of its
++ contributors may be used to endorse or promote products derived
++ from this software without specific prior written permission.
++
++ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
++ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
++ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
++ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
++ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
++ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
++ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++ Written by Dave Gilbert <david.gilbert@linaro.org>
++
++ This memcpy routine is optimised on a Cortex-A9 and should work on
++ all ARMv7 processors with NEON. */
++
++@ 2011-09-01 david.gilbert@linaro.org
++@ Extracted from local git 2f11b436
++
++ .syntax unified
++ .arch armv7-a
++
++@ this lets us check a flag in a 00/ff byte easily in either endianness
++#ifdef __ARMEB__
++#define CHARTSTMASK(c) 1<<(31-(c*8))
++#else
++#define CHARTSTMASK(c) 1<<(c*8)
++#endif
++ .text
++ .thumb
++
++@ ---------------------------------------------------------------------------
++ .thumb_func
++ .align 2
++ .p2align 4,,15
++ .global memcpy
++ .type memcpy,%function
++memcpy:
++ @ r0 = dest
++ @ r1 = source
++ @ r2 = count
++ @ returns dest in r0
++ @ Overlaps of source/dest not allowed according to spec
++ @ Note this routine relies on v7 misaligned loads/stores
++ pld [r1]
++ mov r12, r0 @ stash original r0
++ cmp r2,#32
++ blt 10f @ take the small copy case separately
++
++ @ test for either source or destination being misaligned
++ @ (We only rely on word align)
++ tst r0,#3
++ it eq
++ tsteq r1,#3
++ bne 30f @ misaligned case
++
++4:
++ @ at this point we are word (or better) aligned and have at least
++ @ 32 bytes to play with
++
++ @ If it's a huge copy, try Neon
++ cmp r2, #128*1024
++ bge 35f @ Sharing general non-aligned case here, aligned could be faster
++
++ push {r3,r4,r5,r6,r7,r8,r10,r11}
++5:
++ ldmia r1!,{r3,r4,r5,r6,r7,r8,r10,r11}
++ sub r2,r2,#32
++ pld [r1,#96]
++ cmp r2,#32
++ stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11}
++ bge 5b
++
++ pop {r3,r4,r5,r6,r7,r8,r10,r11}
++ @ We are now down to less than 32 bytes
++ cbz r2,15f @ quick exit for the case where we copied a multiple of 32
++
++10: @ small copies (not necessarily aligned - note might be slightly more than 32bytes)
++ cmp r2,#4
++ blt 12f
++11:
++ sub r2,r2,#4
++ cmp r2,#4
++ ldr r3, [r1],#4
++ str r3, [r0],#4
++ bge 11b
++12:
++ tst r2,#2
++ itt ne
++ ldrhne r3, [r1],#2
++ strhne r3, [r0],#2
++
++ tst r2,#1
++ itt ne
++ ldrbne r3, [r1],#1
++ strbne r3, [r0],#1
++
++15: @ exit
++ mov r0,r12 @ restore r0
++ bx lr
++
++ .align 2
++ .p2align 4,,15
++30: @ non-aligned - at least 32 bytes to play with
++ @ Test for co-misalignment
++ eor r3, r0, r1
++ tst r3,#3
++ beq 50f
++
++ @ Use Neon for misaligned
++35:
++ vld1.8 {d0,d1,d2,d3}, [r1]!
++ sub r2,r2,#32
++ cmp r2,#32
++ pld [r1,#96]
++ vst1.8 {d0,d1,d2,d3}, [r0]!
++ bge 35b
++ b 10b @ TODO: Probably a bad idea to switch to ARM at this point
++
++ .align 2
++ .p2align 4,,15
++50: @ Co-misaligned
++ @ At this point we've got at least 32 bytes
++51:
++ ldrb r3,[r1],#1
++ sub r2,r2,#1
++ strb r3,[r0],#1
++ tst r0,#7
++ bne 51b
++
++ cmp r2,#32
++ blt 10b
++ b 4b
+diff --git a/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/memset.S b/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/memset.S
+new file mode 100644
+index 0000000..921cb75
+--- /dev/null
++++ b/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/memset.S
+@@ -0,0 +1,118 @@
++/* Copyright (c) 2010-2011, Linaro Limited
++ All rights reserved.
++
++ Redistribution and use in source and binary forms, with or without
++ modification, are permitted provided that the following conditions
++ are met:
++
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++
++ * Neither the name of Linaro Limited nor the names of its
++ contributors may be used to endorse or promote products derived
++ from this software without specific prior written permission.
++
++ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
++ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
++ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
++ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
++ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
++ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
++ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++ Written by Dave Gilbert <david.gilbert@linaro.org>
++
++ This memset routine is optimised on a Cortex-A9 and should work on
++ all ARMv7 processors. */
++
++ .syntax unified
++ .arch armv7-a
++
++@ 2011-08-30 david.gilbert@linaro.org
++@ Extracted from local git 2f11b436
++
++@ this lets us check a flag in a 00/ff byte easily in either endianness
++#ifdef __ARMEB__
++#define CHARTSTMASK(c) 1<<(31-(c*8))
++#else
++#define CHARTSTMASK(c) 1<<(c*8)
++#endif
++ .text
++ .thumb
++
++@ ---------------------------------------------------------------------------
++ .thumb_func
++ .align 2
++ .p2align 4,,15
++ .global memset
++ .type memset,%function
++memset:
++ @ r0 = address
++ @ r1 = character
++ @ r2 = count
++ @ returns original address in r0
++
++ mov r3, r0 @ Leave r0 alone
++ cbz r2, 10f @ Exit if 0 length
++
++ tst r0, #7
++ beq 2f @ Already aligned
++
++ @ Ok, so we're misaligned here
++1:
++ strb r1, [r3], #1
++ subs r2,r2,#1
++ tst r3, #7
++ cbz r2, 10f @ Exit if we hit the end
++ bne 1b @ go round again if still misaligned
++
++2:
++ @ OK, so we're aligned
++ push {r4,r5,r6,r7}
++ bics r4, r2, #15 @ if less than 16 bytes then need to finish it off
++ beq 5f
++
++3:
++ @ POSIX says that ch is cast to an unsigned char. A uxtb is one
++ @ byte and takes two cycles, where an AND is four bytes but one
++ @ cycle.
++ and r1, #0xFF
++ orr r1, r1, r1, lsl#8 @ Same character into all bytes
++ orr r1, r1, r1, lsl#16
++ mov r5,r1
++ mov r6,r1
++ mov r7,r1
++
++4:
++ subs r4,r4,#16
++ stmia r3!,{r1,r5,r6,r7}
++ bne 4b
++ and r2,r2,#15
++
++ @ At this point we're still aligned and we have upto align-1 bytes left to right
++ @ we can avoid some of the byte-at-a time now by testing for some big chunks
++ tst r2,#8
++ itt ne
++ subne r2,r2,#8
++ stmiane r3!,{r1,r5}
++
++5:
++ pop {r4,r5,r6,r7}
++ cbz r2, 10f
++
++ @ Got to do any last < alignment bytes
++6:
++ subs r2,r2,#1
++ strb r1,[r3],#1
++ bne 6b
++
++10:
++ bx lr @ goodbye
+diff --git a/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/strchr.S b/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/strchr.S
+new file mode 100644
+index 0000000..8875dbf
+--- /dev/null
++++ b/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/strchr.S
+@@ -0,0 +1,76 @@
++/* Copyright (c) 2010-2011, Linaro Limited
++ All rights reserved.
++
++ Redistribution and use in source and binary forms, with or without
++ modification, are permitted provided that the following conditions
++ are met:
++
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++
++ * Neither the name of Linaro Limited nor the names of its
++ contributors may be used to endorse or promote products derived
++ from this software without specific prior written permission.
++
++ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
++ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
++ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
++ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
++ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
++ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
++ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++ Written by Dave Gilbert <david.gilbert@linaro.org>
++
++ A very simple strchr routine, from benchmarks on A9 it's a bit faster than
++ the current version in eglibc (2.12.1-0ubuntu14 package)
++ I don't think doing a word at a time version is worth it since a lot
++ of strchr cases are very short anyway */
++
++@ 2011-02-07 david.gilbert@linaro.org
++@ Extracted from local git a5b438d861
++
++ .syntax unified
++ .arch armv7-a
++
++ .text
++ .thumb
++
++@ ---------------------------------------------------------------------------
++
++ .thumb_func
++ .align 2
++ .p2align 4,,15
++ .global strchr
++ .type strchr,%function
++strchr:
++ @ r0 = start of string
++ @ r1 = character to match
++ @ returns NULL for no match, or a pointer to the match
++ and r1,r1, #255
++
++1:
++ ldrb r2,[r0],#1
++ cmp r2,r1
++ cbz r2,10f
++ bne 1b
++
++ @ We're here if it matched
++5:
++ subs r0,r0,#1
++ bx lr
++
++10:
++ @ We're here if we ran off the end
++ cmp r1, #0 @ Corner case - you're allowed to search for the nil and get a pointer to it
++ beq 5b @ A bit messy, if it's common we should branch at the start to a special loop
++ mov r0,#0
++ bx lr
+diff --git a/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/strcmp.c b/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/strcmp.c
+new file mode 100644
+index 0000000..fb2280d
+--- /dev/null
++++ b/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/strcmp.c
+@@ -0,0 +1,449 @@
++/*
++ * Copyright (c) 2008 ARM Ltd
++ * All rights reserved.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ * 2. Redistributions in binary form must reproduce the above copyright
++ * notice, this list of conditions and the following disclaimer in the
++ * documentation and/or other materials provided with the distribution.
++ * 3. The name of the company may not be used to endorse or promote
++ * products derived from this software without specific prior written
++ * permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
++ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
++ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ */
++
++#include <string.h>
++#include <memcopy.h>
++
++#undef strcmp
++
++
++#ifdef __ARMEB__
++#define SHFT2LSB "lsl"
++#define SHFT2MSB "lsr"
++#define MSB "0x000000ff"
++#define LSB "0xff000000"
++#else
++#define SHFT2LSB "lsr"
++#define SHFT2MSB "lsl"
++#define MSB "0xff000000"
++#define LSB "0x000000ff"
++#endif
++
++#ifdef __thumb2__
++#define magic1(REG) "#0x01010101"
++#define magic2(REG) "#0x80808080"
++#else
++#define magic1(REG) #REG
++#define magic2(REG) #REG ", lsl #7"
++#endif
++
++int
++__attribute__((naked)) strcmp (const char* s1, const char* s2)
++{
++ asm(
++#if !(defined(__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) || \
++ (defined (__thumb__) && !defined (__thumb2__)))
++ "pld [r0, #0]\n\t"
++ "pld [r1, #0]\n\t"
++ "eor r2, r0, r1\n\t"
++ "tst r2, #3\n\t"
++ /* Strings not at same byte offset from a word boundary. */
++ "bne strcmp_unaligned\n\t"
++ "ands r2, r0, #3\n\t"
++ "bic r0, r0, #3\n\t"
++ "bic r1, r1, #3\n\t"
++ "ldr ip, [r0], #4\n\t"
++ "it eq\n\t"
++ "ldreq r3, [r1], #4\n\t"
++ "beq 1f\n\t"
++ /* Although s1 and s2 have identical initial alignment, they are
++ not currently word aligned. Rather than comparing bytes,
++ make sure that any bytes fetched from before the addressed
++ bytes are forced to 0xff. Then they will always compare
++ equal. */
++ "eor r2, r2, #3\n\t"
++ "lsl r2, r2, #3\n\t"
++ "mvn r3, #"MSB"\n\t"
++ SHFT2LSB" r2, r3, r2\n\t"
++ "ldr r3, [r1], #4\n\t"
++ "orr ip, ip, r2\n\t"
++ "orr r3, r3, r2\n"
++ "1:\n\t"
++#ifndef __thumb2__
++ /* Load the 'magic' constant 0x01010101. */
++ "str r4, [sp, #-4]!\n\t"
++ "mov r4, #1\n\t"
++ "orr r4, r4, r4, lsl #8\n\t"
++ "orr r4, r4, r4, lsl #16\n"
++#endif
++ ".p2align 2\n"
++ "4:\n\t"
++ "pld [r0, #8]\n\t"
++ "pld [r1, #8]\n\t"
++ "sub r2, ip, "magic1(r4)"\n\t"
++ "cmp ip, r3\n\t"
++ "itttt eq\n\t"
++ /* check for any zero bytes in first word */
++ "biceq r2, r2, ip\n\t"
++ "tsteq r2, "magic2(r4)"\n\t"
++ "ldreq ip, [r0], #4\n\t"
++ "ldreq r3, [r1], #4\n\t"
++ "beq 4b\n"
++ "2:\n\t"
++ /* There's a zero or a different byte in the word */
++ SHFT2MSB" r0, ip, #24\n\t"
++ SHFT2LSB" ip, ip, #8\n\t"
++ "cmp r0, #1\n\t"
++ "it cs\n\t"
++ "cmpcs r0, r3, "SHFT2MSB" #24\n\t"
++ "it eq\n\t"
++ SHFT2LSB"eq r3, r3, #8\n\t"
++ "beq 2b\n\t"
++ /* On a big-endian machine, r0 contains the desired byte in bits
++ 0-7; on a little-endian machine they are in bits 24-31. In
++ both cases the other bits in r0 are all zero. For r3 the
++ interesting byte is at the other end of the word, but the
++ other bits are not necessarily zero. We need a signed result
++ representing the differnece in the unsigned bytes, so for the
++ little-endian case we can't just shift the interesting bits
++ up. */
++#ifdef __ARMEB__
++ "sub r0, r0, r3, lsr #24\n\t"
++#else
++ "and r3, r3, #255\n\t"
++#ifdef __thumb2__
++ /* No RSB instruction in Thumb2 */
++ "lsr r0, r0, #24\n\t"
++ "sub r0, r0, r3\n\t"
++#else
++ "rsb r0, r3, r0, lsr #24\n\t"
++#endif
++#endif
++#ifndef __thumb2__
++ "ldr r4, [sp], #4\n\t"
++#endif
++ "BX LR"
++#elif (defined (__thumb__) && !defined (__thumb2__))
++ "1:\n\t"
++ "ldrb r2, [r0]\n\t"
++ "ldrb r3, [r1]\n\t"
++ "add r0, r0, #1\n\t"
++ "add r1, r1, #1\n\t"
++ "cmp r2, #0\n\t"
++ "beq 2f\n\t"
++ "cmp r2, r3\n\t"
++ "beq 1b\n\t"
++ "2:\n\t"
++ "sub r0, r2, r3\n\t"
++ "bx lr"
++#else
++ "3:\n\t"
++ "ldrb r2, [r0], #1\n\t"
++ "ldrb r3, [r1], #1\n\t"
++ "cmp r2, #1\n\t"
++ "it cs\n\t"
++ "cmpcs r2, r3\n\t"
++ "beq 3b\n\t"
++ "sub r0, r2, r3\n\t"
++ "BX LR"
++#endif
++ );
++}
++
++#if !(defined(__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) || \
++ (defined (__thumb__) && !defined (__thumb2__)))
++static int __attribute__((naked, used))
++strcmp_unaligned(const char* s1, const char* s2)
++{
++#if 0
++ /* The assembly code below is based on the following alogrithm. */
++#ifdef __ARMEB__
++#define RSHIFT <<
++#define LSHIFT >>
++#else
++#define RSHIFT >>
++#define LSHIFT <<
++#endif
++
++#define body(shift) \
++ mask = 0xffffffffU RSHIFT shift; \
++ w1 = *wp1++; \
++ w2 = *wp2++; \
++ do \
++ { \
++ t1 = w1 & mask; \
++ if (__builtin_expect(t1 != w2 RSHIFT shift, 0)) \
++ { \
++ w2 RSHIFT= shift; \
++ break; \
++ } \
++ if (__builtin_expect(((w1 - b1) & ~w1) & (b1 << 7), 0)) \
++ { \
++ /* See comment in assembler below re syndrome on big-endian */\
++ if ((((w1 - b1) & ~w1) & (b1 << 7)) & mask) \
++ w2 RSHIFT= shift; \
++ else \
++ { \
++ w2 = *wp2; \
++ t1 = w1 RSHIFT (32 - shift); \
++ w2 = (w2 LSHIFT (32 - shift)) RSHIFT (32 - shift); \
++ } \
++ break; \
++ } \
++ w2 = *wp2++; \
++ t1 ^= w1; \
++ if (__builtin_expect(t1 != w2 LSHIFT (32 - shift), 0)) \
++ { \
++ t1 = w1 >> (32 - shift); \
++ w2 = (w2 << (32 - shift)) RSHIFT (32 - shift); \
++ break; \
++ } \
++ w1 = *wp1++; \
++ } while (1)
++
++ const unsigned* wp1;
++ const unsigned* wp2;
++ unsigned w1, w2;
++ unsigned mask;
++ unsigned shift;
++ unsigned b1 = 0x01010101;
++ char c1, c2;
++ unsigned t1;
++
++ while (((unsigned) s1) & 3)
++ {
++ c1 = *s1++;
++ c2 = *s2++;
++ if (c1 == 0 || c1 != c2)
++ return c1 - (int)c2;
++ }
++ wp1 = (unsigned*) (((unsigned)s1) & ~3);
++ wp2 = (unsigned*) (((unsigned)s2) & ~3);
++ t1 = ((unsigned) s2) & 3;
++ if (t1 == 1)
++ {
++ body(8);
++ }
++ else if (t1 == 2)
++ {
++ body(16);
++ }
++ else
++ {
++ body (24);
++ }
++
++ do
++ {
++#ifdef __ARMEB__
++ c1 = (char) t1 >> 24;
++ c2 = (char) w2 >> 24;
++#else
++ c1 = (char) t1;
++ c2 = (char) w2;
++#endif
++ t1 RSHIFT= 8;
++ w2 RSHIFT= 8;
++ } while (c1 != 0 && c1 == c2);
++ return c1 - c2;
++#endif
++
++ asm("wp1 .req r0\n\t"
++ "wp2 .req r1\n\t"
++ "b1 .req r2\n\t"
++ "w1 .req r4\n\t"
++ "w2 .req r5\n\t"
++ "t1 .req ip\n\t"
++ "@ r3 is scratch\n"
++
++ /* First of all, compare bytes until wp1(sp1) is word-aligned. */
++ "1:\n\t"
++ "tst wp1, #3\n\t"
++ "beq 2f\n\t"
++ "ldrb r2, [wp1], #1\n\t"
++ "ldrb r3, [wp2], #1\n\t"
++ "cmp r2, #1\n\t"
++ "it cs\n\t"
++ "cmpcs r2, r3\n\t"
++ "beq 1b\n\t"
++ "sub r0, r2, r3\n\t"
++ "BX LR\n"
++
++ "2:\n\t"
++ "str r5, [sp, #-4]!\n\t"
++ "str r4, [sp, #-4]!\n\t"
++ // "stmfd sp!, {r4, r5}\n\t"
++ "mov b1, #1\n\t"
++ "orr b1, b1, b1, lsl #8\n\t"
++ "orr b1, b1, b1, lsl #16\n\t"
++
++ "and t1, wp2, #3\n\t"
++ "bic wp2, wp2, #3\n\t"
++ "ldr w1, [wp1], #4\n\t"
++ "ldr w2, [wp2], #4\n\t"
++ "cmp t1, #2\n\t"
++ "beq 2f\n\t"
++ "bhi 3f\n"
++
++ /* Critical inner Loop: Block with 3 bytes initial overlap */
++ ".p2align 2\n"
++ "1:\n\t"
++ "bic t1, w1, #"MSB"\n\t"
++ "cmp t1, w2, "SHFT2LSB" #8\n\t"
++ "sub r3, w1, b1\n\t"
++ "bic r3, r3, w1\n\t"
++ "bne 4f\n\t"
++ "ands r3, r3, b1, lsl #7\n\t"
++ "it eq\n\t"
++ "ldreq w2, [wp2], #4\n\t"
++ "bne 5f\n\t"
++ "eor t1, t1, w1\n\t"
++ "cmp t1, w2, "SHFT2MSB" #24\n\t"
++ "bne 6f\n\t"
++ "ldr w1, [wp1], #4\n\t"
++ "b 1b\n"
++ "4:\n\t"
++ SHFT2LSB" w2, w2, #8\n\t"
++ "b 8f\n"
++
++ "5:\n\t"
++#ifdef __ARMEB__
++ /* The syndrome value may contain false ones if the string ends
++ with the bytes 0x01 0x00 */
++ "tst w1, #0xff000000\n\t"
++ "itt ne\n\t"
++ "tstne w1, #0x00ff0000\n\t"
++ "tstne w1, #0x0000ff00\n\t"
++ "beq 7f\n\t"
++#else
++ "bics r3, r3, #0xff000000\n\t"
++ "bne 7f\n\t"
++#endif
++ "ldrb w2, [wp2]\n\t"
++ SHFT2LSB" t1, w1, #24\n\t"
++#ifdef __ARMEB__
++ "lsl w2, w2, #24\n\t"
++#endif
++ "b 8f\n"
++
++ "6:\n\t"
++ SHFT2LSB" t1, w1, #24\n\t"
++ "and w2, w2, #"LSB"\n\t"
++ "b 8f\n"
++
++ /* Critical inner Loop: Block with 2 bytes initial overlap */
++ ".p2align 2\n"
++ "2:\n\t"
++ SHFT2MSB" t1, w1, #16\n\t"
++ "sub r3, w1, b1\n\t"
++ SHFT2LSB" t1, t1, #16\n\t"
++ "bic r3, r3, w1\n\t"
++ "cmp t1, w2, "SHFT2LSB" #16\n\t"
++ "bne 4f\n\t"
++ "ands r3, r3, b1, lsl #7\n\t"
++ "it eq\n\t"
++ "ldreq w2, [wp2], #4\n\t"
++ "bne 5f\n\t"
++ "eor t1, t1, w1\n\t"
++ "cmp t1, w2, "SHFT2MSB" #16\n\t"
++ "bne 6f\n\t"
++ "ldr w1, [wp1], #4\n\t"
++ "b 2b\n"
++
++ "5:\n\t"
++#ifdef __ARMEB__
++ /* The syndrome value may contain false ones if the string ends
++ with the bytes 0x01 0x00 */
++ "tst w1, #0xff000000\n\t"
++ "it ne\n\t"
++ "tstne w1, #0x00ff0000\n\t"
++ "beq 7f\n\t"
++#else
++ "lsls r3, r3, #16\n\t"
++ "bne 7f\n\t"
++#endif
++ "ldrh w2, [wp2]\n\t"
++ SHFT2LSB" t1, w1, #16\n\t"
++#ifdef __ARMEB__
++ "lsl w2, w2, #16\n\t"
++#endif
++ "b 8f\n"
++
++ "6:\n\t"
++ SHFT2MSB" w2, w2, #16\n\t"
++ SHFT2LSB" t1, w1, #16\n\t"
++ "4:\n\t"
++ SHFT2LSB" w2, w2, #16\n\t"
++ "b 8f\n\t"
++
++ /* Critical inner Loop: Block with 1 byte initial overlap */
++ ".p2align 2\n"
++ "3:\n\t"
++ "and t1, w1, #"LSB"\n\t"
++ "cmp t1, w2, "SHFT2LSB" #24\n\t"
++ "sub r3, w1, b1\n\t"
++ "bic r3, r3, w1\n\t"
++ "bne 4f\n\t"
++ "ands r3, r3, b1, lsl #7\n\t"
++ "it eq\n\t"
++ "ldreq w2, [wp2], #4\n\t"
++ "bne 5f\n\t"
++ "eor t1, t1, w1\n\t"
++ "cmp t1, w2, "SHFT2MSB" #8\n\t"
++ "bne 6f\n\t"
++ "ldr w1, [wp1], #4\n\t"
++ "b 3b\n"
++ "4:\n\t"
++ SHFT2LSB" w2, w2, #24\n\t"
++ "b 8f\n"
++ "5:\n\t"
++ /* The syndrome value may contain false ones if the string ends
++ with the bytes 0x01 0x00 */
++ "tst w1, #"LSB"\n\t"
++ "beq 7f\n\t"
++ "ldr w2, [wp2], #4\n"
++ "6:\n\t"
++ SHFT2LSB" t1, w1, #8\n\t"
++ "bic w2, w2, #"MSB"\n\t"
++ "b 8f\n"
++ "7:\n\t"
++ "mov r0, #0\n\t"
++ // "ldmfd sp!, {r4, r5}\n\t"
++ "ldr r4, [sp], #4\n\t"
++ "ldr r5, [sp], #4\n\t"
++ "BX LR\n"
++ "8:\n\t"
++ "and r2, t1, #"LSB"\n\t"
++ "and r0, w2, #"LSB"\n\t"
++ "cmp r0, #1\n\t"
++ "it cs\n\t"
++ "cmpcs r0, r2\n\t"
++ "itt eq\n\t"
++ SHFT2LSB"eq t1, t1, #8\n\t"
++ SHFT2LSB"eq w2, w2, #8\n\t"
++ "beq 8b\n\t"
++ "sub r0, r2, r0\n\t"
++ // "ldmfd sp!, {r4, r5}\n\t"
++ "ldr r4, [sp], #4\n\t"
++ "ldr r5, [sp], #4\n\t"
++ "BX LR");
++}
++
++#endif
++
++libc_hidden_builtin_def (strcmp)
+diff --git a/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/strcpy.c b/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/strcpy.c
+new file mode 100644
+index 0000000..aa8cb06
+--- /dev/null
++++ b/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/strcpy.c
+@@ -0,0 +1,172 @@
++/*
++ * Copyright (c) 2008 ARM Ltd
++ * All rights reserved.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ * 2. Redistributions in binary form must reproduce the above copyright
++ * notice, this list of conditions and the following disclaimer in the
++ * documentation and/or other materials provided with the distribution.
++ * 3. The name of the company may not be used to endorse or promote
++ * products derived from this software without specific prior written
++ * permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
++ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
++ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ */
++
++#include <string.h>
++#include <memcopy.h>
++
++#undef strcmp
++
++
++#ifdef __thumb2__
++#define magic1(REG) "#0x01010101"
++#define magic2(REG) "#0x80808080"
++#else
++#define magic1(REG) #REG
++#define magic2(REG) #REG ", lsl #7"
++#endif
++
++char* __attribute__((naked))
++strcpy (char* dst, const char* src)
++{
++ asm (
++#if !(defined(__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) || \
++ (defined (__thumb__) && !defined (__thumb2__)))
++ "pld [r1, #0]\n\t"
++ "eor r2, r0, r1\n\t"
++ "mov ip, r0\n\t"
++ "tst r2, #3\n\t"
++ "bne 4f\n\t"
++ "tst r1, #3\n\t"
++ "bne 3f\n"
++ "5:\n\t"
++#ifndef __thumb2__
++ "str r5, [sp, #-4]!\n\t"
++ "mov r5, #0x01\n\t"
++ "orr r5, r5, r5, lsl #8\n\t"
++ "orr r5, r5, r5, lsl #16\n\t"
++#endif
++
++ "str r4, [sp, #-4]!\n\t"
++ "tst r1, #4\n\t"
++ "ldr r3, [r1], #4\n\t"
++ "beq 2f\n\t"
++ "sub r2, r3, "magic1(r5)"\n\t"
++ "bics r2, r2, r3\n\t"
++ "tst r2, "magic2(r5)"\n\t"
++ "itt eq\n\t"
++ "streq r3, [ip], #4\n\t"
++ "ldreq r3, [r1], #4\n"
++ "bne 1f\n\t"
++ /* Inner loop. We now know that r1 is 64-bit aligned, so we
++ can safely fetch up to two words. This allows us to avoid
++ load stalls. */
++ ".p2align 2\n"
++ "2:\n\t"
++ "pld [r1, #8]\n\t"
++ "ldr r4, [r1], #4\n\t"
++ "sub r2, r3, "magic1(r5)"\n\t"
++ "bics r2, r2, r3\n\t"
++ "tst r2, "magic2(r5)"\n\t"
++ "sub r2, r4, "magic1(r5)"\n\t"
++ "bne 1f\n\t"
++ "str r3, [ip], #4\n\t"
++ "bics r2, r2, r4\n\t"
++ "tst r2, "magic2(r5)"\n\t"
++ "itt eq\n\t"
++ "ldreq r3, [r1], #4\n\t"
++ "streq r4, [ip], #4\n\t"
++ "beq 2b\n\t"
++ "mov r3, r4\n"
++ "1:\n\t"
++#ifdef __ARMEB__
++ "rors r3, r3, #24\n\t"
++#endif
++ "strb r3, [ip], #1\n\t"
++ "tst r3, #0xff\n\t"
++#ifdef __ARMEL__
++ "ror r3, r3, #8\n\t"
++#endif
++ "bne 1b\n\t"
++ "ldr r4, [sp], #4\n\t"
++#ifndef __thumb2__
++ "ldr r5, [sp], #4\n\t"
++#endif
++ "BX LR\n"
++
++ /* Strings have the same offset from word alignment, but it's
++ not zero. */
++ "3:\n\t"
++ "tst r1, #1\n\t"
++ "beq 1f\n\t"
++ "ldrb r2, [r1], #1\n\t"
++ "strb r2, [ip], #1\n\t"
++ "cmp r2, #0\n\t"
++ "it eq\n"
++ "BXEQ LR\n"
++ "1:\n\t"
++ "tst r1, #2\n\t"
++ "beq 5b\n\t"
++ "ldrh r2, [r1], #2\n\t"
++#ifdef __ARMEB__
++ "tst r2, #0xff00\n\t"
++ "iteet ne\n\t"
++ "strneh r2, [ip], #2\n\t"
++ "lsreq r2, r2, #8\n\t"
++ "streqb r2, [ip]\n\t"
++ "tstne r2, #0xff\n\t"
++#else
++ "tst r2, #0xff\n\t"
++ "itet ne\n\t"
++ "strneh r2, [ip], #2\n\t"
++ "streqb r2, [ip]\n\t"
++ "tstne r2, #0xff00\n\t"
++#endif
++ "bne 5b\n\t"
++ "BX LR\n"
++
++ /* src and dst do not have a common word-alignement. Fall back to
++ byte copying. */
++ "4:\n\t"
++ "ldrb r2, [r1], #1\n\t"
++ "strb r2, [ip], #1\n\t"
++ "cmp r2, #0\n\t"
++ "bne 4b\n\t"
++ "BX LR"
++
++#elif !defined (__thumb__) || defined (__thumb2__)
++ "mov r3, r0\n\t"
++ "1:\n\t"
++ "ldrb r2, [r1], #1\n\t"
++ "strb r2, [r3], #1\n\t"
++ "cmp r2, #0\n\t"
++ "bne 1b\n\t"
++ "BX LR"
++#else
++ "mov r3, r0\n\t"
++ "1:\n\t"
++ "ldrb r2, [r1]\n\t"
++ "add r1, r1, #1\n\t"
++ "strb r2, [r3]\n\t"
++ "add r3, r3, #1\n\t"
++ "cmp r2, #0\n\t"
++ "bne 1b\n\t"
++ "BX LR"
++#endif
++ );
++}
++libc_hidden_builtin_def (strcpy)
+diff --git a/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/strlen.S b/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/strlen.S
+new file mode 100644
+index 0000000..125e92f
+--- /dev/null
++++ b/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/strlen.S
+@@ -0,0 +1,111 @@
++/* Copyright (c) 2010-2011, Linaro Limited
++ All rights reserved.
++
++ Redistribution and use in source and binary forms, with or without
++ modification, are permitted provided that the following conditions
++ are met:
++
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++
++ * Neither the name of Linaro Limited nor the names of its
++ contributors may be used to endorse or promote products derived
++ from this software without specific prior written permission.
++
++ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
++ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
++ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
++ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
++ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
++ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
++ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++ Written by Dave Gilbert <david.gilbert@linaro.org>
++
++ This strlen routine is optimised on a Cortex-A9 and should work on
++ all ARMv7 processors. This routine is reasonably fast for short
++ strings, but is probably slower than a simple implementation if all
++ your strings are very short */
++
++@ 2011-02-08 david.gilbert@linaro.org
++@ Extracted from local git 6848613a
++
++
++@ this lets us check a flag in a 00/ff byte easily in either endianness
++#ifdef __ARMEB__
++#define CHARTSTMASK(c) 1<<(31-(c*8))
++#else
++#define CHARTSTMASK(c) 1<<(c*8)
++#endif
++
++@-----------------------------------------------------------------------------------------------------------------------------
++ .syntax unified
++ .arch armv7-a
++
++ .thumb_func
++ .align 2
++ .p2align 4,,15
++ .global strlen
++ .type strlen,%function
++strlen:
++ @ r0 = string
++ @ returns count of bytes in string not including terminator
++ mov r1, r0
++ push { r4,r6 }
++ mvns r6, #0 @ all F
++ movs r4, #0
++ tst r0, #7
++ beq 2f
++
++1:
++ ldrb r2, [r1], #1
++ tst r1, #7 @ Hit alignment yet?
++ cbz r2, 10f @ Exit if we found the 0
++ bne 1b
++
++ @ So we're now aligned
++2:
++ ldmia r1!,{r2,r3}
++ uadd8 r2, r2, r6 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0
++ sel r2, r4, r6 @ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
++ uadd8 r3, r3, r6 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0
++ sel r3, r2, r6 @ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
++ cmp r3, #0
++ beq 2b
++
++strlenendtmp:
++ @ One (or more) of the bytes we loaded was 0 - but which one?
++ @ r2 has the mask corresponding to the first loaded word
++ @ r3 has a combined mask of the two words - but if r2 was all-non 0
++ @ then it's just the 2nd words
++ cmp r2, #0
++ itte eq
++ moveq r2, r3 @ the end is in the 2nd word
++ subeq r1,r1,#3
++ subne r1,r1,#7
++
++ @ r1 currently points to the 2nd byte of the word containing the 0
++ tst r2, # CHARTSTMASK(0) @ 1st character
++ bne 10f
++ adds r1,r1,#1
++ tst r2, # CHARTSTMASK(1) @ 2nd character
++ ittt eq
++ addeq r1,r1,#1
++ tsteq r2, # (3<<15) @ 2nd & 3rd character
++ @ If not the 3rd must be the last one
++ addeq r1,r1,#1
++
++10:
++ @ r0 is still at the beginning, r1 is pointing 1 byte after the terminator
++ sub r0, r1, r0
++ subs r0, r0, #1
++ pop { r4, r6 }
++ bx lr
+--
+1.7.7
+
diff --git a/patches/glibc-2.13/0025-add-libc_hidden_builtin_def-for-all-cortex-functions.patch b/patches/glibc-2.13/0025-add-libc_hidden_builtin_def-for-all-cortex-functions.patch
new file mode 100644
index 0000000..c5e0d23
--- /dev/null
+++ b/patches/glibc-2.13/0025-add-libc_hidden_builtin_def-for-all-cortex-functions.patch
@@ -0,0 +1,79 @@
+From: Michael Olbrich <m.olbrich@pengutronix.de>
+Date: Thu, 15 Sep 2011 23:30:25 +0200
+Subject: [PATCH] add libc_hidden_builtin_def for all cortex functions
+
+Signed-off-by: Michael Olbrich <m.olbrich@pengutronix.de>
+---
+ .../sysdeps/arm/eabi/arm/cortex-a8/memchr.S | 3 +++
+ .../sysdeps/arm/eabi/arm/cortex-a8/memcpy.S | 2 ++
+ .../sysdeps/arm/eabi/arm/cortex-a8/memset.S | 2 ++
+ .../sysdeps/arm/eabi/arm/cortex-a8/strchr.S | 3 +++
+ .../sysdeps/arm/eabi/arm/cortex-a8/strcpy.c | 1 +
+ .../sysdeps/arm/eabi/arm/cortex-a8/strlen.S | 2 ++
+ 6 files changed, 13 insertions(+), 0 deletions(-)
+
+diff --git a/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/memchr.S b/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/memchr.S
+index 8f5aaa9..6d497cb 100644
+--- a/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/memchr.S
++++ b/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/memchr.S
+@@ -148,3 +148,6 @@ memchr:
+ pop {r4,r5,r6,r7}
+ subs r0,r0,#1
+ bx lr
++
++strong_alias (memchr, __memchr)
++libc_hidden_builtin_def (memchr)
+diff --git a/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/memcpy.S b/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/memcpy.S
+index 3be24ca..c274207 100644
+--- a/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/memcpy.S
++++ b/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/memcpy.S
+@@ -150,3 +150,5 @@ memcpy:
+ cmp r2,#32
+ blt 10b
+ b 4b
++
++libc_hidden_builtin_def (memcpy)
+diff --git a/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/memset.S b/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/memset.S
+index 921cb75..d4c12a4 100644
+--- a/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/memset.S
++++ b/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/memset.S
+@@ -116,3 +116,5 @@ memset:
+
+ 10:
+ bx lr @ goodbye
++
++libc_hidden_builtin_def (memset)
+diff --git a/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/strchr.S b/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/strchr.S
+index 8875dbf..05c832f 100644
+--- a/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/strchr.S
++++ b/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/strchr.S
+@@ -74,3 +74,6 @@ strchr:
+ beq 5b @ A bit messy, if it's common we should branch at the start to a special loop
+ mov r0,#0
+ bx lr
++
++weak_alias (strchr, index)
++libc_hidden_builtin_def (strchr)
+diff --git a/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/strcpy.c b/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/strcpy.c
+index aa8cb06..3bbaa86 100644
+--- a/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/strcpy.c
++++ b/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/strcpy.c
+@@ -169,4 +169,5 @@ strcpy (char* dst, const char* src)
+ #endif
+ );
+ }
++
+ libc_hidden_builtin_def (strcpy)
+diff --git a/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/strlen.S b/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/strlen.S
+index 125e92f..a1e02ad 100644
+--- a/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/strlen.S
++++ b/cortex-strings/sysdeps/arm/eabi/arm/cortex-a8/strlen.S
+@@ -109,3 +109,5 @@ strlenendtmp:
+ subs r0, r0, #1
+ pop { r4, r6 }
+ bx lr
++
++libc_hidden_builtin_def (strlen)
+--
+1.7.7
+
diff --git a/patches/glibc-2.13/series b/patches/glibc-2.13/series
index edb574b..313e852 100644
--- a/patches/glibc-2.13/series
+++ b/patches/glibc-2.13/series
@@ -21,3 +21,5 @@
0021-pre20040117-pt_pax.patch
0022-fpscr_values.patch
0023-Fix-prelinking.patch
+0024-optimized-string-functions-for-NEON-from-Linaro.patch
+0025-add-libc_hidden_builtin_def-for-all-cortex-functions.patch