summaryrefslogtreecommitdiffstats
path: root/libc/bionic/memmove_words.c
blob: 26df7fafa0e55b394908e0ef247ee0fd54c72f96 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
/*
 * Copyright (C) 2011 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <stdlib.h>
#include <stdint.h>
#include <assert.h>

/*
 * Works like memmove(), except:
 * - if all arguments are at least 32-bit aligned, we guarantee that we
 *   will use operations that preserve atomicity of 32-bit values
 * - if not, we guarantee atomicity of 16-bit values
 *
 * If all three arguments are not at least 16-bit aligned, the behavior
 * of this function is undefined.  (We could remove this restriction by
 * testing for unaligned values and punting to memmove(), but that's
 * not currently useful.)
 *
 * TODO: add loop for 64-bit alignment
 * TODO: use __builtin_prefetch
 * TODO: write an ARM-optimized version
 */
void _memmove_words(void* dest, const void* src, size_t n)
{
    assert((((uintptr_t) dest | (uintptr_t) src | n) & 0x01) == 0);

    char* d = (char*) dest;
    const char* s = (const char*) src;
    size_t copyCount;

    /*
     * If the source and destination pointers are the same, this is
     * an expensive no-op.  Testing for an empty move now allows us
     * to skip a check later.
     */
    if (n == 0 || d == s)
        return;

    /*
     * Determine if the source and destination buffers will overlap if
     * we copy data forward (i.e. *dest++ = *src++).
     *
     * It's okay if the destination buffer starts before the source and
     * there is some overlap, because the reader is always ahead of the
     * writer.
     */
    if (__builtin_expect((d < s) || ((size_t)(d - s) >= n), 1)) {
        /*
         * Copy forward.  We prefer 32-bit loads and stores even for 16-bit
         * data, so sort that out.
         */
        if ((((uintptr_t) d | (uintptr_t) s) & 0x03) != 0) {
            /*
             * Not 32-bit aligned.  Two possibilities:
             * (1) Congruent, we can align to 32-bit by copying one 16-bit val
             * (2) Non-congruent, we can do one of:
             *   a. copy whole buffer as a series of 16-bit values
             *   b. load/store 32 bits, using shifts to ensure alignment
             *   c. just copy the as 32-bit values and assume the CPU
             *      will do a reasonable job
             *
             * We're currently using (a), which is suboptimal.
             */
            if ((((uintptr_t) d ^ (uintptr_t) s) & 0x03) != 0) {
                copyCount = n;
            } else {
                copyCount = 2;
            }
            n -= copyCount;
            copyCount /= sizeof(uint16_t);

            while (copyCount--) {
                *(uint16_t*)d = *(uint16_t*)s;
                d += sizeof(uint16_t);
                s += sizeof(uint16_t);
            }
        }

        /*
         * Copy 32-bit aligned words.
         */
        copyCount = n / sizeof(uint32_t);
        while (copyCount--) {
            *(uint32_t*)d = *(uint32_t*)s;
            d += sizeof(uint32_t);
            s += sizeof(uint32_t);
        }

        /*
         * Check for leftovers.  Either we finished exactly, or we have
         * one remaining 16-bit chunk.
         */
        if ((n & 0x02) != 0) {
            *(uint16_t*)d = *(uint16_t*)s;
        }
    } else {
        /*
         * Copy backward, starting at the end.
         */
        d += n;
        s += n;

        if ((((uintptr_t) d | (uintptr_t) s) & 0x03) != 0) {
            /* try for 32-bit alignment */
            if ((((uintptr_t) d ^ (uintptr_t) s) & 0x03) != 0) {
                copyCount = n;
            } else {
                copyCount = 2;
            }
            n -= copyCount;
            copyCount /= sizeof(uint16_t);

            while (copyCount--) {
                d -= sizeof(uint16_t);
                s -= sizeof(uint16_t);
                *(uint16_t*)d = *(uint16_t*)s;
            }
        }

        /* copy 32-bit aligned words */
        copyCount = n / sizeof(uint32_t);
        while (copyCount--) {
            d -= sizeof(uint32_t);
            s -= sizeof(uint32_t);
            *(uint32_t*)d = *(uint32_t*)s;
        }

        /* copy leftovers */
        if ((n & 0x02) != 0) {
            d -= sizeof(uint16_t);
            s -= sizeof(uint16_t);
            *(uint16_t*)d = *(uint16_t*)s;
        }
    }
}