summaryrefslogtreecommitdiffstats
path: root/libc/bionic/pthread_mutex.cpp
blob: 851fc3d99c7535ecc2a9dd4aeac83dbd2eeecbc7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
/*
 * Copyright (C) 2008 The Android Open Source Project
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <pthread.h>

#include <errno.h>
#include <limits.h>
#include <stdatomic.h>
#include <string.h>
#include <sys/cdefs.h>
#include <sys/mman.h>
#include <unistd.h>

#include "pthread_internal.h"

#include "private/bionic_constants.h"
#include "private/bionic_futex.h"
#include "private/bionic_systrace.h"
#include "private/bionic_time_conversions.h"
#include "private/bionic_tls.h"

/* a mutex attribute holds the following fields
 *
 * bits:     name       description
 * 0-3       type       type of mutex
 * 4         shared     process-shared flag
 */
#define  MUTEXATTR_TYPE_MASK   0x000f
#define  MUTEXATTR_SHARED_MASK 0x0010

int pthread_mutexattr_init(pthread_mutexattr_t *attr)
{
    *attr = PTHREAD_MUTEX_DEFAULT;
    return 0;
}

int pthread_mutexattr_destroy(pthread_mutexattr_t *attr)
{
    *attr = -1;
    return 0;
}

int pthread_mutexattr_gettype(const pthread_mutexattr_t *attr, int *type_p)
{
    int type = (*attr & MUTEXATTR_TYPE_MASK);

    if (type < PTHREAD_MUTEX_NORMAL || type > PTHREAD_MUTEX_ERRORCHECK) {
        return EINVAL;
    }

    *type_p = type;
    return 0;
}

int pthread_mutexattr_settype(pthread_mutexattr_t *attr, int type)
{
    if (type < PTHREAD_MUTEX_NORMAL || type > PTHREAD_MUTEX_ERRORCHECK ) {
        return EINVAL;
    }

    *attr = (*attr & ~MUTEXATTR_TYPE_MASK) | type;
    return 0;
}

/* process-shared mutexes are not supported at the moment */

int pthread_mutexattr_setpshared(pthread_mutexattr_t *attr, int  pshared)
{
    switch (pshared) {
    case PTHREAD_PROCESS_PRIVATE:
        *attr &= ~MUTEXATTR_SHARED_MASK;
        return 0;

    case PTHREAD_PROCESS_SHARED:
        /* our current implementation of pthread actually supports shared
         * mutexes but won't cleanup if a process dies with the mutex held.
         * Nevertheless, it's better than nothing. Shared mutexes are used
         * by surfaceflinger and audioflinger.
         */
        *attr |= MUTEXATTR_SHARED_MASK;
        return 0;
    }
    return EINVAL;
}

int pthread_mutexattr_getpshared(const pthread_mutexattr_t* attr, int* pshared) {
    *pshared = (*attr & MUTEXATTR_SHARED_MASK) ? PTHREAD_PROCESS_SHARED : PTHREAD_PROCESS_PRIVATE;
    return 0;
}

/* a mutex contains a state value and a owner_tid.
 * The value is implemented as a 16-bit integer holding the following fields:
 *
 * bits:     name     description
 * 15-14     type     mutex type
 * 13        shared   process-shared flag
 * 12-2      counter  counter of recursive mutexes
 * 1-0       state    lock state (0, 1 or 2)
 *
 * The owner_tid is used only in recursive and errorcheck mutex to hold the mutex owner thread tid.
 */

/* Convenience macro, creates a mask of 'bits' bits that starts from
 * the 'shift'-th least significant bit in a 32-bit word.
 *
 * Examples: FIELD_MASK(0,4)  -> 0xf
 *           FIELD_MASK(16,9) -> 0x1ff0000
 */
#define  FIELD_MASK(shift,bits)           (((1 << (bits))-1) << (shift))

/* This one is used to create a bit pattern from a given field value */
#define  FIELD_TO_BITS(val,shift,bits)    (((val) & ((1 << (bits))-1)) << (shift))

/* And this one does the opposite, i.e. extract a field's value from a bit pattern */
#define  FIELD_FROM_BITS(val,shift,bits)  (((val) >> (shift)) & ((1 << (bits))-1))


/* Convenience macros.
 *
 * These are used to form or modify the bit pattern of a given mutex value
 */

/* Mutex state:
 *
 * 0 for unlocked
 * 1 for locked, no waiters
 * 2 for locked, maybe waiters
 */
#define  MUTEX_STATE_SHIFT      0
#define  MUTEX_STATE_LEN        2

#define  MUTEX_STATE_MASK           FIELD_MASK(MUTEX_STATE_SHIFT, MUTEX_STATE_LEN)
#define  MUTEX_STATE_FROM_BITS(v)   FIELD_FROM_BITS(v, MUTEX_STATE_SHIFT, MUTEX_STATE_LEN)
#define  MUTEX_STATE_TO_BITS(v)     FIELD_TO_BITS(v, MUTEX_STATE_SHIFT, MUTEX_STATE_LEN)

#define  MUTEX_STATE_UNLOCKED            0   /* must be 0 to match PTHREAD_MUTEX_INITIALIZER */
#define  MUTEX_STATE_LOCKED_UNCONTENDED  1   /* must be 1 due to atomic dec in unlock operation */
#define  MUTEX_STATE_LOCKED_CONTENDED    2   /* must be 1 + LOCKED_UNCONTENDED due to atomic dec */

#define  MUTEX_STATE_BITS_UNLOCKED            MUTEX_STATE_TO_BITS(MUTEX_STATE_UNLOCKED)
#define  MUTEX_STATE_BITS_LOCKED_UNCONTENDED  MUTEX_STATE_TO_BITS(MUTEX_STATE_LOCKED_UNCONTENDED)
#define  MUTEX_STATE_BITS_LOCKED_CONTENDED    MUTEX_STATE_TO_BITS(MUTEX_STATE_LOCKED_CONTENDED)

/* return true iff the mutex if locked with no waiters */
#define  MUTEX_STATE_BITS_IS_LOCKED_UNCONTENDED(v)  (((v) & MUTEX_STATE_MASK) == MUTEX_STATE_BITS_LOCKED_UNCONTENDED)

/* return true iff the mutex if locked with maybe waiters */
#define  MUTEX_STATE_BITS_IS_LOCKED_CONTENDED(v)   (((v) & MUTEX_STATE_MASK) == MUTEX_STATE_BITS_LOCKED_CONTENDED)

/* used to flip from LOCKED_UNCONTENDED to LOCKED_CONTENDED */
#define  MUTEX_STATE_BITS_FLIP_CONTENTION(v)      ((v) ^ (MUTEX_STATE_BITS_LOCKED_CONTENDED ^ MUTEX_STATE_BITS_LOCKED_UNCONTENDED))

/* Mutex counter:
 *
 * We need to check for overflow before incrementing, and we also need to
 * detect when the counter is 0
 */
#define  MUTEX_COUNTER_SHIFT         2
#define  MUTEX_COUNTER_LEN           11
#define  MUTEX_COUNTER_MASK          FIELD_MASK(MUTEX_COUNTER_SHIFT, MUTEX_COUNTER_LEN)

#define  MUTEX_COUNTER_BITS_WILL_OVERFLOW(v)    (((v) & MUTEX_COUNTER_MASK) == MUTEX_COUNTER_MASK)
#define  MUTEX_COUNTER_BITS_IS_ZERO(v)          (((v) & MUTEX_COUNTER_MASK) == 0)

/* Used to increment the counter directly after overflow has been checked */
#define  MUTEX_COUNTER_BITS_ONE      FIELD_TO_BITS(1, MUTEX_COUNTER_SHIFT,MUTEX_COUNTER_LEN)

/* Mutex shared bit flag
 *
 * This flag is set to indicate that the mutex is shared among processes.
 * This changes the futex opcode we use for futex wait/wake operations
 * (non-shared operations are much faster).
 */
#define  MUTEX_SHARED_SHIFT    13
#define  MUTEX_SHARED_MASK     FIELD_MASK(MUTEX_SHARED_SHIFT,1)

/* Mutex type:
 * We support normal, recursive and errorcheck mutexes.
 */
#define  MUTEX_TYPE_SHIFT      14
#define  MUTEX_TYPE_LEN        2
#define  MUTEX_TYPE_MASK       FIELD_MASK(MUTEX_TYPE_SHIFT,MUTEX_TYPE_LEN)

#define  MUTEX_TYPE_TO_BITS(t)       FIELD_TO_BITS(t, MUTEX_TYPE_SHIFT, MUTEX_TYPE_LEN)

#define  MUTEX_TYPE_BITS_NORMAL      MUTEX_TYPE_TO_BITS(PTHREAD_MUTEX_NORMAL)
#define  MUTEX_TYPE_BITS_RECURSIVE   MUTEX_TYPE_TO_BITS(PTHREAD_MUTEX_RECURSIVE)
#define  MUTEX_TYPE_BITS_ERRORCHECK  MUTEX_TYPE_TO_BITS(PTHREAD_MUTEX_ERRORCHECK)

struct pthread_mutex_internal_t {
  _Atomic(uint16_t) state;
#if defined(__LP64__)
  uint16_t __pad;
  atomic_int owner_tid;
  char __reserved[32];
#else
  _Atomic(uint16_t) owner_tid;
#endif
} __attribute__((aligned(4)));

static_assert(sizeof(pthread_mutex_t) == sizeof(pthread_mutex_internal_t),
              "pthread_mutex_t should actually be pthread_mutex_internal_t in implementation.");

// For binary compatibility with old version of pthread_mutex_t, we can't use more strict alignment
// than 4-byte alignment.
static_assert(alignof(pthread_mutex_t) == 4,
              "pthread_mutex_t should fulfill the alignment of pthread_mutex_internal_t.");

static inline pthread_mutex_internal_t* __get_internal_mutex(pthread_mutex_t* mutex_interface) {
  return reinterpret_cast<pthread_mutex_internal_t*>(mutex_interface);
}

int pthread_mutex_init(pthread_mutex_t* mutex_interface, const pthread_mutexattr_t* attr) {
    pthread_mutex_internal_t* mutex = __get_internal_mutex(mutex_interface);

    memset(mutex, 0, sizeof(pthread_mutex_internal_t));

    if (__predict_true(attr == NULL)) {
        atomic_init(&mutex->state, MUTEX_TYPE_BITS_NORMAL);
        return 0;
    }

    uint16_t state = 0;
    if ((*attr & MUTEXATTR_SHARED_MASK) != 0) {
        state |= MUTEX_SHARED_MASK;
    }

    switch (*attr & MUTEXATTR_TYPE_MASK) {
    case PTHREAD_MUTEX_NORMAL:
      state |= MUTEX_TYPE_BITS_NORMAL;
      break;
    case PTHREAD_MUTEX_RECURSIVE:
      state |= MUTEX_TYPE_BITS_RECURSIVE;
      break;
    case PTHREAD_MUTEX_ERRORCHECK:
      state |= MUTEX_TYPE_BITS_ERRORCHECK;
      break;
    default:
        return EINVAL;
    }

    atomic_init(&mutex->state, state);
    atomic_init(&mutex->owner_tid, 0);
    return 0;
}

static inline __always_inline int __pthread_normal_mutex_trylock(pthread_mutex_internal_t* mutex,
                                                                 uint16_t shared) {
    const uint16_t unlocked           = shared | MUTEX_STATE_BITS_UNLOCKED;
    const uint16_t locked_uncontended = shared | MUTEX_STATE_BITS_LOCKED_UNCONTENDED;

    uint16_t old_state = unlocked;
    if (__predict_true(atomic_compare_exchange_strong_explicit(&mutex->state, &old_state,
                         locked_uncontended, memory_order_acquire, memory_order_relaxed))) {
        return 0;
    }
    return EBUSY;
}

/*
 * Lock a mutex of type NORMAL.
 *
 * As noted above, there are three states:
 *   0 (unlocked, no contention)
 *   1 (locked, no contention)
 *   2 (locked, contention)
 *
 * Non-recursive mutexes don't use the thread-id or counter fields, and the
 * "type" value is zero, so the only bits that will be set are the ones in
 * the lock state field.
 */
static inline __always_inline int __pthread_normal_mutex_lock(pthread_mutex_internal_t* mutex,
                                                              uint16_t shared,
                                                              const timespec* abs_timeout_or_null,
                                                              clockid_t clock) {
    if (__predict_true(__pthread_normal_mutex_trylock(mutex, shared) == 0)) {
        return 0;
    }

    ScopedTrace trace("Contending for pthread mutex");

    const uint16_t unlocked           = shared | MUTEX_STATE_BITS_UNLOCKED;
    const uint16_t locked_contended = shared | MUTEX_STATE_BITS_LOCKED_CONTENDED;

    // We want to go to sleep until the mutex is available, which requires
    // promoting it to locked_contended. We need to swap in the new state
    // and then wait until somebody wakes us up.
    // An atomic_exchange is used to compete with other threads for the lock.
    // If it returns unlocked, we have acquired the lock, otherwise another
    // thread still holds the lock and we should wait again.
    // If lock is acquired, an acquire fence is needed to make all memory accesses
    // made by other threads visible to the current CPU.
    while (atomic_exchange_explicit(&mutex->state, locked_contended,
                                    memory_order_acquire) != unlocked) {
        timespec ts;
        timespec* rel_timeout = NULL;
        if (abs_timeout_or_null != NULL) {
            rel_timeout = &ts;
            if (!timespec_from_absolute_timespec(*rel_timeout, *abs_timeout_or_null, clock)) {
                return ETIMEDOUT;
            }
        }
        if (__futex_wait_ex(&mutex->state, shared, locked_contended, rel_timeout) == -ETIMEDOUT) {
            return ETIMEDOUT;
        }
    }
    return 0;
}

/*
 * Release a normal mutex.  The caller is responsible for determining
 * that we are in fact the owner of this lock.
 */
static inline __always_inline void __pthread_normal_mutex_unlock(pthread_mutex_internal_t* mutex,
                                                                 uint16_t shared) {
    const uint16_t unlocked         = shared | MUTEX_STATE_BITS_UNLOCKED;
    const uint16_t locked_contended = shared | MUTEX_STATE_BITS_LOCKED_CONTENDED;

    // We use an atomic_exchange to release the lock. If locked_contended state
    // is returned, some threads is waiting for the lock and we need to wake up
    // one of them.
    // A release fence is required to make previous stores visible to next
    // lock owner threads.
    if (atomic_exchange_explicit(&mutex->state, unlocked,
                                 memory_order_release) == locked_contended) {
        // Wake up one waiting thread. We don't know which thread will be
        // woken or when it'll start executing -- futexes make no guarantees
        // here. There may not even be a thread waiting.
        //
        // The newly-woken thread will replace the unlocked state we just set above
        // with locked_contended state, which means that when it eventually releases
        // the mutex it will also call FUTEX_WAKE. This results in one extra wake
        // call whenever a lock is contended, but let us avoid forgetting anyone
        // without requiring us to track the number of sleepers.
        //
        // It's possible for another thread to sneak in and grab the lock between
        // the exchange above and the wake call below. If the new thread is "slow"
        // and holds the lock for a while, we'll wake up a sleeper, which will swap
        // in locked_uncontended state and then go back to sleep since the lock is
        // still held. If the new thread is "fast", running to completion before
        // we call wake, the thread we eventually wake will find an unlocked mutex
        // and will execute. Either way we have correct behavior and nobody is
        // orphaned on the wait queue.
        __futex_wake_ex(&mutex->state, shared, 1);
    }
}

/* This common inlined function is used to increment the counter of a recursive mutex.
 *
 * If the counter overflows, it will return EAGAIN.
 * Otherwise, it atomically increments the counter and returns 0.
 *
 */
static inline __always_inline int __recursive_increment(pthread_mutex_internal_t* mutex,
                                                        uint16_t old_state) {
    // Detect recursive lock overflow and return EAGAIN.
    // This is safe because only the owner thread can modify the
    // counter bits in the mutex value.
    if (MUTEX_COUNTER_BITS_WILL_OVERFLOW(old_state)) {
        return EAGAIN;
    }

    // Other threads are able to change the lower bits (e.g. promoting it to "contended"),
    // but the mutex counter will not overflow. So we use atomic_fetch_add operation here.
    // The mutex is still locked by current thread, so we don't need a release fence.
    atomic_fetch_add_explicit(&mutex->state, MUTEX_COUNTER_BITS_ONE, memory_order_relaxed);
    return 0;
}

static inline __always_inline int __recursive_or_errorcheck_mutex_wait(
                                                      pthread_mutex_internal_t* mutex,
                                                      uint16_t shared,
                                                      uint16_t old_state,
                                                      const timespec* rel_timeout) {
// __futex_wait always waits on a 32-bit value. But state is 16-bit. For a normal mutex, the owner_tid
// field in mutex is not used. On 64-bit devices, the __pad field in mutex is not used.
// But when a recursive or errorcheck mutex is used on 32-bit devices, we need to add the
// owner_tid value in the value argument for __futex_wait, otherwise we may always get EAGAIN error.

#if defined(__LP64__)
  return __futex_wait_ex(&mutex->state, shared, old_state, rel_timeout);

#else
  // This implementation works only when the layout of pthread_mutex_internal_t matches below expectation.
  // And it is based on the assumption that Android is always in little-endian devices.
  static_assert(offsetof(pthread_mutex_internal_t, state) == 0, "");
  static_assert(offsetof(pthread_mutex_internal_t, owner_tid) == 2, "");

  uint32_t owner_tid = atomic_load_explicit(&mutex->owner_tid, memory_order_relaxed);
  return __futex_wait_ex(&mutex->state, shared, (owner_tid << 16) | old_state, rel_timeout);
#endif
}

static int __pthread_mutex_lock_with_timeout(pthread_mutex_internal_t* mutex,
                                           const timespec* abs_timeout_or_null, clockid_t clock) {
    uint16_t old_state = atomic_load_explicit(&mutex->state, memory_order_relaxed);
    uint16_t mtype = (old_state & MUTEX_TYPE_MASK);
    uint16_t shared = (old_state & MUTEX_SHARED_MASK);

    // Handle common case first.
    if ( __predict_true(mtype == MUTEX_TYPE_BITS_NORMAL) ) {
        return __pthread_normal_mutex_lock(mutex, shared, abs_timeout_or_null, clock);
    }

    // Do we already own this recursive or error-check mutex?
    pid_t tid = __get_thread()->tid;
    if (tid == atomic_load_explicit(&mutex->owner_tid, memory_order_relaxed)) {
        if (mtype == MUTEX_TYPE_BITS_ERRORCHECK) {
            return EDEADLK;
        }
        return __recursive_increment(mutex, old_state);
    }

    const uint16_t unlocked           = mtype | shared | MUTEX_STATE_BITS_UNLOCKED;
    const uint16_t locked_uncontended = mtype | shared | MUTEX_STATE_BITS_LOCKED_UNCONTENDED;
    const uint16_t locked_contended   = mtype | shared | MUTEX_STATE_BITS_LOCKED_CONTENDED;

    // First, if the mutex is unlocked, try to quickly acquire it.
    // In the optimistic case where this works, set the state to locked_uncontended.
    if (old_state == unlocked) {
        // If exchanged successfully, an acquire fence is required to make
        // all memory accesses made by other threads visible to the current CPU.
        if (__predict_true(atomic_compare_exchange_strong_explicit(&mutex->state, &old_state,
                             locked_uncontended, memory_order_acquire, memory_order_relaxed))) {
            atomic_store_explicit(&mutex->owner_tid, tid, memory_order_relaxed);
            return 0;
        }
    }

    ScopedTrace trace("Contending for pthread mutex");

    while (true) {
        if (old_state == unlocked) {
            // NOTE: We put the state to locked_contended since we _know_ there
            // is contention when we are in this loop. This ensures all waiters
            // will be unlocked.

            // If exchanged successfully, an acquire fence is required to make
            // all memory accesses made by other threads visible to the current CPU.
            if (__predict_true(atomic_compare_exchange_weak_explicit(&mutex->state,
                                                                     &old_state, locked_contended,
                                                                     memory_order_acquire,
                                                                     memory_order_relaxed))) {
                atomic_store_explicit(&mutex->owner_tid, tid, memory_order_relaxed);
                return 0;
            }
            continue;
        } else if (MUTEX_STATE_BITS_IS_LOCKED_UNCONTENDED(old_state)) {
            // We should set it to locked_contended beforing going to sleep. This can make
            // sure waiters will be woken up eventually.

            int new_state = MUTEX_STATE_BITS_FLIP_CONTENTION(old_state);
            if (__predict_false(!atomic_compare_exchange_weak_explicit(&mutex->state,
                                                                       &old_state, new_state,
                                                                       memory_order_relaxed,
                                                                       memory_order_relaxed))) {
                continue;
            }
            old_state = new_state;
        }

        // We are in locked_contended state, sleep until someone wakes us up.
        timespec ts;
        timespec* rel_timeout = NULL;
        if (abs_timeout_or_null != NULL) {
            rel_timeout = &ts;
            if (!timespec_from_absolute_timespec(*rel_timeout, *abs_timeout_or_null, clock)) {
                return ETIMEDOUT;
            }
        }
        if (__recursive_or_errorcheck_mutex_wait(mutex, shared, old_state, rel_timeout) == -ETIMEDOUT) {
            return ETIMEDOUT;
        }
        old_state = atomic_load_explicit(&mutex->state, memory_order_relaxed);
    }
}

int pthread_mutex_lock(pthread_mutex_t* mutex_interface) {
#if !defined(__LP64__)
    if (mutex_interface == NULL) {
        return EINVAL;
    }
#endif

    pthread_mutex_internal_t* mutex = __get_internal_mutex(mutex_interface);

    uint16_t old_state = atomic_load_explicit(&mutex->state, memory_order_relaxed);
    uint16_t mtype = (old_state & MUTEX_TYPE_MASK);
    uint16_t shared = (old_state & MUTEX_SHARED_MASK);
    // Avoid slowing down fast path of normal mutex lock operation.
    if (__predict_true(mtype == MUTEX_TYPE_BITS_NORMAL)) {
      if (__predict_true(__pthread_normal_mutex_trylock(mutex, shared) == 0)) {
        return 0;
      }
    }
    return __pthread_mutex_lock_with_timeout(mutex, NULL, 0);
}

int pthread_mutex_unlock(pthread_mutex_t* mutex_interface) {
#if !defined(__LP64__)
    if (mutex_interface == NULL) {
        return EINVAL;
    }
#endif

    pthread_mutex_internal_t* mutex = __get_internal_mutex(mutex_interface);

    uint16_t old_state = atomic_load_explicit(&mutex->state, memory_order_relaxed);
    uint16_t mtype  = (old_state & MUTEX_TYPE_MASK);
    uint16_t shared = (old_state & MUTEX_SHARED_MASK);

    // Handle common case first.
    if (__predict_true(mtype == MUTEX_TYPE_BITS_NORMAL)) {
        __pthread_normal_mutex_unlock(mutex, shared);
        return 0;
    }

    // Do we already own this recursive or error-check mutex?
    pid_t tid = __get_thread()->tid;
    if ( tid != atomic_load_explicit(&mutex->owner_tid, memory_order_relaxed) ) {
        return EPERM;
    }

    // If the counter is > 0, we can simply decrement it atomically.
    // Since other threads can mutate the lower state bits (and only the
    // lower state bits), use a compare_exchange loop to do it.
    if (!MUTEX_COUNTER_BITS_IS_ZERO(old_state)) {
        // We still own the mutex, so a release fence is not needed.
        atomic_fetch_sub_explicit(&mutex->state, MUTEX_COUNTER_BITS_ONE, memory_order_relaxed);
        return 0;
    }

    // The counter is 0, so we'are going to unlock the mutex by resetting its
    // state to unlocked, we need to perform a atomic_exchange inorder to read
    // the current state, which will be locked_contended if there may have waiters
    // to awake.
    // A release fence is required to make previous stores visible to next
    // lock owner threads.
    atomic_store_explicit(&mutex->owner_tid, 0, memory_order_relaxed);
    const uint16_t unlocked = mtype | shared | MUTEX_STATE_BITS_UNLOCKED;
    old_state = atomic_exchange_explicit(&mutex->state, unlocked, memory_order_release);
    if (MUTEX_STATE_BITS_IS_LOCKED_CONTENDED(old_state)) {
        __futex_wake_ex(&mutex->state, shared, 1);
    }

    return 0;
}

int pthread_mutex_trylock(pthread_mutex_t* mutex_interface) {
    pthread_mutex_internal_t* mutex = __get_internal_mutex(mutex_interface);

    uint16_t old_state = atomic_load_explicit(&mutex->state, memory_order_relaxed);
    uint16_t mtype  = (old_state & MUTEX_TYPE_MASK);
    uint16_t shared = (old_state & MUTEX_SHARED_MASK);

    const uint16_t unlocked           = mtype | shared | MUTEX_STATE_BITS_UNLOCKED;
    const uint16_t locked_uncontended = mtype | shared | MUTEX_STATE_BITS_LOCKED_UNCONTENDED;

    // Handle common case first.
    if (__predict_true(mtype == MUTEX_TYPE_BITS_NORMAL)) {
        return __pthread_normal_mutex_trylock(mutex, shared);
    }

    // Do we already own this recursive or error-check mutex?
    pid_t tid = __get_thread()->tid;
    if (tid == atomic_load_explicit(&mutex->owner_tid, memory_order_relaxed)) {
        if (mtype == MUTEX_TYPE_BITS_ERRORCHECK) {
            return EBUSY;
        }
        return __recursive_increment(mutex, old_state);
    }

    // Same as pthread_mutex_lock, except that we don't want to wait, and
    // the only operation that can succeed is a single compare_exchange to acquire the
    // lock if it is released / not owned by anyone. No need for a complex loop.
    // If exchanged successfully, an acquire fence is required to make
    // all memory accesses made by other threads visible to the current CPU.
    old_state = unlocked;
    if (__predict_true(atomic_compare_exchange_strong_explicit(&mutex->state, &old_state,
                                                               locked_uncontended,
                                                               memory_order_acquire,
                                                               memory_order_relaxed))) {
        atomic_store_explicit(&mutex->owner_tid, tid, memory_order_relaxed);
        return 0;
    }
    return EBUSY;
}

#if !defined(__LP64__)
extern "C" int pthread_mutex_lock_timeout_np(pthread_mutex_t* mutex_interface, unsigned ms) {
    timespec abs_timeout;
    clock_gettime(CLOCK_MONOTONIC, &abs_timeout);
    abs_timeout.tv_sec  += ms / 1000;
    abs_timeout.tv_nsec += (ms % 1000) * 1000000;
    if (abs_timeout.tv_nsec >= NS_PER_S) {
        abs_timeout.tv_sec++;
        abs_timeout.tv_nsec -= NS_PER_S;
    }

    int error = __pthread_mutex_lock_with_timeout(__get_internal_mutex(mutex_interface),
                                                  &abs_timeout, CLOCK_MONOTONIC);
    if (error == ETIMEDOUT) {
        error = EBUSY;
    }
    return error;
}
#endif

int pthread_mutex_timedlock(pthread_mutex_t* mutex_interface, const timespec* abs_timeout) {
    return __pthread_mutex_lock_with_timeout(__get_internal_mutex(mutex_interface),
                                             abs_timeout, CLOCK_REALTIME);
}

int pthread_mutex_destroy(pthread_mutex_t* mutex_interface) {
    // Use trylock to ensure that the mutex is valid and not already locked.
    int error = pthread_mutex_trylock(mutex_interface);
    if (error != 0) {
        return error;
    }
    return 0;
}