7 files changed, 1080 insertions, 159 deletions
diff --git a/Utilities/cmliblzma/common/mythread.h b/Utilities/cmliblzma/common/mythread.h
new file mode 100644
index 0000000..be22654
--- /dev/null
+++ b/Utilities/cmliblzma/common/mythread.h
@@ -0,0 +1,521 @@
+///////////////////////////////////////////////////////////////////////////////
+//
+/// \file       mythread.h
+/// \brief      Some threading related helper macros and functions
+//
+//  Author:     Lasse Collin
+//
+//  This file has been put into the public domain.
+//  You can do whatever you want with this file.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef MYTHREAD_H
+#define MYTHREAD_H
+
+#include "sysdefs.h"
+
+// If any type of threading is enabled, #define MYTHREAD_ENABLED.
+#if defined(MYTHREAD_POSIX) || defined(MYTHREAD_WIN95) \
+		|| defined(MYTHREAD_VISTA)
+#	define MYTHREAD_ENABLED 1
+#endif
+
+
+#ifdef MYTHREAD_ENABLED
+
+////////////////////////////////////////
+// Shared between all threading types //
+////////////////////////////////////////
+
+// Locks a mutex for a duration of a block.
+//
+// Perform mythread_mutex_lock(&mutex) in the beginning of a block
+// and mythread_mutex_unlock(&mutex) at the end of the block. "break"
+// may be used to unlock the mutex and jump out of the block.
+// mythread_sync blocks may be nested.
+//
+// Example:
+//
+//     mythread_sync(mutex) {
+//         foo();
+//         if (some_error)
+//             break; // Skips bar()
+//         bar();
+//     }
+//
+// At least GCC optimizes the loops completely away so it doesn't slow
+// things down at all compared to plain mythread_mutex_lock(&mutex)
+// and mythread_mutex_unlock(&mutex) calls.
+//
+#define mythread_sync(mutex) mythread_sync_helper1(mutex, __LINE__)
+#define mythread_sync_helper1(mutex, line) mythread_sync_helper2(mutex, line)
+#define mythread_sync_helper2(mutex, line) \
+	for (unsigned int mythread_i_ ## line = 0; \
+			mythread_i_ ## line \
+				? (mythread_mutex_unlock(&(mutex)), 0) \
+				: (mythread_mutex_lock(&(mutex)), 1); \
+			mythread_i_ ## line = 1) \
+		for (unsigned int mythread_j_ ## line = 0; \
+				!mythread_j_ ## line; \
+				mythread_j_ ## line = 1)
+#endif
+
+
+#if !defined(MYTHREAD_ENABLED)
+
+//////////////////
+// No threading //
+//////////////////
+
+// Calls the given function once. This isn't thread safe.
+#define mythread_once(func) \
+do { \
+	static bool once_ = false; \
+	if (!once_) { \
+		func(); \
+		once_ = true; \
+	} \
+} while (0)
+
+
+#if !(defined(_WIN32) && !defined(__CYGWIN__))
+// Use sigprocmask() to set the signal mask in single-threaded programs.
+#include <signal.h>
+
+static inline void
+mythread_sigmask(int how, const sigset_t *restrict set,
+		sigset_t *restrict oset)
+{
+	int ret = sigprocmask(how, set, oset);
+	assert(ret == 0);
+	(void)ret;
+}
+#endif
+
+
+#elif defined(MYTHREAD_POSIX)
+
+////////////////////
+// Using pthreads //
+////////////////////
+
+#include <sys/time.h>
+#include <pthread.h>
+#include <signal.h>
+#include <time.h>
+#include <errno.h>
+
+#define MYTHREAD_RET_TYPE void *
+#define MYTHREAD_RET_VALUE NULL
+
+typedef pthread_t mythread;
+typedef pthread_mutex_t mythread_mutex;
+
+typedef struct {
+	pthread_cond_t cond;
+#ifdef HAVE_CLOCK_GETTIME
+	// Clock ID (CLOCK_REALTIME or CLOCK_MONOTONIC) associated with
+	// the condition variable.
+	clockid_t clk_id;
+#endif
+} mythread_cond;
+
+typedef struct timespec mythread_condtime;
+
+
+// Calls the given function once in a thread-safe way.
+#define mythread_once(func) \
+	do { \
+		static pthread_once_t once_ = PTHREAD_ONCE_INIT; \
+		pthread_once(&once_, &func); \
+	} while (0)
+
+
+// Use pthread_sigmask() to set the signal mask in multi-threaded programs.
+// Do nothing on OpenVMS since it lacks pthread_sigmask().
+static inline void
+mythread_sigmask(int how, const sigset_t *restrict set,
+		sigset_t *restrict oset)
+{
+#ifdef __VMS
+	(void)how;
+	(void)set;
+	(void)oset;
+#else
+	int ret = pthread_sigmask(how, set, oset);
+	assert(ret == 0);
+	(void)ret;
+#endif
+}
+
+
+// Creates a new thread with all signals blocked. Returns zero on success
+// and non-zero on error.
+static inline int
+mythread_create(mythread *thread, void *(*func)(void *arg), void *arg)
+{
+	sigset_t old;
+	sigset_t all;
+	sigfillset(&all);
+
+	mythread_sigmask(SIG_SETMASK, &all, &old);
+	const int ret = pthread_create(thread, NULL, func, arg);
+	mythread_sigmask(SIG_SETMASK, &old, NULL);
+
+	return ret;
+}
+
+// Joins a thread. Returns zero on success and non-zero on error.
+static inline int
+mythread_join(mythread thread)
+{
+	return pthread_join(thread, NULL);
+}
+
+
+// Initiatlizes a mutex. Returns zero on success and non-zero on error.
+static inline int
+mythread_mutex_init(mythread_mutex *mutex)
+{
+	return pthread_mutex_init(mutex, NULL);
+}
+
+static inline void
+mythread_mutex_destroy(mythread_mutex *mutex)
+{
+	int ret = pthread_mutex_destroy(mutex);
+	assert(ret == 0);
+	(void)ret;
+}
+
+static inline void
+mythread_mutex_lock(mythread_mutex *mutex)
+{
+	int ret = pthread_mutex_lock(mutex);
+	assert(ret == 0);
+	(void)ret;
+}
+
+static inline void
+mythread_mutex_unlock(mythread_mutex *mutex)
+{
+	int ret = pthread_mutex_unlock(mutex);
+	assert(ret == 0);
+	(void)ret;
+}
+
+
+// Initializes a condition variable.
+//
+// Using CLOCK_MONOTONIC instead of the default CLOCK_REALTIME makes the
+// timeout in pthread_cond_timedwait() work correctly also if system time
+// is suddenly changed. Unfortunately CLOCK_MONOTONIC isn't available
+// everywhere while the default CLOCK_REALTIME is, so the default is
+// used if CLOCK_MONOTONIC isn't available.
+//
+// If clock_gettime() isn't available at all, gettimeofday() will be used.
+static inline int
+mythread_cond_init(mythread_cond *mycond)
+{
+#ifdef HAVE_CLOCK_GETTIME
+	// NOTE: HAVE_DECL_CLOCK_MONOTONIC is always defined to 0 or 1.
+#	if defined(HAVE_PTHREAD_CONDATTR_SETCLOCK) && HAVE_DECL_CLOCK_MONOTONIC
+	struct timespec ts;
+	pthread_condattr_t condattr;
+
+	// POSIX doesn't seem to *require* that pthread_condattr_setclock()
+	// will fail if given an unsupported clock ID. Test that
+	// CLOCK_MONOTONIC really is supported using clock_gettime().
+	if (clock_gettime(CLOCK_MONOTONIC, &ts) == 0
+			&& pthread_condattr_init(&condattr) == 0) {
+		int ret = pthread_condattr_setclock(
+				&condattr, CLOCK_MONOTONIC);
+		if (ret == 0)
+			ret = pthread_cond_init(&mycond->cond, &condattr);
+
+		pthread_condattr_destroy(&condattr);
+
+		if (ret == 0) {
+			mycond->clk_id = CLOCK_MONOTONIC;
+			return 0;
+		}
+	}
+
+	// If anything above fails, fall back to the default CLOCK_REALTIME.
+	// POSIX requires that all implementations of clock_gettime() must
+	// support at least CLOCK_REALTIME.
+#	endif
+
+	mycond->clk_id = CLOCK_REALTIME;
+#endif
+
+	return pthread_cond_init(&mycond->cond, NULL);
+}
+
+static inline void
+mythread_cond_destroy(mythread_cond *cond)
+{
+	int ret = pthread_cond_destroy(&cond->cond);
+	assert(ret == 0);
+	(void)ret;
+}
+
+static inline void
+mythread_cond_signal(mythread_cond *cond)
+{
+	int ret = pthread_cond_signal(&cond->cond);
+	assert(ret == 0);
+	(void)ret;
+}
+
+static inline void
+mythread_cond_wait(mythread_cond *cond, mythread_mutex *mutex)
+{
+	int ret = pthread_cond_wait(&cond->cond, mutex);
+	assert(ret == 0);
+	(void)ret;
+}
+
+// Waits on a condition or until a timeout expires. If the timeout expires,
+// non-zero is returned, otherwise zero is returned.
+static inline int
+mythread_cond_timedwait(mythread_cond *cond, mythread_mutex *mutex,
+		const mythread_condtime *condtime)
+{
+	int ret = pthread_cond_timedwait(&cond->cond, mutex, condtime);
+	assert(ret == 0 || ret == ETIMEDOUT);
+	return ret;
+}
+
+// Sets condtime to the absolute time that is timeout_ms milliseconds
+// in the future. The type of the clock to use is taken from cond.
+static inline void
+mythread_condtime_set(mythread_condtime *condtime, const mythread_cond *cond,
+		uint32_t timeout_ms)
+{
+	condtime->tv_sec = timeout_ms / 1000;
+	condtime->tv_nsec = (timeout_ms % 1000) * 1000000;
+
+#ifdef HAVE_CLOCK_GETTIME
+	struct timespec now;
+	int ret = clock_gettime(cond->clk_id, &now);
+	assert(ret == 0);
+	(void)ret;
+
+	condtime->tv_sec += now.tv_sec;
+	condtime->tv_nsec += now.tv_nsec;
+#else
+	(void)cond;
+
+	struct timeval now;
+	gettimeofday(&now, NULL);
+
+	condtime->tv_sec += now.tv_sec;
+	condtime->tv_nsec += now.tv_usec * 1000L;
+#endif
+
+	// tv_nsec must stay in the range [0, 999_999_999].
+	if (condtime->tv_nsec >= 1000000000L) {
+		condtime->tv_nsec -= 1000000000L;
+		++condtime->tv_sec;
+	}
+}
+
+
+#elif defined(MYTHREAD_WIN95) || defined(MYTHREAD_VISTA)
+
+/////////////////////
+// Windows threads //
+/////////////////////
+
+#define WIN32_LEAN_AND_MEAN
+#ifdef MYTHREAD_VISTA
+#	undef _WIN32_WINNT
+#	define _WIN32_WINNT 0x0600
+#endif
+#include <windows.h>
+#include <process.h>
+
+#define MYTHREAD_RET_TYPE unsigned int __stdcall
+#define MYTHREAD_RET_VALUE 0
+
+typedef HANDLE mythread;
+typedef CRITICAL_SECTION mythread_mutex;
+
+#ifdef MYTHREAD_WIN95
+typedef HANDLE mythread_cond;
+#else
+typedef CONDITION_VARIABLE mythread_cond;
+#endif
+
+typedef struct {
+	// Tick count (milliseconds) in the beginning of the timeout.
+	// NOTE: This is 32 bits so it wraps around after 49.7 days.
+	// Multi-day timeouts may not work as expected.
+	DWORD start;
+
+	// Length of the timeout in milliseconds. The timeout expires
+	// when the current tick count minus "start" is equal or greater
+	// than "timeout".
+	DWORD timeout;
+} mythread_condtime;
+
+
+// mythread_once() is only available with Vista threads.
+#ifdef MYTHREAD_VISTA
+#define mythread_once(func) \
+	do { \
+		static INIT_ONCE once_ = INIT_ONCE_STATIC_INIT; \
+		BOOL pending_; \
+		if (!InitOnceBeginInitialize(&once_, 0, &pending_, NULL)) \
+			abort(); \
+		if (pending_) \
+			func(); \
+		if (!InitOnceComplete(&once, 0, NULL)) \
+			abort(); \
+	} while (0)
+#endif
+
+
+// mythread_sigmask() isn't available on Windows. Even a dummy version would
+// make no sense because the other POSIX signal functions are missing anyway.
+
+
+static inline int
+mythread_create(mythread *thread,
+		unsigned int (__stdcall *func)(void *arg), void *arg)
+{
+	uintptr_t ret = _beginthreadex(NULL, 0, func, arg, 0, NULL);
+	if (ret == 0)
+		return -1;
+
+	*thread = (HANDLE)ret;
+	return 0;
+}
+
+static inline int
+mythread_join(mythread thread)
+{
+	int ret = 0;
+
+	if (WaitForSingleObject(thread, INFINITE) != WAIT_OBJECT_0)
+		ret = -1;
+
+	if (!CloseHandle(thread))
+		ret = -1;
+
+	return ret;
+}
+
+
+static inline int
+mythread_mutex_init(mythread_mutex *mutex)
+{
+	InitializeCriticalSection(mutex);
+	return 0;
+}
+
+static inline void
+mythread_mutex_destroy(mythread_mutex *mutex)
+{
+	DeleteCriticalSection(mutex);
+}
+
+static inline void
+mythread_mutex_lock(mythread_mutex *mutex)
+{
+	EnterCriticalSection(mutex);
+}
+
+static inline void
+mythread_mutex_unlock(mythread_mutex *mutex)
+{
+	LeaveCriticalSection(mutex);
+}
+
+
+static inline int
+mythread_cond_init(mythread_cond *cond)
+{
+#ifdef MYTHREAD_WIN95
+	*cond = CreateEvent(NULL, FALSE, FALSE, NULL);
+	return *cond == NULL ? -1 : 0;
+#else
+	InitializeConditionVariable(cond);
+	return 0;
+#endif
+}
+
+static inline void
+mythread_cond_destroy(mythread_cond *cond)
+{
+#ifdef MYTHREAD_WIN95
+	CloseHandle(*cond);
+#else
+	(void)cond;
+#endif
+}
+
+static inline void
+mythread_cond_signal(mythread_cond *cond)
+{
+#ifdef MYTHREAD_WIN95
+	SetEvent(*cond);
+#else
+	WakeConditionVariable(cond);
+#endif
+}
+
+static inline void
+mythread_cond_wait(mythread_cond *cond, mythread_mutex *mutex)
+{
+#ifdef MYTHREAD_WIN95
+	LeaveCriticalSection(mutex);
+	WaitForSingleObject(*cond, INFINITE);
+	EnterCriticalSection(mutex);
+#else
+	BOOL ret = SleepConditionVariableCS(cond, mutex, INFINITE);
+	assert(ret);
+	(void)ret;
+#endif
+}
+
+static inline int
+mythread_cond_timedwait(mythread_cond *cond, mythread_mutex *mutex,
+		const mythread_condtime *condtime)
+{
+#ifdef MYTHREAD_WIN95
+	LeaveCriticalSection(mutex);
+#endif
+
+	DWORD elapsed = GetTickCount() - condtime->start;
+	DWORD timeout = elapsed >= condtime->timeout
+			? 0 : condtime->timeout - elapsed;
+
+#ifdef MYTHREAD_WIN95
+	DWORD ret = WaitForSingleObject(*cond, timeout);
+	assert(ret == WAIT_OBJECT_0 || ret == WAIT_TIMEOUT);
+
+	EnterCriticalSection(mutex);
+
+	return ret == WAIT_TIMEOUT;
+#else
+	BOOL ret = SleepConditionVariableCS(cond, mutex, timeout);
+	assert(ret || GetLastError() == ERROR_TIMEOUT);
+	return !ret;
+#endif
+}
+
+static inline void
+mythread_condtime_set(mythread_condtime *condtime, const mythread_cond *cond,
+		uint32_t timeout)
+{
+	(void)cond;
+	condtime->start = GetTickCount();
+	condtime->timeout = timeout;
+}
+
+#endif
+
+#endif
diff --git a/Utilities/cmliblzma/common/sysdefs.h b/Utilities/cmliblzma/common/sysdefs.h
index 22f487b..86c5da0 100644
--- a/Utilities/cmliblzma/common/sysdefs.h
+++ b/Utilities/cmliblzma/common/sysdefs.h
@@ -49,9 +49,7 @@
 
 // Some pre-C99 systems have SIZE_MAX in limits.h instead of stdint.h. The
 // limits are also used to figure out some macros missing from pre-C99 systems.
-#ifdef HAVE_LIMITS_H
-#	include <limits.h>
-#endif
+#include <limits.h>
 
 
 #if defined(_MSC_VER) && (_MSC_VER < 1310)
@@ -164,9 +162,7 @@ typedef unsigned char _Bool;
 
 // string.h should be enough but let's include strings.h and memory.h too if
 // they exists, since that shouldn't do any harm, but may improve portability.
-#ifdef HAVE_STRING_H
-#	include <string.h>
-#endif
+#include <string.h>
 
 #ifdef HAVE_STRINGS_H
 #	include <strings.h>
@@ -204,7 +200,8 @@ typedef unsigned char _Bool;
 #	define ARRAY_SIZE(array) (sizeof(array) / sizeof((array)[0]))
 #endif
 
-#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4
+#if defined(__GNUC__) \
+		&& ((__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4)
 #	define lzma_attr_alloc_size(x) __attribute__((__alloc_size__(x)))
 #else
 #	define lzma_attr_alloc_size(x)
diff --git a/Utilities/cmliblzma/common/tuklib_common.h b/Utilities/cmliblzma/common/tuklib_common.h
new file mode 100644
index 0000000..31fbab5
--- /dev/null
+++ b/Utilities/cmliblzma/common/tuklib_common.h
@@ -0,0 +1,71 @@
+///////////////////////////////////////////////////////////////////////////////
+//
+/// \file       tuklib_common.h
+/// \brief      Common definitions for tuklib modules
+//
+//  Author:     Lasse Collin
+//
+//  This file has been put into the public domain.
+//  You can do whatever you want with this file.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef TUKLIB_COMMON_H
+#define TUKLIB_COMMON_H
+
+// The config file may be replaced by a package-specific file.
+// It should include at least stddef.h, inttypes.h, and limits.h.
+#include "tuklib_config.h"
+
+// TUKLIB_SYMBOL_PREFIX is prefixed to all symbols exported by
+// the tuklib modules. If you use a tuklib module in a library,
+// you should use TUKLIB_SYMBOL_PREFIX to make sure that there
+// are no symbol conflicts in case someone links your library
+// into application that also uses the same tuklib module.
+#ifndef TUKLIB_SYMBOL_PREFIX
+#	define TUKLIB_SYMBOL_PREFIX
+#endif
+
+#define TUKLIB_CAT_X(a, b) a ## b
+#define TUKLIB_CAT(a, b) TUKLIB_CAT_X(a, b)
+
+#ifndef TUKLIB_SYMBOL
+#	define TUKLIB_SYMBOL(sym) TUKLIB_CAT(TUKLIB_SYMBOL_PREFIX, sym)
+#endif
+
+#ifndef TUKLIB_DECLS_BEGIN
+#	ifdef __cplusplus
+#		define TUKLIB_DECLS_BEGIN extern "C" {
+#	else
+#		define TUKLIB_DECLS_BEGIN
+#	endif
+#endif
+
+#ifndef TUKLIB_DECLS_END
+#	ifdef __cplusplus
+#		define TUKLIB_DECLS_END }
+#	else
+#		define TUKLIB_DECLS_END
+#	endif
+#endif
+
+#if defined(__GNUC__) && defined(__GNUC_MINOR__)
+#	define TUKLIB_GNUC_REQ(major, minor) \
+		((__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)) \
+			|| __GNUC__ > (major))
+#else
+#	define TUKLIB_GNUC_REQ(major, minor) 0
+#endif
+
+#if TUKLIB_GNUC_REQ(2, 5)
+#	define tuklib_attr_noreturn __attribute__((__noreturn__))
+#else
+#	define tuklib_attr_noreturn
+#endif
+
+#if (defined(_WIN32) && !defined(__CYGWIN__)) \
+		|| defined(__OS2__) || defined(__MSDOS__)
+#	define TUKLIB_DOSLIKE 1
+#endif
+
+#endif
diff --git a/Utilities/cmliblzma/common/tuklib_config.h b/Utilities/cmliblzma/common/tuklib_config.h
new file mode 100644
index 0000000..549cb24
--- /dev/null
+++ b/Utilities/cmliblzma/common/tuklib_config.h
@@ -0,0 +1,7 @@
+#ifdef HAVE_CONFIG_H
+#	include "sysdefs.h"
+#else
+#	include <stddef.h>
+#	include <inttypes.h>
+#	include <limits.h>
+#endif
diff --git a/Utilities/cmliblzma/common/tuklib_cpucores.c b/Utilities/cmliblzma/common/tuklib_cpucores.c
new file mode 100644
index 0000000..cc968dd
--- /dev/null
+++ b/Utilities/cmliblzma/common/tuklib_cpucores.c
@@ -0,0 +1,100 @@
+///////////////////////////////////////////////////////////////////////////////
+//
+/// \file       tuklib_cpucores.c
+/// \brief      Get the number of CPU cores online
+//
+//  Author:     Lasse Collin
+//
+//  This file has been put into the public domain.
+//  You can do whatever you want with this file.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#include "tuklib_cpucores.h"
+
+#if defined(_WIN32) || defined(__CYGWIN__)
+#	ifndef _WIN32_WINNT
+#		define _WIN32_WINNT 0x0500
+#	endif
+#	include <windows.h>
+
+// glibc >= 2.9
+#elif defined(TUKLIB_CPUCORES_SCHED_GETAFFINITY)
+#	include <sched.h>
+
+// FreeBSD
+#elif defined(TUKLIB_CPUCORES_CPUSET)
+#	include <sys/param.h>
+#	include <sys/cpuset.h>
+
+#elif defined(TUKLIB_CPUCORES_SYSCTL)
+#	ifdef HAVE_SYS_PARAM_H
+#		include <sys/param.h>
+#	endif
+#	include <sys/sysctl.h>
+
+#elif defined(TUKLIB_CPUCORES_SYSCONF)
+#	include <unistd.h>
+
+// HP-UX
+#elif defined(TUKLIB_CPUCORES_PSTAT_GETDYNAMIC)
+#	include <sys/param.h>
+#	include <sys/pstat.h>
+#endif
+
+
+extern uint32_t
+tuklib_cpucores(void)
+{
+	uint32_t ret = 0;
+
+#if defined(_WIN32) || defined(__CYGWIN__)
+	SYSTEM_INFO sysinfo;
+	GetSystemInfo(&sysinfo);
+	ret = sysinfo.dwNumberOfProcessors;
+
+#elif defined(TUKLIB_CPUCORES_SCHED_GETAFFINITY)
+	cpu_set_t cpu_mask;
+	if (sched_getaffinity(0, sizeof(cpu_mask), &cpu_mask) == 0)
+		ret = (uint32_t)CPU_COUNT(&cpu_mask);
+
+#elif defined(TUKLIB_CPUCORES_CPUSET)
+	cpuset_t set;
+	if (cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, -1,
+			sizeof(set), &set) == 0) {
+#	ifdef CPU_COUNT
+		ret = (uint32_t)CPU_COUNT(&set);
+#	else
+		for (unsigned i = 0; i < CPU_SETSIZE; ++i)
+			if (CPU_ISSET(i, &set))
+				++ret;
+#	endif
+	}
+
+#elif defined(TUKLIB_CPUCORES_SYSCTL)
+	int name[2] = { CTL_HW, HW_NCPU };
+	int cpus;
+	size_t cpus_size = sizeof(cpus);
+	if (sysctl(name, 2, &cpus, &cpus_size, NULL, 0) != -1
+			&& cpus_size == sizeof(cpus) && cpus > 0)
+		ret = (uint32_t)cpus;
+
+#elif defined(TUKLIB_CPUCORES_SYSCONF)
+#	ifdef _SC_NPROCESSORS_ONLN
+	// Most systems
+	const long cpus = sysconf(_SC_NPROCESSORS_ONLN);
+#	else
+	// IRIX
+	const long cpus = sysconf(_SC_NPROC_ONLN);
+#	endif
+	if (cpus > 0)
+		ret = (uint32_t)cpus;
+
+#elif defined(TUKLIB_CPUCORES_PSTAT_GETDYNAMIC)
+	struct pst_dynamic pst;
+	if (pstat_getdynamic(&pst, sizeof(pst), 1, 0) != -1)
+		ret = (uint32_t)pst.psd_proc_cnt;
+#endif
+
+	return ret;
+}
diff --git a/Utilities/cmliblzma/common/tuklib_cpucores.h b/Utilities/cmliblzma/common/tuklib_cpucores.h
new file mode 100644
index 0000000..be1ce1c
--- /dev/null
+++ b/Utilities/cmliblzma/common/tuklib_cpucores.h
@@ -0,0 +1,23 @@
+///////////////////////////////////////////////////////////////////////////////
+//
+/// \file       tuklib_cpucores.h
+/// \brief      Get the number of CPU cores online
+//
+//  Author:     Lasse Collin
+//
+//  This file has been put into the public domain.
+//  You can do whatever you want with this file.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef TUKLIB_CPUCORES_H
+#define TUKLIB_CPUCORES_H
+
+#include "tuklib_common.h"
+TUKLIB_DECLS_BEGIN
+
+#define tuklib_cpucores TUKLIB_SYMBOL(tuklib_cpucores)
+extern uint32_t tuklib_cpucores(void);
+
+TUKLIB_DECLS_END
+#endif
diff --git a/Utilities/cmliblzma/common/tuklib_integer.h b/Utilities/cmliblzma/common/tuklib_integer.h
index 5d82685..4e61357 100644
--- a/Utilities/cmliblzma/common/tuklib_integer.h
+++ b/Utilities/cmliblzma/common/tuklib_integer.h
@@ -6,22 +6,26 @@
 /// This file provides macros or functions to do some basic integer and bit
 /// operations.
 ///
-/// Endianness related integer operations (XX = 16, 32, or 64; Y = b or l):
-///   - Byte swapping: bswapXX(num)
-///   - Byte order conversions to/from native: convXXYe(num)
-///   - Aligned reads: readXXYe(ptr)
-///   - Aligned writes: writeXXYe(ptr, num)
-///   - Unaligned reads (16/32-bit only): unaligned_readXXYe(ptr)
-///   - Unaligned writes (16/32-bit only): unaligned_writeXXYe(ptr, num)
+/// Native endian inline functions (XX = 16, 32, or 64):
+///   - Unaligned native endian reads: readXXne(ptr)
+///   - Unaligned native endian writes: writeXXne(ptr, num)
+///   - Aligned native endian reads: aligned_readXXne(ptr)
+///   - Aligned native endian writes: aligned_writeXXne(ptr, num)
 ///
-/// Since they can macros, the arguments should have no side effects since
-/// they may be evaluated more than once.
+/// Endianness-converting integer operations (these can be macros!)
+/// (XX = 16, 32, or 64; Y = b or l):
+///   - Byte swapping: bswapXX(num)
+///   - Byte order conversions to/from native (byteswaps if Y isn't
+///     the native endianness): convXXYe(num)
+///   - Unaligned reads (16/32-bit only): readXXYe(ptr)
+///   - Unaligned writes (16/32-bit only): writeXXYe(ptr, num)
+///   - Aligned reads: aligned_readXXYe(ptr)
+///   - Aligned writes: aligned_writeXXYe(ptr, num)
 ///
-/// \todo       PowerPC and possibly some other architectures support
-///             byte swapping load and store instructions. This file
-///             doesn't take advantage of those instructions.
+/// Since the above can macros, the arguments should have no side effects
+/// because they may be evaluated more than once.
 ///
-/// Bit scan operations for non-zero 32-bit integers:
+/// Bit scan operations for non-zero 32-bit integers (inline functions):
 ///   - Bit scan reverse (find highest non-zero bit): bsr32(num)
 ///   - Count leading zeros: clz32(num)
 ///   - Count trailing zeros: ctz32(num)
@@ -41,22 +45,27 @@
 #ifndef TUKLIB_INTEGER_H
 #define TUKLIB_INTEGER_H
 
-#include "sysdefs.h"
+#include "tuklib_common.h"
+#include <string.h>
 
-#if defined(__GNUC__) && defined(__GNUC_MINOR__)
-#   define TUKLIB_GNUC_REQ(major, minor) \
-        ((__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)) \
-            || __GNUC__ > (major))
-#else
-#   define TUKLIB_GNUC_REQ(major, minor) 0
+// Newer Intel C compilers require immintrin.h for _bit_scan_reverse()
+// and such functions.
+#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500)
+#	include <immintrin.h>
 #endif
 
 
-////////////////////////////////////////
-// Operating system specific features //
-////////////////////////////////////////
+///////////////////
+// Byte swapping //
+///////////////////
+
+#if defined(HAVE___BUILTIN_BSWAPXX)
+	// GCC >= 4.8 and Clang
+#	define bswap16(n) __builtin_bswap16(n)
+#	define bswap32(n) __builtin_bswap32(n)
+#	define bswap64(n) __builtin_bswap64(n)
 
-#if defined(HAVE_BYTESWAP_H)
+#elif defined(HAVE_BYTESWAP_H)
 	// glibc, uClibc, dietlibc
 #	include <byteswap.h>
 #	ifdef HAVE_BSWAP_16
@@ -105,45 +114,33 @@
 #	endif
 #endif
 
-
-////////////////////////////////
-// Compiler-specific features //
-////////////////////////////////
-
-// Newer Intel C compilers require immintrin.h for _bit_scan_reverse()
-// and such functions.
-#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500)
-#	include <immintrin.h>
-#endif
-
-
-///////////////////
-// Byte swapping //
-///////////////////
-
 #ifndef bswap16
-#	define bswap16(num) \
-		(((uint16_t)(num) << 8) | ((uint16_t)(num) >> 8))
+#	define bswap16(n) (uint16_t)( \
+		  (((n) & 0x00FFU) << 8) \
+		| (((n) & 0xFF00U) >> 8) \
+	)
 #endif
 
 #ifndef bswap32
-#	define bswap32(num) \
-		( (((uint32_t)(num) << 24)                       ) \
-		| (((uint32_t)(num) <<  8) & UINT32_C(0x00FF0000)) \
-		| (((uint32_t)(num) >>  8) & UINT32_C(0x0000FF00)) \
-		| (((uint32_t)(num) >> 24)                       ) )
+#	define bswap32(n) (uint32_t)( \
+		  (((n) & UINT32_C(0x000000FF)) << 24) \
+		| (((n) & UINT32_C(0x0000FF00)) << 8) \
+		| (((n) & UINT32_C(0x00FF0000)) >> 8) \
+		| (((n) & UINT32_C(0xFF000000)) >> 24) \
+	)
 #endif
 
 #ifndef bswap64
-#	define bswap64(num) \
-		( (((uint64_t)(num) << 56)                               ) \
-		| (((uint64_t)(num) << 40) & UINT64_C(0x00FF000000000000)) \
-		| (((uint64_t)(num) << 24) & UINT64_C(0x0000FF0000000000)) \
-		| (((uint64_t)(num) <<  8) & UINT64_C(0x000000FF00000000)) \
-		| (((uint64_t)(num) >>  8) & UINT64_C(0x00000000FF000000)) \
-		| (((uint64_t)(num) >> 24) & UINT64_C(0x0000000000FF0000)) \
-		| (((uint64_t)(num) >> 40) & UINT64_C(0x000000000000FF00)) \
-		| (((uint64_t)(num) >> 56)                               ) )
+#	define bswap64(n) (uint64_t)( \
+		  (((n) & UINT64_C(0x00000000000000FF)) << 56) \
+		| (((n) & UINT64_C(0x000000000000FF00)) << 40) \
+		| (((n) & UINT64_C(0x0000000000FF0000)) << 24) \
+		| (((n) & UINT64_C(0x00000000FF000000)) << 8) \
+		| (((n) & UINT64_C(0x000000FF00000000)) >> 8) \
+		| (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \
+		| (((n) & UINT64_C(0x00FF000000000000)) >> 40) \
+		| (((n) & UINT64_C(0xFF00000000000000)) >> 56) \
+	)
 #endif
 
 // Define conversion macros using the basic byte swapping macros.
@@ -188,76 +185,76 @@
 #endif
 
 
-//////////////////////////////
-// Aligned reads and writes //
-//////////////////////////////
-
-static inline uint16_t
-read16be(const uint8_t *buf)
-{
-	uint16_t num = *(const uint16_t *)buf;
-	return conv16be(num);
-}
+////////////////////////////////
+// Unaligned reads and writes //
+////////////////////////////////
 
+// The traditional way of casting e.g. *(const uint16_t *)uint8_pointer
+// is bad even if the uint8_pointer is properly aligned because this kind
+// of casts break strict aliasing rules and result in undefined behavior.
+// With unaligned pointers it's even worse: compilers may emit vector
+// instructions that require aligned pointers even if non-vector
+// instructions work with unaligned pointers.
+//
+// Using memcpy() is the standard compliant way to do unaligned access.
+// Many modern compilers inline it so there is no function call overhead.
+// For those compilers that don't handle the memcpy() method well, the
+// old casting method (that violates strict aliasing) can be requested at
+// build time. A third method, casting to a packed struct, would also be
+// an option but isn't provided to keep things simpler (it's already a mess).
+// Hopefully this is flexible enough in practice.
 
 static inline uint16_t
-read16le(const uint8_t *buf)
+read16ne(const uint8_t *buf)
 {
-	uint16_t num = *(const uint16_t *)buf;
-	return conv16le(num);
-}
-
-
-static inline uint32_t
-read32be(const uint8_t *buf)
-{
-	uint32_t num = *(const uint32_t *)buf;
-	return conv32be(num);
+#if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
+		&& defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING)
+	return *(const uint16_t *)buf;
+#else
+	uint16_t num;
+	memcpy(&num, buf, sizeof(num));
+	return num;
+#endif
 }
 
 
 static inline uint32_t
-read32le(const uint8_t *buf)
-{
-	uint32_t num = *(const uint32_t *)buf;
-	return conv32le(num);
-}
-
-
-static inline uint64_t
-read64be(const uint8_t *buf)
+read32ne(const uint8_t *buf)
 {
-	uint64_t num = *(const uint64_t *)buf;
-	return conv64be(num);
+#if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
+		&& defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING)
+	return *(const uint32_t *)buf;
+#else
+	uint32_t num;
+	memcpy(&num, buf, sizeof(num));
+	return num;
+#endif
 }
 
 
 static inline uint64_t
-read64le(const uint8_t *buf)
+read64ne(const uint8_t *buf)
 {
-	uint64_t num = *(const uint64_t *)buf;
-	return conv64le(num);
+#if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
+		&& defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING)
+	return *(const uint64_t *)buf;
+#else
+	uint64_t num;
+	memcpy(&num, buf, sizeof(num));
+	return num;
+#endif
 }
 
 
-// NOTE: Possible byte swapping must be done in a macro to allow GCC
-// to optimize byte swapping of constants when using glibc's or *BSD's
-// byte swapping macros. The actual write is done in an inline function
-// to make type checking of the buf pointer possible similarly to readXXYe()
-// functions.
-
-#define write16be(buf, num) write16ne((buf), conv16be(num))
-#define write16le(buf, num) write16ne((buf), conv16le(num))
-#define write32be(buf, num) write32ne((buf), conv32be(num))
-#define write32le(buf, num) write32ne((buf), conv32le(num))
-#define write64be(buf, num) write64ne((buf), conv64be(num))
-#define write64le(buf, num) write64ne((buf), conv64le(num))
-
-
 static inline void
 write16ne(uint8_t *buf, uint16_t num)
 {
+#if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
+		&& defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING)
 	*(uint16_t *)buf = num;
+#else
+	memcpy(buf, &num, sizeof(num));
+#endif
 	return;
 }
 
@@ -265,7 +262,12 @@ write16ne(uint8_t *buf, uint16_t num)
 static inline void
 write32ne(uint8_t *buf, uint32_t num)
 {
+#if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
+		&& defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING)
 	*(uint32_t *)buf = num;
+#else
+	memcpy(buf, &num, sizeof(num));
+#endif
 	return;
 }
 
@@ -273,90 +275,114 @@ write32ne(uint8_t *buf, uint32_t num)
 static inline void
 write64ne(uint8_t *buf, uint64_t num)
 {
+#if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
+		&& defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING)
 	*(uint64_t *)buf = num;
+#else
+	memcpy(buf, &num, sizeof(num));
+#endif
 	return;
 }
 
 
-////////////////////////////////
-// Unaligned reads and writes //
-////////////////////////////////
-
-// NOTE: TUKLIB_FAST_UNALIGNED_ACCESS indicates only support for 16-bit and
-// 32-bit unaligned integer loads and stores. It's possible that 64-bit
-// unaligned access doesn't work or is slower than byte-by-byte access.
-// Since unaligned 64-bit is probably not needed as often as 16-bit or
-// 32-bit, we simply don't support 64-bit unaligned access for now.
-#ifdef TUKLIB_FAST_UNALIGNED_ACCESS
-#	define unaligned_read16be read16be
-#	define unaligned_read16le read16le
-#	define unaligned_read32be read32be
-#	define unaligned_read32le read32le
-#	define unaligned_write16be write16be
-#	define unaligned_write16le write16le
-#	define unaligned_write32be write32be
-#	define unaligned_write32le write32le
-
-#else
-
 static inline uint16_t
-unaligned_read16be(const uint8_t *buf)
+read16be(const uint8_t *buf)
 {
+#if defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
+	uint16_t num = read16ne(buf);
+	return conv16be(num);
+#else
 	uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1];
 	return num;
+#endif
 }
 
 
 static inline uint16_t
-unaligned_read16le(const uint8_t *buf)
+read16le(const uint8_t *buf)
 {
+#if !defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
+	uint16_t num = read16ne(buf);
+	return conv16le(num);
+#else
 	uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8);
 	return num;
+#endif
 }
 
 
 static inline uint32_t
-unaligned_read32be(const uint8_t *buf)
+read32be(const uint8_t *buf)
 {
+#if defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
+	uint32_t num = read32ne(buf);
+	return conv32be(num);
+#else
 	uint32_t num = (uint32_t)buf[0] << 24;
 	num |= (uint32_t)buf[1] << 16;
 	num |= (uint32_t)buf[2] << 8;
 	num |= (uint32_t)buf[3];
 	return num;
+#endif
 }
 
 
 static inline uint32_t
-unaligned_read32le(const uint8_t *buf)
+read32le(const uint8_t *buf)
 {
+#if !defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
+	uint32_t num = read32ne(buf);
+	return conv32le(num);
+#else
 	uint32_t num = (uint32_t)buf[0];
 	num |= (uint32_t)buf[1] << 8;
 	num |= (uint32_t)buf[2] << 16;
 	num |= (uint32_t)buf[3] << 24;
 	return num;
+#endif
 }
 
 
+// NOTE: Possible byte swapping must be done in a macro to allow the compiler
+// to optimize byte swapping of constants when using glibc's or *BSD's
+// byte swapping macros. The actual write is done in an inline function
+// to make type checking of the buf pointer possible.
+#if defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
+#	define write16be(buf, num) write16ne(buf, conv16be(num))
+#	define write32be(buf, num) write32ne(buf, conv32be(num))
+#endif
+
+#if !defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
+#	define write16le(buf, num) write16ne(buf, conv16le(num))
+#	define write32le(buf, num) write32ne(buf, conv32le(num))
+#endif
+
+
+#ifndef write16be
 static inline void
-unaligned_write16be(uint8_t *buf, uint16_t num)
+write16be(uint8_t *buf, uint16_t num)
 {
 	buf[0] = (uint8_t)(num >> 8);
 	buf[1] = (uint8_t)num;
 	return;
 }
+#endif
 
 
+#ifndef write16le
 static inline void
-unaligned_write16le(uint8_t *buf, uint16_t num)
+write16le(uint8_t *buf, uint16_t num)
 {
 	buf[0] = (uint8_t)num;
 	buf[1] = (uint8_t)(num >> 8);
 	return;
 }
+#endif
 
 
+#ifndef write32be
 static inline void
-unaligned_write32be(uint8_t *buf, uint32_t num)
+write32be(uint8_t *buf, uint32_t num)
 {
 	buf[0] = (uint8_t)(num >> 24);
 	buf[1] = (uint8_t)(num >> 16);
@@ -364,10 +390,12 @@ unaligned_write32be(uint8_t *buf, uint32_t num)
 	buf[3] = (uint8_t)num;
 	return;
 }
+#endif
 
 
+#ifndef write32le
 static inline void
-unaligned_write32le(uint8_t *buf, uint32_t num)
+write32le(uint8_t *buf, uint32_t num)
 {
 	buf[0] = (uint8_t)num;
 	buf[1] = (uint8_t)(num >> 8);
@@ -375,10 +403,184 @@ unaligned_write32le(uint8_t *buf, uint32_t num)
 	buf[3] = (uint8_t)(num >> 24);
 	return;
 }
+#endif
+
+
+//////////////////////////////
+// Aligned reads and writes //
+//////////////////////////////
 
+// Separate functions for aligned reads and writes are provided since on
+// strict-align archs aligned access is much faster than unaligned access.
+//
+// Just like in the unaligned case, memcpy() is needed to avoid
+// strict aliasing violations. However, on archs that don't support
+// unaligned access the compiler cannot know that the pointers given
+// to memcpy() are aligned which results in slow code. As of C11 there is
+// no standard way to tell the compiler that we know that the address is
+// aligned but some compilers have language extensions to do that. With
+// such language extensions the memcpy() method gives excellent results.
+//
+// What to do on a strict-align system when no known language extentensions
+// are available? Falling back to byte-by-byte access would be safe but ruin
+// optimizations that have been made specifically with aligned access in mind.
+// As a compromise, aligned reads will fall back to non-compliant type punning
+// but aligned writes will be byte-by-byte, that is, fast reads are preferred
+// over fast writes. This obviously isn't great but hopefully it's a working
+// compromise for now.
+//
+// __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6.
+#ifdef HAVE___BUILTIN_ASSUME_ALIGNED
+#	define tuklib_memcpy_aligned(dest, src, size) \
+		memcpy(dest, __builtin_assume_aligned(src, size), size)
+#else
+#	define tuklib_memcpy_aligned(dest, src, size) \
+		memcpy(dest, src, size)
+#	ifndef TUKLIB_FAST_UNALIGNED_ACCESS
+#		define TUKLIB_USE_UNSAFE_ALIGNED_READS 1
+#	endif
 #endif
 
 
+static inline uint16_t
+aligned_read16ne(const uint8_t *buf)
+{
+#if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
+		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
+	return *(const uint16_t *)buf;
+#else
+	uint16_t num;
+	tuklib_memcpy_aligned(&num, buf, sizeof(num));
+	return num;
+#endif
+}
+
+
+static inline uint32_t
+aligned_read32ne(const uint8_t *buf)
+{
+#if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
+		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
+	return *(const uint32_t *)buf;
+#else
+	uint32_t num;
+	tuklib_memcpy_aligned(&num, buf, sizeof(num));
+	return num;
+#endif
+}
+
+
+static inline uint64_t
+aligned_read64ne(const uint8_t *buf)
+{
+#if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
+		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
+	return *(const uint64_t *)buf;
+#else
+	uint64_t num;
+	tuklib_memcpy_aligned(&num, buf, sizeof(num));
+	return num;
+#endif
+}
+
+
+static inline void
+aligned_write16ne(uint8_t *buf, uint16_t num)
+{
+#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
+	*(uint16_t *)buf = num;
+#else
+	tuklib_memcpy_aligned(buf, &num, sizeof(num));
+#endif
+	return;
+}
+
+
+static inline void
+aligned_write32ne(uint8_t *buf, uint32_t num)
+{
+#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
+	*(uint32_t *)buf = num;
+#else
+	tuklib_memcpy_aligned(buf, &num, sizeof(num));
+#endif
+	return;
+}
+
+
+static inline void
+aligned_write64ne(uint8_t *buf, uint64_t num)
+{
+#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
+	*(uint64_t *)buf = num;
+#else
+	tuklib_memcpy_aligned(buf, &num, sizeof(num));
+#endif
+	return;
+}
+
+
+static inline uint16_t
+aligned_read16be(const uint8_t *buf)
+{
+	uint16_t num = aligned_read16ne(buf);
+	return conv16be(num);
+}
+
+
+static inline uint16_t
+aligned_read16le(const uint8_t *buf)
+{
+	uint16_t num = aligned_read16ne(buf);
+	return conv16le(num);
+}
+
+
+static inline uint32_t
+aligned_read32be(const uint8_t *buf)
+{
+	uint32_t num = aligned_read32ne(buf);
+	return conv32be(num);
+}
+
+
+static inline uint32_t
+aligned_read32le(const uint8_t *buf)
+{
+	uint32_t num = aligned_read32ne(buf);
+	return conv32le(num);
+}
+
+
+static inline uint64_t
+aligned_read64be(const uint8_t *buf)
+{
+	uint64_t num = aligned_read64ne(buf);
+	return conv64be(num);
+}
+
+
+static inline uint64_t
+aligned_read64le(const uint8_t *buf)
+{
+	uint64_t num = aligned_read64ne(buf);
+	return conv64le(num);
+}
+
+
+// These need to be macros like in the unaligned case.
+#define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num))
+#define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num))
+#define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num))
+#define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num))
+#define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num))
+#define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num))
+
+
+////////////////////
+// Bit operations //
+////////////////////
+
 static inline uint32_t
 bsr32(uint32_t n)
 {
@@ -391,7 +593,7 @@ bsr32(uint32_t n)
 	// multiple architectures. On x86, __builtin_clz() ^ 31U becomes
 	// either plain BSR (so the XOR gets optimized away) or LZCNT and
 	// XOR (if -march indicates that SSE4a instructions are supported).
-	return __builtin_clz(n) ^ 31U;
+	return (uint32_t)__builtin_clz(n) ^ 31U;
 
 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
 	uint32_t i;
@@ -401,27 +603,27 @@ bsr32(uint32_t n)
 #else
 	uint32_t i = 31;
 
-	if ((n & UINT32_C(0xFFFF0000)) == 0) {
+	if ((n & 0xFFFF0000) == 0) {
 		n <<= 16;
 		i = 15;
 	}
 
-	if ((n & UINT32_C(0xFF000000)) == 0) {
+	if ((n & 0xFF000000) == 0) {
 		n <<= 8;
 		i -= 8;
 	}
 
-	if ((n & UINT32_C(0xF0000000)) == 0) {
+	if ((n & 0xF0000000) == 0) {
 		n <<= 4;
 		i -= 4;
 	}
 
-	if ((n & UINT32_C(0xC0000000)) == 0) {
+	if ((n & 0xC0000000) == 0) {
 		n <<= 2;
 		i -= 2;
 	}
 
-	if ((n & UINT32_C(0x80000000)) == 0)
+	if ((n & 0x80000000) == 0)
 		--i;
 
 	return i;
@@ -436,7 +638,7 @@ clz32(uint32_t n)
 	return _bit_scan_reverse(n) ^ 31U;
 
 #elif TUKLIB_GNUC_REQ(3, 4) && UINT_MAX == UINT32_MAX
-	return __builtin_clz(n);
+	return (uint32_t)__builtin_clz(n);
 
 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
 	uint32_t i;
@@ -448,27 +650,27 @@ clz32(uint32_t n)
 #else
 	uint32_t i = 0;
 
-	if ((n & UINT32_C(0xFFFF0000)) == 0) {
+	if ((n & 0xFFFF0000) == 0) {
 		n <<= 16;
 		i = 16;
 	}
 
-	if ((n & UINT32_C(0xFF000000)) == 0) {
+	if ((n & 0xFF000000) == 0) {
 		n <<= 8;
 		i += 8;
 	}
 
-	if ((n & UINT32_C(0xF0000000)) == 0) {
+	if ((n & 0xF0000000) == 0) {
 		n <<= 4;
 		i += 4;
 	}
 
-	if ((n & UINT32_C(0xC0000000)) == 0) {
+	if ((n & 0xC0000000) == 0) {
 		n <<= 2;
 		i += 2;
 	}
 
-	if ((n & UINT32_C(0x80000000)) == 0)
+	if ((n & 0x80000000) == 0)
 		++i;
 
 	return i;
@@ -483,7 +685,7 @@ ctz32(uint32_t n)
 	return _bit_scan_forward(n);
 
 #elif TUKLIB_GNUC_REQ(3, 4) && UINT_MAX >= UINT32_MAX
-	return __builtin_ctz(n);
+	return (uint32_t)__builtin_ctz(n);
 
 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
 	uint32_t i;
@@ -493,27 +695,27 @@ ctz32(uint32_t n)
 #else
 	uint32_t i = 0;
 
-	if ((n & UINT32_C(0x0000FFFF)) == 0) {
+	if ((n & 0x0000FFFF) == 0) {
 		n >>= 16;
 		i = 16;
 	}
 
-	if ((n & UINT32_C(0x000000FF)) == 0) {
+	if ((n & 0x000000FF) == 0) {
 		n >>= 8;
 		i += 8;
 	}
 
-	if ((n & UINT32_C(0x0000000F)) == 0) {
+	if ((n & 0x0000000F) == 0) {
 		n >>= 4;
 		i += 4;
 	}
 
-	if ((n & UINT32_C(0x00000003)) == 0) {
+	if ((n & 0x00000003) == 0) {
 		n >>= 2;
 		i += 2;
 	}
 
-	if ((n & UINT32_C(0x00000001)) == 0)
+	if ((n & 0x00000001) == 0)
 		++i;
 
 	return i;