diff --git a/CMakeLists.txt b/CMakeLists.txt index e108c16f8..819847356 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 2.4.4) +cmake_minimum_required(VERSION 2.6) set(CMAKE_ALLOW_LOOSE_LOOP_CONSTRUCTS ON) project(zlib C) @@ -96,6 +96,7 @@ set(ZLIB_PUBLIC_HDRS ) set(ZLIB_PRIVATE_HDRS crc32.h + cpu_features.h deflate.h gzguts.h inffast.h @@ -108,6 +109,7 @@ set(ZLIB_PRIVATE_HDRS set(ZLIB_SRCS adler32.c compress.c + cpu_features.c crc32.c deflate.c gzclose.c @@ -136,6 +138,12 @@ if(CMAKE_COMPILER_IS_GNUCC) set(ZLIB_ASMS contrib/amd64/amd64-match.S) endif () + # NEON is mandatory in ARMv8, but 'crypto extensions' are optional. + if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") + set_source_files_properties(${ZLIB_SRCS} PROPERTIES LANGUAGE C COMPILE_FLAGS -march=armv8-a+crc) + add_definitions(-DCRC32_ARMV8_CRC32) + endif() + if(ZLIB_ASMS) add_definitions(-DASMV) set_source_files_properties(${ZLIB_ASMS} PROPERTIES LANGUAGE C COMPILE_FLAGS -DNO_UNDERLINE) @@ -200,6 +208,19 @@ if(NOT CYGWIN) endif() if(UNIX) + # CPU features detection in a multithreaded context (e.g. Chromium) can + # be racy and requires proper handling. + find_package (Threads REQUIRED) + target_link_libraries(zlib ${CMAKE_THREAD_LIBS_INIT}) + target_link_libraries(zlibstatic ${CMAKE_THREAD_LIBS_INIT}) + + # This is limited, basically we assume either Arm or Intel. + if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") + add_definitions(-DARMV8_OS_LINUX) + elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64") + add_definitions(-DX86_NOT_WINDOWS) + endif() + # On unix-like platforms the library is almost always called libz set_target_properties(zlib zlibstatic PROPERTIES OUTPUT_NAME z) if(NOT APPLE) diff --git a/cpu_features.c b/cpu_features.c new file mode 100644 index 000000000..2c5b39e1d --- /dev/null +++ b/cpu_features.c @@ -0,0 +1,145 @@ +/* cpu_features.c -- Processor features detection. + * + * Copyright 2018 The Chromium Authors. All rights reserved. + * Use of this source code is governed by a BSD-style license that can be + * found in the Chromium source repository LICENSE file. + */ + +#include "cpu_features.h" +#include "zutil.h" + +#include +#if defined(_MSC_VER) +#include +#elif defined(X86_NOT_WINDOWS) || defined(X86_WINDOWS) +#include +#endif + +/* TODO(cavalcantii): remove checks for x86_flags on deflate. + */ +int ZLIB_INTERNAL arm_cpu_enable_crc32 = 0; +int ZLIB_INTERNAL arm_cpu_enable_pmull = 0; +int ZLIB_INTERNAL x86_cpu_enable_ssse3 = 0; +int ZLIB_INTERNAL x86_cpu_enable_simd = 0; + +#if defined(ARMV8_OS_ANDROID) || defined(ARMV8_OS_LINUX) || defined(ARMV8_OS_FUCHSIA) +#include +#endif + +#if defined(ARMV8_OS_ANDROID) +#include +#elif defined(ARMV8_OS_LINUX) +#include +#include +#elif defined(ARMV8_OS_FUCHSIA) +#include +#include +#include +#elif defined(ARMV8_OS_WINDOWS) || defined(X86_WINDOWS) +#include +#elif !defined(_MSC_VER) +#include +#else +#error cpu_features.c CPU feature detection in not defined for your platform +#endif + +#if !defined(CPU_NO_SIMD) && !defined(ARM_OS_IOS) +static void _cpu_check_features(void); +#endif + +#if defined(ARMV8_OS_ANDROID) || defined(ARMV8_OS_LINUX) || defined(ARMV8_OS_FUCHSIA) || defined(X86_NOT_WINDOWS) +static pthread_once_t cpu_check_inited_once = PTHREAD_ONCE_INIT; +void ZLIB_INTERNAL cpu_check_features(void) +{ + pthread_once(&cpu_check_inited_once, _cpu_check_features); +} +#elif defined(ARMV8_OS_WINDOWS) || defined(X86_WINDOWS) +static INIT_ONCE cpu_check_inited_once = INIT_ONCE_STATIC_INIT; +static BOOL CALLBACK _cpu_check_features_forwarder(PINIT_ONCE once, PVOID param, PVOID* context) +{ + _cpu_check_features(); + return TRUE; +} +void ZLIB_INTERNAL cpu_check_features(void) +{ + InitOnceExecuteOnce(&cpu_check_inited_once, _cpu_check_features_forwarder, + NULL, NULL); +} +#endif + +#if (defined(__ARM_NEON__) || defined(__ARM_NEON)) +/* + * iOS@ARM is a special case where we always have NEON but don't check + * for crypto extensions. + */ +#ifndef ARM_OS_IOS +/* + * See http://bit.ly/2CcoEsr for run-time detection of ARM features and also + * crbug.com/931275 for android_getCpuFeatures() use in the Android sandbox. + */ +static void _cpu_check_features(void) +{ +#if defined(ARMV8_OS_ANDROID) && defined(__aarch64__) + uint64_t features = android_getCpuFeatures(); + arm_cpu_enable_crc32 = !!(features & ANDROID_CPU_ARM64_FEATURE_CRC32); + arm_cpu_enable_pmull = !!(features & ANDROID_CPU_ARM64_FEATURE_PMULL); +#elif defined(ARMV8_OS_ANDROID) /* aarch32 */ + uint64_t features = android_getCpuFeatures(); + arm_cpu_enable_crc32 = !!(features & ANDROID_CPU_ARM_FEATURE_CRC32); + arm_cpu_enable_pmull = !!(features & ANDROID_CPU_ARM_FEATURE_PMULL); +#elif defined(ARMV8_OS_LINUX) && defined(__aarch64__) + unsigned long features = getauxval(AT_HWCAP); + arm_cpu_enable_crc32 = !!(features & HWCAP_CRC32); + arm_cpu_enable_pmull = !!(features & HWCAP_PMULL); +#elif defined(ARMV8_OS_LINUX) && (defined(__ARM_NEON) || defined(__ARM_NEON__)) + /* Query HWCAP2 for ARMV8-A SoCs running in aarch32 mode */ + unsigned long features = getauxval(AT_HWCAP2); + arm_cpu_enable_crc32 = !!(features & HWCAP2_CRC32); + arm_cpu_enable_pmull = !!(features & HWCAP2_PMULL); +#elif defined(ARMV8_OS_FUCHSIA) + uint32_t features; + zx_status_t rc = zx_system_get_features(ZX_FEATURE_KIND_CPU, &features); + if (rc != ZX_OK || (features & ZX_ARM64_FEATURE_ISA_ASIMD) == 0) + return; /* Report nothing if ASIMD(NEON) is missing */ + arm_cpu_enable_crc32 = !!(features & ZX_ARM64_FEATURE_ISA_CRC32); + arm_cpu_enable_pmull = !!(features & ZX_ARM64_FEATURE_ISA_PMULL); +#elif defined(ARMV8_OS_WINDOWS) + arm_cpu_enable_crc32 = IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE); + arm_cpu_enable_pmull = IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE); +#endif +} +#endif +#elif defined(X86_NOT_WINDOWS) || defined(X86_WINDOWS) +/* + * iOS@x86 (i.e. emulator) is another special case where we disable + * SIMD optimizations. + */ +#ifndef CPU_NO_SIMD +/* On x86 we simply use a instruction to check the CPU features. + * (i.e. CPUID). + */ +static void _cpu_check_features(void) +{ + int x86_cpu_has_sse2; + int x86_cpu_has_ssse3; + int x86_cpu_has_sse42; + int x86_cpu_has_pclmulqdq; + int abcd[4]; +#ifdef _MSC_VER + __cpuid(abcd, 1); +#else + __cpuid(1, abcd[0], abcd[1], abcd[2], abcd[3]); +#endif + x86_cpu_has_sse2 = abcd[3] & 0x4000000; + x86_cpu_has_ssse3 = abcd[2] & 0x000200; + x86_cpu_has_sse42 = abcd[2] & 0x100000; + x86_cpu_has_pclmulqdq = abcd[2] & 0x2; + + x86_cpu_enable_ssse3 = x86_cpu_has_ssse3; + + x86_cpu_enable_simd = x86_cpu_has_sse2 && + x86_cpu_has_sse42 && + x86_cpu_has_pclmulqdq; +} +#endif +#endif diff --git a/cpu_features.h b/cpu_features.h new file mode 100644 index 000000000..2a4a79734 --- /dev/null +++ b/cpu_features.h @@ -0,0 +1,17 @@ +/* cpu_features.h -- Processor features detection. + * + * Copyright 2018 The Chromium Authors. All rights reserved. + * Use of this source code is governed by a BSD-style license that can be + * found in the Chromium source repository LICENSE file. + */ + +#include "zlib.h" + +/* TODO(cavalcantii): remove checks for x86_flags on deflate. + */ +extern int arm_cpu_enable_crc32; +extern int arm_cpu_enable_pmull; +extern int x86_cpu_enable_ssse3; +extern int x86_cpu_enable_simd; + +void cpu_check_features(void); diff --git a/crc32.c b/crc32.c index 62cb9fe34..a801a39c0 100644 --- a/crc32.c +++ b/crc32.c @@ -27,6 +27,7 @@ # endif /* !DYNAMIC_CRC_TABLE */ #endif /* MAKECRCH */ +#include "cpu_features.h" #include "zutil.h" /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */ /* @@ -620,13 +621,13 @@ const z_crc_t FAR * ZEXPORT get_crc_table() /* ========================================================================= * Use ARM machine instructions if available. This will compute the CRC about * ten times faster than the braided calculation. This code does not check for - * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will + * the presence of the CRC instruction at run time. CRC32_ARMV8_CRC32 will * only be defined if the compilation specifies an ARM processor architecture * that has the instructions. For example, compiling with -march=armv8.1-a or * -march=armv8-a+crc, or -march=native if the compile machine has the crc32 * instructions. */ -#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8 +#if defined(__aarch64__) && defined(CRC32_ARMV8_CRC32) && W == 8 /* Constants empirically determined to maximize speed. These values are from @@ -636,7 +637,7 @@ const z_crc_t FAR * ZEXPORT get_crc_table() #define Z_BATCH_ZEROS 0xa10d3d0c /* computed from Z_BATCH = 3990 */ #define Z_BATCH_MIN 800 /* fewest words in a final batch */ -unsigned long ZEXPORT crc32_z(crc, buf, len) +unsigned long ZEXPORT armv8_crc32_z(crc, buf, len) unsigned long crc; const unsigned char FAR *buf; z_size_t len; @@ -648,15 +649,7 @@ unsigned long ZEXPORT crc32_z(crc, buf, len) z_size_t last, last2, i; z_size_t num; - /* Return initial CRC, if requested. */ - if (buf == Z_NULL) return 0; - -#ifdef DYNAMIC_CRC_TABLE - once(&made, make_crc_table); -#endif /* DYNAMIC_CRC_TABLE */ - - /* Pre-condition the CRC */ - crc ^= 0xffffffff; + /* Initial setup is done in crc32_z() i.e. handling Z_NULL, etc. */ /* Compute the CRC up to a word boundary. */ while (len && ((z_size_t)buf & 7) != 0) { @@ -731,7 +724,7 @@ unsigned long ZEXPORT crc32_z(crc, buf, len) return crc ^ 0xffffffff; } -#else +#endif /* ========================================================================= */ unsigned long ZEXPORT crc32_z(crc, buf, len) @@ -740,7 +733,14 @@ unsigned long ZEXPORT crc32_z(crc, buf, len) z_size_t len; { /* Return initial CRC, if requested. */ - if (buf == Z_NULL) return 0; + if (buf == Z_NULL) { + /* Assume user is calling 'crc32(0, NULL, 0)', so we cache CPU features + * detection early (and infrequently) on. + */ + if (!len) + cpu_check_features(); + return 0; + } #ifdef DYNAMIC_CRC_TABLE once(&made, make_crc_table); @@ -749,6 +749,12 @@ unsigned long ZEXPORT crc32_z(crc, buf, len) /* Pre-condition the CRC */ crc ^= 0xffffffff; +#if defined(CRC32_ARMV8_CRC32) + /* If we don't have required CPU features, fallback to portable implementation. */ + if (arm_cpu_enable_crc32) /* TODO: add x86 optimized CRC32. */ + return armv8_crc32_z(crc, buf, len); +#endif + #ifdef W /* If provided enough bytes, do a braided CRC calculation. */ @@ -1055,8 +1061,6 @@ unsigned long ZEXPORT crc32_z(crc, buf, len) return crc ^ 0xffffffff; } -#endif - /* ========================================================================= */ unsigned long ZEXPORT crc32(crc, buf, len) unsigned long crc; diff --git a/deflate.c b/deflate.c index 23aef1878..5e7c5018f 100644 --- a/deflate.c +++ b/deflate.c @@ -50,6 +50,7 @@ /* @(#) $Id$ */ #include "deflate.h" +#include "cpu_features.h" const char deflate_copyright[] = " deflate 1.2.11.1 Copyright 1995-2017 Jean-loup Gailly and Mark Adler "; @@ -255,6 +256,14 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy, int wrap = 1; static const char my_version[] = ZLIB_VERSION; + // Needed to activate optimized insert_string() that helps compression + // for all wrapper formats (e.g. RAW, ZLIB, GZIP). + // Feature detection is not triggered while using RAW mode (i.e. we never + // call crc32() with a NULL buffer). +#if defined(CRC32_ARMV8_CRC32) || defined(CRC32_SIMD_SSE42_PCLMUL) + cpu_check_features(); +#endif + if (version == Z_NULL || version[0] != my_version[0] || stream_size != sizeof(z_stream)) { return Z_VERSION_ERROR;