-
Notifications
You must be signed in to change notification settings - Fork 2.7k
Adding CPU features detection code #468
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,145 @@ | ||
| /* cpu_features.c -- Processor features detection. | ||
| * | ||
| * Copyright 2018 The Chromium Authors. All rights reserved. | ||
| * Use of this source code is governed by a BSD-style license that can be | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @madler I'm looking for your feedback concerning the licensing of the new code. Chromium's license can be found at: https://cs.chromium.org/chromium/src/LICENSE I believe we may be able to adjust it to follow ZLIB license if that is your preference. |
||
| * found in the Chromium source repository LICENSE file. | ||
| */ | ||
|
|
||
| #include "cpu_features.h" | ||
| #include "zutil.h" | ||
|
|
||
| #include <stdint.h> | ||
| #if defined(_MSC_VER) | ||
| #include <intrin.h> | ||
| #elif defined(X86_NOT_WINDOWS) || defined(X86_WINDOWS) | ||
| #include <cpuid.h> | ||
| #endif | ||
|
|
||
| /* TODO(cavalcantii): remove checks for x86_flags on deflate. | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I imported the code pretty much as it is from: To clarify: this TODO is not applicable within the context of this patch. |
||
| */ | ||
| int ZLIB_INTERNAL arm_cpu_enable_crc32 = 0; | ||
| int ZLIB_INTERNAL arm_cpu_enable_pmull = 0; | ||
| int ZLIB_INTERNAL x86_cpu_enable_ssse3 = 0; | ||
| int ZLIB_INTERNAL x86_cpu_enable_simd = 0; | ||
|
|
||
| #if defined(ARMV8_OS_ANDROID) || defined(ARMV8_OS_LINUX) || defined(ARMV8_OS_FUCHSIA) | ||
| #include <pthread.h> | ||
| #endif | ||
|
|
||
| #if defined(ARMV8_OS_ANDROID) | ||
| #include <cpu-features.h> | ||
| #elif defined(ARMV8_OS_LINUX) | ||
| #include <asm/hwcap.h> | ||
| #include <sys/auxv.h> | ||
| #elif defined(ARMV8_OS_FUCHSIA) | ||
| #include <zircon/features.h> | ||
| #include <zircon/syscalls.h> | ||
| #include <zircon/types.h> | ||
| #elif defined(ARMV8_OS_WINDOWS) || defined(X86_WINDOWS) | ||
| #include <windows.h> | ||
| #elif !defined(_MSC_VER) | ||
| #include <pthread.h> | ||
| #else | ||
| #error cpu_features.c CPU feature detection in not defined for your platform | ||
| #endif | ||
|
|
||
| #if !defined(CPU_NO_SIMD) && !defined(ARM_OS_IOS) | ||
| static void _cpu_check_features(void); | ||
| #endif | ||
|
|
||
| #if defined(ARMV8_OS_ANDROID) || defined(ARMV8_OS_LINUX) || defined(ARMV8_OS_FUCHSIA) || defined(X86_NOT_WINDOWS) | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. An explanation here: we have to support many different combinations in Chromium (e.g. Android without SIMD, with SIMD, 32bit, 64bit, etc) and operating systems e.g. Fuchsia, WoA (Windows on ARM), etc. Unfortunately, there isn't much else I can do to make the code simpler. |
||
| static pthread_once_t cpu_check_inited_once = PTHREAD_ONCE_INIT; | ||
| void ZLIB_INTERNAL cpu_check_features(void) | ||
| { | ||
| pthread_once(&cpu_check_inited_once, _cpu_check_features); | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was thinking that we could easily change this to make it optional to have threaded synchronization for CPU features detection. That would allow users to opt-in into the features (i.e. if they plan to run the code in a multithreaded app), allowing zlib to keep its required set of dependencies minimal as it is today. The implementation would be something like: @madler what you think? |
||
| } | ||
| #elif defined(ARMV8_OS_WINDOWS) || defined(X86_WINDOWS) | ||
| static INIT_ONCE cpu_check_inited_once = INIT_ONCE_STATIC_INIT; | ||
| static BOOL CALLBACK _cpu_check_features_forwarder(PINIT_ONCE once, PVOID param, PVOID* context) | ||
| { | ||
| _cpu_check_features(); | ||
| return TRUE; | ||
| } | ||
| void ZLIB_INTERNAL cpu_check_features(void) | ||
| { | ||
| InitOnceExecuteOnce(&cpu_check_inited_once, _cpu_check_features_forwarder, | ||
| NULL, NULL); | ||
| } | ||
| #endif | ||
|
|
||
| #if (defined(__ARM_NEON__) || defined(__ARM_NEON)) | ||
| /* | ||
| * iOS@ARM is a special case where we always have NEON but don't check | ||
| * for crypto extensions. | ||
| */ | ||
| #ifndef ARM_OS_IOS | ||
| /* | ||
| * See http://bit.ly/2CcoEsr for run-time detection of ARM features and also | ||
| * crbug.com/931275 for android_getCpuFeatures() use in the Android sandbox. | ||
| */ | ||
| static void _cpu_check_features(void) | ||
| { | ||
| #if defined(ARMV8_OS_ANDROID) && defined(__aarch64__) | ||
| uint64_t features = android_getCpuFeatures(); | ||
| arm_cpu_enable_crc32 = !!(features & ANDROID_CPU_ARM64_FEATURE_CRC32); | ||
| arm_cpu_enable_pmull = !!(features & ANDROID_CPU_ARM64_FEATURE_PMULL); | ||
| #elif defined(ARMV8_OS_ANDROID) /* aarch32 */ | ||
| uint64_t features = android_getCpuFeatures(); | ||
| arm_cpu_enable_crc32 = !!(features & ANDROID_CPU_ARM_FEATURE_CRC32); | ||
| arm_cpu_enable_pmull = !!(features & ANDROID_CPU_ARM_FEATURE_PMULL); | ||
| #elif defined(ARMV8_OS_LINUX) && defined(__aarch64__) | ||
| unsigned long features = getauxval(AT_HWCAP); | ||
| arm_cpu_enable_crc32 = !!(features & HWCAP_CRC32); | ||
| arm_cpu_enable_pmull = !!(features & HWCAP_PMULL); | ||
| #elif defined(ARMV8_OS_LINUX) && (defined(__ARM_NEON) || defined(__ARM_NEON__)) | ||
| /* Query HWCAP2 for ARMV8-A SoCs running in aarch32 mode */ | ||
| unsigned long features = getauxval(AT_HWCAP2); | ||
| arm_cpu_enable_crc32 = !!(features & HWCAP2_CRC32); | ||
| arm_cpu_enable_pmull = !!(features & HWCAP2_PMULL); | ||
| #elif defined(ARMV8_OS_FUCHSIA) | ||
| uint32_t features; | ||
| zx_status_t rc = zx_system_get_features(ZX_FEATURE_KIND_CPU, &features); | ||
| if (rc != ZX_OK || (features & ZX_ARM64_FEATURE_ISA_ASIMD) == 0) | ||
| return; /* Report nothing if ASIMD(NEON) is missing */ | ||
| arm_cpu_enable_crc32 = !!(features & ZX_ARM64_FEATURE_ISA_CRC32); | ||
| arm_cpu_enable_pmull = !!(features & ZX_ARM64_FEATURE_ISA_PMULL); | ||
| #elif defined(ARMV8_OS_WINDOWS) | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is for WoA (Windows on ARM). |
||
| arm_cpu_enable_crc32 = IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE); | ||
| arm_cpu_enable_pmull = IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE); | ||
| #endif | ||
| } | ||
| #endif | ||
| #elif defined(X86_NOT_WINDOWS) || defined(X86_WINDOWS) | ||
| /* | ||
| * iOS@x86 (i.e. emulator) is another special case where we disable | ||
| * SIMD optimizations. | ||
| */ | ||
| #ifndef CPU_NO_SIMD | ||
| /* On x86 we simply use a instruction to check the CPU features. | ||
| * (i.e. CPUID). | ||
| */ | ||
| static void _cpu_check_features(void) | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Even though we don't use x86 CPU features flags for now, they will be needed for the next patch (i.e. optimized insert_string). |
||
| { | ||
| int x86_cpu_has_sse2; | ||
| int x86_cpu_has_ssse3; | ||
| int x86_cpu_has_sse42; | ||
| int x86_cpu_has_pclmulqdq; | ||
| int abcd[4]; | ||
| #ifdef _MSC_VER | ||
| __cpuid(abcd, 1); | ||
| #else | ||
| __cpuid(1, abcd[0], abcd[1], abcd[2], abcd[3]); | ||
| #endif | ||
| x86_cpu_has_sse2 = abcd[3] & 0x4000000; | ||
| x86_cpu_has_ssse3 = abcd[2] & 0x000200; | ||
| x86_cpu_has_sse42 = abcd[2] & 0x100000; | ||
| x86_cpu_has_pclmulqdq = abcd[2] & 0x2; | ||
|
|
||
| x86_cpu_enable_ssse3 = x86_cpu_has_ssse3; | ||
|
|
||
| x86_cpu_enable_simd = x86_cpu_has_sse2 && | ||
| x86_cpu_has_sse42 && | ||
| x86_cpu_has_pclmulqdq; | ||
| } | ||
| #endif | ||
| #endif | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,17 @@ | ||
| /* cpu_features.h -- Processor features detection. | ||
| * | ||
| * Copyright 2018 The Chromium Authors. All rights reserved. | ||
| * Use of this source code is governed by a BSD-style license that can be | ||
| * found in the Chromium source repository LICENSE file. | ||
| */ | ||
|
|
||
| #include "zlib.h" | ||
|
|
||
| /* TODO(cavalcantii): remove checks for x86_flags on deflate. | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See previous note about importing the code. |
||
| */ | ||
| extern int arm_cpu_enable_crc32; | ||
| extern int arm_cpu_enable_pmull; | ||
| extern int x86_cpu_enable_ssse3; | ||
| extern int x86_cpu_enable_simd; | ||
|
|
||
| void cpu_check_features(void); | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,6 +27,7 @@ | |
| # endif /* !DYNAMIC_CRC_TABLE */ | ||
| #endif /* MAKECRCH */ | ||
|
|
||
| #include "cpu_features.h" | ||
| #include "zutil.h" /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */ | ||
|
|
||
| /* | ||
|
|
@@ -620,13 +621,13 @@ const z_crc_t FAR * ZEXPORT get_crc_table() | |
| /* ========================================================================= | ||
| * Use ARM machine instructions if available. This will compute the CRC about | ||
| * ten times faster than the braided calculation. This code does not check for | ||
| * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will | ||
| * the presence of the CRC instruction at run time. CRC32_ARMV8_CRC32 will | ||
| * only be defined if the compilation specifies an ARM processor architecture | ||
| * that has the instructions. For example, compiling with -march=armv8.1-a or | ||
| * -march=armv8-a+crc, or -march=native if the compile machine has the crc32 | ||
| * instructions. | ||
| */ | ||
| #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8 | ||
| #if defined(__aarch64__) && defined(CRC32_ARMV8_CRC32) && W == 8 | ||
|
|
||
| /* | ||
| Constants empirically determined to maximize speed. These values are from | ||
|
|
@@ -636,7 +637,7 @@ const z_crc_t FAR * ZEXPORT get_crc_table() | |
| #define Z_BATCH_ZEROS 0xa10d3d0c /* computed from Z_BATCH = 3990 */ | ||
| #define Z_BATCH_MIN 800 /* fewest words in a final batch */ | ||
|
|
||
| unsigned long ZEXPORT crc32_z(crc, buf, len) | ||
| unsigned long ZEXPORT armv8_crc32_z(crc, buf, len) | ||
| unsigned long crc; | ||
| const unsigned char FAR *buf; | ||
| z_size_t len; | ||
|
|
@@ -648,15 +649,7 @@ unsigned long ZEXPORT crc32_z(crc, buf, len) | |
| z_size_t last, last2, i; | ||
| z_size_t num; | ||
|
|
||
| /* Return initial CRC, if requested. */ | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I hooked into the portable code to save some code duplication. As a result, for ARMv8 we will build both implementations (e.g. portable + optimized) and decide which one to use at runtime. We also cache the CPU features detection flags to avoid wasting CPU cycles. |
||
| if (buf == Z_NULL) return 0; | ||
|
|
||
| #ifdef DYNAMIC_CRC_TABLE | ||
| once(&made, make_crc_table); | ||
| #endif /* DYNAMIC_CRC_TABLE */ | ||
|
|
||
| /* Pre-condition the CRC */ | ||
| crc ^= 0xffffffff; | ||
| /* Initial setup is done in crc32_z() i.e. handling Z_NULL, etc. */ | ||
|
|
||
| /* Compute the CRC up to a word boundary. */ | ||
| while (len && ((z_size_t)buf & 7) != 0) { | ||
|
|
@@ -731,7 +724,7 @@ unsigned long ZEXPORT crc32_z(crc, buf, len) | |
| return crc ^ 0xffffffff; | ||
| } | ||
|
|
||
| #else | ||
| #endif | ||
|
|
||
| /* ========================================================================= */ | ||
| unsigned long ZEXPORT crc32_z(crc, buf, len) | ||
|
|
@@ -740,7 +733,14 @@ unsigned long ZEXPORT crc32_z(crc, buf, len) | |
| z_size_t len; | ||
| { | ||
| /* Return initial CRC, if requested. */ | ||
| if (buf == Z_NULL) return 0; | ||
| if (buf == Z_NULL) { | ||
| /* Assume user is calling 'crc32(0, NULL, 0)', so we cache CPU features | ||
| * detection early (and infrequently) on. | ||
| */ | ||
| if (!len) | ||
| cpu_check_features(); | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. An explanation here: in Chromium code base, we have client code (e.g. network code) that may rely on using zlib's crc32. There is no guarantee that the client code is performing compression/decompression of data, so we have to cover that corner case by ensuring that we perform CPU features detection in a scenario where the user only rely on calling crc32() (e.g. first to get an initial valid crc32 value and next with a real data vector). |
||
| return 0; | ||
| } | ||
|
|
||
| #ifdef DYNAMIC_CRC_TABLE | ||
| once(&made, make_crc_table); | ||
|
|
@@ -749,6 +749,12 @@ unsigned long ZEXPORT crc32_z(crc, buf, len) | |
| /* Pre-condition the CRC */ | ||
| crc ^= 0xffffffff; | ||
|
|
||
| #if defined(CRC32_ARMV8_CRC32) | ||
| /* If we don't have required CPU features, fallback to portable implementation. */ | ||
| if (arm_cpu_enable_crc32) /* TODO: add x86 optimized CRC32. */ | ||
| return armv8_crc32_z(crc, buf, len); | ||
| #endif | ||
|
|
||
| #ifdef W | ||
|
|
||
| /* If provided enough bytes, do a braided CRC calculation. */ | ||
|
|
@@ -1055,8 +1061,6 @@ unsigned long ZEXPORT crc32_z(crc, buf, len) | |
| return crc ^ 0xffffffff; | ||
| } | ||
|
|
||
| #endif | ||
|
|
||
| /* ========================================================================= */ | ||
| unsigned long ZEXPORT crc32(crc, buf, len) | ||
| unsigned long crc; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -50,6 +50,7 @@ | |
| /* @(#) $Id$ */ | ||
|
|
||
| #include "deflate.h" | ||
| #include "cpu_features.h" | ||
|
|
||
| const char deflate_copyright[] = | ||
| " deflate 1.2.11.1 Copyright 1995-2017 Jean-loup Gailly and Mark Adler "; | ||
|
|
@@ -255,6 +256,14 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy, | |
| int wrap = 1; | ||
| static const char my_version[] = ZLIB_VERSION; | ||
|
|
||
| // Needed to activate optimized insert_string() that helps compression | ||
| // for all wrapper formats (e.g. RAW, ZLIB, GZIP). | ||
| // Feature detection is not triggered while using RAW mode (i.e. we never | ||
| // call crc32() with a NULL buffer). | ||
| #if defined(CRC32_ARMV8_CRC32) || defined(CRC32_SIMD_SSE42_PCLMUL) | ||
| cpu_check_features(); | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is another valid entry point to perform CPU features detection, covering the scenario where we perform compression using RAW mode. I found this corner case last December while V8 was starting to use zlib's checksums for speeding up loading snapshots (e.g. code blobs). I think we should target to have in near future an optimized CRC32 for intel as we have this done already, for reference: |
||
| #endif | ||
|
|
||
| if (version == Z_NULL || version[0] != my_version[0] || | ||
| stream_size != sizeof(z_stream)) { | ||
| return Z_VERSION_ERROR; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
CMake 2.6 is the minimal to be able to use 'Threads' macro and later on add it to the libraries to be linked to zlib.