Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 2.4.4)
cmake_minimum_required(VERSION 2.6)
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CMake 2.6 is the minimal to be able to use 'Threads' macro and later on add it to the libraries to be linked to zlib.

set(CMAKE_ALLOW_LOOSE_LOOP_CONSTRUCTS ON)

project(zlib C)
Expand Down Expand Up @@ -96,6 +96,7 @@ set(ZLIB_PUBLIC_HDRS
)
set(ZLIB_PRIVATE_HDRS
crc32.h
cpu_features.h
deflate.h
gzguts.h
inffast.h
Expand All @@ -108,6 +109,7 @@ set(ZLIB_PRIVATE_HDRS
set(ZLIB_SRCS
adler32.c
compress.c
cpu_features.c
crc32.c
deflate.c
gzclose.c
Expand Down Expand Up @@ -136,6 +138,12 @@ if(CMAKE_COMPILER_IS_GNUCC)
set(ZLIB_ASMS contrib/amd64/amd64-match.S)
endif ()

# NEON is mandatory in ARMv8, but 'crypto extensions' are optional.
if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
set_source_files_properties(${ZLIB_SRCS} PROPERTIES LANGUAGE C COMPILE_FLAGS -march=armv8-a+crc)
add_definitions(-DCRC32_ARMV8_CRC32)
endif()

if(ZLIB_ASMS)
add_definitions(-DASMV)
set_source_files_properties(${ZLIB_ASMS} PROPERTIES LANGUAGE C COMPILE_FLAGS -DNO_UNDERLINE)
Expand Down Expand Up @@ -200,6 +208,19 @@ if(NOT CYGWIN)
endif()

if(UNIX)
# CPU features detection in a multithreaded context (e.g. Chromium) can
# be racy and requires proper handling.
find_package (Threads REQUIRED)
target_link_libraries(zlib ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(zlibstatic ${CMAKE_THREAD_LIBS_INIT})

# This is limited, basically we assume either Arm or Intel.
if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
add_definitions(-DARMV8_OS_LINUX)
elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
add_definitions(-DX86_NOT_WINDOWS)
endif()

# On unix-like platforms the library is almost always called libz
set_target_properties(zlib zlibstatic PROPERTIES OUTPUT_NAME z)
if(NOT APPLE)
Expand Down
145 changes: 145 additions & 0 deletions cpu_features.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
/* cpu_features.c -- Processor features detection.
*
* Copyright 2018 The Chromium Authors. All rights reserved.
* Use of this source code is governed by a BSD-style license that can be
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@madler I'm looking for your feedback concerning the licensing of the new code.

Chromium's license can be found at: https://cs.chromium.org/chromium/src/LICENSE

I believe we may be able to adjust it to follow ZLIB license if that is your preference.

* found in the Chromium source repository LICENSE file.
*/

#include "cpu_features.h"
#include "zutil.h"

#include <stdint.h>
#if defined(_MSC_VER)
#include <intrin.h>
#elif defined(X86_NOT_WINDOWS) || defined(X86_WINDOWS)
#include <cpuid.h>
#endif

/* TODO(cavalcantii): remove checks for x86_flags on deflate.
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I imported the code pretty much as it is from:
https://cs.chromium.org/chromium/src/third_party/zlib/cpu_features.c

To clarify: this TODO is not applicable within the context of this patch.

*/
int ZLIB_INTERNAL arm_cpu_enable_crc32 = 0;
int ZLIB_INTERNAL arm_cpu_enable_pmull = 0;
int ZLIB_INTERNAL x86_cpu_enable_ssse3 = 0;
int ZLIB_INTERNAL x86_cpu_enable_simd = 0;

#if defined(ARMV8_OS_ANDROID) || defined(ARMV8_OS_LINUX) || defined(ARMV8_OS_FUCHSIA)
#include <pthread.h>
#endif

#if defined(ARMV8_OS_ANDROID)
#include <cpu-features.h>
#elif defined(ARMV8_OS_LINUX)
#include <asm/hwcap.h>
#include <sys/auxv.h>
#elif defined(ARMV8_OS_FUCHSIA)
#include <zircon/features.h>
#include <zircon/syscalls.h>
#include <zircon/types.h>
#elif defined(ARMV8_OS_WINDOWS) || defined(X86_WINDOWS)
#include <windows.h>
#elif !defined(_MSC_VER)
#include <pthread.h>
#else
#error cpu_features.c CPU feature detection in not defined for your platform
#endif

#if !defined(CPU_NO_SIMD) && !defined(ARM_OS_IOS)
static void _cpu_check_features(void);
#endif

#if defined(ARMV8_OS_ANDROID) || defined(ARMV8_OS_LINUX) || defined(ARMV8_OS_FUCHSIA) || defined(X86_NOT_WINDOWS)
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An explanation here: we have to support many different combinations in Chromium (e.g. Android without SIMD, with SIMD, 32bit, 64bit, etc) and operating systems e.g. Fuchsia, WoA (Windows on ARM), etc.

Unfortunately, there isn't much else I can do to make the code simpler.
:-(

static pthread_once_t cpu_check_inited_once = PTHREAD_ONCE_INIT;
void ZLIB_INTERNAL cpu_check_features(void)
{
pthread_once(&cpu_check_inited_once, _cpu_check_features);
Copy link
Author

@Adenilson Adenilson Jan 28, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking that we could easily change this to make it optional to have threaded synchronization for CPU features detection.

That would allow users to opt-in into the features (i.e. if they plan to run the code in a multithreaded app), allowing zlib to keep its required set of dependencies minimal as it is today.

The implementation would be something like:
#if defined(HAVE_THREAD_SUPPORT)
// here goes code using POSIX thread API
#else
// Simply call _cpu_check_features().
#endif

@madler what you think?

}
#elif defined(ARMV8_OS_WINDOWS) || defined(X86_WINDOWS)
static INIT_ONCE cpu_check_inited_once = INIT_ONCE_STATIC_INIT;
static BOOL CALLBACK _cpu_check_features_forwarder(PINIT_ONCE once, PVOID param, PVOID* context)
{
_cpu_check_features();
return TRUE;
}
void ZLIB_INTERNAL cpu_check_features(void)
{
InitOnceExecuteOnce(&cpu_check_inited_once, _cpu_check_features_forwarder,
NULL, NULL);
}
#endif

#if (defined(__ARM_NEON__) || defined(__ARM_NEON))
/*
* iOS@ARM is a special case where we always have NEON but don't check
* for crypto extensions.
*/
#ifndef ARM_OS_IOS
/*
* See http://bit.ly/2CcoEsr for run-time detection of ARM features and also
* crbug.com/931275 for android_getCpuFeatures() use in the Android sandbox.
*/
static void _cpu_check_features(void)
{
#if defined(ARMV8_OS_ANDROID) && defined(__aarch64__)
uint64_t features = android_getCpuFeatures();
arm_cpu_enable_crc32 = !!(features & ANDROID_CPU_ARM64_FEATURE_CRC32);
arm_cpu_enable_pmull = !!(features & ANDROID_CPU_ARM64_FEATURE_PMULL);
#elif defined(ARMV8_OS_ANDROID) /* aarch32 */
uint64_t features = android_getCpuFeatures();
arm_cpu_enable_crc32 = !!(features & ANDROID_CPU_ARM_FEATURE_CRC32);
arm_cpu_enable_pmull = !!(features & ANDROID_CPU_ARM_FEATURE_PMULL);
#elif defined(ARMV8_OS_LINUX) && defined(__aarch64__)
unsigned long features = getauxval(AT_HWCAP);
arm_cpu_enable_crc32 = !!(features & HWCAP_CRC32);
arm_cpu_enable_pmull = !!(features & HWCAP_PMULL);
#elif defined(ARMV8_OS_LINUX) && (defined(__ARM_NEON) || defined(__ARM_NEON__))
/* Query HWCAP2 for ARMV8-A SoCs running in aarch32 mode */
unsigned long features = getauxval(AT_HWCAP2);
arm_cpu_enable_crc32 = !!(features & HWCAP2_CRC32);
arm_cpu_enable_pmull = !!(features & HWCAP2_PMULL);
#elif defined(ARMV8_OS_FUCHSIA)
uint32_t features;
zx_status_t rc = zx_system_get_features(ZX_FEATURE_KIND_CPU, &features);
if (rc != ZX_OK || (features & ZX_ARM64_FEATURE_ISA_ASIMD) == 0)
return; /* Report nothing if ASIMD(NEON) is missing */
arm_cpu_enable_crc32 = !!(features & ZX_ARM64_FEATURE_ISA_CRC32);
arm_cpu_enable_pmull = !!(features & ZX_ARM64_FEATURE_ISA_PMULL);
#elif defined(ARMV8_OS_WINDOWS)
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is for WoA (Windows on ARM).

arm_cpu_enable_crc32 = IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE);
arm_cpu_enable_pmull = IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);
#endif
}
#endif
#elif defined(X86_NOT_WINDOWS) || defined(X86_WINDOWS)
/*
* iOS@x86 (i.e. emulator) is another special case where we disable
* SIMD optimizations.
*/
#ifndef CPU_NO_SIMD
/* On x86 we simply use a instruction to check the CPU features.
* (i.e. CPUID).
*/
static void _cpu_check_features(void)
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even though we don't use x86 CPU features flags for now, they will be needed for the next patch (i.e. optimized insert_string).

{
int x86_cpu_has_sse2;
int x86_cpu_has_ssse3;
int x86_cpu_has_sse42;
int x86_cpu_has_pclmulqdq;
int abcd[4];
#ifdef _MSC_VER
__cpuid(abcd, 1);
#else
__cpuid(1, abcd[0], abcd[1], abcd[2], abcd[3]);
#endif
x86_cpu_has_sse2 = abcd[3] & 0x4000000;
x86_cpu_has_ssse3 = abcd[2] & 0x000200;
x86_cpu_has_sse42 = abcd[2] & 0x100000;
x86_cpu_has_pclmulqdq = abcd[2] & 0x2;

x86_cpu_enable_ssse3 = x86_cpu_has_ssse3;

x86_cpu_enable_simd = x86_cpu_has_sse2 &&
x86_cpu_has_sse42 &&
x86_cpu_has_pclmulqdq;
}
#endif
#endif
17 changes: 17 additions & 0 deletions cpu_features.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
/* cpu_features.h -- Processor features detection.
*
* Copyright 2018 The Chromium Authors. All rights reserved.
* Use of this source code is governed by a BSD-style license that can be
* found in the Chromium source repository LICENSE file.
*/

#include "zlib.h"

/* TODO(cavalcantii): remove checks for x86_flags on deflate.
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See previous note about importing the code.

*/
extern int arm_cpu_enable_crc32;
extern int arm_cpu_enable_pmull;
extern int x86_cpu_enable_ssse3;
extern int x86_cpu_enable_simd;

void cpu_check_features(void);
36 changes: 20 additions & 16 deletions crc32.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
# endif /* !DYNAMIC_CRC_TABLE */
#endif /* MAKECRCH */

#include "cpu_features.h"
#include "zutil.h" /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */

/*
Expand Down Expand Up @@ -620,13 +621,13 @@ const z_crc_t FAR * ZEXPORT get_crc_table()
/* =========================================================================
* Use ARM machine instructions if available. This will compute the CRC about
* ten times faster than the braided calculation. This code does not check for
* the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will
* the presence of the CRC instruction at run time. CRC32_ARMV8_CRC32 will
* only be defined if the compilation specifies an ARM processor architecture
* that has the instructions. For example, compiling with -march=armv8.1-a or
* -march=armv8-a+crc, or -march=native if the compile machine has the crc32
* instructions.
*/
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8
#if defined(__aarch64__) && defined(CRC32_ARMV8_CRC32) && W == 8

/*
Constants empirically determined to maximize speed. These values are from
Expand All @@ -636,7 +637,7 @@ const z_crc_t FAR * ZEXPORT get_crc_table()
#define Z_BATCH_ZEROS 0xa10d3d0c /* computed from Z_BATCH = 3990 */
#define Z_BATCH_MIN 800 /* fewest words in a final batch */

unsigned long ZEXPORT crc32_z(crc, buf, len)
unsigned long ZEXPORT armv8_crc32_z(crc, buf, len)
unsigned long crc;
const unsigned char FAR *buf;
z_size_t len;
Expand All @@ -648,15 +649,7 @@ unsigned long ZEXPORT crc32_z(crc, buf, len)
z_size_t last, last2, i;
z_size_t num;

/* Return initial CRC, if requested. */
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I hooked into the portable code to save some code duplication.

As a result, for ARMv8 we will build both implementations (e.g. portable + optimized) and decide which one to use at runtime.

We also cache the CPU features detection flags to avoid wasting CPU cycles.

if (buf == Z_NULL) return 0;

#ifdef DYNAMIC_CRC_TABLE
once(&made, make_crc_table);
#endif /* DYNAMIC_CRC_TABLE */

/* Pre-condition the CRC */
crc ^= 0xffffffff;
/* Initial setup is done in crc32_z() i.e. handling Z_NULL, etc. */

/* Compute the CRC up to a word boundary. */
while (len && ((z_size_t)buf & 7) != 0) {
Expand Down Expand Up @@ -731,7 +724,7 @@ unsigned long ZEXPORT crc32_z(crc, buf, len)
return crc ^ 0xffffffff;
}

#else
#endif

/* ========================================================================= */
unsigned long ZEXPORT crc32_z(crc, buf, len)
Expand All @@ -740,7 +733,14 @@ unsigned long ZEXPORT crc32_z(crc, buf, len)
z_size_t len;
{
/* Return initial CRC, if requested. */
if (buf == Z_NULL) return 0;
if (buf == Z_NULL) {
/* Assume user is calling 'crc32(0, NULL, 0)', so we cache CPU features
* detection early (and infrequently) on.
*/
if (!len)
cpu_check_features();
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An explanation here: in Chromium code base, we have client code (e.g. network code) that may rely on using zlib's crc32.

There is no guarantee that the client code is performing compression/decompression of data, so we have to cover that corner case by ensuring that we perform CPU features detection in a scenario where the user only rely on calling crc32() (e.g. first to get an initial valid crc32 value and next with a real data vector).

return 0;
}

#ifdef DYNAMIC_CRC_TABLE
once(&made, make_crc_table);
Expand All @@ -749,6 +749,12 @@ unsigned long ZEXPORT crc32_z(crc, buf, len)
/* Pre-condition the CRC */
crc ^= 0xffffffff;

#if defined(CRC32_ARMV8_CRC32)
/* If we don't have required CPU features, fallback to portable implementation. */
if (arm_cpu_enable_crc32) /* TODO: add x86 optimized CRC32. */
return armv8_crc32_z(crc, buf, len);
#endif

#ifdef W

/* If provided enough bytes, do a braided CRC calculation. */
Expand Down Expand Up @@ -1055,8 +1061,6 @@ unsigned long ZEXPORT crc32_z(crc, buf, len)
return crc ^ 0xffffffff;
}

#endif

/* ========================================================================= */
unsigned long ZEXPORT crc32(crc, buf, len)
unsigned long crc;
Expand Down
9 changes: 9 additions & 0 deletions deflate.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
/* @(#) $Id$ */

#include "deflate.h"
#include "cpu_features.h"

const char deflate_copyright[] =
" deflate 1.2.11.1 Copyright 1995-2017 Jean-loup Gailly and Mark Adler ";
Expand Down Expand Up @@ -255,6 +256,14 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
int wrap = 1;
static const char my_version[] = ZLIB_VERSION;

// Needed to activate optimized insert_string() that helps compression
// for all wrapper formats (e.g. RAW, ZLIB, GZIP).
// Feature detection is not triggered while using RAW mode (i.e. we never
// call crc32() with a NULL buffer).
#if defined(CRC32_ARMV8_CRC32) || defined(CRC32_SIMD_SSE42_PCLMUL)
cpu_check_features();
Copy link
Author

@Adenilson Adenilson Jan 22, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is another valid entry point to perform CPU features detection, covering the scenario where we perform compression using RAW mode.

I found this corner case last December while V8 was starting to use zlib's checksums for speeding up loading snapshots (e.g. code blobs).

I think we should target to have in near future an optimized CRC32 for intel as we have this done already, for reference:
https://cs.chromium.org/chromium/src/third_party/zlib/crc32_simd.c

#endif

if (version == Z_NULL || version[0] != my_version[0] ||
stream_size != sizeof(z_stream)) {
return Z_VERSION_ERROR;
Expand Down