-
-
Notifications
You must be signed in to change notification settings - Fork 33.8k
gh-124951: Optimize base64 encode & decode for an easy 2-3x speedup [no SIMD] #143262
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
+134
−23
Merged
Changes from all commits
Commits
Show all changes
13 commits
Select commit
Hold shift + click to select a range
2d2be30
Add base64 benchmark tool and optimize encoding/decoding
gpshead 573eaf3
Fix MSVC build: move table_b2a_base64 before inline functions
gpshead eaac671
NEWS entry
gpshead 060dbf5
Add a whatsnew entry
gpshead 1e12273
expose the benchmark defaults in the help text.
gpshead ef38895
Align base64 tables to 64-byte cache line boundaries
gpshead 7458c99
Use BASE64_PAD macro instead of literal '=' in fast path
gpshead 1f8ff74
Simplify block comment for base64 helpers
gpshead 4b1245b
Remove binasciibench in favor of pyperformance bm_base64 addition PR.
gpshead 4b50c73
reword L1 cache line comment
gpshead f7b27ee
doc Contributed by
gpshead 2b13b81
Remove redundant PAD check in base64 decode fast path
gpshead 30976a9
Use pointer increment in base64_encode_fast
gpshead File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
3 changes: 3 additions & 0 deletions
3
Misc/NEWS.d/next/Library/2025-12-29-00-42-26.gh-issue-124951.OsC5K4.rst
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| The base64 implementation behind the :mod:`binascii`, :mod:`base64`, and | ||
| related codec has been optimized for modern pipelined CPU architectures and | ||
| now performs 2-3x faster across all platforms. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -76,11 +76,12 @@ get_binascii_state(PyObject *module) | |
| } | ||
|
|
||
|
|
||
| static const unsigned char table_a2b_base64[] = { | ||
| /* Align to 64 bytes for L1 cache line friendliness */ | ||
| static const unsigned char table_a2b_base64[] Py_ALIGNED(64) = { | ||
| -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, | ||
| -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, | ||
| -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63, | ||
| 52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1, 0,-1,-1, /* Note PAD->0 */ | ||
| 52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,64,-1,-1, /* PAD->64 detected by fast path */ | ||
| -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14, | ||
| 15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1, | ||
| -1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40, | ||
|
|
@@ -101,9 +102,91 @@ static const unsigned char table_a2b_base64[] = { | |
| /* Max binary chunk size; limited only by available memory */ | ||
| #define BASE64_MAXBIN ((PY_SSIZE_T_MAX - 3) / 2) | ||
|
|
||
| static const unsigned char table_b2a_base64[] = | ||
| /* | ||
| * Fast base64 encoding/decoding helpers. | ||
| * | ||
| * Process complete groups without loop-carried dependencies. | ||
| */ | ||
|
|
||
| /* Align to 64 bytes for L1 cache line friendliness */ | ||
| static const unsigned char table_b2a_base64[] Py_ALIGNED(64) = | ||
| "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; | ||
|
|
||
| /* Encode 3 bytes into 4 base64 characters. */ | ||
| static inline void | ||
| base64_encode_trio(const unsigned char *in, unsigned char *out, | ||
| const unsigned char *table) | ||
| { | ||
| unsigned int combined = ((unsigned int)in[0] << 16) | | ||
| ((unsigned int)in[1] << 8) | | ||
| (unsigned int)in[2]; | ||
| out[0] = table[(combined >> 18) & 0x3f]; | ||
| out[1] = table[(combined >> 12) & 0x3f]; | ||
| out[2] = table[(combined >> 6) & 0x3f]; | ||
| out[3] = table[combined & 0x3f]; | ||
| } | ||
|
|
||
| /* Encode multiple complete 3-byte groups. | ||
| * Returns the number of input bytes processed (always a multiple of 3). | ||
| */ | ||
| static inline Py_ssize_t | ||
| base64_encode_fast(const unsigned char *in, Py_ssize_t in_len, | ||
| unsigned char *out, const unsigned char *table) | ||
| { | ||
| Py_ssize_t n_trios = in_len / 3; | ||
| const unsigned char *in_end = in + n_trios * 3; | ||
|
|
||
| while (in < in_end) { | ||
| base64_encode_trio(in, out, table); | ||
| in += 3; | ||
| out += 4; | ||
| } | ||
|
|
||
| return n_trios * 3; | ||
| } | ||
|
|
||
| /* Decode 4 base64 characters into 3 bytes. | ||
| * Returns 1 on success, 0 if any character is invalid. | ||
| */ | ||
| static inline int | ||
| base64_decode_quad(const unsigned char *in, unsigned char *out, | ||
| const unsigned char *table) | ||
| { | ||
| unsigned char v0 = table[in[0]]; | ||
| unsigned char v1 = table[in[1]]; | ||
| unsigned char v2 = table[in[2]]; | ||
| unsigned char v3 = table[in[3]]; | ||
|
|
||
| if ((v0 | v1 | v2 | v3) & 0xc0) { | ||
| return 0; | ||
| } | ||
|
|
||
| out[0] = (v0 << 2) | (v1 >> 4); | ||
| out[1] = (v1 << 4) | (v2 >> 2); | ||
| out[2] = (v2 << 6) | v3; | ||
| return 1; | ||
| } | ||
|
|
||
| /* Decode multiple complete 4-character groups (no padding allowed). | ||
| * Returns the number of input characters processed. | ||
| * Stops at the first invalid character, padding, or incomplete group. | ||
| */ | ||
| static inline Py_ssize_t | ||
| base64_decode_fast(const unsigned char *in, Py_ssize_t in_len, | ||
| unsigned char *out, const unsigned char *table) | ||
| { | ||
| Py_ssize_t n_quads = in_len / 4; | ||
| Py_ssize_t i; | ||
|
|
||
| for (i = 0; i < n_quads; i++) { | ||
| if (!base64_decode_quad(in + i * 4, out + i * 3, table)) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Did incrementing in and out by 4 and 3 have any benefit? |
||
| break; | ||
| } | ||
| } | ||
|
|
||
| return i * 4; | ||
| } | ||
|
|
||
|
|
||
| static const unsigned short crctab_hqx[256] = { | ||
| 0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5, 0x60c6, 0x70e7, | ||
|
|
@@ -403,10 +486,26 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode) | |
| goto error_end; | ||
| } | ||
|
|
||
| size_t i = 0; /* Current position in input */ | ||
|
|
||
| /* Fast path: use optimized decoder for complete quads. | ||
| * This works for both strict and non-strict mode for valid input. | ||
| * The fast path stops at padding, invalid chars, or incomplete groups. | ||
| */ | ||
| if (ascii_len >= 4) { | ||
| Py_ssize_t fast_chars = base64_decode_fast(ascii_data, (Py_ssize_t)ascii_len, | ||
| bin_data, table_a2b_base64); | ||
| if (fast_chars > 0) { | ||
gpshead marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| i = (size_t)fast_chars; | ||
| bin_data += (fast_chars / 4) * 3; | ||
| } | ||
| } | ||
|
|
||
| /* Slow path: handle remaining input (padding, invalid chars, partial groups) */ | ||
| int quad_pos = 0; | ||
| unsigned char leftchar = 0; | ||
| int pads = 0; | ||
| for (size_t i = 0; i < ascii_len; i++) { | ||
| for (; i < ascii_len; i++) { | ||
| unsigned char this_ch = ascii_data[i]; | ||
|
|
||
| /* Check for pad sequences and ignore | ||
|
|
@@ -533,9 +632,6 @@ binascii_b2a_base64_impl(PyObject *module, Py_buffer *data, int newline) | |
| /*[clinic end generated code: output=4ad62c8e8485d3b3 input=0e20ff59c5f2e3e1]*/ | ||
| { | ||
| const unsigned char *bin_data; | ||
| int leftbits = 0; | ||
| unsigned char this_ch; | ||
| unsigned int leftchar = 0; | ||
| Py_ssize_t bin_len; | ||
| binascii_state *state; | ||
|
|
||
|
|
@@ -566,26 +662,31 @@ binascii_b2a_base64_impl(PyObject *module, Py_buffer *data, int newline) | |
| } | ||
| unsigned char *ascii_data = PyBytesWriter_GetData(writer); | ||
|
|
||
| for( ; bin_len > 0 ; bin_len--, bin_data++ ) { | ||
| /* Shift the data into our buffer */ | ||
| leftchar = (leftchar << 8) | *bin_data; | ||
| leftbits += 8; | ||
|
|
||
| /* See if there are 6-bit groups ready */ | ||
| while ( leftbits >= 6 ) { | ||
| this_ch = (leftchar >> (leftbits-6)) & 0x3f; | ||
| leftbits -= 6; | ||
| *ascii_data++ = table_b2a_base64[this_ch]; | ||
| } | ||
| } | ||
| if ( leftbits == 2 ) { | ||
| *ascii_data++ = table_b2a_base64[(leftchar&3) << 4]; | ||
| /* Use the optimized fast path for complete 3-byte groups */ | ||
| Py_ssize_t fast_bytes = base64_encode_fast(bin_data, bin_len, ascii_data, | ||
| table_b2a_base64); | ||
| bin_data += fast_bytes; | ||
| ascii_data += (fast_bytes / 3) * 4; | ||
gpshead marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| bin_len -= fast_bytes; | ||
|
|
||
| /* Handle remaining 0-2 bytes */ | ||
| if (bin_len == 1) { | ||
| /* 1 byte remaining: produces 2 base64 chars + 2 padding */ | ||
| unsigned int val = bin_data[0]; | ||
| *ascii_data++ = table_b2a_base64[(val >> 2) & 0x3f]; | ||
| *ascii_data++ = table_b2a_base64[(val << 4) & 0x3f]; | ||
| *ascii_data++ = BASE64_PAD; | ||
| *ascii_data++ = BASE64_PAD; | ||
| } else if ( leftbits == 4 ) { | ||
| *ascii_data++ = table_b2a_base64[(leftchar&0xf) << 2]; | ||
| } | ||
| else if (bin_len == 2) { | ||
| /* 2 bytes remaining: produces 3 base64 chars + 1 padding */ | ||
| unsigned int val = ((unsigned int)bin_data[0] << 8) | bin_data[1]; | ||
| *ascii_data++ = table_b2a_base64[(val >> 10) & 0x3f]; | ||
| *ascii_data++ = table_b2a_base64[(val >> 4) & 0x3f]; | ||
| *ascii_data++ = table_b2a_base64[(val << 2) & 0x3f]; | ||
| *ascii_data++ = BASE64_PAD; | ||
| } | ||
|
|
||
| if (newline) | ||
| *ascii_data++ = '\n'; /* Append a courtesy newline */ | ||
|
|
||
|
|
||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.