diff --git a/lib/astc-encoder/.gitrepo b/lib/astc-encoder/.gitrepo
index 568c3dd6bc..a14134bab2 100644
--- a/lib/astc-encoder/.gitrepo
+++ b/lib/astc-encoder/.gitrepo
@@ -6,7 +6,7 @@
 [subrepo]
 	remote = https://github.com/ARM-software/astc-encoder.git
 	branch = main
-	commit = 42a8f6ee01715f45edffb6773e34b8bb914a47df
-	parent = 2b1c072a13e9b69ada8457665b51ed5b59ddb408
+	commit = a33dbb44739da188e32d8f90e2342a780cf751fe
+	parent = dbfeb82a731c534f0ad830800a9cd7e68755cc0d
 	method = merge
 	cmdver = 0.4.3
diff --git a/lib/astc-encoder/CMakeLists.txt b/lib/astc-encoder/CMakeLists.txt
index 761098054f..0ff7f1ce9e 100644
--- a/lib/astc-encoder/CMakeLists.txt
+++ b/lib/astc-encoder/CMakeLists.txt
@@ -1,6 +1,6 @@
 #  SPDX-License-Identifier: Apache-2.0
 #  ----------------------------------------------------------------------------
-#  Copyright 2020-2021 Arm Limited
+#  Copyright 2020-2022 Arm Limited
 #
 #  Licensed under the Apache License, Version 2.0 (the "License"); you may not
 #  use this file except in compliance with the License. You may obtain a copy
@@ -183,12 +183,15 @@ endif()
 
 function(printopt optName optVal)
     if(${optVal})
-        message(STATUS "  ${optName} -  ON")
+        message(STATUS "  ${optName}  - ON")
     else()
-        message(STATUS "  ${optName} - OFF")
+        message(STATUS "  ${optName}  - OFF")
     endif()
 endfunction()
 
+if("${BLOCK_MAX_TEXELS}")
+     message(STATUS "  Max block texels - ${BLOCK_MAX_TEXELS}")
+endif()
 printopt("AVX2 backend   " ${ISA_AVX2})
 printopt("SSE4.1 backend " ${ISA_SSE41})
 printopt("SSE2 backend   " ${ISA_SSE2})
diff --git a/lib/astc-encoder/Docs/Building.md b/lib/astc-encoder/Docs/Building.md
index 83d2f30196..adb98a377a 100644
--- a/lib/astc-encoder/Docs/Building.md
+++ b/lib/astc-encoder/Docs/Building.md
@@ -118,17 +118,6 @@ make install -j16
 
 For codec developers there are a number of useful features in the build system.
 
-### No intrinsics build
-
-All normal builds will use SIMD accelerated code paths using intrinsics, as all
-target architectures (x86-64 and aarch64) guarantee SIMD availability. For
-development purposes it is possible to build an intrinsic-free build which uses
-no explicit SIMD acceleration (the compiler may still auto-vectorize).
-
-To enable this binary variant add `-DISA_NONE=ON` to the CMake command line
-when configuring. It is NOT recommended to use this for production; it is
-significantly slower than the vectorized SIMD builds.
-
 ### Build Types
 
 We support and test the following `CMAKE_BUILD_TYPE` options.
@@ -142,6 +131,26 @@ We support and test the following `CMAKE_BUILD_TYPE` options.
 Note that optimized release builds are compiled with link-time optimization,
 which can make profiling more challenging ...
 
+### No intrinsics builds
+
+All normal builds will use SIMD accelerated code paths using intrinsics, as all
+target architectures (x86-64 and aarch64) guarantee SIMD availability. For
+development purposes it is possible to build an intrinsic-free build which uses
+no explicit SIMD acceleration (the compiler may still auto-vectorize).
+
+To enable this binary variant add `-DISA_NONE=ON` to the CMake command line
+when configuring. It is NOT recommended to use this for production; it is
+significantly slower than the vectorized SIMD builds.
+
+### Constrained block sizebuilds
+
+All normal builds will support all ASTC block sizes, including the worst case
+6x6x6 3D block size (216 texels per block). Compressor memory footprint and
+performance can be improved by limiting the block sizes supported in the build
+by adding `-DBLOCK_MAX_TEXELS=<texel_count>` to to CMake command line when
+configuring. Legal block sizes that are unavailable in a restricted build will
+return the error `ASTCENC_ERR_NOT_IMPLEMENTED` during context creation.
+
 ### Testing
 
 We support building unit tests.
@@ -186,3 +195,7 @@ integrated into other projects using CMake. An example of the CMake integration
 and the codec API usage can be found in the `./Utils/Example` directory in the
 repository. See the [Example Readme](../Utils/Example/README.md) for more
 details.
+
+- - -
+
+_Copyright © 2019-2022, Arm Limited and contributors. All rights reserved._
diff --git a/lib/astc-encoder/Docs/ChangeLog-2x.md b/lib/astc-encoder/Docs/ChangeLog-2x.md
index ca0d0d0aa0..0c1ccdce4e 100644
--- a/lib/astc-encoder/Docs/ChangeLog-2x.md
+++ b/lib/astc-encoder/Docs/ChangeLog-2x.md
@@ -322,3 +322,7 @@ Key for performance charts
 **Relative performance vs 1.7 release:**
 
 ![Relative scores 2.0 vs 1.7](./ChangeLogImg/relative-1.7-to-2.0.png)
+
+- - -
+
+_Copyright © 2020-2022, Arm Limited and contributors. All rights reserved._
diff --git a/lib/astc-encoder/Docs/ChangeLog-3x.md b/lib/astc-encoder/Docs/ChangeLog-3x.md
index 7d68347fb4..f227521145 100644
--- a/lib/astc-encoder/Docs/ChangeLog-3x.md
+++ b/lib/astc-encoder/Docs/ChangeLog-3x.md
@@ -16,6 +16,13 @@ The 3.4 release introduces another round of optimizations, removing a number
 of power-user configuration options to simplify the core compressor data path.
 
 * **General:**
+  * **Feature:** Many memory allocations have been moved off the stack into
+    dynamically allocated working memory. This significantly reduces the peak
+    stack usage, allowing the compressor to run in systems with 128KB stack
+    limits.
+  * **Feature:** Builds now support `-DBLOCK_MAX_TEXELS=<count>` to allow a
+    compressor to support a subset of block sizes. This can reduce binary size
+    and runtime memory footprint, and improve performance.
   * **Feature:** The `-v` and `-va` options to set a per-texel error weight
     function are no longer supported.
   * **Feature:** The `-b` option to set a per-texel error weight boost for
@@ -186,3 +193,7 @@ Key for charts:
 **Relative performance vs 2.5 release:**
 
 ![Relative scores 3.0 vs 2.5](./ChangeLogImg/relative-2.5-to-3.0.png)
+
+- - -
+
+_Copyright © 2021-2022, Arm Limited and contributors. All rights reserved._
diff --git a/lib/astc-encoder/Docs/Encoding.md b/lib/astc-encoder/Docs/Encoding.md
index 841527f458..85a3a73fed 100644
--- a/lib/astc-encoder/Docs/Encoding.md
+++ b/lib/astc-encoder/Docs/Encoding.md
@@ -201,3 +201,7 @@ A channel as LDR.
 For other use cases the alpha channel is simply a fourth data channel which is
 also storing an HDR value. For these cases use the `-cH` compressor option
 which will treat all channels as HDR data.
+
+- - -
+
+_Copyright © 2019-2022, Arm Limited and contributors. All rights reserved._
diff --git a/lib/astc-encoder/Docs/FileFormat.md b/lib/astc-encoder/Docs/FileFormat.md
index 9e90030452..4ee84fe87c 100644
--- a/lib/astc-encoder/Docs/FileFormat.md
+++ b/lib/astc-encoder/Docs/FileFormat.md
@@ -65,3 +65,7 @@ Binary payload
 The binary payload is a byte stream that immediately follows the header. It
 contains 16 bytes per compressed block. The number of compressed blocks is
 determined from the header information.
+
+- - -
+
+_Copyright © 2020-2022, Arm Limited and contributors. All rights reserved._
diff --git a/lib/astc-encoder/Docs/FormatOverview.md b/lib/astc-encoder/Docs/FormatOverview.md
index a0b3869132..00dd861566 100644
--- a/lib/astc-encoder/Docs/FormatOverview.md
+++ b/lib/astc-encoder/Docs/FormatOverview.md
@@ -482,3 +482,7 @@ which allow applications to reduce the intermediate precision to either UNORM8
 [astc_3d]: https://www.khronos.org/registry/OpenGL/extensions/KHR/KHR_texture_compression_astc_sliced_3d.txt
 [astc_full]: https://www.khronos.org/registry/OpenGL/extensions/OES/OES_texture_compression_astc.txt
 [astc_decode]: https://www.khronos.org/registry/OpenGL/extensions/EXT/EXT_texture_compression_astc_decode_mode.txt
+
+- - -
+
+_Copyright © 2019-2022, Arm Limited and contributors. All rights reserved._
diff --git a/lib/astc-encoder/Docs/Profiling.md b/lib/astc-encoder/Docs/Profiling.md
index 036069c7cd..de415d1dba 100644
--- a/lib/astc-encoder/Docs/Profiling.md
+++ b/lib/astc-encoder/Docs/Profiling.md
@@ -46,3 +46,7 @@ Standard syntax x86-64 disassembly can be generated using:
 ```shell
 objdump -C -M intel --no-show-raw -d -S <binary> > dis.txt
 ```
+
+- - -
+
+_Copyright © 2020-2022, Arm Limited and contributors. All rights reserved._
diff --git a/lib/astc-encoder/Docs/Terminology.md b/lib/astc-encoder/Docs/Terminology.md
index ce7d9227d8..bf1dd0c717 100644
--- a/lib/astc-encoder/Docs/Terminology.md
+++ b/lib/astc-encoder/Docs/Terminology.md
@@ -73,3 +73,7 @@ interpolation weights. This gets very confusing in functions using all three!
 We are slowly refactoring the code to only use "weight" to mean the endpoint
 interpolation weights. The error weighting factors used for other purposes are
 being updated to use the using the term "significance".
+
+- - -
+
+_Copyright © 2020-2022, Arm Limited and contributors. All rights reserved._
diff --git a/lib/astc-encoder/Docs/Testing.md b/lib/astc-encoder/Docs/Testing.md
index a1907d2ae6..2cef276b2a 100644
--- a/lib/astc-encoder/Docs/Testing.md
+++ b/lib/astc-encoder/Docs/Testing.md
@@ -114,3 +114,7 @@ not introduced any obvious memory errors. Build a release build with symbols
 information with `-DCMAKE_BUILD_TYPE=RelWithDebInfo` and then run:
 
     valgrind --tool=memcheck --track-origins=yes <command>
+
+- - -
+
+_Copyright © 2019-2022, Arm Limited and contributors. All rights reserved._
diff --git a/lib/astc-encoder/Source/astcenc_block_sizes.cpp b/lib/astc-encoder/Source/astcenc_block_sizes.cpp
index 4a9dc09058..70da1e714b 100644
--- a/lib/astc-encoder/Source/astcenc_block_sizes.cpp
+++ b/lib/astc-encoder/Source/astcenc_block_sizes.cpp
@@ -243,25 +243,20 @@ static bool decode_block_mode_3d(
  * @param      x_weights   The number of weights in the X dimension.
  * @param      y_weights   The number of weights in the Y dimension.
  * @param[out] di          The decimation info structure to populate.
+ * @param[out] wb          The decimation table init scratch working buffers.
  */
 static void init_decimation_info_2d(
 	unsigned int x_texels,
 	unsigned int y_texels,
 	unsigned int x_weights,
 	unsigned int y_weights,
-	decimation_info& di
+	decimation_info& di,
+	dt_init_working_buffers& wb
 ) {
 	unsigned int texels_per_block = x_texels * y_texels;
 	unsigned int weights_per_block = x_weights * y_weights;
 
-	uint8_t weight_count_of_texel[BLOCK_MAX_TEXELS];
-	uint8_t grid_weights_of_texel[BLOCK_MAX_TEXELS][4];
-	uint8_t weights_of_texel[BLOCK_MAX_TEXELS][4];
-
-	uint8_t texel_count_of_weight[BLOCK_MAX_WEIGHTS];
 	uint8_t max_texel_count_of_weight = 0;
-	uint8_t texels_of_weight[BLOCK_MAX_WEIGHTS][BLOCK_MAX_TEXELS];
-	uint8_t texel_weights_of_weight[BLOCK_MAX_WEIGHTS][BLOCK_MAX_TEXELS];
 
 	promise(weights_per_block > 0);
 	promise(texels_per_block > 0);
@@ -270,12 +265,12 @@ static void init_decimation_info_2d(
 
 	for (unsigned int i = 0; i < weights_per_block; i++)
 	{
-		texel_count_of_weight[i] = 0;
+		wb.texel_count_of_weight[i] = 0;
 	}
 
 	for (unsigned int i = 0; i < texels_per_block; i++)
 	{
-		weight_count_of_texel[i] = 0;
+		wb.weight_count_of_texel[i] = 0;
 	}
 
 	for (unsigned int y = 0; y < y_texels; y++)
@@ -311,13 +306,13 @@ static void init_decimation_info_2d(
 			{
 				if (weight[i] != 0)
 				{
-					grid_weights_of_texel[texel][weight_count_of_texel[texel]] = static_cast<uint8_t>(qweight[i]);
-					weights_of_texel[texel][weight_count_of_texel[texel]] = static_cast<uint8_t>(weight[i]);
-					weight_count_of_texel[texel]++;
-					texels_of_weight[qweight[i]][texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(texel);
-					texel_weights_of_weight[qweight[i]][texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(weight[i]);
-					texel_count_of_weight[qweight[i]]++;
-					max_texel_count_of_weight = astc::max(max_texel_count_of_weight, texel_count_of_weight[qweight[i]]);
+					wb.grid_weights_of_texel[texel][wb.weight_count_of_texel[texel]] = static_cast<uint8_t>(qweight[i]);
+					wb.weights_of_texel[texel][wb.weight_count_of_texel[texel]] = static_cast<uint8_t>(weight[i]);
+					wb.weight_count_of_texel[texel]++;
+					wb.texels_of_weight[qweight[i]][wb.texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(texel);
+					wb.texel_weights_of_weight[qweight[i]][wb.texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(weight[i]);
+					wb.texel_count_of_weight[qweight[i]]++;
+					max_texel_count_of_weight = astc::max(max_texel_count_of_weight, wb.texel_count_of_weight[qweight[i]]);
 				}
 			}
 		}
@@ -325,17 +320,17 @@ static void init_decimation_info_2d(
 
 	for (unsigned int i = 0; i < texels_per_block; i++)
 	{
-		di.texel_weight_count[i] = weight_count_of_texel[i];
+		di.texel_weight_count[i] = wb.weight_count_of_texel[i];
 
-		for (unsigned int j = 0; j < weight_count_of_texel[i]; j++)
+		for (unsigned int j = 0; j < wb.weight_count_of_texel[i]; j++)
 		{
-			di.texel_weights_int_4t[j][i] = weights_of_texel[i][j];
-			di.texel_weights_float_4t[j][i] = ((float)weights_of_texel[i][j]) * (1.0f / WEIGHTS_TEXEL_SUM);
-			di.texel_weights_4t[j][i] = grid_weights_of_texel[i][j];
+			di.texel_weights_int_4t[j][i] = wb.weights_of_texel[i][j];
+			di.texel_weights_float_4t[j][i] = ((float)wb.weights_of_texel[i][j]) * (1.0f / WEIGHTS_TEXEL_SUM);
+			di.texel_weights_4t[j][i] = wb.grid_weights_of_texel[i][j];
 		}
 
 		// Init all 4 entries so we can rely on zeros for vectorization
-		for (unsigned int j = weight_count_of_texel[i]; j < 4; j++)
+		for (unsigned int j = wb.weight_count_of_texel[i]; j < 4; j++)
 		{
 			di.texel_weights_int_4t[j][i] = 0;
 			di.texel_weights_float_4t[j][i] = 0.0f;
@@ -345,16 +340,16 @@ static void init_decimation_info_2d(
 
 	for (unsigned int i = 0; i < weights_per_block; i++)
 	{
-		unsigned int texel_count_wt = texel_count_of_weight[i];
+		unsigned int texel_count_wt = wb.texel_count_of_weight[i];
 		di.weight_texel_count[i] = (uint8_t)texel_count_wt;
 
 		for (unsigned int j = 0; j < texel_count_wt; j++)
 		{
-			uint8_t texel = texels_of_weight[i][j];
+			uint8_t texel = wb.texels_of_weight[i][j];
 
 			// Create transposed versions of these for better vectorization
 			di.weight_texel[j][i] = texel;
-			di.weights_flt[j][i] = (float)texel_weights_of_weight[i][j];
+			di.weights_flt[j][i] = (float)wb.texel_weights_of_weight[i][j];
 
 			// perform a layer of array unrolling. An aspect of this unrolling is that
 			// one of the texel-weight indexes is an identity-mapped index; we will use this
@@ -409,7 +404,7 @@ static void init_decimation_info_2d(
 
 	// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
 	// Match last texel in active lane in SIMD group, for better gathers
-	unsigned int last_texel_count_wt = texel_count_of_weight[weights_per_block - 1];
+	unsigned int last_texel_count_wt = wb.texel_count_of_weight[weights_per_block - 1];
 	uint8_t last_texel = di.weight_texel[last_texel_count_wt - 1][weights_per_block - 1];
 
 	unsigned int weights_per_block_simd = round_up_to_simd_multiple_vla(weights_per_block);
@@ -441,6 +436,7 @@ static void init_decimation_info_2d(
  * @param      y_weights   The number of weights in the Y dimension.
  * @param      z_weights   The number of weights in the Z dimension.
  * @param[out] di          The decimation info structure to populate.
+   @param[out] wb          The decimation table init scratch working buffers.
  */
 static void init_decimation_info_3d(
 	unsigned int x_texels,
@@ -449,31 +445,25 @@ static void init_decimation_info_3d(
 	unsigned int x_weights,
 	unsigned int y_weights,
 	unsigned int z_weights,
-	decimation_info& di
+	decimation_info& di,
+	dt_init_working_buffers& wb
 ) {
 	unsigned int texels_per_block = x_texels * y_texels * z_texels;
 	unsigned int weights_per_block = x_weights * y_weights * z_weights;
 
-	uint8_t weight_count_of_texel[BLOCK_MAX_TEXELS];
-	uint8_t grid_weights_of_texel[BLOCK_MAX_TEXELS][4];
-	uint8_t weights_of_texel[BLOCK_MAX_TEXELS][4];
-
-	uint8_t texel_count_of_weight[BLOCK_MAX_WEIGHTS];
 	uint8_t max_texel_count_of_weight = 0;
-	uint8_t texels_of_weight[BLOCK_MAX_WEIGHTS][BLOCK_MAX_TEXELS];
-	uint8_t texel_weights_of_weight[BLOCK_MAX_WEIGHTS][BLOCK_MAX_TEXELS];
 
 	promise(weights_per_block > 0);
 	promise(texels_per_block > 0);
 
 	for (unsigned int i = 0; i < weights_per_block; i++)
 	{
-		texel_count_of_weight[i] = 0;
+		wb.texel_count_of_weight[i] = 0;
 	}
 
 	for (unsigned int i = 0; i < texels_per_block; i++)
 	{
-		weight_count_of_texel[i] = 0;
+		wb.weight_count_of_texel[i] = 0;
 	}
 
 	for (unsigned int z = 0; z < z_texels; z++)
@@ -580,13 +570,13 @@ static void init_decimation_info_3d(
 				{
 					if (weight[i] != 0)
 					{
-						grid_weights_of_texel[texel][weight_count_of_texel[texel]] = static_cast<uint8_t>(qweight[i]);
-						weights_of_texel[texel][weight_count_of_texel[texel]] = static_cast<uint8_t>(weight[i]);
-						weight_count_of_texel[texel]++;
-						texels_of_weight[qweight[i]][texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(texel);
-						texel_weights_of_weight[qweight[i]][texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(weight[i]);
-						texel_count_of_weight[qweight[i]]++;
-						max_texel_count_of_weight = astc::max(max_texel_count_of_weight, texel_count_of_weight[qweight[i]]);
+						wb.grid_weights_of_texel[texel][wb.weight_count_of_texel[texel]] = static_cast<uint8_t>(qweight[i]);
+						wb.weights_of_texel[texel][wb.weight_count_of_texel[texel]] = static_cast<uint8_t>(weight[i]);
+						wb.weight_count_of_texel[texel]++;
+						wb.texels_of_weight[qweight[i]][wb.texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(texel);
+						wb.texel_weights_of_weight[qweight[i]][wb.texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(weight[i]);
+						wb.texel_count_of_weight[qweight[i]]++;
+						max_texel_count_of_weight = astc::max(max_texel_count_of_weight, wb.texel_count_of_weight[qweight[i]]);
 					}
 				}
 			}
@@ -595,7 +585,7 @@ static void init_decimation_info_3d(
 
 	for (unsigned int i = 0; i < texels_per_block; i++)
 	{
-		di.texel_weight_count[i] = weight_count_of_texel[i];
+		di.texel_weight_count[i] = wb.weight_count_of_texel[i];
 
 		// Init all 4 entries so we can rely on zeros for vectorization
 		for (unsigned int j = 0; j < 4; j++)
@@ -605,26 +595,26 @@ static void init_decimation_info_3d(
 			di.texel_weights_4t[j][i] = 0;
 		}
 
-		for (unsigned int j = 0; j < weight_count_of_texel[i]; j++)
+		for (unsigned int j = 0; j < wb.weight_count_of_texel[i]; j++)
 		{
-			di.texel_weights_int_4t[j][i] = weights_of_texel[i][j];
-			di.texel_weights_float_4t[j][i] = ((float)weights_of_texel[i][j]) * (1.0f / WEIGHTS_TEXEL_SUM);
-			di.texel_weights_4t[j][i] = grid_weights_of_texel[i][j];
+			di.texel_weights_int_4t[j][i] = wb.weights_of_texel[i][j];
+			di.texel_weights_float_4t[j][i] = ((float)wb.weights_of_texel[i][j]) * (1.0f / WEIGHTS_TEXEL_SUM);
+			di.texel_weights_4t[j][i] = wb.grid_weights_of_texel[i][j];
 		}
 	}
 
 	for (unsigned int i = 0; i < weights_per_block; i++)
 	{
-		unsigned int texel_count_wt = texel_count_of_weight[i];
+		unsigned int texel_count_wt = wb.texel_count_of_weight[i];
 		di.weight_texel_count[i] = (uint8_t)texel_count_wt;
 
 		for (unsigned int j = 0; j < texel_count_wt; j++)
 		{
-			unsigned int texel = texels_of_weight[i][j];
+			unsigned int texel = wb.texels_of_weight[i][j];
 
 			// Create transposed versions of these for better vectorization
 			di.weight_texel[j][i] = static_cast<uint8_t>(texel);
-			di.weights_flt[j][i] = static_cast<float>(texel_weights_of_weight[i][j]);
+			di.weights_flt[j][i] = static_cast<float>(wb.texel_weights_of_weight[i][j]);
 
 			// perform a layer of array unrolling. An aspect of this unrolling is that
 			// one of the texel-weight indexes is an identity-mapped index; we will use this
@@ -679,7 +669,7 @@ static void init_decimation_info_3d(
 
 	// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
 	// Match last texel in active lane in SIMD group, for better gathers
-	int last_texel_count_wt = texel_count_of_weight[weights_per_block - 1];
+	int last_texel_count_wt = wb.texel_count_of_weight[weights_per_block - 1];
 	uint8_t last_texel = di.weight_texel[last_texel_count_wt - 1][weights_per_block - 1];
 
 	unsigned int weights_per_block_simd = round_up_to_simd_multiple_vla(weights_per_block);
@@ -755,6 +745,8 @@ static void assign_kmeans_texels(
  * @param y_texels    The number of texels in the Y dimension.
  * @param x_weights   The number of weights in the X dimension.
  * @param y_weights   The number of weights in the Y dimension.
+ * @param bsd         The block size descriptor we are populating.
+ * @param wb          The decimation table init scratch working buffers.
  *
  * @return The new entry's index in the compacted decimation table array.
  */
@@ -763,7 +755,8 @@ static int construct_dt_entry_2d(
 	unsigned int y_texels,
 	unsigned int x_weights,
 	unsigned int y_weights,
-	block_size_descriptor& bsd
+	block_size_descriptor& bsd,
+	dt_init_working_buffers& wb
 ) {
 	unsigned int dm_index = bsd.decimation_mode_count;
 	unsigned int weight_count = x_weights * y_weights;
@@ -772,7 +765,7 @@ static int construct_dt_entry_2d(
 	bool try_2planes = (2 * weight_count) <= BLOCK_MAX_WEIGHTS;
 
 	decimation_info *di = aligned_malloc<decimation_info>(sizeof(decimation_info), ASTCENC_VECALIGN);
-	init_decimation_info_2d(x_texels, y_texels, x_weights, y_weights, *di);
+	init_decimation_info_2d(x_texels, y_texels, x_weights, y_weights, *di, wb);
 
 	int maxprec_1plane = -1;
 	int maxprec_2planes = -1;
@@ -829,6 +822,8 @@ static void construct_block_size_descriptor_2d(
 	static const unsigned int MAX_DMI = 12 * 16 + 12;
 	int decimation_mode_index[MAX_DMI];
 
+	dt_init_working_buffers* wb = new dt_init_working_buffers;
+
 	bsd.xdim = static_cast<uint8_t>(x_texels);
 	bsd.ydim = static_cast<uint8_t>(y_texels);
 	bsd.zdim = 1;
@@ -909,7 +904,7 @@ static void construct_block_size_descriptor_2d(
 			int decimation_mode = decimation_mode_index[y_weights * 16 + x_weights];
 			if (decimation_mode < 0)
 			{
-				decimation_mode = construct_dt_entry_2d(x_texels, y_texels, x_weights, y_weights, bsd);
+				decimation_mode = construct_dt_entry_2d(x_texels, y_texels, x_weights, y_weights, bsd, *wb);
 				decimation_mode_index[y_weights * 16 + x_weights] = decimation_mode;
 
 	#if !defined(ASTCENC_DECOMPRESS_ONLY)
@@ -970,6 +965,8 @@ static void construct_block_size_descriptor_2d(
 
 	// Determine the texels to use for kmeans clustering.
 	assign_kmeans_texels(bsd);
+
+	delete wb;
 }
 
 /**
@@ -995,6 +992,8 @@ static void construct_block_size_descriptor_3d(
 	int decimation_mode_index[MAX_DMI];
 	unsigned int decimation_mode_count = 0;
 
+	dt_init_working_buffers* wb = new dt_init_working_buffers;
+
 	bsd.xdim = static_cast<uint8_t>(x_texels);
 	bsd.ydim = static_cast<uint8_t>(y_texels);
 	bsd.zdim = static_cast<uint8_t>(z_texels);
@@ -1020,7 +1019,7 @@ static void construct_block_size_descriptor_3d(
 
 				decimation_info *di = aligned_malloc<decimation_info>(sizeof(decimation_info), ASTCENC_VECALIGN);
 				decimation_mode_index[z_weights * 64 + y_weights * 8 + x_weights] = decimation_mode_count;
-				init_decimation_info_3d(x_texels, y_texels, z_texels, x_weights, y_weights, z_weights, *di);
+				init_decimation_info_3d(x_texels, y_texels, z_texels, x_weights, y_weights, z_weights, *di, *wb);
 
 				int maxprec_1plane = -1;
 				int maxprec_2planes = -1;
@@ -1114,6 +1113,8 @@ static void construct_block_size_descriptor_3d(
 
 	// Determine the texels to use for kmeans clustering.
 	assign_kmeans_texels(bsd);
+
+	delete wb;
 }
 
 /* See header for documentation. */
diff --git a/lib/astc-encoder/Source/astcenc_compress_symbolic.cpp b/lib/astc-encoder/Source/astcenc_compress_symbolic.cpp
index 01b2a8b697..315d7a3d14 100644
--- a/lib/astc-encoder/Source/astcenc_compress_symbolic.cpp
+++ b/lib/astc-encoder/Source/astcenc_compress_symbolic.cpp
@@ -50,6 +50,136 @@ static void merge_endpoints(
 	result.endpt1[0] = select(ep_plane1.endpt1[0], ep_plane2.endpt1[0], sep_mask);
 }
 
+/**
+ * @brief Attempt to improve weights given a chosen configuration.
+ *
+ * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
+ * partition and per plane) and attempt to improve image quality by moving each weight up by one or
+ * down by one quantization step.
+ *
+ * This is a specialized function which only supports operating on undecimated weight grids,
+ * therefore primarily improving the performance of 4x4 and 5x5 blocks where grid decimation
+ * is needed less often.
+ *
+ * @param      decode_mode                       The decode mode (LDR, HDR).
+ * @param      bsd                               The block size information.
+ * @param      blk                               The image block color data to compress.
+ * @param[out] scb                               The symbolic compressed block output.
+ * @param[out] dec_weights_quant_pvalue_plane1   The weights for plane 1.
+ * @param[out] dec_weights_quant_pvalue_plane2   The weights for plane 2, or @c nullptr if 1 plane.
+ */
+static bool realign_weights_undecimated(
+	astcenc_profile decode_mode,
+	const block_size_descriptor& bsd,
+	const image_block& blk,
+	symbolic_compressed_block& scb,
+	uint8_t* dec_weights_quant_pvalue_plane1,
+	uint8_t* dec_weights_quant_pvalue_plane2
+) {
+	// Get the partition descriptor
+	unsigned int partition_count = scb.partition_count;
+	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
+
+	// Get the quantization table
+	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
+	unsigned int weight_quant_level = bm.quant_mode;
+	const quantization_and_transfer_table *qat = &(quant_and_xfer_tables[weight_quant_level]);
+
+	unsigned int max_plane = bm.is_dual_plane;
+	int plane2_component = bm.is_dual_plane ? scb.plane2_component : -1;
+	vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
+
+	// Decode the color endpoints
+	bool rgb_hdr;
+	bool alpha_hdr;
+	vint4 endpnt0[BLOCK_MAX_PARTITIONS];
+	vint4 endpnt1[BLOCK_MAX_PARTITIONS];
+	vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
+	vfloat4 offset[BLOCK_MAX_PARTITIONS];
+
+	promise(partition_count > 0);
+
+	for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
+	{
+		unpack_color_endpoints(decode_mode,
+		                       scb.color_formats[pa_idx],
+		                       scb.get_color_quant_mode(),
+		                       scb.color_values[pa_idx],
+		                       rgb_hdr, alpha_hdr,
+		                       endpnt0[pa_idx],
+		                       endpnt1[pa_idx]);
+	}
+
+	uint8_t* dec_weights_quant_pvalue = dec_weights_quant_pvalue_plane1;
+	bool adjustments = false;
+
+	// For each plane and partition ...
+	for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
+	{
+		for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
+		{
+			// Compute the endpoint delta for all components in current plane
+			vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
+			epd = select(epd, vint4::zero(), plane_mask);
+
+			endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
+			offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
+		}
+
+		// For each weight compute previous, current, and next errors
+		promise(bsd.texel_count > 0);
+		for (unsigned int texel = 0; texel < bsd.texel_count; texel++)
+		{
+			unsigned int uqw = qat->unquantized_value[dec_weights_quant_pvalue[texel]];
+
+			uint32_t prev_and_next = qat->prev_next_values[uqw];
+			int prev_wt_uq = prev_and_next & 0xFF;
+			int next_wt_uq = (prev_and_next >> 8) & 0xFF;
+
+			// Interpolate the colors to create the diffs
+			unsigned int partition = pi.partition_of_texel[texel];
+
+			int plane_weight = uqw;
+			int plane_up_weight = next_wt_uq - uqw;
+			int plane_down_weight = prev_wt_uq - uqw;
+
+			vfloat4 color_offset = offset[partition];
+			vfloat4 color_base   = endpnt0f[partition];
+
+			vfloat4 color = color_base + color_offset * plane_weight;
+
+			vfloat4 orig_color   = blk.texel(texel);
+			vfloat4 error_weight = blk.channel_weight;
+
+			vfloat4 color_diff      = color - orig_color;
+			vfloat4 color_up_diff   = color_diff + color_offset * plane_up_weight;
+			vfloat4 color_down_diff = color_diff + color_offset * plane_down_weight;
+
+			float current_error = dot_s(color_diff      * color_diff,      error_weight);
+			float up_error      = dot_s(color_up_diff   * color_up_diff,   error_weight);
+			float down_error    = dot_s(color_down_diff * color_down_diff, error_weight);
+
+			// Check if the prev or next error is better, and if so use it
+			if ((up_error < current_error) && (up_error < down_error))
+			{
+				dec_weights_quant_pvalue[texel] = (uint8_t)((prev_and_next >> 24) & 0xFF);
+				adjustments = true;
+			}
+			else if (down_error < current_error)
+			{
+				dec_weights_quant_pvalue[texel] = (uint8_t)((prev_and_next >> 16) & 0xFF);
+				adjustments = true;
+			}
+		}
+
+		// Prepare iteration for plane 2
+		dec_weights_quant_pvalue = dec_weights_quant_pvalue_plane2;
+		plane_mask = ~plane_mask;
+	}
+
+	return adjustments;
+}
+
 /**
  * @brief Attempt to improve weights given a chosen configuration.
  *
@@ -64,7 +194,7 @@ static void merge_endpoints(
  * @param[out] dec_weights_quant_pvalue_plane1   The weights for plane 1.
  * @param[out] dec_weights_quant_pvalue_plane2   The weights for plane 2, or @c nullptr if 1 plane.
  */
-static bool realign_weights(
+static bool realign_weights_generic(
 	astcenc_profile decode_mode,
 	const block_size_descriptor& bsd,
 	const image_block& blk,
@@ -84,6 +214,7 @@ static bool realign_weights(
 	// Get the decimation table
 	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
 	unsigned int weight_count = di.weight_count;
+	bool is_decimated = weight_count != bsd.texel_count;
 
 	unsigned int max_plane = bm.is_dual_plane;
 	int plane2_component = bm.is_dual_plane ? scb.plane2_component : -1;
@@ -112,6 +243,7 @@ static bool realign_weights(
 	}
 
 	uint8_t uq_pl_weights[BLOCK_MAX_WEIGHTS];
+	float uq_pl_weightsf[BLOCK_MAX_WEIGHTS];
 	uint8_t* dec_weights_quant_pvalue = dec_weights_quant_pvalue_plane1;
 	bool adjustments = false;
 
@@ -132,19 +264,21 @@ static bool realign_weights(
 		for (unsigned int we_idx = 0; we_idx < weight_count; we_idx++)
 		{
 			uq_pl_weights[we_idx] = qat->unquantized_value[dec_weights_quant_pvalue[we_idx]];
+			uq_pl_weightsf[we_idx] = static_cast<float>(uq_pl_weights[we_idx]);
 		}
 
 		// For each weight compute previous, current, and next errors
 		for (unsigned int we_idx = 0; we_idx < weight_count; we_idx++)
 		{
 			unsigned int uqw = uq_pl_weights[we_idx];
+			unsigned int uqwf = uq_pl_weightsf[we_idx];
 
 			uint32_t prev_and_next = qat->prev_next_values[uqw];
 			unsigned int prev_wt_uq = prev_and_next & 0xFF;
 			unsigned int next_wt_uq = (prev_and_next >> 8) & 0xFF;
 
-			int uqw_next_dif = next_wt_uq - uqw;
-			int uqw_prev_dif = prev_wt_uq - uqw;
+			float uqw_next_dif = static_cast<float>(next_wt_uq) - uqwf;
+			float uqw_prev_dif = static_cast<float>(prev_wt_uq) - uqwf;
 
 			float current_error = 0.0f;
 			float up_error = 0.0f;
@@ -156,41 +290,42 @@ static bool realign_weights(
 			for (unsigned int te_idx = 0; te_idx < texels_to_evaluate; te_idx++)
 			{
 				unsigned int texel = di.weight_texel[te_idx][we_idx];
-				const uint8_t *texel_weights = di.texel_weights_texel[we_idx][te_idx];
-				const float *texel_weights_float = di.texel_weights_float_texel[we_idx][te_idx];
-				float twf0 = texel_weights_float[0];
-
-				float weight_base = static_cast<float>(uqw) * twf0;
+				float weight_base = uqwf;
+				float twf0 = 1.0f;
 
 				// Don't interpolate filtered weights for a 1:1 weight grid
-				if (weight_count != bsd.texel_count)
+				if (is_decimated)
 				{
+					const uint8_t *texel_weights = di.texel_weights_texel[we_idx][te_idx];
+					const float *texel_weights_float = di.texel_weights_float_texel[we_idx][te_idx];
+					twf0 = texel_weights_float[0];
+
 					weight_base =
-						(( weight_base
-						+ static_cast<float>(uq_pl_weights[texel_weights[1]])  * texel_weights_float[1])
-						+ (static_cast<float>(uq_pl_weights[texel_weights[2]]) * texel_weights_float[2]
-						+ static_cast<float>(uq_pl_weights[texel_weights[3]]) * texel_weights_float[3]));
+						  (uqwf                             * twf0
+						 + uq_pl_weightsf[texel_weights[1]] * texel_weights_float[1])
+						+ (uq_pl_weightsf[texel_weights[2]] * texel_weights_float[2]
+						 + uq_pl_weightsf[texel_weights[3]] * texel_weights_float[3]);
 				}
 
 				unsigned int partition = pi.partition_of_texel[texel];
 
 				weight_base = weight_base + 0.5f;
 				float plane_weight = astc::flt_rd(weight_base);
-				float plane_up_weight = astc::flt_rd(weight_base + static_cast<float>(uqw_next_dif) * twf0) - plane_weight;
-				float plane_down_weight = astc::flt_rd(weight_base + static_cast<float>(uqw_prev_dif) * twf0) - plane_weight;
+				float plane_up_weight = astc::flt_rd(weight_base + uqw_next_dif * twf0) - plane_weight;
+				float plane_down_weight = astc::flt_rd(weight_base + uqw_prev_dif * twf0) - plane_weight;
 
 				vfloat4 color_offset = offset[partition];
 				vfloat4 color_base   = endpnt0f[partition];
 
 				vfloat4 color = color_base + color_offset * plane_weight;
 
-				vfloat4 origcolor    = blk.texel(texel);
+				vfloat4 orig_color    = blk.texel(texel);
 				vfloat4 error_weight = blk.channel_weight;
 
-				vfloat4 colordiff       = color - origcolor;
-				vfloat4 color_up_diff   = colordiff + color_offset * plane_up_weight;
-				vfloat4 color_down_diff = colordiff + color_offset * plane_down_weight;
-				current_error += dot_s(colordiff       * colordiff,       error_weight);
+				vfloat4 color_diff      = color - orig_color;
+				vfloat4 color_up_diff   = color_diff + color_offset * plane_up_weight;
+				vfloat4 color_down_diff = color_diff + color_offset * plane_down_weight;
+				current_error += dot_s(color_diff      * color_diff,      error_weight);
 				up_error      += dot_s(color_up_diff   * color_up_diff,   error_weight);
 				down_error    += dot_s(color_down_diff * color_down_diff, error_weight);
 			}
@@ -199,12 +334,14 @@ static bool realign_weights(
 			if ((up_error < current_error) && (up_error < down_error))
 			{
 				uq_pl_weights[we_idx] = static_cast<uint8_t>(next_wt_uq);
+				uq_pl_weightsf[we_idx] = static_cast<float>(next_wt_uq);
 				dec_weights_quant_pvalue[we_idx] = (uint8_t)((prev_and_next >> 24) & 0xFF);
 				adjustments = true;
 			}
 			else if (down_error < current_error)
 			{
 				uq_pl_weights[we_idx] = static_cast<uint8_t>(prev_wt_uq);
+				uq_pl_weightsf[we_idx] = static_cast<float>(prev_wt_uq);
 				dec_weights_quant_pvalue[we_idx] = (uint8_t)((prev_and_next >> 16) & 0xFF);
 				adjustments = true;
 			}
@@ -300,23 +437,22 @@ static float compress_symbolic_block_for_partition_1plane(
 	float min_wt_cutoff = hmin_s(min_ep);
 
 	// For each mode, use the angular method to compute a shift
-	float weight_low_value[WEIGHTS_MAX_BLOCK_MODES];
-	float weight_high_value[WEIGHTS_MAX_BLOCK_MODES];
-
 	compute_angular_endpoints_1plane(
 	    config.tune_low_weight_count_limit,
 	    only_always, bsd,
 	    dec_weights_ideal_value, dec_weights_ideal_sig,
-	    weight_low_value, weight_high_value);
+	    tmpbuf);
+
+	float* weight_low_value = tmpbuf.weight_low_value1;
+	float* weight_high_value = tmpbuf.weight_high_value1;
+	int* qwt_bitcounts = tmpbuf.qwt_bitcounts;
+	float* qwt_errors = tmpbuf.qwt_errors;
 
 	// For each mode (which specifies a decimation and a quantization):
 	//     * Compute number of bits needed for the quantized weights
 	//     * Generate an optimized set of quantized weights
 	//     * Compute quantization errors for the mode
 
-	int qwt_bitcounts[WEIGHTS_MAX_BLOCK_MODES];
-	float qwt_errors[WEIGHTS_MAX_BLOCK_MODES];
-
 	for (unsigned int i = 0; i < bsd.block_mode_count; ++i)
 	{
 		qwt_errors[i] = 1e38f;
@@ -380,7 +516,7 @@ static float compress_symbolic_block_for_partition_1plane(
 	unsigned int candidate_count = compute_ideal_endpoint_formats(
 	    bsd, pi, blk, ei.ep, qwt_bitcounts, qwt_errors,
 	    config.tune_candidate_limit, partition_format_specifiers, block_mode_index,
-	    color_quant_level, color_quant_level_mod);
+	    color_quant_level, color_quant_level_mod, tmpbuf);
 
 	// Iterate over the N believed-to-be-best modes to find out which one is actually best
 	float best_errorval_in_mode = ERROR_CALC_DEFAULT;
@@ -530,10 +666,19 @@ static float compress_symbolic_block_for_partition_1plane(
 				}
 			}
 
-			// Perform a final pass over the weights to try to improve them.
-			bool adjustments = realign_weights(
-			    config.profile, bsd, blk, workscb,
-			    workscb.weights, nullptr);
+			bool adjustments;
+			if (di.weight_count != bsd.texel_count)
+			{
+				adjustments = realign_weights_generic(
+					config.profile, bsd, blk, workscb,
+					workscb.weights, nullptr);
+			}
+			else
+			{
+				adjustments = realign_weights_undecimated(
+					config.profile, bsd, blk, workscb,
+					workscb.weights, nullptr);
+			}
 
 			// Post-realign test
 			float errorval = compute_symbolic_block_difference(config, bsd, workscb, blk);
@@ -667,24 +812,24 @@ static float compress_symbolic_block_for_partition_2planes(
 	// Set the minwt2 to the plane2 component min in ep2
 	float min_wt_cutoff2 = hmin_s(select(err_max, min_ep2, err_mask));
 
-	float weight_low_value1[WEIGHTS_MAX_BLOCK_MODES];
-	float weight_high_value1[WEIGHTS_MAX_BLOCK_MODES];
-	float weight_low_value2[WEIGHTS_MAX_BLOCK_MODES];
-	float weight_high_value2[WEIGHTS_MAX_BLOCK_MODES];
-
 	compute_angular_endpoints_2planes(
 	    config.tune_low_weight_count_limit,
 	    bsd, dec_weights_ideal_value, dec_weights_ideal_sig,
-	    weight_low_value1, weight_high_value1,
-	    weight_low_value2, weight_high_value2);
+	    tmpbuf);
 
 	// For each mode (which specifies a decimation and a quantization):
 	//     * Compute number of bits needed for the quantized weights
 	//     * Generate an optimized set of quantized weights
 	//     * Compute quantization errors for the mode
 
-	int qwt_bitcounts[WEIGHTS_MAX_BLOCK_MODES];
-	float qwt_errors[WEIGHTS_MAX_BLOCK_MODES];
+	float* weight_low_value1 = tmpbuf.weight_low_value1;
+	float* weight_high_value1 = tmpbuf.weight_high_value1;
+	float* weight_low_value2 = tmpbuf.weight_low_value2;
+	float* weight_high_value2 = tmpbuf.weight_high_value2;
+
+	int* qwt_bitcounts = tmpbuf.qwt_bitcounts;
+	float* qwt_errors = tmpbuf.qwt_errors;
+
 	for (unsigned int i = 0; i < bsd.block_mode_count; ++i)
 	{
 		const block_mode& bm = bsd.block_modes[i];
@@ -762,7 +907,7 @@ static float compress_symbolic_block_for_partition_2planes(
 	unsigned int candidate_count = compute_ideal_endpoint_formats(
 	    bsd, pi, blk, epm, qwt_bitcounts, qwt_errors,
 	    config.tune_candidate_limit, partition_format_specifiers, block_mode_index,
-	    color_quant_level, color_quant_level_mod);
+	    color_quant_level, color_quant_level_mod, tmpbuf);
 
 	// Iterate over the N believed-to-be-best modes to find out which one is actually best
 	float best_errorval_in_mode = ERROR_CALC_DEFAULT;
@@ -872,10 +1017,20 @@ static float compress_symbolic_block_for_partition_2planes(
 				}
 			}
 
-			// Perform a final pass over the weights to try to improve them
-			bool adjustments = realign_weights(
-			    config.profile, bsd, blk, workscb,
-			    workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET);
+			// Perform a final pass over the weights to try to improve them.
+			bool adjustments;
+			if (di.weight_count != bsd.texel_count)
+			{
+				adjustments = realign_weights_generic(
+					config.profile, bsd, blk, workscb,
+					workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET);
+			}
+			else
+			{
+				adjustments = realign_weights_undecimated(
+					config.profile, bsd, blk, workscb,
+					workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET);
+			}
 
 			// Post-realign test
 			float errorval = compute_symbolic_block_difference(config, bsd, workscb, blk);
diff --git a/lib/astc-encoder/Source/astcenc_decompress_symbolic.cpp b/lib/astc-encoder/Source/astcenc_decompress_symbolic.cpp
index 478c1cf1c8..18677f8a5d 100644
--- a/lib/astc-encoder/Source/astcenc_decompress_symbolic.cpp
+++ b/lib/astc-encoder/Source/astcenc_decompress_symbolic.cpp
@@ -171,6 +171,20 @@ void unpack_weights(
 	}
 }
 
+/**
+ * @brief Return an FP32 NaN value for use in error colors.
+ *
+ * This NaN encoding will turn into 0xFFFF when converted to an FP16 NaN.
+ *
+ * @return The float color value.
+ */
+static float error_color_nan()
+{
+	if32 v;
+	v.u = 0xFFFFE000U;
+	return v.f;
+}
+
 /* See header for documentation. */
 void decompress_symbolic_block(
 	astcenc_profile decode_mode,
@@ -195,10 +209,10 @@ void decompress_symbolic_block(
 	{
 		for (unsigned int i = 0; i < bsd.texel_count; i++)
 		{
-			blk.data_r[i] = std::numeric_limits<float>::quiet_NaN();
-			blk.data_g[i] = std::numeric_limits<float>::quiet_NaN();
-			blk.data_b[i] = std::numeric_limits<float>::quiet_NaN();
-			blk.data_a[i] = std::numeric_limits<float>::quiet_NaN();
+			blk.data_r[i] = error_color_nan();
+			blk.data_g[i] = error_color_nan();
+			blk.data_b[i] = error_color_nan();
+			blk.data_a[i] = error_color_nan();
 			blk.rgb_lns[i] = 0;
 			blk.alpha_lns[i] = 0;
 		}
@@ -234,7 +248,7 @@ void decompress_symbolic_block(
 			{
 			case ASTCENC_PRF_LDR_SRGB:
 			case ASTCENC_PRF_LDR:
-				color = vfloat4(std::numeric_limits<float>::quiet_NaN());
+				color = vfloat4(error_color_nan());
 				break;
 			case ASTCENC_PRF_HDR_RGB_LDR_A:
 			case ASTCENC_PRF_HDR:
diff --git a/lib/astc-encoder/Source/astcenc_entry.cpp b/lib/astc-encoder/Source/astcenc_entry.cpp
index b77857c0bd..563c485743 100644
--- a/lib/astc-encoder/Source/astcenc_entry.cpp
+++ b/lib/astc-encoder/Source/astcenc_entry.cpp
@@ -231,13 +231,22 @@ static astcenc_error validate_block_size(
 	unsigned int block_y,
 	unsigned int block_z
 ) {
-	if (((block_z <= 1) && is_legal_2d_block_size(block_x, block_y)) ||
-	    ((block_z >= 2) && is_legal_3d_block_size(block_x, block_y, block_z)))
+	// Test if this is a legal block size at all
+	bool is_legal = (((block_z <= 1) && is_legal_2d_block_size(block_x, block_y)) ||
+	                 ((block_z >= 2) && is_legal_3d_block_size(block_x, block_y, block_z)));
+	if (!is_legal)
 	{
-		return ASTCENC_SUCCESS;
+		return ASTCENC_ERR_BAD_BLOCK_SIZE;
 	}
 
-	return ASTCENC_ERR_BAD_BLOCK_SIZE;
+	// Test if this build has sufficient capacity for this block size
+	bool have_capacity = (block_x * block_y * block_z) <= BLOCK_MAX_TEXELS;
+	if (!have_capacity)
+	{
+		return ASTCENC_ERR_NOT_IMPLEMENTED;
+	}
+
+	return ASTCENC_SUCCESS;
 }
 
 /**
@@ -1110,7 +1119,7 @@ astcenc_error astcenc_decompress_image(
 			                          scb, blk);
 
 			write_image_block(image_out, blk, *ctx->bsd,
-			                 x * block_x, y * block_y, z * block_z, *swizzle);
+			                  x * block_x, y * block_y, z * block_z, *swizzle);
 		}
 
 		ctx->manage_decompress.complete_task_assignment(count);
diff --git a/lib/astc-encoder/Source/astcenc_image.cpp b/lib/astc-encoder/Source/astcenc_image.cpp
index 47af5714a7..cda80722dd 100644
--- a/lib/astc-encoder/Source/astcenc_image.cpp
+++ b/lib/astc-encoder/Source/astcenc_image.cpp
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2021 Arm Limited
+// Copyright 2011-2022 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -318,9 +318,9 @@ void write_image_block(
 				{
 					vint4 colori = vint4::zero();
 
-					if (blk.data_r[idx] == std::numeric_limits<float>::quiet_NaN())
+					// Errors are NaN encoded - convert to magenta error color
+					if (blk.data_r[idx] != blk.data_r[idx])
 					{
-						// Can't display NaN - show magenta error color
 						colori = vint4(0xFF, 0x00, 0xFF, 0xFF);
 					}
 					else if (needs_swz)
@@ -374,11 +374,8 @@ void write_image_block(
 				{
 					vint4 color;
 
-					if (blk.data_r[idx] == std::numeric_limits<float>::quiet_NaN())
-					{
-						color = vint4(0xFFFF);
-					}
-					else if (needs_swz)
+					// NaNs are handled inline - no need to special case
+					if (needs_swz)
 					{
 						data[ASTCENC_SWZ_R] = blk.data_r[idx];
 						data[ASTCENC_SWZ_G] = blk.data_g[idx];
@@ -433,11 +430,8 @@ void write_image_block(
 				{
 					vfloat4 color = blk.texel(idx);
 
-					if (color.lane<0>() == std::numeric_limits<float>::quiet_NaN())
-					{
-						color = vfloat4(std::numeric_limits<float>::quiet_NaN());
-					}
-					else if (needs_swz)
+					// NaNs are handled inline - no need to special case
+					if (needs_swz)
 					{
 						data[ASTCENC_SWZ_R] = color.lane<0>();
 						data[ASTCENC_SWZ_G] = color.lane<1>();
diff --git a/lib/astc-encoder/Source/astcenc_internal.h b/lib/astc-encoder/Source/astcenc_internal.h
index 5981fd1d02..a1c7e74be9 100644
--- a/lib/astc-encoder/Source/astcenc_internal.h
+++ b/lib/astc-encoder/Source/astcenc_internal.h
@@ -66,6 +66,13 @@
 /* ============================================================================
   Constants
 ============================================================================ */
+#if !defined(ASTCENC_BLOCK_MAX_TEXELS)
+	#define ASTCENC_BLOCK_MAX_TEXELS 216 // A 3D 6x6x6 block
+#endif
+
+/** @brief The maximum number of texels a block can support (6x6x6 block). */
+static constexpr unsigned int BLOCK_MAX_TEXELS { ASTCENC_BLOCK_MAX_TEXELS };
+
 /** @brief The maximum number of components a block can support. */
 static constexpr unsigned int BLOCK_MAX_COMPONENTS { 4 };
 
@@ -75,9 +82,6 @@ static constexpr unsigned int BLOCK_MAX_PARTITIONS { 4 };
 /** @brief The number of partitionings, per partition count, suported by the ASTC format. */
 static constexpr unsigned int BLOCK_MAX_PARTITIONINGS { 1024 };
 
-/** @brief The maximum number of texels a block can support (6x6x6 block). */
-static constexpr unsigned int BLOCK_MAX_TEXELS { 216 };
-
 /** @brief The maximum number of weights used during partition selection for texel clustering. */
 static constexpr uint8_t BLOCK_MAX_KMEANS_TEXELS { 64 };
 
@@ -1022,6 +1026,59 @@ struct alignas(ASTCENC_VECALIGN) compression_working_buffers
 	 * For two plane encodings, second plane weights start at @c WEIGHTS_PLANE2_OFFSET offsets.
 	 */
 	alignas(ASTCENC_VECALIGN) uint8_t dec_weights_quant_pvalue[WEIGHTS_MAX_BLOCK_MODES * BLOCK_MAX_WEIGHTS];
+
+	/** @brief Error of the best encoding combination for each block mode. */
+	alignas(ASTCENC_VECALIGN) float errors_of_best_combination[WEIGHTS_MAX_BLOCK_MODES];
+
+	/** @brief The best color quant for each block mode. */
+	alignas(ASTCENC_VECALIGN) quant_method best_quant_levels[WEIGHTS_MAX_BLOCK_MODES];
+
+	/** @brief The best color quant for each block mode if modes are the same and we have spare bits. */
+	quant_method best_quant_levels_mod[WEIGHTS_MAX_BLOCK_MODES];
+
+	/** @brief The best endpoint format for each partition. */
+	int best_ep_formats[WEIGHTS_MAX_BLOCK_MODES][BLOCK_MAX_PARTITIONS];
+
+	/** @brief The total bit storage needed for quantized weights for each block mode. */
+	int qwt_bitcounts[WEIGHTS_MAX_BLOCK_MODES];
+
+	/** @brief The cumulative error for quantized weights for each block mode. */
+	float qwt_errors[WEIGHTS_MAX_BLOCK_MODES];
+
+	/** @brief The low weight value in plane 1 for each block mode. */
+	float weight_low_value1[WEIGHTS_MAX_BLOCK_MODES];
+
+	/** @brief The high weight value in plane 1 for each block mode. */
+	float weight_high_value1[WEIGHTS_MAX_BLOCK_MODES];
+
+	/** @brief The low weight value in plane 1 for each quant level and decimation mode. */
+	float weight_low_values1[WEIGHTS_MAX_DECIMATION_MODES][12];
+
+	/** @brief The high weight value in plane 1 for each quant level and decimation mode. */
+	float weight_high_values1[WEIGHTS_MAX_DECIMATION_MODES][12];
+
+	/** @brief The low weight value in plane 2 for each block mode. */
+	float weight_low_value2[WEIGHTS_MAX_BLOCK_MODES];
+
+	/** @brief The high weight value in plane 2 for each block mode. */
+	float weight_high_value2[WEIGHTS_MAX_BLOCK_MODES];
+
+	/** @brief The low weight value in plane 2 for each quant level and decimation mode. */
+	float weight_low_values2[WEIGHTS_MAX_DECIMATION_MODES][12];
+
+	/** @brief The high weight value in plane 2 for each quant level and decimation mode. */
+	float weight_high_values2[WEIGHTS_MAX_DECIMATION_MODES][12];
+};
+
+struct dt_init_working_buffers
+{
+	uint8_t weight_count_of_texel[BLOCK_MAX_TEXELS];
+	uint8_t grid_weights_of_texel[BLOCK_MAX_TEXELS][4];
+	uint8_t weights_of_texel[BLOCK_MAX_TEXELS][4];
+
+	uint8_t texel_count_of_weight[BLOCK_MAX_WEIGHTS];
+	uint8_t texels_of_weight[BLOCK_MAX_WEIGHTS][BLOCK_MAX_TEXELS];
+	uint8_t texel_weights_of_weight[BLOCK_MAX_WEIGHTS][BLOCK_MAX_TEXELS];
 };
 
 /**
@@ -1935,6 +1992,7 @@ void unpack_weights(
  * @param[out] block_mode                    The best packed block mode indexes.
  * @param[out] quant_level                   The best color quant level.
  * @param[out] quant_level_mod               The best color quant level if endpoints are the same.
+ * @param[out] tmpbuf                        Preallocated scratch buffers for the compressor.
  *
  * @return The actual number of candidate matches returned.
  */
@@ -1949,7 +2007,8 @@ unsigned int compute_ideal_endpoint_formats(
 	int partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS],
 	int block_mode[TUNE_MAX_TRIAL_CANDIDATES],
 	quant_method quant_level[TUNE_MAX_TRIAL_CANDIDATES],
-	quant_method quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES]);
+	quant_method quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES],
+	compression_working_buffers& tmpbuf);
 
 /**
  * @brief For a given 1 plane weight set recompute the endpoint colors.
@@ -2018,8 +2077,7 @@ void prepare_angular_tables();
  * @param      bsd                       The block size descriptor for the current trial.
  * @param      dec_weight_quant_uvalue   The decimated and quantized weight values.
  * @param      dec_weight_quant_sig      The significance of each weight.
- * @param[out] low_value                 The lowest weight to consider for each block mode.
- * @param[out] high_value                The highest weight to consider for each block mode.
+ * @param[out] tmpbuf                    Preallocated scratch buffers for the compressor.
  */
 void compute_angular_endpoints_1plane(
 	unsigned int tune_low_weight_limit,
@@ -2027,30 +2085,23 @@ void compute_angular_endpoints_1plane(
 	const block_size_descriptor& bsd,
 	const float* dec_weight_quant_uvalue,
 	const float* dec_weight_quant_sig,
-	float low_value[WEIGHTS_MAX_BLOCK_MODES],
-	float high_value[WEIGHTS_MAX_BLOCK_MODES]);
+	compression_working_buffers& tmpbuf);
 
 /**
  * @brief Compute the angular endpoints for two planes for each block mode.
  *
- * @param      tune_low_weight_limit    Weight count cutoff below which we use simpler searches.
- * @param     bsd                       The block size descriptor for the current trial.
- * @param     dec_weight_quant_uvalue   The decimated and quantized weight values.
- * @param     dec_weight_quant_sig      The significance of each weight.
- * @param[out] low_value1               The lowest weight p1 to consider for each block mode.
- * @param[out] high_value1              The highest weight p1 to consider for each block mode.
- * @param[out] low_value2               The lowest weight p2 to consider for each block mode.
- * @param[out] high_value2              The highest weight p2 to consider for each block mode.
+ * @param      tune_low_weight_limit     Weight count cutoff below which we use simpler searches.
+ * @param      bsd                       The block size descriptor for the current trial.
+ * @param      dec_weight_quant_uvalue   The decimated and quantized weight values.
+ * @param      dec_weight_quant_sig      The significance of each weight.
+ * @param[out] tmpbuf                    Preallocated scratch buffers for the compressor.
  */
 void compute_angular_endpoints_2planes(
 	unsigned int tune_low_weight_limit,
 	const block_size_descriptor& bsd,
 	const float* dec_weight_quant_uvalue,
 	const float* dec_weight_quant_sig,
-	float low_value1[WEIGHTS_MAX_BLOCK_MODES],
-	float high_value1[WEIGHTS_MAX_BLOCK_MODES],
-	float low_value2[WEIGHTS_MAX_BLOCK_MODES],
-	float high_value2[WEIGHTS_MAX_BLOCK_MODES]);
+	compression_working_buffers& tmpbuf);
 
 /* ============================================================================
   Functionality for high level compression and decompression access.
diff --git a/lib/astc-encoder/Source/astcenc_partition_tables.cpp b/lib/astc-encoder/Source/astcenc_partition_tables.cpp
index 52d76cfaf2..314761c994 100644
--- a/lib/astc-encoder/Source/astcenc_partition_tables.cpp
+++ b/lib/astc-encoder/Source/astcenc_partition_tables.cpp
@@ -93,15 +93,15 @@ static bool compare_canonical_partitionings(
  * which have the same texel assignment groupings. It is only useful for the compressor to test one
  * of each, so we mark duplicates as invalid.
  *
- * @param         texel_count   The first canonical bit pattern to check.
- * @param[in,out] pt            The table of partitioning information entries.
+* @param          bit_patterns   The scratch memory for the bit patterns.
+ * @param         texel_count    The first canonical bit pattern to check.
+ * @param[in,out] pt             The table of partitioning information entries.
  */
 static void remove_duplicate_partitionings(
+	uint64_t* bit_patterns,
 	int texel_count,
 	partition_info pt[BLOCK_MAX_PARTITIONINGS]
 ) {
-	uint64_t bit_patterns[BLOCK_MAX_PARTITIONINGS * 7];
-
 	for (unsigned int i = 0; i < BLOCK_MAX_PARTITIONINGS; i++)
 	{
 		generate_canonical_partitioning(texel_count, pt[i].partition_of_texel, bit_patterns + i * 7);
@@ -373,7 +373,11 @@ void init_partition_tables(
 		generate_one_partition_info_entry(bsd, 4, i, par_tab4[i]);
 	}
 
-	remove_duplicate_partitionings(bsd.texel_count, par_tab2);
-	remove_duplicate_partitionings(bsd.texel_count, par_tab3);
-	remove_duplicate_partitionings(bsd.texel_count, par_tab4);
+	uint64_t* bit_patterns = new uint64_t[BLOCK_MAX_PARTITIONINGS * 7];
+
+	remove_duplicate_partitionings(bit_patterns, bsd.texel_count, par_tab2);
+	remove_duplicate_partitionings(bit_patterns, bsd.texel_count, par_tab3);
+	remove_duplicate_partitionings(bit_patterns, bsd.texel_count, par_tab4);
+
+	delete[] bit_patterns;
 }
diff --git a/lib/astc-encoder/Source/astcenc_percentile_tables.cpp b/lib/astc-encoder/Source/astcenc_percentile_tables.cpp
index 9d4bfcbfdd..1744d996f9 100644
--- a/lib/astc-encoder/Source/astcenc_percentile_tables.cpp
+++ b/lib/astc-encoder/Source/astcenc_percentile_tables.cpp
@@ -45,6 +45,7 @@ struct packed_percentile_table
 	const uint16_t *items[2];
 };
 
+#if ASTCENC_BLOCK_MAX_TEXELS >= (4 * 4)
 static const uint16_t percentile_arr_4x4_0[61] {
 	0x0242,0x7243,0x6A51,0x6A52,0x5A41,0x4A53,0x8851,0x3842,
 	0x3852,0x3853,0x3043,0xFA33,0x1BDF,0x2022,0x1032,0x29CE,
@@ -77,7 +78,9 @@ static const packed_percentile_table block_pcd_4x4 {
 	{ 0, 53 },
 	{ percentile_arr_4x4_0, percentile_arr_4x4_1 }
 };
+#endif
 
+#if ASTCENC_BLOCK_MAX_TEXELS >= (5 * 4)
 static const uint16_t percentile_arr_5x4_0[91] {
 	0x02C1,0xFAD1,0xE8D3,0xDAC2,0xA8D2,0x70D1,0x50C2,0x80C3,
 	0xD2C3,0x4AA2,0x2AD2,0x2242,0x2251,0x42A3,0x1A43,0x4A52,
@@ -116,7 +119,9 @@ static const packed_percentile_table block_pcd_5x4 {
 	{ 0, 202 },
 	{ percentile_arr_5x4_0, percentile_arr_5x4_1 }
 };
+#endif
 
+#if ASTCENC_BLOCK_MAX_TEXELS >= (5 * 5)
 static const uint16_t percentile_arr_5x5_0[129] {
 	0x00F3,0xF8F2,0x70E3,0x62E1,0x60E1,0x4AC1,0x3261,0x38D3,
 	0x3271,0x5AF1,0x5873,0x2AD1,0x28E2,0x28F1,0x2262,0x9AC2,
@@ -163,7 +168,9 @@ static const packed_percentile_table block_pcd_5x5 {
 	{ 0, 116 },
 	{ percentile_arr_5x5_0, percentile_arr_5x5_1 }
 };
+#endif
 
+#if ASTCENC_BLOCK_MAX_TEXELS >= (6 * 5)
 static const uint16_t percentile_arr_6x5_0[165] {
 	0x0163,0xF8F3,0x9962,0x8972,0x7961,0x7173,0x6953,0x5943,
 	0x4B41,0x3AE1,0x38E3,0x6971,0x32C1,0x28D3,0x2A61,0xC8F2,
@@ -217,7 +224,9 @@ static const packed_percentile_table block_pcd_6x5 {
 	{ 0, 156 },
 	{ percentile_arr_6x5_0, percentile_arr_6x5_1 }
 };
+#endif
 
+#if ASTCENC_BLOCK_MAX_TEXELS >= (6 * 6)
 static const uint16_t percentile_arr_6x6_0[206] {
 	0x006F,0xF908,0xF104,0xE918,0xE963,0xD114,0xB0F3,0xA07E,
 	0x7972,0x705F,0x687F,0x6162,0x5953,0x586E,0x610C,0x524D,
@@ -278,7 +287,9 @@ static const packed_percentile_table block_pcd_6x6 {
 	{ 0, 256 },
 	{ percentile_arr_6x6_0, percentile_arr_6x6_1 }
 };
+#endif
 
+#if ASTCENC_BLOCK_MAX_TEXELS >= (8 * 5)
 static const uint16_t percentile_arr_8x5_0[226] {
 	0x0066,0xF865,0xE963,0xA856,0xA1F2,0x9875,0x91C3,0x91E2,
 	0x80F3,0x8076,0x61E3,0x6153,0x5172,0x59D2,0x51D3,0x5047,
@@ -342,7 +353,9 @@ static const packed_percentile_table block_pcd_8x5 {
 	{ 0, 178 },
 	{ percentile_arr_8x5_0, percentile_arr_8x5_1 }
 };
+#endif
 
+#if ASTCENC_BLOCK_MAX_TEXELS >= (8 * 6)
 static const uint16_t percentile_arr_8x6_0[273] {
 	0x0154,0xF944,0xE066,0xA128,0x9963,0x8118,0x806F,0x79F2,
 	0x79E2,0x7108,0xD934,0x6056,0x69C3,0x60F3,0x5972,0x59E3,
@@ -415,7 +428,9 @@ static const packed_percentile_table block_pcd_8x6 {
 	{ 0, 64 },
 	{ percentile_arr_8x6_0, percentile_arr_8x6_1 }
 };
+#endif
 
+#if ASTCENC_BLOCK_MAX_TEXELS >= (8 * 8)
 static const uint16_t percentile_arr_8x8_0[347] {
 	0x0334,0xFD44,0xDD14,0x9154,0x9B08,0x906A,0x8928,0x8108,
 	0xE866,0xC918,0x606F,0xC0FE,0x5963,0x58EE,0x6534,0x505A,
@@ -499,7 +514,9 @@ static const packed_percentile_table block_pcd_8x8 {
 	{ 0, 38 },
 	{ percentile_arr_8x8_0, percentile_arr_8x8_1 }
 };
+#endif
 
+#if ASTCENC_BLOCK_MAX_TEXELS >= (10 * 5)
 static const uint16_t percentile_arr_10x5_0[274] {
 	0x0165,0xF975,0xD866,0xC056,0xA946,0x90C6,0x90F5,0x8963,
 	0x80D6,0x80E6,0x60F3,0x61C3,0x59F2,0xA927,0x5075,0x4847,
@@ -571,7 +588,9 @@ static const packed_percentile_table block_pcd_10x5 {
 	{ 0, 79 },
 	{ percentile_arr_10x5_0, percentile_arr_10x5_1 }
 };
+#endif
 
+#if ASTCENC_BLOCK_MAX_TEXELS >= (10 * 6)
 static const uint16_t percentile_arr_10x6_0[325] {
 	0x01A4,0xF954,0xA066,0x9975,0x80F5,0x7056,0x6918,0x6963,
 	0x58C6,0x5946,0x5928,0x5174,0x586F,0xA0E6,0x5108,0x48D6,
@@ -651,7 +670,9 @@ static const packed_percentile_table block_pcd_10x6 {
 	{ 0, 78 },
 	{ percentile_arr_10x6_0, percentile_arr_10x6_1 }
 };
+#endif
 
+#if ASTCENC_BLOCK_MAX_TEXELS >= (10 * 8)
 static const uint16_t percentile_arr_10x8_0[400] {
 	0x0154,0xAB34,0xAD44,0x8308,0x7866,0x7B64,0x79A4,0x7975,
 	0x686A,0x6908,0xC514,0x6174,0x6128,0x6118,0x5B54,0x5163,
@@ -744,7 +765,9 @@ static const packed_percentile_table block_pcd_10x8 =
 	{ 0, 52 },
 	{ percentile_arr_10x8_0, percentile_arr_10x8_1 }
 };
+#endif
 
+#if ASTCENC_BLOCK_MAX_TEXELS >= (10 * 10)
 static const uint16_t percentile_arr_10x10_0[453] {
 	0x0334,0x9514,0x8954,0x806A,0x6F14,0x6724,0x6108,0x6364,
 	0x5175,0x5D44,0x5866,0x5118,0x5308,0xA179,0x5128,0xF534,
@@ -845,7 +868,9 @@ static const packed_percentile_table block_pcd_10x10 {
 	{ 0, 70 },
 	{ percentile_arr_10x10_0, percentile_arr_10x10_1 }
 };
+#endif
 
+#if ASTCENC_BLOCK_MAX_TEXELS >= (12 * 10)
 static const uint16_t percentile_arr_12x10_0[491] {
 	0x0334,0x9954,0x8514,0x7128,0x6364,0xC174,0x5D34,0x5866,
 	0x5975,0x5354,0xAF14,0x506A,0x5108,0x5724,0x5308,0x4544,
@@ -952,7 +977,9 @@ static const packed_percentile_table block_pcd_12x10 =
 	{ 0, 23 },
 	{ percentile_arr_12x10_0, percentile_arr_12x10_1 }
 };
+#endif
 
+#if ASTCENC_BLOCK_MAX_TEXELS >= (12 * 12)
 static const uint16_t percentile_arr_12x12_0[529] {
 	0x0334,0xF534,0x8514,0x8954,0x7F14,0xFB54,0x7B08,0x7128,
 	0x7974,0x6179,0x6B64,0x6908,0x606A,0x6724,0xB544,0xB066,
@@ -1064,6 +1091,7 @@ static const packed_percentile_table block_pcd_12x12 {
 	{ 0, 22 },
 	{ percentile_arr_12x12_0, percentile_arr_12x12_1 }
 };
+#endif
 
 /**
  * @brief Fetch the packed percentile table for the given 2D block size.
@@ -1080,20 +1108,48 @@ static const packed_percentile_table *get_packed_table(
 	int idx = (ydim << 8) | xdim;
 	switch (idx)
 	{
+#if ASTCENC_BLOCK_MAX_TEXELS >= (4 * 4)
 		case 0x0404: return &block_pcd_4x4;
+#endif
+#if ASTCENC_BLOCK_MAX_TEXELS >= (5 * 4)
 		case 0x0405: return &block_pcd_5x4;
+#endif
+#if ASTCENC_BLOCK_MAX_TEXELS >= (5 * 5)
 		case 0x0505: return &block_pcd_5x5;
+#endif
+#if ASTCENC_BLOCK_MAX_TEXELS >= (6 * 5)
 		case 0x0506: return &block_pcd_6x5;
+#endif
+#if ASTCENC_BLOCK_MAX_TEXELS >= (6 * 6)
 		case 0x0606: return &block_pcd_6x6;
+#endif
+#if ASTCENC_BLOCK_MAX_TEXELS >= (8 * 5)
 		case 0x0508: return &block_pcd_8x5;
+#endif
+#if ASTCENC_BLOCK_MAX_TEXELS >= (8 * 6)
 		case 0x0608: return &block_pcd_8x6;
+#endif
+#if ASTCENC_BLOCK_MAX_TEXELS >= (8 * 8)
 		case 0x0808: return &block_pcd_8x8;
+#endif
+#if ASTCENC_BLOCK_MAX_TEXELS >= (10 * 5)
 		case 0x050A: return &block_pcd_10x5;
+#endif
+#if ASTCENC_BLOCK_MAX_TEXELS >= (10 * 6)
 		case 0x060A: return &block_pcd_10x6;
+#endif
+#if ASTCENC_BLOCK_MAX_TEXELS >= (10 * 8)
 		case 0x080A: return &block_pcd_10x8;
+#endif
+#if ASTCENC_BLOCK_MAX_TEXELS >= (10 * 10)
 		case 0x0A0A: return &block_pcd_10x10;
+#endif
+#if ASTCENC_BLOCK_MAX_TEXELS >= (12 * 10)
 		case 0x0A0C: return &block_pcd_12x10;
+#endif
+#if ASTCENC_BLOCK_MAX_TEXELS >= (12 * 12)
 		case 0x0C0C: return &block_pcd_12x12;
+#endif
 	}
 
 	// Should never hit this with a valid 2D block size
diff --git a/lib/astc-encoder/Source/astcenc_pick_best_endpoint_format.cpp b/lib/astc-encoder/Source/astcenc_pick_best_endpoint_format.cpp
index 140edb1029..4082ca9a24 100644
--- a/lib/astc-encoder/Source/astcenc_pick_best_endpoint_format.cpp
+++ b/lib/astc-encoder/Source/astcenc_pick_best_endpoint_format.cpp
@@ -1052,7 +1052,8 @@ unsigned int compute_ideal_endpoint_formats(
 	int partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS],
 	int block_mode[TUNE_MAX_TRIAL_CANDIDATES],
 	quant_method quant_level[TUNE_MAX_TRIAL_CANDIDATES],
-	quant_method quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES]
+	quant_method quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES],
+	compression_working_buffers& tmpbuf
 ) {
 	int partition_count = pi.partition_count;
 
@@ -1077,10 +1078,10 @@ unsigned int compute_ideal_endpoint_formats(
 		    format_of_choice[i]);
 	}
 
-	alignas(ASTCENC_VECALIGN) float errors_of_best_combination[WEIGHTS_MAX_BLOCK_MODES];
-	alignas(ASTCENC_VECALIGN) quant_method best_quant_levels[WEIGHTS_MAX_BLOCK_MODES];
-	quant_method best_quant_levels_mod[WEIGHTS_MAX_BLOCK_MODES];
-	int best_ep_formats[WEIGHTS_MAX_BLOCK_MODES][4];
+	float* errors_of_best_combination = tmpbuf.errors_of_best_combination;
+	quant_method* best_quant_levels = tmpbuf.best_quant_levels;
+	quant_method* best_quant_levels_mod = tmpbuf.best_quant_levels_mod;
+	int (&best_ep_formats)[WEIGHTS_MAX_BLOCK_MODES][BLOCK_MAX_PARTITIONS] = tmpbuf.best_ep_formats;
 
 	// Ensure that the "overstep" of the last iteration in the vectorized loop will contain data
 	// that will never be picked as best candidate
diff --git a/lib/astc-encoder/Source/astcenc_weight_align.cpp b/lib/astc-encoder/Source/astcenc_weight_align.cpp
index e29ff8861e..e2257fb522 100644
--- a/lib/astc-encoder/Source/astcenc_weight_align.cpp
+++ b/lib/astc-encoder/Source/astcenc_weight_align.cpp
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2021 Arm Limited
+// Copyright 2011-2022 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -111,7 +111,7 @@ static void compute_angular_offsets(
 	promise(weight_count > 0);
 	promise(max_angular_steps > 0);
 
-	alignas(ASTCENC_VECALIGN) int isamplev[BLOCK_MAX_WEIGHTS] { 0 };
+	alignas(ASTCENC_VECALIGN) int isamplev[BLOCK_MAX_WEIGHTS];
 
 	// Precompute isample; arrays are always allocated 64 elements long
 	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
@@ -518,11 +518,13 @@ void compute_angular_endpoints_1plane(
 	const block_size_descriptor& bsd,
 	const float* dec_weight_quant_uvalue,
 	const float* dec_weight_quant_sig,
-	float low_value[WEIGHTS_MAX_BLOCK_MODES],
-	float high_value[WEIGHTS_MAX_BLOCK_MODES]
+	compression_working_buffers& tmpbuf
 ) {
-	float low_values[WEIGHTS_MAX_DECIMATION_MODES][12];
-	float high_values[WEIGHTS_MAX_DECIMATION_MODES][12];
+	float (&low_value)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value1;
+	float (&high_value)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value1;
+
+	float (&low_values)[WEIGHTS_MAX_DECIMATION_MODES][12] = tmpbuf.weight_low_values1;
+	float (&high_values)[WEIGHTS_MAX_DECIMATION_MODES][12] = tmpbuf.weight_high_values1;
 
 	unsigned int max_decimation_modes = only_always ? bsd.always_decimation_mode_count
 	                                                : bsd.decimation_mode_count;
@@ -580,15 +582,17 @@ void compute_angular_endpoints_2planes(
 	const block_size_descriptor& bsd,
 	const float* dec_weight_quant_uvalue,
 	const float* dec_weight_quant_sig,
-	float low_value1[WEIGHTS_MAX_BLOCK_MODES],
-	float high_value1[WEIGHTS_MAX_BLOCK_MODES],
-	float low_value2[WEIGHTS_MAX_BLOCK_MODES],
-	float high_value2[WEIGHTS_MAX_BLOCK_MODES]
+	compression_working_buffers& tmpbuf
 ) {
-	float low_values1[WEIGHTS_MAX_DECIMATION_MODES][12];
-	float high_values1[WEIGHTS_MAX_DECIMATION_MODES][12];
-	float low_values2[WEIGHTS_MAX_DECIMATION_MODES][12];
-	float high_values2[WEIGHTS_MAX_DECIMATION_MODES][12];
+	float (&low_value1)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value1;
+	float (&high_value1)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value1;
+	float (&low_value2)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value2;
+	float (&high_value2)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value2;
+
+	float (&low_values1)[WEIGHTS_MAX_DECIMATION_MODES][12] = tmpbuf.weight_low_values1;
+	float (&high_values1)[WEIGHTS_MAX_DECIMATION_MODES][12] = tmpbuf.weight_high_values1;
+	float (&low_values2)[WEIGHTS_MAX_DECIMATION_MODES][12] = tmpbuf.weight_low_values2;
+	float (&high_values2)[WEIGHTS_MAX_DECIMATION_MODES][12] = tmpbuf.weight_high_values2;
 
 	promise(bsd.decimation_mode_count > 0);
 	for (unsigned int i = 0; i < bsd.decimation_mode_count; i++)
diff --git a/lib/astc-encoder/Source/astcenccli_image.cpp b/lib/astc-encoder/Source/astcenccli_image.cpp
index 218c409658..8bdfb51799 100644
--- a/lib/astc-encoder/Source/astcenccli_image.cpp
+++ b/lib/astc-encoder/Source/astcenccli_image.cpp
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2021 Arm Limited
+// Copyright 2011-2022 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -169,7 +169,7 @@ int determine_image_components(const astcenc_image * img)
 		}
 	}
 
-	int image_components = 1 + (is_luma == 0 ? 0 : 2) + (has_alpha ? 0 : 1);
+	int image_components = 1 + (is_luma == 0 ? 2 : 0) + (has_alpha ? 1 : 0);
 	return image_components;
 }
 
diff --git a/lib/astc-encoder/Source/cmake_core.cmake b/lib/astc-encoder/Source/cmake_core.cmake
index 8431fd8c84..f676a1a4e6 100644
--- a/lib/astc-encoder/Source/cmake_core.cmake
+++ b/lib/astc-encoder/Source/cmake_core.cmake
@@ -1,6 +1,6 @@
 #  SPDX-License-Identifier: Apache-2.0
 #  ----------------------------------------------------------------------------
-#  Copyright 2020-2021 Arm Limited
+#  Copyright 2020-2022 Arm Limited
 #
 #  Licensed under the Apache License, Version 2.0 (the "License"); you may not
 #  use this file except in compliance with the License. You may obtain a copy
@@ -89,6 +89,12 @@ macro(astcenc_set_properties NAME)
                 ASTCENC_DECOMPRESS_ONLY)
     endif()
 
+    if(${BLOCK_MAX_TEXELS})
+        target_compile_definitions(${NAME}
+            PRIVATE
+                ASTCENC_BLOCK_MAX_TEXELS=${BLOCK_MAX_TEXELS})
+    endif()
+
     if(${DIAGNOSTICS})
         target_compile_definitions(${NAME}
             PUBLIC
@@ -128,7 +134,7 @@ macro(astcenc_set_properties NAME)
             $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-shift-sign-overflow>
             $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-format-nonliteral>
 
-           $<$<CXX_COMPILER_ID:Clang>:-Wdocumentation>)
+            $<$<CXX_COMPILER_ID:Clang>:-Wdocumentation>)
 
     target_link_options(${NAME}
         PRIVATE
diff --git a/lib/astc-encoder/Test/astc_test_competitive.py b/lib/astc-encoder/Test/astc_test_competitive.py
new file mode 100644
index 0000000000..4cfb3828bf
--- /dev/null
+++ b/lib/astc-encoder/Test/astc_test_competitive.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# -----------------------------------------------------------------------------
+# Copyright 2022 Arm Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy
+# of the License at:
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+# -----------------------------------------------------------------------------
+"""
+This script is a simple test runner for sweeps on multiple compressors.
+"""
+
+import os
+import subprocess as sp
+import re
+import sys
+
+LOG_COMMANDS = False
+LOG_PATTERN = re.compile(r"\s*Coding rate:\s*(.*)\s*MT/s")
+
+ISPC_BIN = "./Binaries/ISPC/ispc_astc.exe"
+ISPC_QUALITY = ["rgba", "rgb"]
+
+ASTC_BIN = "./astcenc/astcenc-avx2"
+ASTC_QUALITY = ["-medium", "-fast", "-fastest"]
+
+TEST_BLOCK_SIZES = ["4x4", "6x6", "8x8"]
+
+TEST_IMAGE = "./Test/Images/Kodak/LDR-RGB/ldr-rgb-kodak%02u.png"
+TEST_RANGE = 24
+TEST_REPEATS = 5
+
+OUT_CIMAGE = "out.astc"
+OUT_DIMAGE = "out.png"
+
+
+def run(command):
+    if LOG_COMMANDS:
+        print(" ".join(command))
+
+    return sp.run(command, capture_output=True, universal_newlines=True)
+
+
+def run_astcenc(in_image, out_image, block_size, quality):
+    args = [ASTC_BIN, "-tl", in_image, out_image, block_size, quality, "-j", "1"]
+    result = run(args)
+    return float(LOG_PATTERN.search(result.stdout).group(1))
+
+
+def run_ispc(in_image, out_image, block_size, quality):
+    args = [ISPC_BIN, in_image, out_image, block_size, quality]
+    result = run(args)
+    return float(LOG_PATTERN.search(result.stdout).group(1))
+
+
+def decompress(in_image, out_image):
+    args = [ASTC_BIN, "-dl", in_image, out_image]
+    result = run(args)
+    os.remove(in_image)
+
+
+def compare(in_image, out_image):
+    args = ["compare", "-metric", "PSNR", in_image, out_image, "diff.png"]
+    result = run(args)
+    os.remove("diff.png")
+    os.remove(out_image)
+    return float(result.stderr)
+
+
+def main():
+    """
+    The main function.
+
+    Returns:
+        int: The process return code.
+    """
+    # ISPC Tests
+    for block_size in TEST_BLOCK_SIZES:
+        for quality in ISPC_QUALITY:
+            print(f"ISPC {quality} {block_size}")
+            print(f"ISPC {quality} {block_size}", file=sys.stderr)
+            for index in range(1, TEST_RANGE + 1):
+                result_rate = 0.0
+                for repeat in range(0, TEST_REPEATS):
+                    image = TEST_IMAGE % index
+                    result_rate += run_ispc(image, OUT_CIMAGE, block_size, quality)
+                    decompress(OUT_CIMAGE, OUT_DIMAGE)
+                    result_error = compare(image, OUT_DIMAGE)
+                result_rate /= TEST_REPEATS
+
+                print("%s,Kodak%02u,%0.4f,%0.4f" % (block_size, index, result_rate, result_error))
+
+    # ASTCENC Tests
+    for block_size in TEST_BLOCK_SIZES:
+        for quality in ASTC_QUALITY:
+            print(f"ASTC {quality} {block_size}")
+            print(f"ASTC {quality} {block_size}", file=sys.stderr)
+            for index in range(1, TEST_RANGE + 1):
+                result_rate = 0.0
+                for repeat in range(0, TEST_REPEATS):
+                    image = TEST_IMAGE % index
+                    result_rate += run_astcenc(image, OUT_DIMAGE, block_size, quality)
+                    result_error = compare(image, OUT_DIMAGE)
+                result_rate /= TEST_REPEATS
+
+                print("%s,Kodak%02u,%0.4f,%0.4f" % (block_size, index, result_rate, result_error))
+
+    return 0
+
+
+if __name__ == "__main__":
+    try:
+        sys.exit(main())
+    except sp.CalledProcessError as ex:
+        print(ex.stdout)
+        print(ex.stderr)
diff --git a/lib/astc-encoder/Test/astc_test_competitive_plot.py b/lib/astc-encoder/Test/astc_test_competitive_plot.py
new file mode 100644
index 0000000000..e796a59f25
--- /dev/null
+++ b/lib/astc-encoder/Test/astc_test_competitive_plot.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# -----------------------------------------------------------------------------
+# Copyright 2022 Arm Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy
+# of the License at:
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+# -----------------------------------------------------------------------------
+"""
+This script is a simple test result plotter for sweeps on multiple compressors.
+"""
+import csv
+import numpy as np
+import matplotlib.pyplot as plt
+import sys
+
+DATABASE = "log.csv"
+
+
+class Series:
+
+    def __init__(self, name, perf, qual):
+        self.name = name
+        self.perf = perf
+        self.qual = qual
+
+
+def get_series(database, compressor, quality, block_size):
+    title = f"{compressor} {quality} {block_size}"
+    in_section = False
+
+    perf = []
+    qual = []
+
+    with open(database) as csvfile:
+        reader = csv.reader(csvfile)
+        for row in reader:
+            if len(row) == 1:
+                in_section = row[0] == title
+                continue
+
+            if in_section:
+                perf.append(float(row[2]))
+                qual.append(float(row[3]))
+
+    return (perf, qual)
+
+
+def plot(block_size, series_set):
+
+    for series in series_set:
+        plt.scatter(series.perf, series.qual, s=2, label=series.name)
+
+    plt.xlabel("Speed (MT/s)")
+    plt.ylabel("PSNR dB")
+    plt.legend(loc='lower right', prop={'size': 6})
+
+    plt.tight_layout()
+    plt.savefig(f"ASTC_v_ISPC_{block_size}.png")
+    plt.clf()
+
+
+def plot_diff(series_a, series_b):
+
+    diff_perf = np.divide(series_a.perf, series_b.perf)
+    diff_qual = np.subtract(series_a.qual, series_b.qual)
+    label = f"{series_a.name} vs {series_b.name}"
+
+    plt.scatter(diff_perf, diff_qual, s=2, label=label)
+
+    plt.axhline(y=0, color="r", linestyle="dotted", lw=0.5)
+    plt.axvline(x=1, color="r", linestyle="dotted", lw=0.5)
+
+    plt.xlabel("Relative speed")
+    plt.ylabel("PSNR diff (dB)")
+    plt.legend(loc='lower right', prop={'size': 6})
+
+    plt.tight_layout()
+    file_name = label.replace(" ", "_") + ".png"
+    plt.savefig(file_name)
+    plt.clf()
+
+
+def main():
+
+    block_sizes = ["4x4", "6x6", "8x8"]
+
+    for block_size in block_sizes:
+        series_set = []
+
+        perf, qual = get_series(DATABASE, "ISPC", "rgba", block_size)
+        series_set.append(Series(f"{block_size} IPSC Slow", perf, qual))
+
+        perf, qual = get_series(DATABASE, "ISPC", "rgb", block_size)
+        series_set.append(Series(f"{block_size} IPSC Fast", perf, qual))
+
+        perf, qual = get_series(DATABASE, "ASTC", "-medium", block_size)
+        series_set.append(Series(f"{block_size} ASTC Medium", perf, qual))
+
+        perf, qual = get_series(DATABASE, "ASTC", "-fast", block_size)
+        series_set.append(Series(f"{block_size} ASTC Fast", perf, qual))
+
+        perf, qual = get_series(DATABASE, "ASTC", "-fastest", block_size)
+        series_set.append(Series(f"{block_size} ASTC Fastest", perf, qual))
+
+        plot(block_size, series_set)
+
+        plot_diff(series_set[2], series_set[0])
+        plot_diff(series_set[3], series_set[0])
+        plot_diff(series_set[3], series_set[1])
+        plot_diff(series_set[4], series_set[1])
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())