host/vulkan/emulated_textures/shaders/AstcToBc3.comp - platform/hardware/google/gfxstream - Git at Google

 // Compute shader to convert ASTC textures to BC3 (ie: BC1 for color + BC4 for alpha).
 //
 // A bit of history
 // ----------------
 //
 // The algorithm used here for BC1 compression has a long history. It was originally published by
 // Simon Brown for the Squish encoder:
 //   https://www.sjbrown.co.uk/posts/dxt-compression-techniques/
 //   https://github.com/svn2github/libsquish/blob/c763145a30512c10450954b7a2b5b3a2f9a94e00/rangefit.cpp#L33
 //
 // It was then rewritten and improved upon by Fabian "ryg" Giesen for the stb_dxt encoder:
 //   https://github.com/GammaUNC/FasTC/blob/0f8cef65cf8f0fc5c58a2d06af3e0c3ad2374678/DXTEncoder/src/stb_dxt.h#L283
 //   https://fgiesen.wordpress.com/2022/11/08/whats-that-magic-computation-in-stb__refineblock/
 //
 // That version then made it to many places, including ANGLE, first as a C++ version:
 //   https://source.corp.google.com/android/external/angle/src/image_util/loadimage_etc.cpp;l=1073;bpv=0;bpt=0;rcl=90f88d3bc0d38ef5ec06ddaaef230db2d6e6fc02
 //
 // and then as a compute shader version upon which this shader is based:
 //   http://cs/android/external/angle/src/libANGLE/renderer/vulkan/shaders/src/EtcToBc.comp;rcl=81e45c881c54a7737f6fce95097f6df2f94cd76f
 //
 //
 // Useful links to understand BC1 compression
 // ------------------------------------------
 //
 //  http://www.ludicon.com/castano/blog/2022/11/bc1-compression-revisited/
 //  https://github.com/castano/icbc
 //  https://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/dxtc/doc/cuda_dxtc.pdf
 //  https://fgiesen.wordpress.com/2022/11/08/whats-that-magic-computation-in-stb__refineblock/
 //  https://www.reedbeta.com/blog/understanding-bcn-texture-compression-formats/
 //  https://bartwronski.com/2020/05/21/dimensionality-reduction-for-image-and-texture-set-compression/
 //  https://core.ac.uk/download/pdf/210601023.pdf
 //  https://github.com/microsoft/Xbox-ATG-Samples/blob/main/XDKSamples/Graphics/FastBlockCompress/Shaders/BlockCompress.hlsli
 //  https://github.com/GammaUNC/FasTC/blob/0f8cef65cf8f0fc5c58a2d06af3e0c3ad2374678/DXTEncoder/src/stb_dxt.h
 //  https://github.com/darksylinc/betsy/blob/master/bin/Data/bc1.glsl
 //  https://github.com/GPUOpen-Tools/compressonator/blob/master/cmp_core/shaders/bc1_cmp.h
 //
 //
 // Optimization ideas
 // ------------------
 //
 // - Do the color refinement step from stb_dxt. This is probably the top priority. Currently, we
 //   only do the PCA step and we use the min and max colors as the endpoints. We should instead see
 //   if picking other endpoints on the PCA line would lead to better results.
 //
 // - Use dithering to improve quality. Betsy and FasTC encoders (links above) have examples.
 //
 // - Add a fast path for when all pixels are the same color (speed improvement)
 //
 // - Use BC1 instead of BC3 if the image doesn't contain semi-transparent pixels. We will need to
 //   add a pre-processing step to determine if there are such pixels. Alternatively, it could be
 //   done fairly efficiently as a post-processing step where we discard the BC4 data if all pixels
 //   are opaque, however in that case it would only work for fully opaque image (ie: we wouldn't be
 //   able to take advantage of BC1's punch-through alpha.
 //
 // To-do list
 // ---------------
 //   - TODO(gregschlom): Check that the GPU has gl_SubgroupSize >= 16 before using this shader,
 //     otherwise it will give wrong results.
 //
 //   - TODO(gregschlom): Check if the results are correct for image sizes that aren't multiples of 4

 #version 450 core
 #include "AstcDecompressor.glsl"
 #include "Common.comp"

 // TODO(gregschlom): Check how widespread is support for these extensions.
 #extension GL_KHR_shader_subgroup_clustered : enable
 #extension GL_KHR_shader_subgroup_shuffle : enable

 // To maximize GPU utilization, we use a local workgroup size of 64 which is a multiple of the
 // subgroup size of both AMD and NVIDIA cards.
 layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;

 // Using 2DArray textures for compatibility with the old ASTC decoder.
 // TODO(gregschlom): Once we have texture metrics, check if we want to keep supporting array text.
 layout(binding = 0, rgba32ui) readonly uniform WITH_TYPE(uimage) srcImage;
 layout(binding = 1, rgba32ui) writeonly uniform WITH_TYPE(uimage) dstImage;

 layout(push_constant) uniform imagInfo {
     uvec2 blockSize;
     uint baseLayer;
     uint smallBlock;  // TODO(gregschlom) Remove this once we remove the old decoder.
 }
 u_pushConstant;

 // Decodes an ASTC-encoded pixel at `texelPos` to RGBA
 uvec4 decodeRGBA(uvec2 texelPos, uint layer) {
     uvec2 blockPos = texelPos / u_pushConstant.blockSize;
     uvec2 posInBlock = texelPos % u_pushConstant.blockSize;

     astcBlock = imageLoad(srcImage, WITH_TYPE(getPos)(ivec3(blockPos, layer))).wzyx;
     astcDecoderInitialize(astcBlock, u_pushConstant.blockSize);
     return astcDecodeTexel(posInBlock);
 }

 // Returns the 2-bit index of the BC1 color that's the closest to the input color.
 // color: the color that we want to approximate
 // maxEndpoint / minEndpoint: the BC1 endpoint values we've chosen
 uint getColorIndex(vec3 color, vec3 minEndpoint, vec3 maxEndpoint) {
     // Project `color` on the line that goes between `minEndpoint` and `maxEndpoint`.
     //
     // TODO(gregschlom): this doesn't account for the fact that the color palette is actually
     // quantisized as RGB565 instead of RGB8. A slower but potentially slightly higher quality
     // approach would be to compute all 4 RGB565 colors in the palette, then find the closest one.
     vec3 colorLine = maxEndpoint - minEndpoint;
     float x = dot(color - minEndpoint, colorLine) / dot(colorLine, colorLine);

     // x is now a float in [0, 1] indicating where `color` lies when projected on the line between
     // the min and max endpoint. Remap x as an integer between 0 and 3.
     int index = int(round(clamp(x * 3, 0, 3)));

     // Finally, we need to convert to the somewhat unintuitive BC1 indexing scheme, where:
     //  0 is maxEndpoint, 1 is minEndpoint, 2 is (1/3)*minEndpoint + (2/3)*maxEndpoint and 3 is
     // (2/3)*minEndpoint + (1/3)*maxEndpoint. The lookup table for this is [1, 3, 2, 0], which we
     // bit-pack into 8 bits.
     //
     // Alternatively, we could use this formula:
     // `index = -index & 3; return index ^ uint(index < 2);` but the  lookup table method is faster.
     return bitfieldExtract(45u, index * 2, 2);
 }

 // Same as above, but for alpha values, using BC4's encoding scheme.
 uint getAlphaIndex(uint alpha, uint minAlpha, uint maxAlpha) {
     float x = float(alpha - minAlpha) / float(maxAlpha - minAlpha);
     int index = int(round(clamp(x * 7, 0, 7)));

     // Like for getColorIndex, we need to remap the index according to BC4's indexing scheme, where
     //  0 is maxAlpha, 1 is minAlpha, 2 is (1/7)*minAlpha + (6/7)*maxAlpha, etc...
     // The lookup table for this is [1, 7, 6, 5, 4, 3, 2, 0], which we bit-pack into 32 bits using
     // 4 bits for each value.
     //
     // Alternatively, we could use this formula:
     // `index = -index & 7; return index ^ uint(index < 2);` but the lookup table method is faster.
     return bitfieldExtract(36984433u, index * 4, 3);
 }

 // Computes the color endpoints using Principal Component Analysis to find the best fit line
 // through the colors in the 4x4 block.
 void computeEndpoints(uvec3 rgbColor, out uvec3 minEndpoint, out uvec3 maxEndpoint) {
     // See the comment at the top of this file for more details on this algorithm.

     uvec3 avgColor = subgroupClusteredAdd(rgbColor, 16) + 8 >> 4;  // +8 to round to nearest.
     uvec3 minColor = subgroupClusteredMin(rgbColor, 16);
     uvec3 maxColor = subgroupClusteredMax(rgbColor, 16);

     // Special case when all pixels are the same color
     if (minColor == maxColor) {
         minEndpoint = minColor;
         maxEndpoint = minColor;
         return;
     }

     // Compute the covariance matrix of the r, g and b channels. This is a 3x3 symmetric matrix.
     // First compute the 6 unique covariance values:
     ivec3 dx = ivec3(rgbColor) - ivec3(avgColor);
     vec3 cov1 = subgroupClusteredAdd(dx.r * dx, 16);        // cov(r,r), cov(r,g), cov(r,b)
     vec3 cov2 = subgroupClusteredAdd(dx.ggb * dx.gbb, 16);  // cov(g,g), cov(g,b), cov(b,b)

     // Then build the matrix:
     mat3 covMat = mat3(cov1,                    // rr, rg, rb
                        vec3(cov1.y, cov2.xy),   // rg, gg, gb
                        vec3(cov1.z, cov2.yz));  // rb, gb, bb

     // Find the principal axis via power iteration. (https://en.wikipedia.org/wiki/Power_iteration)
     // 3 to 8 iterations are sufficient for a good approximation.
     // Note: in theory, we're supposed to normalize the vector on each iteration, however we get
     // significantly higher quality (and obviously faster performance) when not doing it.
     // TODO(gregschlom): Investigate why that is the case.
     vec3 principalAxis = covMat * (covMat * (covMat * (covMat * (maxColor - minColor))));

     // Ensure all components are in the [-1,1] range.
     // TODO(gregschlom): Investigate if we really need this. It doesn't make a lot of sense.
     float magn = max(max(abs(principalAxis.r), abs(principalAxis.g)), abs(principalAxis.b));
     principalAxis = (magn < 4.0)  // If the magnitude is too small, default to luminance
                         ? vec3(0.299f, 0.587f, 0.114f)  // Coefficients to convert RGB to luminance
                         : principalAxis / magn;

     // Project the colors on the principal axis and pick the 2 colors at the extreme points as the
     // endpoints.
     float distance = dot(rgbColor, principalAxis);
     float minDistance = subgroupClusteredMin(distance, 16);
     float maxDistance = subgroupClusteredMax(distance, 16);

     uvec2 indices = uvec2(distance == minDistance ? gl_SubgroupInvocationID : 0,
                           distance == maxDistance ? gl_SubgroupInvocationID : 0);
     uvec2 minMaxIndex = subgroupClusteredMax(indices, 16);

     // TODO(gregschlom): we're returning the original pixel colors instead of the projected colors.
     // Investigate if we could increase quality by returning the projected colors.
     minEndpoint = subgroupShuffle(rgbColor, minMaxIndex.x);
     maxEndpoint = subgroupShuffle(rgbColor, minMaxIndex.y);
 }

 uvec2 encodeAlpha(uint value, uint texelId) {
     uint minValue = subgroupClusteredMin(value, 16);
     uint maxValue = subgroupClusteredMax(value, 16);

     // Determine the alpha index (between 0 and 7)
     uint index = (minValue != maxValue) ? getAlphaIndex(value, minValue, maxValue) : 0;

     // Pack everything together into 64 bits. The first 3-bit index goes at bit 16, the next
     // one at bit 19 and so on until the last one which goes at bit 61. The bottom 16 bits will
     // contain the max and min value.
     // Note: shifting a uint by more than 31 is UB, which is why we need the ternary operator here.
     uvec2 mask = uvec2(texelId < 5 ? 0 : (index << 29) >> (-3 * texelId + 45),
                        texelId > 5 ? 0 : index << (3 * texelId + 16));
     uvec2 packed = subgroupClusteredOr(mask, 16);
     return uvec2((maxValue & 0xff) | ((minValue & 0xff) << 8) | packed[1], packed[0]);
 }

 uint packColorToRGB565(uvec3 color) {
     uvec3 quant = uvec3(round(vec3(color) * vec3(31.0, 63.0, 31.0) / vec3(255.0)));
     return (quant.r << 11) | (quant.g << 5) | quant.b;
 }

 void main() {
     // We can't use gl_LocalInvocationID here because the spec doesn't make any guarantees as to how
     // it will be mapped to gl_SubgroupInvocationID (See: https://stackoverflow.com/q/72451338/).
     // And since we use subgroupClusteredXXX commands, we must ensure that any 16 consecutive
     // subgroup invocation ids [16n, 16n+1..16n+15] map to the same 4x4 block in the input image.
     // So instead of using gl_LocalInvocationID, we construct it from the subgroup ids.
     // This is a number in the range [0, 63] since local group size is 64
     uint localId = gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID;

     uint blockId = localId / 16;  // [0-3]  Id of the 4x4 block we're working on
     uint texelId = localId % 16;  // [0-15] Id of the texel within the 4x4 block

     // Absolute coordinates in the input image
     uvec2 texelCoord = 8 * gl_WorkGroupID.xy + uvec2(4 * (blockId & 0x1) + (texelId % 4),
                                                      2 * (blockId & 0x2) + (texelId / 4));
     // Layer, for array textures.
     uint layer = u_pushConstant.baseLayer + gl_WorkGroupID.z;

     uvec4 currentTexel = decodeRGBA(texelCoord, layer);

     // Compute the color endpoints
     uvec3 minEndpoint, maxEndpoint;
     computeEndpoints(currentTexel.rgb, minEndpoint, maxEndpoint);
     uvec2 endpoints = uvec2(packColorToRGB565(minEndpoint), packColorToRGB565(maxEndpoint));

     // Find which of the 4 colors best matches the color of the current texel
     uint index = 0;
     if (endpoints.x != endpoints.y) {
         index = getColorIndex(vec3(currentTexel.rgb), vec3(minEndpoint), vec3(maxEndpoint));
     }
     if (endpoints.x > endpoints.y) {
         index ^= 1;
         endpoints = endpoints.yx;
     }

     // Pack everything together.
     uvec4 result;
     result.rg = encodeAlpha(currentTexel.a, texelId);
     result.b = endpoints.y | (endpoints.x << 16);
     result.a = subgroupClusteredOr(index << (2 * texelId), 16);

     if (texelId == 0) {
         imageStore(dstImage, WITH_TYPE(getPos)(ivec3(texelCoord / 4, layer)), result);
     }
 }
	// Compute shader to convert ASTC textures to BC3 (ie: BC1 for color + BC4 for alpha).
	//
	// A bit of history
	// ----------------
	//
	// The algorithm used here for BC1 compression has a long history. It was originally published by
	// Simon Brown for the Squish encoder:
	// https://www.sjbrown.co.uk/posts/dxt-compression-techniques/
	// https://github.com/svn2github/libsquish/blob/c763145a30512c10450954b7a2b5b3a2f9a94e00/rangefit.cpp#L33
	//
	// It was then rewritten and improved upon by Fabian "ryg" Giesen for the stb_dxt encoder:
	// https://github.com/GammaUNC/FasTC/blob/0f8cef65cf8f0fc5c58a2d06af3e0c3ad2374678/DXTEncoder/src/stb_dxt.h#L283
	// https://fgiesen.wordpress.com/2022/11/08/whats-that-magic-computation-in-stb__refineblock/
	//
	// That version then made it to many places, including ANGLE, first as a C++ version:
	// https://source.corp.google.com/android/external/angle/src/image_util/loadimage_etc.cpp;l=1073;bpv=0;bpt=0;rcl=90f88d3bc0d38ef5ec06ddaaef230db2d6e6fc02
	//
	// and then as a compute shader version upon which this shader is based:
	// http://cs/android/external/angle/src/libANGLE/renderer/vulkan/shaders/src/EtcToBc.comp;rcl=81e45c881c54a7737f6fce95097f6df2f94cd76f
	//
	//
	// Useful links to understand BC1 compression
	// ------------------------------------------
	//
	// http://www.ludicon.com/castano/blog/2022/11/bc1-compression-revisited/
	// https://github.com/castano/icbc
	// https://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/dxtc/doc/cuda_dxtc.pdf
	// https://fgiesen.wordpress.com/2022/11/08/whats-that-magic-computation-in-stb__refineblock/
	// https://www.reedbeta.com/blog/understanding-bcn-texture-compression-formats/
	// https://bartwronski.com/2020/05/21/dimensionality-reduction-for-image-and-texture-set-compression/
	// https://core.ac.uk/download/pdf/210601023.pdf
	// https://github.com/microsoft/Xbox-ATG-Samples/blob/main/XDKSamples/Graphics/FastBlockCompress/Shaders/BlockCompress.hlsli
	// https://github.com/GammaUNC/FasTC/blob/0f8cef65cf8f0fc5c58a2d06af3e0c3ad2374678/DXTEncoder/src/stb_dxt.h
	// https://github.com/darksylinc/betsy/blob/master/bin/Data/bc1.glsl
	// https://github.com/GPUOpen-Tools/compressonator/blob/master/cmp_core/shaders/bc1_cmp.h
	//
	//
	// Optimization ideas
	// ------------------
	//
	// - Do the color refinement step from stb_dxt. This is probably the top priority. Currently, we
	// only do the PCA step and we use the min and max colors as the endpoints. We should instead see
	// if picking other endpoints on the PCA line would lead to better results.
	//
	// - Use dithering to improve quality. Betsy and FasTC encoders (links above) have examples.
	//
	// - Add a fast path for when all pixels are the same color (speed improvement)
	//
	// - Use BC1 instead of BC3 if the image doesn't contain semi-transparent pixels. We will need to
	// add a pre-processing step to determine if there are such pixels. Alternatively, it could be
	// done fairly efficiently as a post-processing step where we discard the BC4 data if all pixels
	// are opaque, however in that case it would only work for fully opaque image (ie: we wouldn't be
	// able to take advantage of BC1's punch-through alpha.
	//
	// To-do list
	// ---------------
	// - TODO(gregschlom): Check that the GPU has gl_SubgroupSize >= 16 before using this shader,
	// otherwise it will give wrong results.
	//
	// - TODO(gregschlom): Check if the results are correct for image sizes that aren't multiples of 4

	#version 450 core
	#include "AstcDecompressor.glsl"
	#include "Common.comp"

	// TODO(gregschlom): Check how widespread is support for these extensions.
	#extension GL_KHR_shader_subgroup_clustered : enable
	#extension GL_KHR_shader_subgroup_shuffle : enable

	// To maximize GPU utilization, we use a local workgroup size of 64 which is a multiple of the
	// subgroup size of both AMD and NVIDIA cards.
	layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;

	// Using 2DArray textures for compatibility with the old ASTC decoder.
	// TODO(gregschlom): Once we have texture metrics, check if we want to keep supporting array text.
	layout(binding = 0, rgba32ui) readonly uniform WITH_TYPE(uimage) srcImage;
	layout(binding = 1, rgba32ui) writeonly uniform WITH_TYPE(uimage) dstImage;

	layout(push_constant) uniform imagInfo {
	uvec2 blockSize;
	uint baseLayer;
	uint smallBlock; // TODO(gregschlom) Remove this once we remove the old decoder.
	}
	u_pushConstant;

	// Decodes an ASTC-encoded pixel at `texelPos` to RGBA
	uvec4 decodeRGBA(uvec2 texelPos, uint layer) {
	uvec2 blockPos = texelPos / u_pushConstant.blockSize;
	uvec2 posInBlock = texelPos % u_pushConstant.blockSize;

	astcBlock = imageLoad(srcImage, WITH_TYPE(getPos)(ivec3(blockPos, layer))).wzyx;
	astcDecoderInitialize(astcBlock, u_pushConstant.blockSize);
	return astcDecodeTexel(posInBlock);
	}

	// Returns the 2-bit index of the BC1 color that's the closest to the input color.
	// color: the color that we want to approximate
	// maxEndpoint / minEndpoint: the BC1 endpoint values we've chosen
	uint getColorIndex(vec3 color, vec3 minEndpoint, vec3 maxEndpoint) {
	// Project `color` on the line that goes between `minEndpoint` and `maxEndpoint`.
	//
	// TODO(gregschlom): this doesn't account for the fact that the color palette is actually
	// quantisized as RGB565 instead of RGB8. A slower but potentially slightly higher quality
	// approach would be to compute all 4 RGB565 colors in the palette, then find the closest one.
	vec3 colorLine = maxEndpoint - minEndpoint;
	float x = dot(color - minEndpoint, colorLine) / dot(colorLine, colorLine);

	// x is now a float in [0, 1] indicating where `color` lies when projected on the line between
	// the min and max endpoint. Remap x as an integer between 0 and 3.
	int index = int(round(clamp(x * 3, 0, 3)));

	// Finally, we need to convert to the somewhat unintuitive BC1 indexing scheme, where:
	// 0 is maxEndpoint, 1 is minEndpoint, 2 is (1/3)minEndpoint + (2/3)maxEndpoint and 3 is
	// (2/3)minEndpoint + (1/3)maxEndpoint. The lookup table for this is [1, 3, 2, 0], which we
	// bit-pack into 8 bits.
	//
	// Alternatively, we could use this formula:
	// `index = -index & 3; return index ^ uint(index < 2);` but the lookup table method is faster.
	return bitfieldExtract(45u, index * 2, 2);
	}

	// Same as above, but for alpha values, using BC4's encoding scheme.
	uint getAlphaIndex(uint alpha, uint minAlpha, uint maxAlpha) {
	float x = float(alpha - minAlpha) / float(maxAlpha - minAlpha);
	int index = int(round(clamp(x * 7, 0, 7)));

	// Like for getColorIndex, we need to remap the index according to BC4's indexing scheme, where
	// 0 is maxAlpha, 1 is minAlpha, 2 is (1/7)minAlpha + (6/7)maxAlpha, etc...
	// The lookup table for this is [1, 7, 6, 5, 4, 3, 2, 0], which we bit-pack into 32 bits using
	// 4 bits for each value.
	//
	// Alternatively, we could use this formula:
	// `index = -index & 7; return index ^ uint(index < 2);` but the lookup table method is faster.
	return bitfieldExtract(36984433u, index * 4, 3);
	}

	// Computes the color endpoints using Principal Component Analysis to find the best fit line
	// through the colors in the 4x4 block.
	void computeEndpoints(uvec3 rgbColor, out uvec3 minEndpoint, out uvec3 maxEndpoint) {
	// See the comment at the top of this file for more details on this algorithm.

	uvec3 avgColor = subgroupClusteredAdd(rgbColor, 16) + 8 >> 4; // +8 to round to nearest.
	uvec3 minColor = subgroupClusteredMin(rgbColor, 16);
	uvec3 maxColor = subgroupClusteredMax(rgbColor, 16);

	// Special case when all pixels are the same color
	if (minColor == maxColor) {
	minEndpoint = minColor;
	maxEndpoint = minColor;
	return;
	}

	// Compute the covariance matrix of the r, g and b channels. This is a 3x3 symmetric matrix.
	// First compute the 6 unique covariance values:
	ivec3 dx = ivec3(rgbColor) - ivec3(avgColor);
	vec3 cov1 = subgroupClusteredAdd(dx.r * dx, 16); // cov(r,r), cov(r,g), cov(r,b)
	vec3 cov2 = subgroupClusteredAdd(dx.ggb * dx.gbb, 16); // cov(g,g), cov(g,b), cov(b,b)

	// Then build the matrix:
	mat3 covMat = mat3(cov1, // rr, rg, rb
	vec3(cov1.y, cov2.xy), // rg, gg, gb
	vec3(cov1.z, cov2.yz)); // rb, gb, bb

	// Find the principal axis via power iteration. (https://en.wikipedia.org/wiki/Power_iteration)
	// 3 to 8 iterations are sufficient for a good approximation.
	// Note: in theory, we're supposed to normalize the vector on each iteration, however we get
	// significantly higher quality (and obviously faster performance) when not doing it.
	// TODO(gregschlom): Investigate why that is the case.
	vec3 principalAxis = covMat * (covMat * (covMat * (covMat * (maxColor - minColor))));

	// Ensure all components are in the [-1,1] range.
	// TODO(gregschlom): Investigate if we really need this. It doesn't make a lot of sense.
	float magn = max(max(abs(principalAxis.r), abs(principalAxis.g)), abs(principalAxis.b));
	principalAxis = (magn < 4.0) // If the magnitude is too small, default to luminance
	? vec3(0.299f, 0.587f, 0.114f) // Coefficients to convert RGB to luminance
	: principalAxis / magn;

	// Project the colors on the principal axis and pick the 2 colors at the extreme points as the
	// endpoints.
	float distance = dot(rgbColor, principalAxis);
	float minDistance = subgroupClusteredMin(distance, 16);
	float maxDistance = subgroupClusteredMax(distance, 16);

	uvec2 indices = uvec2(distance == minDistance ? gl_SubgroupInvocationID : 0,
	distance == maxDistance ? gl_SubgroupInvocationID : 0);
	uvec2 minMaxIndex = subgroupClusteredMax(indices, 16);

	// TODO(gregschlom): we're returning the original pixel colors instead of the projected colors.
	// Investigate if we could increase quality by returning the projected colors.
	minEndpoint = subgroupShuffle(rgbColor, minMaxIndex.x);
	maxEndpoint = subgroupShuffle(rgbColor, minMaxIndex.y);
	}

	uvec2 encodeAlpha(uint value, uint texelId) {
	uint minValue = subgroupClusteredMin(value, 16);
	uint maxValue = subgroupClusteredMax(value, 16);

	// Determine the alpha index (between 0 and 7)
	uint index = (minValue != maxValue) ? getAlphaIndex(value, minValue, maxValue) : 0;

	// Pack everything together into 64 bits. The first 3-bit index goes at bit 16, the next
	// one at bit 19 and so on until the last one which goes at bit 61. The bottom 16 bits will
	// contain the max and min value.
	// Note: shifting a uint by more than 31 is UB, which is why we need the ternary operator here.
	uvec2 mask = uvec2(texelId < 5 ? 0 : (index << 29) >> (-3 * texelId + 45),
	texelId > 5 ? 0 : index << (3 * texelId + 16));
	uvec2 packed = subgroupClusteredOr(mask, 16);
	return uvec2((maxValue & 0xff) \| ((minValue & 0xff) << 8) \| packed[1], packed[0]);
	}

	uint packColorToRGB565(uvec3 color) {
	uvec3 quant = uvec3(round(vec3(color) * vec3(31.0, 63.0, 31.0) / vec3(255.0)));
	return (quant.r << 11) \| (quant.g << 5) \| quant.b;
	}

	void main() {
	// We can't use gl_LocalInvocationID here because the spec doesn't make any guarantees as to how
	// it will be mapped to gl_SubgroupInvocationID (See: https://stackoverflow.com/q/72451338/).
	// And since we use subgroupClusteredXXX commands, we must ensure that any 16 consecutive
	// subgroup invocation ids [16n, 16n+1..16n+15] map to the same 4x4 block in the input image.
	// So instead of using gl_LocalInvocationID, we construct it from the subgroup ids.
	// This is a number in the range [0, 63] since local group size is 64
	uint localId = gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID;

	uint blockId = localId / 16; // [0-3] Id of the 4x4 block we're working on
	uint texelId = localId % 16; // [0-15] Id of the texel within the 4x4 block

	// Absolute coordinates in the input image
	uvec2 texelCoord = 8 * gl_WorkGroupID.xy + uvec2(4 * (blockId & 0x1) + (texelId % 4),
	2 * (blockId & 0x2) + (texelId / 4));
	// Layer, for array textures.
	uint layer = u_pushConstant.baseLayer + gl_WorkGroupID.z;

	uvec4 currentTexel = decodeRGBA(texelCoord, layer);

	// Compute the color endpoints
	uvec3 minEndpoint, maxEndpoint;
	computeEndpoints(currentTexel.rgb, minEndpoint, maxEndpoint);
	uvec2 endpoints = uvec2(packColorToRGB565(minEndpoint), packColorToRGB565(maxEndpoint));

	// Find which of the 4 colors best matches the color of the current texel
	uint index = 0;
	if (endpoints.x != endpoints.y) {
	index = getColorIndex(vec3(currentTexel.rgb), vec3(minEndpoint), vec3(maxEndpoint));
	}
	if (endpoints.x > endpoints.y) {
	index ^= 1;
	endpoints = endpoints.yx;
	}

	// Pack everything together.
	uvec4 result;
	result.rg = encodeAlpha(currentTexel.a, texelId);
	result.b = endpoints.y \| (endpoints.x << 16);
	result.a = subgroupClusteredOr(index << (2 * texelId), 16);

	if (texelId == 0) {
	imageStore(dstImage, WITH_TYPE(getPos)(ivec3(texelCoord / 4, layer)), result);
	}
	}