| // Compute shader to convert ASTC textures to BC3 (ie: BC1 for color + BC4 for alpha). |
| // |
| // A bit of history |
| // ---------------- |
| // |
| // The algorithm used here for BC1 compression has a long history. It was originally published by |
| // Simon Brown for the Squish encoder: |
| // https://www.sjbrown.co.uk/posts/dxt-compression-techniques/ |
| // https://github.com/svn2github/libsquish/blob/c763145a30512c10450954b7a2b5b3a2f9a94e00/rangefit.cpp#L33 |
| // |
| // It was then rewritten and improved upon by Fabian "ryg" Giesen for the stb_dxt encoder: |
| // https://github.com/GammaUNC/FasTC/blob/0f8cef65cf8f0fc5c58a2d06af3e0c3ad2374678/DXTEncoder/src/stb_dxt.h#L283 |
| // https://fgiesen.wordpress.com/2022/11/08/whats-that-magic-computation-in-stb__refineblock/ |
| // |
| // That version then made it to many places, including ANGLE, first as a C++ version: |
| // https://source.corp.google.com/android/external/angle/src/image_util/loadimage_etc.cpp;l=1073;bpv=0;bpt=0;rcl=90f88d3bc0d38ef5ec06ddaaef230db2d6e6fc02 |
| // |
| // and then as a compute shader version upon which this shader is based: |
| // http://cs/android/external/angle/src/libANGLE/renderer/vulkan/shaders/src/EtcToBc.comp;rcl=81e45c881c54a7737f6fce95097f6df2f94cd76f |
| // |
| // |
| // Useful links to understand BC1 compression |
| // ------------------------------------------ |
| // |
| // http://www.ludicon.com/castano/blog/2022/11/bc1-compression-revisited/ |
| // https://github.com/castano/icbc |
| // https://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/dxtc/doc/cuda_dxtc.pdf |
| // https://fgiesen.wordpress.com/2022/11/08/whats-that-magic-computation-in-stb__refineblock/ |
| // https://www.reedbeta.com/blog/understanding-bcn-texture-compression-formats/ |
| // https://bartwronski.com/2020/05/21/dimensionality-reduction-for-image-and-texture-set-compression/ |
| // https://core.ac.uk/download/pdf/210601023.pdf |
| // https://github.com/microsoft/Xbox-ATG-Samples/blob/main/XDKSamples/Graphics/FastBlockCompress/Shaders/BlockCompress.hlsli |
| // https://github.com/GammaUNC/FasTC/blob/0f8cef65cf8f0fc5c58a2d06af3e0c3ad2374678/DXTEncoder/src/stb_dxt.h |
| // https://github.com/darksylinc/betsy/blob/master/bin/Data/bc1.glsl |
| // https://github.com/GPUOpen-Tools/compressonator/blob/master/cmp_core/shaders/bc1_cmp.h |
| // |
| // |
| // Optimization ideas |
| // ------------------ |
| // |
| // - Do the color refinement step from stb_dxt. This is probably the top priority. Currently, we |
| // only do the PCA step and we use the min and max colors as the endpoints. We should instead see |
| // if picking other endpoints on the PCA line would lead to better results. |
| // |
| // - Use dithering to improve quality. Betsy and FasTC encoders (links above) have examples. |
| // |
| // - Add a fast path for when all pixels are the same color (speed improvement) |
| // |
| // - Use BC1 instead of BC3 if the image doesn't contain semi-transparent pixels. We will need to |
| // add a pre-processing step to determine if there are such pixels. Alternatively, it could be |
| // done fairly efficiently as a post-processing step where we discard the BC4 data if all pixels |
| // are opaque, however in that case it would only work for fully opaque image (ie: we wouldn't be |
| // able to take advantage of BC1's punch-through alpha. |
| // |
| // To-do list |
| // --------------- |
| // - TODO(gregschlom): Check that the GPU has gl_SubgroupSize >= 16 before using this shader, |
| // otherwise it will give wrong results. |
| // |
| // - TODO(gregschlom): Check if the results are correct for image sizes that aren't multiples of 4 |
| |
| #version 450 core |
| #include "AstcDecompressor.glsl" |
| #include "Common.comp" |
| |
| // TODO(gregschlom): Check how widespread is support for these extensions. |
| #extension GL_KHR_shader_subgroup_clustered : enable |
| #extension GL_KHR_shader_subgroup_shuffle : enable |
| |
| // To maximize GPU utilization, we use a local workgroup size of 64 which is a multiple of the |
| // subgroup size of both AMD and NVIDIA cards. |
| layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; |
| |
| // Using 2DArray textures for compatibility with the old ASTC decoder. |
| // TODO(gregschlom): Once we have texture metrics, check if we want to keep supporting array text. |
| layout(binding = 0, rgba32ui) readonly uniform WITH_TYPE(uimage) srcImage; |
| layout(binding = 1, rgba32ui) writeonly uniform WITH_TYPE(uimage) dstImage; |
| |
| layout(push_constant) uniform imagInfo { |
| uvec2 blockSize; |
| uint baseLayer; |
| uint smallBlock; // TODO(gregschlom) Remove this once we remove the old decoder. |
| } |
| u_pushConstant; |
| |
| // Decodes an ASTC-encoded pixel at `texelPos` to RGBA |
| uvec4 decodeRGBA(uvec2 texelPos, uint layer) { |
| uvec2 blockPos = texelPos / u_pushConstant.blockSize; |
| uvec2 posInBlock = texelPos % u_pushConstant.blockSize; |
| |
| astcBlock = imageLoad(srcImage, WITH_TYPE(getPos)(ivec3(blockPos, layer))).wzyx; |
| astcDecoderInitialize(astcBlock, u_pushConstant.blockSize); |
| return astcDecodeTexel(posInBlock); |
| } |
| |
| // Returns the 2-bit index of the BC1 color that's the closest to the input color. |
| // color: the color that we want to approximate |
| // maxEndpoint / minEndpoint: the BC1 endpoint values we've chosen |
| uint getColorIndex(vec3 color, vec3 minEndpoint, vec3 maxEndpoint) { |
| // Project `color` on the line that goes between `minEndpoint` and `maxEndpoint`. |
| // |
| // TODO(gregschlom): this doesn't account for the fact that the color palette is actually |
| // quantisized as RGB565 instead of RGB8. A slower but potentially slightly higher quality |
| // approach would be to compute all 4 RGB565 colors in the palette, then find the closest one. |
| vec3 colorLine = maxEndpoint - minEndpoint; |
| float x = dot(color - minEndpoint, colorLine) / dot(colorLine, colorLine); |
| |
| // x is now a float in [0, 1] indicating where `color` lies when projected on the line between |
| // the min and max endpoint. Remap x as an integer between 0 and 3. |
| int index = int(round(clamp(x * 3, 0, 3))); |
| |
| // Finally, we need to convert to the somewhat unintuitive BC1 indexing scheme, where: |
| // 0 is maxEndpoint, 1 is minEndpoint, 2 is (1/3)*minEndpoint + (2/3)*maxEndpoint and 3 is |
| // (2/3)*minEndpoint + (1/3)*maxEndpoint. The lookup table for this is [1, 3, 2, 0], which we |
| // bit-pack into 8 bits. |
| // |
| // Alternatively, we could use this formula: |
| // `index = -index & 3; return index ^ uint(index < 2);` but the lookup table method is faster. |
| return bitfieldExtract(45u, index * 2, 2); |
| } |
| |
| // Same as above, but for alpha values, using BC4's encoding scheme. |
| uint getAlphaIndex(uint alpha, uint minAlpha, uint maxAlpha) { |
| float x = float(alpha - minAlpha) / float(maxAlpha - minAlpha); |
| int index = int(round(clamp(x * 7, 0, 7))); |
| |
| // Like for getColorIndex, we need to remap the index according to BC4's indexing scheme, where |
| // 0 is maxAlpha, 1 is minAlpha, 2 is (1/7)*minAlpha + (6/7)*maxAlpha, etc... |
| // The lookup table for this is [1, 7, 6, 5, 4, 3, 2, 0], which we bit-pack into 32 bits using |
| // 4 bits for each value. |
| // |
| // Alternatively, we could use this formula: |
| // `index = -index & 7; return index ^ uint(index < 2);` but the lookup table method is faster. |
| return bitfieldExtract(36984433u, index * 4, 3); |
| } |
| |
| // Computes the color endpoints using Principal Component Analysis to find the best fit line |
| // through the colors in the 4x4 block. |
| void computeEndpoints(uvec3 rgbColor, out uvec3 minEndpoint, out uvec3 maxEndpoint) { |
| // See the comment at the top of this file for more details on this algorithm. |
| |
| uvec3 avgColor = subgroupClusteredAdd(rgbColor, 16) + 8 >> 4; // +8 to round to nearest. |
| uvec3 minColor = subgroupClusteredMin(rgbColor, 16); |
| uvec3 maxColor = subgroupClusteredMax(rgbColor, 16); |
| |
| // Special case when all pixels are the same color |
| if (minColor == maxColor) { |
| minEndpoint = minColor; |
| maxEndpoint = minColor; |
| return; |
| } |
| |
| // Compute the covariance matrix of the r, g and b channels. This is a 3x3 symmetric matrix. |
| // First compute the 6 unique covariance values: |
| ivec3 dx = ivec3(rgbColor) - ivec3(avgColor); |
| vec3 cov1 = subgroupClusteredAdd(dx.r * dx, 16); // cov(r,r), cov(r,g), cov(r,b) |
| vec3 cov2 = subgroupClusteredAdd(dx.ggb * dx.gbb, 16); // cov(g,g), cov(g,b), cov(b,b) |
| |
| // Then build the matrix: |
| mat3 covMat = mat3(cov1, // rr, rg, rb |
| vec3(cov1.y, cov2.xy), // rg, gg, gb |
| vec3(cov1.z, cov2.yz)); // rb, gb, bb |
| |
| // Find the principal axis via power iteration. (https://en.wikipedia.org/wiki/Power_iteration) |
| // 3 to 8 iterations are sufficient for a good approximation. |
| // Note: in theory, we're supposed to normalize the vector on each iteration, however we get |
| // significantly higher quality (and obviously faster performance) when not doing it. |
| // TODO(gregschlom): Investigate why that is the case. |
| vec3 principalAxis = covMat * (covMat * (covMat * (covMat * (maxColor - minColor)))); |
| |
| // Ensure all components are in the [-1,1] range. |
| // TODO(gregschlom): Investigate if we really need this. It doesn't make a lot of sense. |
| float magn = max(max(abs(principalAxis.r), abs(principalAxis.g)), abs(principalAxis.b)); |
| principalAxis = (magn < 4.0) // If the magnitude is too small, default to luminance |
| ? vec3(0.299f, 0.587f, 0.114f) // Coefficients to convert RGB to luminance |
| : principalAxis / magn; |
| |
| // Project the colors on the principal axis and pick the 2 colors at the extreme points as the |
| // endpoints. |
| float distance = dot(rgbColor, principalAxis); |
| float minDistance = subgroupClusteredMin(distance, 16); |
| float maxDistance = subgroupClusteredMax(distance, 16); |
| |
| uvec2 indices = uvec2(distance == minDistance ? gl_SubgroupInvocationID : 0, |
| distance == maxDistance ? gl_SubgroupInvocationID : 0); |
| uvec2 minMaxIndex = subgroupClusteredMax(indices, 16); |
| |
| // TODO(gregschlom): we're returning the original pixel colors instead of the projected colors. |
| // Investigate if we could increase quality by returning the projected colors. |
| minEndpoint = subgroupShuffle(rgbColor, minMaxIndex.x); |
| maxEndpoint = subgroupShuffle(rgbColor, minMaxIndex.y); |
| } |
| |
| uvec2 encodeAlpha(uint value, uint texelId) { |
| uint minValue = subgroupClusteredMin(value, 16); |
| uint maxValue = subgroupClusteredMax(value, 16); |
| |
| // Determine the alpha index (between 0 and 7) |
| uint index = (minValue != maxValue) ? getAlphaIndex(value, minValue, maxValue) : 0; |
| |
| // Pack everything together into 64 bits. The first 3-bit index goes at bit 16, the next |
| // one at bit 19 and so on until the last one which goes at bit 61. The bottom 16 bits will |
| // contain the max and min value. |
| // Note: shifting a uint by more than 31 is UB, which is why we need the ternary operator here. |
| uvec2 mask = uvec2(texelId < 5 ? 0 : (index << 29) >> (-3 * texelId + 45), |
| texelId > 5 ? 0 : index << (3 * texelId + 16)); |
| uvec2 packed = subgroupClusteredOr(mask, 16); |
| return uvec2((maxValue & 0xff) | ((minValue & 0xff) << 8) | packed[1], packed[0]); |
| } |
| |
| uint packColorToRGB565(uvec3 color) { |
| uvec3 quant = uvec3(round(vec3(color) * vec3(31.0, 63.0, 31.0) / vec3(255.0))); |
| return (quant.r << 11) | (quant.g << 5) | quant.b; |
| } |
| |
| void main() { |
| // We can't use gl_LocalInvocationID here because the spec doesn't make any guarantees as to how |
| // it will be mapped to gl_SubgroupInvocationID (See: https://stackoverflow.com/q/72451338/). |
| // And since we use subgroupClusteredXXX commands, we must ensure that any 16 consecutive |
| // subgroup invocation ids [16n, 16n+1..16n+15] map to the same 4x4 block in the input image. |
| // So instead of using gl_LocalInvocationID, we construct it from the subgroup ids. |
| // This is a number in the range [0, 63] since local group size is 64 |
| uint localId = gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID; |
| |
| uint blockId = localId / 16; // [0-3] Id of the 4x4 block we're working on |
| uint texelId = localId % 16; // [0-15] Id of the texel within the 4x4 block |
| |
| // Absolute coordinates in the input image |
| uvec2 texelCoord = 8 * gl_WorkGroupID.xy + uvec2(4 * (blockId & 0x1) + (texelId % 4), |
| 2 * (blockId & 0x2) + (texelId / 4)); |
| // Layer, for array textures. |
| uint layer = u_pushConstant.baseLayer + gl_WorkGroupID.z; |
| |
| uvec4 currentTexel = decodeRGBA(texelCoord, layer); |
| |
| // Compute the color endpoints |
| uvec3 minEndpoint, maxEndpoint; |
| computeEndpoints(currentTexel.rgb, minEndpoint, maxEndpoint); |
| uvec2 endpoints = uvec2(packColorToRGB565(minEndpoint), packColorToRGB565(maxEndpoint)); |
| |
| // Find which of the 4 colors best matches the color of the current texel |
| uint index = 0; |
| if (endpoints.x != endpoints.y) { |
| index = getColorIndex(vec3(currentTexel.rgb), vec3(minEndpoint), vec3(maxEndpoint)); |
| } |
| if (endpoints.x > endpoints.y) { |
| index ^= 1; |
| endpoints = endpoints.yx; |
| } |
| |
| // Pack everything together. |
| uvec4 result; |
| result.rg = encodeAlpha(currentTexel.a, texelId); |
| result.b = endpoints.y | (endpoints.x << 16); |
| result.a = subgroupClusteredOr(index << (2 * texelId), 16); |
| |
| if (texelId == 0) { |
| imageStore(dstImage, WITH_TYPE(getPos)(ivec3(texelCoord / 4, layer)), result); |
| } |
| } |