0% found this document useful (0 votes)
54 views

Bad P 3

The document defines macros and functions for decoding DXT-compressed texture blocks into rgba8 values and writing them to an output image. It uses a compute shader with work groups of 64 threads to process blocks. Each block is read by one thread and the colors are calculated and shared. Each thread then writes a single texel to the output based on its index within the block.

Uploaded by

Zaki Erlangga
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
54 views

Bad P 3

The document defines macros and functions for decoding DXT-compressed texture blocks into rgba8 values and writing them to an output image. It uses a compute shader with work groups of 64 threads to process blocks. Each block is read by one thread and the colors are calculated and shared. Each thread then writes a single texel to the output based on its index within the block.

Uploaded by

Zaki Erlangga
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 6

#version 320 es

#define FORCE_EARLY_Z layout(early_fragment_tests) in

#define ATTRIBUTE_LOCATION(x)
#define FRAGMENT_OUTPUT_LOCATION(x)
#define FRAGMENT_OUTPUT_LOCATION_INDEXED(x, y)
#define UBO_BINDING(packing, x) layout(packing, binding = x)
#define SAMPLER_BINDING(x) layout(binding = x)
#define TEXEL_BUFFER_BINDING(x) layout(binding = x)
#define SSBO_BINDING(x) layout(binding = x)
#define IMAGE_BINDING(format, x) layout(format, binding = x)

#define VARYING_LOCATION(x)

#extension GL_ANDROID_extension_pack_es31a : enable

#extension GL_ARM_shader_framebuffer_fetch: enable


#define FB_FETCH_VALUE gl_LastFragColorARM
#define FRAGMENT_INOUT out

precision highp float;


precision highp int;
precision highp sampler2DArray;
precision highp usamplerBuffer;
precision highp sampler2DMS;
precision highp image2DArray;
#define API_OPENGL 1
#define float2 vec2
#define float3 vec3
#define float4 vec4
#define uint2 uvec2
#define uint3 uvec3
#define uint4 uvec4
#define int2 ivec2
#define int3 ivec3
#define int4 ivec4
#define frac fract
#define lerp mix
#define PALETTE_FORMAT_IA8 1

#if defined(PALETTE_FORMAT_IA8) || defined(PALETTE_FORMAT_RGB565) ||


defined(PALETTE_FORMAT_RGB5A3)
#define HAS_PALETTE 1
#endif

#ifdef API_D3D
cbuffer UBO : register(b0) {
#else
UBO_BINDING(std140, 1) uniform UBO {
#endif
uint2 u_dst_size;
uint2 u_src_size;
uint u_src_offset;
uint u_src_row_stride;
uint u_palette_offset;
};

#ifdef API_D3D

Buffer<uint4> s_input_buffer : register(t0);


#ifdef HAS_PALETTE
Buffer<uint4> s_palette_buffer : register(t1);
#endif

RWTexture2DArray<unorm float4> output_image : register(u0);

// Helpers for reading/writing.


#define texelFetch(buffer, pos) buffer.Load(pos)
#define imageStore(image, coords, value) image[coords] = value
#define GROUP_MEMORY_BARRIER_WITH_SYNC GroupMemoryBarrierWithGroupSync();
#define GROUP_SHARED groupshared

#define DEFINE_MAIN(lx, ly) \


[numthreads(lx, ly, 1)] \
void main(uint3 gl_WorkGroupID : SV_GroupId, \
uint3 gl_LocalInvocationID : SV_GroupThreadID, \
uint3 gl_GlobalInvocationID : SV_DispatchThreadID)

uint bitfieldExtract(uint val, int off, int size)


{
// This built-in function is only support in OpenGL 4.0+ and ES 3.1+\n"
// Microsoft's HLSL compiler automatically optimises this to a bitfield extract
instruction.
uint mask = uint((1 << size) - 1);
return uint(val >> off) & mask;
}

#else

TEXEL_BUFFER_BINDING(0) uniform usamplerBuffer s_input_buffer;


#ifdef HAS_PALETTE
TEXEL_BUFFER_BINDING(1) uniform usamplerBuffer s_palette_buffer;
#endif
IMAGE_BINDING(rgba8, 0) uniform writeonly image2DArray output_image;

#define GROUP_MEMORY_BARRIER_WITH_SYNC memoryBarrierShared(); barrier();


#define GROUP_SHARED shared

#define DEFINE_MAIN(lx, ly) \


layout(local_size_x = lx, local_size_y = ly) in; \
void main()

#endif

uint Swap16(uint v)
{
// Convert BE to LE.
return ((v >> 8) | (v << 8)) & 0xFFFFu;
}

uint Convert3To8(uint v)
{
// Swizzle bits: 00000123 -> 12312312
return (v << 5) | (v << 2) | (v >> 1);
}
uint Convert4To8(uint v)
{
// Swizzle bits: 00001234 -> 12341234
return (v << 4) | v;
}
uint Convert5To8(uint v)
{
// Swizzle bits: 00012345 -> 12345123
return (v << 3) | (v >> 2);
}
uint Convert6To8(uint v)
{
// Swizzle bits: 00123456 -> 12345612
return (v << 2) | (v >> 4);
}

uint GetTiledTexelOffset(uint2 block_size, uint2 coords)


{
uint2 block = coords / block_size;
uint2 offset = coords % block_size;
uint buffer_pos = u_src_offset;
buffer_pos += block.y * u_src_row_stride;
buffer_pos += block.x * (block_size.x * block_size.y);
buffer_pos += offset.y * block_size.x;
buffer_pos += offset.x;
return buffer_pos;
}

uint4 GetPaletteColor(uint index)


{
// Fetch and swap BE to LE.
uint val = Swap16(texelFetch(s_palette_buffer, int(u_palette_offset + index)).x);

uint4 color;
#if defined(PALETTE_FORMAT_IA8)
uint a = bitfieldExtract(val, 8, 8);
uint i = bitfieldExtract(val, 0, 8);
color = uint4(i, i, i, a);
#elif defined(PALETTE_FORMAT_RGB565)
color.x = Convert5To8(bitfieldExtract(val, 11, 5));
color.y = Convert6To8(bitfieldExtract(val, 5, 6));
color.z = Convert5To8(bitfieldExtract(val, 0, 5));
color.a = 255u;

#elif defined(PALETTE_FORMAT_RGB5A3)
if ((val & 0x8000u) != 0u)
{
color.x = Convert5To8(bitfieldExtract(val, 10, 5));
color.y = Convert5To8(bitfieldExtract(val, 5, 5));
color.z = Convert5To8(bitfieldExtract(val, 0, 5));
color.a = 255u;
}
else
{
color.a = Convert3To8(bitfieldExtract(val, 12, 3));
color.r = Convert4To8(bitfieldExtract(val, 8, 4));
color.g = Convert4To8(bitfieldExtract(val, 4, 4));
color.b = Convert4To8(bitfieldExtract(val, 0, 4));
}
#else
// Not used.
color = uint4(0, 0, 0, 0);
#endif

return color;
}

float4 GetPaletteColorNormalized(uint index)


{
uint4 color = GetPaletteColor(index);
return float4(color) / 255.0;
}

// In the compute version of this decoder, we flatten the blocks to a one-


dimension array.
// Each group is subdivided into 16, and the first thread in each group
fetches the DXT data.
// All threads then calculate the possible colors for the block and write to
the output image.

#define GROUP_SIZE 64u


#define BLOCK_SIZE_X 4u
#define BLOCK_SIZE_Y 4u
#define BLOCK_SIZE (BLOCK_SIZE_X * BLOCK_SIZE_Y)
#define BLOCKS_PER_GROUP (GROUP_SIZE / BLOCK_SIZE)

uint DXTBlend(uint v1, uint v2)


{
// 3/8 blend, which is close to 1/3
return ((v1 * 3u + v2 * 5u) >> 3);
}

GROUP_SHARED uint2 shared_temp[BLOCKS_PER_GROUP];

DEFINE_MAIN(GROUP_SIZE, 8)
{
uint local_thread_id = gl_LocalInvocationID.x;
uint block_in_group = local_thread_id / BLOCK_SIZE;
uint thread_in_block = local_thread_id % BLOCK_SIZE;
uint block_index = gl_WorkGroupID.x * BLOCKS_PER_GROUP + block_in_group;

// Annoyingly, we can't precalculate this as a uniform because the DXT


block size differs
// from the block size of the overall texture (4 vs 8). We can however use
a multiply and
// subtraction to avoid the modulo for calculating the block's X
coordinate.
uint blocks_wide = u_src_size.x / BLOCK_SIZE_X;
uint2 block_coords;
block_coords.y = block_index / blocks_wide;
block_coords.x = block_index - (block_coords.y * blocks_wide);

// Only the first thread for each block reads from the texel buffer.
if (thread_in_block == 0u)
{
// Calculate tiled block coordinates.
uint2 tile_block_coords = block_coords / 2u;
uint2 subtile_block_coords = block_coords % 2u;
uint buffer_pos = u_src_offset;
buffer_pos += tile_block_coords.y * u_src_row_stride;
buffer_pos += tile_block_coords.x * 4u;
buffer_pos += subtile_block_coords.y * 2u;
buffer_pos += subtile_block_coords.x;

// Read the entire DXT block to shared memory.


uint2 raw_data = texelFetch(s_input_buffer, int(buffer_pos)).xy;
shared_temp[block_in_group] = raw_data;
}

// Ensure store is completed before the remaining threads in the block


continue.
GROUP_MEMORY_BARRIER_WITH_SYNC;

// Unpack colors and swap BE to LE.


uint2 raw_data = shared_temp[block_in_group];
uint swapped = ((raw_data.x & 0xFF00FF00u) >> 8) | ((raw_data.x &
0x00FF00FFu) << 8);
uint c1 = swapped & 0xFFFFu;
uint c2 = swapped >> 16;

// Expand 5/6 bit channels to 8-bits per channel.


uint blue1 = Convert5To8(bitfieldExtract(c1, 0, 5));
uint blue2 = Convert5To8(bitfieldExtract(c2, 0, 5));
uint green1 = Convert6To8(bitfieldExtract(c1, 5, 6));
uint green2 = Convert6To8(bitfieldExtract(c2, 5, 6));
uint red1 = Convert5To8(bitfieldExtract(c1, 11, 5));
uint red2 = Convert5To8(bitfieldExtract(c2, 11, 5));

// Determine the four colors the block can use.


// It's quicker to just precalculate all four colors rather than branching
on the index.
// NOTE: These must be masked with 0xFF. This is done at the normalization
stage below.
uint4 color0, color1, color2, color3;
color0 = uint4(red1, green1, blue1, 255u);
color1 = uint4(red2, green2, blue2, 255u);
if (c1 > c2)
{
color2 = uint4(DXTBlend(red2, red1), DXTBlend(green2, green1),
DXTBlend(blue2, blue1), 255u);
color3 = uint4(DXTBlend(red1, red2), DXTBlend(green1, green2),
DXTBlend(blue1, blue2), 255u);
}
else
{
color2 = uint4((red1 + red2) / 2u, (green1 + green2) / 2u, (blue1 +
blue2) / 2u, 255u);
color3 = uint4((red1 + red2) / 2u, (green1 + green2) / 2u, (blue1 +
blue2) / 2u, 0u);
}

// Calculate the texel coordinates that we will write to.


// The divides/modulo here should be turned into a shift/binary AND.
uint local_y = thread_in_block / BLOCK_SIZE_X;
uint local_x = thread_in_block % BLOCK_SIZE_X;
uint global_x = block_coords.x * BLOCK_SIZE_X + local_x;
uint global_y = block_coords.y * BLOCK_SIZE_Y + local_y;

// Use the coordinates within the block to shift the 32-bit value
containing
// all 16 indices to a single 2-bit index.
uint index = bitfieldExtract(raw_data.y, int((local_y * 8u) + (6u - local_x
* 2u)), 2);

// Select the un-normalized color from the precalculated color array.


// Using a switch statement here removes the need for dynamic indexing of
an array.
uint4 color;
switch (index)
{
case 0u: color = color0; break;
case 1u: color = color1; break;
case 2u: color = color2; break;
case 3u: color = color3; break;
default: color = color0; break;
}

// Normalize and write to the output image.


float4 norm_color = float4(color & 0xFFu) / 255.0;
imageStore(output_image, int3(int2(uint2(global_x, global_y)), 0),
norm_color);
}

Max number of total work group invocations exceeded.


#

You might also like